from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics from operator import itemgetter from sklearn.metrics import classification_report import csv import os os.chdir('/Users/rweiss/Dropbox/presentations/MozFest2013/data/') #note that if you generated this from R, you will need to delete the row #"NYT_sample.Topic.Code","NYT_sample.Title" #from the top of the file. nyt = open('../data/nyt_title_data.csv') # check the structure of this file! nyt_data = [] nyt_labels = [] csv_reader = csv.reader(nyt) for line in csv_reader: nyt_labels.append(int(line[0])) nyt_data.append(line[1]) nyt.close() trainset_size = int(round(len(nyt_data)*0.75)) # i chose this threshold arbitrarily...to discuss print 'The training set size for this classifier is ' + str(trainset_size) + '\n' X_train = np.array([''.join(el) for el in nyt_data[0:trainset_size]]) y_train = np.array([el for el in nyt_labels[0:trainset_size]]) X_test = np.array([''.join(el) for el in nyt_data[trainset_size+1:len(nyt_data)]]) y_test = np.array([el for el in nyt_labels[trainset_size+1:len(nyt_labels)]]) #print(X_train) vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') test_string = unicode(nyt_data[0]) print "Example string: " + test_string print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string) print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string)) print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string)) print "\n" X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) nb_classifier = MultinomialNB().fit(X_train, y_train) y_nb_predicted = nb_classifier.predict(X_test) print "MODEL: Multinomial Naive Bayes\n" print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted)) print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted)) print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted)) print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted)) print '\nHere is the classification report:' print classification_report(y_test, y_nb_predicted) #simple thing to do would be to up the n-grams to bigrams; try varying ngram_range from (1, 1) to (1, 2) #we could also modify the vectorizer to stem or lemmatize print '\nHere is the confusion matrix:' print metrics.confusion_matrix(y_test, y_nb_predicted, labels=unique(nyt_labels)) #What are the top N most predictive features per class? N = 10 vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))]) for i, label in enumerate(nyt_labels): if i == 7: # hack... break topN = np.argsort(nb_classifier.coef_[i])[-N:] print "\nThe top %d most informative features for topic code %s: \n%s" % (N, label, " ".join(vocabulary[topN])) #print topN from sklearn.svm import LinearSVC svm_classifier = LinearSVC().fit(X_train, y_train) y_svm_predicted = svm_classifier.predict(X_test) print "MODEL: Linear SVC\n" print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_svm_predicted)) print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_svm_predicted)) print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_svm_predicted)) print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_svm_predicted)) print '\nHere is the classification report:' print classification_report(y_test, y_svm_predicted) #simple thing to do would be to up the n-grams to bigrams; try varying ngram_range from (1, 1) to (1, 2) #we could also modify the vectorizer to stem or lemmatize print '\nHere is the confusion matrix:' print metrics.confusion_matrix(y_test, y_svm_predicted, labels=unique(nyt_labels)) #What are the top N most predictive features per class? N = 10 vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))]) for i, label in enumerate(nyt_labels): if i == 7: # hack... break topN = np.argsort(svm_classifier.coef_[i])[-N:] print "\nThe top %d most informative features for topic code %s: \n%s" % (N, label, " ".join(vocabulary[topN])) #print topN from sklearn.linear_model import LogisticRegression maxent_classifier = LogisticRegression().fit(X_train, y_train) y_maxent_predicted = maxent_classifier.predict(X_test) print "MODEL: Maximum Entropy\n" print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_maxent_predicted)) print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_maxent_predicted)) print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_maxent_predicted)) print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_maxent_predicted)) print '\nHere is the classification report:' print classification_report(y_test, y_maxent_predicted) #simple thing to do would be to up the n-grams to bigrams; try varying ngram_range from (1, 1) to (1, 2) #we could also modify the vectorizer to stem or lemmatize print '\nHere is the confusion matrix:' print metrics.confusion_matrix(y_test, y_maxent_predicted, labels=unique(nyt_labels)) #What are the top N most predictive features per class? N = 10 vocabulary = np.array([t for t, i in sorted(vectorizer.vocabulary_.iteritems(), key=itemgetter(1))]) for i, label in enumerate(nyt_labels): if i == 7: # hack... break topN = np.argsort(maxent_classifier.coef_[i])[-N:] print "\nThe top %d most informative features for topic code %s: \n%s" % (N, label, " ".join(vocabulary[topN])) #print topN from gensim import corpora, models, similarities from itertools import chain import nltk from nltk.corpus import stopwords from operator import itemgetter import re url_pattern = r'https?:\/\/(.*[\r\n]*)+' documents = [nltk.clean_html(document) for document in nyt_data] stoplist = stopwords.words('english') texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) #lsi.print_topics(20) n_topics = 60 lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics) for i in range(0, n_topics): temp = lda.show_topic(i, 10) terms = [] for term in temp: terms.append(term[1]) print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join(terms) print print 'Which LDA topic maximally describes a document?\n' print 'Original document: ' + documents[1] print 'Preprocessed document: ' + str(texts[1]) print 'Matrix Market format: ' + str(corpus[1]) print 'Topic probability mixture: ' + str(lda[corpus[1]]) print 'Maximally probable topic: topic #' + str(max(lda[corpus[1]],key=itemgetter(1))[0])