%pylab inline from __future__ import division from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import BernoulliNB from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import train_test_split from speech import Speech phrase = "abortion" num_speeches = Speech.get(0, 0, phrase=phrase, congress="", start_date="1995-05-04", speaker_party="*")['count'] print "Downloading %i speeches" % num_speeches speeches = Speech.get(start=0, rows=num_speeches, phrase=phrase, speaker_party="*")['speeches'] print len(speeches), "speeches downloaded" naive_bayes = MultinomialNB(alpha=1.0, fit_prior=True) vectorizer = TfidfVectorizer(min_df=.1, max_df=.6, stop_words='english' ) # Make an array of text objects. Each chunk of text is just the text of a congressional speech. data = [" ".join(speech['speaking']) for speech in speeches] # Transform speeches into vectors data = vectorizer.fit_transform(data) # Make an array of 0s and 1s that determine if each speech is democrat or republican. This is called the target vector. target = [speech['speaker_party'] for speech in speeches] target = [ 0 if x == "D" else 1 for x in target ] data.shape target X_train, X_test , Y_train, Y_test = train_test_split(data, target, test_size=0.2) print X_train.shape, X_test.shape, len(Y_train), len(Y_test) naive_bayes.fit(X_train, Y_train) naive_bayes.score(X_test, Y_test) cross_val_score(naive_bayes, data, target, scoring='accuracy', verbose=1, cv=5) terms = vectorizer.get_feature_names() t1 = [(naive_bayes.feature_log_prob_[0][i] * (naive_bayes.class_count_[0] / naive_bayes.class_count_.sum())) for i in range(len(terms))] t2 = [(naive_bayes.feature_log_prob_[1][i] * (naive_bayes.class_count_[1] / naive_bayes.class_count_.sum())) for i in range(len(terms))] [(terms[i],t1[i]) for i in np.array(t1).argsort()] # Top Terms for Republicans [(terms[i],t2[i]) for i in np.array(t2).argsort()] # Top Terms for Democrats