%pylab inline
Populating the interactive namespace from numpy and matplotlib
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import naive_bayes
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)
critics = pd.read_csv('./data/rt_critics.csv')
critics.head()
critic | fresh | imdb | publication | quote | review_date | rtid | title | |
---|---|---|---|---|---|---|---|---|
0 | Derek Adams | fresh | 114709 | Time Out | So ingenious in concept, design and execution ... | 2009-10-04 | 9559 | Toy story |
1 | Richard Corliss | fresh | 114709 | TIME Magazine | The year's most inventive comedy. | 2008-08-31 | 9559 | Toy story |
2 | David Ansen | fresh | 114709 | Newsweek | A winning animated feature that has something ... | 2008-08-18 | 9559 | Toy story |
3 | Leonard Klady | fresh | 114709 | Variety | The film sports a provocative and appealing st... | 2008-06-09 | 9559 | Toy story |
4 | Jonathan Rosenbaum | fresh | 114709 | Chicago Reader | An entertaining computer-generated, hyperreali... | 2008-03-10 | 9559 | Toy story |
from sklearn.feature_extraction.text import CountVectorizer
text = ['Math is great', 'Math is really great', 'Exciting exciting Math']
vectorizer = CountVectorizer(ngram_range = (1,2))
vectorizer.fit(text)
print vectorizer.get_feature_names()
x = vectorizer.transform(text)
[u'exciting', u'exciting exciting', u'exciting math', u'great', u'is', u'is great', u'is really', u'math', u'math is', u'really', u'really great']
print x
(0, 3) 1 (0, 4) 1 (0, 5) 1 (0, 7) 1 (0, 8) 1 (1, 3) 1 (1, 4) 1 (1, 6) 1 (1, 7) 1 (1, 8) 1 (1, 9) 1 (1, 10) 1 (2, 0) 2 (2, 1) 1 (2, 2) 1 (2, 7) 1
print 'Matrix'
x_back = x.toarray()
print x_back
Matrix [[0 0 0 1 1 1 0 1 1 0 0] [0 0 0 1 1 0 1 1 1 1 1] [2 1 1 0 0 0 0 1 0 0 0]]
pd.DataFrame(x_back, columns=vectorizer.get_feature_names())
exciting | exciting exciting | exciting math | great | is | is great | is really | math | math is | really | really great | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 |
1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 |
2 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
rotten_vectorizer = vectorizer.fit(critics.quote)
x = vectorizer.fit_transform(critics.quote)
y = (critics.fresh == 'fresh').values.astype(int)
rotten_vectorizer
CountVectorizer(analyzer=u'word', binary=False, charset=None, charset_error=None, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 2), preprocessor=None, stop_words=None, strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)
def train_and_measure(classifier, x, y, tsize):
from sklearn import cross_validation
"""
Function accepts a classifer from sklearn and computes the accuracy measure for a random train and test split
classifier: an sklearn class
x : a matrix of features
y : a vector of targets
"""
xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(x, y, test_size=tsize, random_state=1234)
clf = classifier.fit(xtrain, ytrain)
#Print the accuracy on the test and training dataset
training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)
print classifier
print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data: %0.2f" % (test_accuracy)
train_and_measure(naive_bayes.MultinomialNB(), x, y,0.3)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) Accuracy on training data: 0.99 Accuracy on test data: 0.76
def train_and_measure_kfold(classifier, x, y, nfolds):
from sklearn import cross_validation
"""
Function accepts a classifer from sklearn and computes the accuracy measure for a random train and test split
classifier: an sklearn class
x : a matrix of features
y : a vector of targets
"""
kfold = cross_validation.KFold(n=x.shape[0], n_folds=nfolds, shuffle=True, random_state=1234)
train_acc = []
test_acc = []
for train_index, test_index in kfold:
clf = classifier.fit(x[train_index], y[train_index])
train_acc.append(clf.score(x[train_index], y[train_index]))
test_acc.append(clf.score(x[test_index], y[test_index]))
print np.array(test_acc).mean()
print np.array(test_acc).std()
plt.figure()
sns.kdeplot(np.random.normal(loc=np.array(test_acc).mean(), scale=np.array(test_acc).std(), size=10000), shade=True)
train_and_measure_kfold(naive_bayes.MultinomialNB(), x, y, 5)
0.782121819222 0.0102653931697
from sklearn import linear_model
train_and_measure_kfold(linear_model.LogisticRegression(), x, y, 5)
0.775228450491 0.0113577230339
x_ones = (x > 1)
train_and_measure_kfold(naive_bayes.BernoulliNB(), x_ones, y, 5)
0.612066993124 0.0101504812805