%pylab inline
Populating the interactive namespace from numpy and matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import naive_bayes
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)
critics = pd.read_csv('./data/rt_critics.csv')
critics.head()
critic | fresh | imdb | publication | quote | review_date | rtid | title | |
---|---|---|---|---|---|---|---|---|
0 | Derek Adams | fresh | 114709 | Time Out | So ingenious in concept, design and execution ... | 2009-10-04 | 9559 | Toy story |
1 | Richard Corliss | fresh | 114709 | TIME Magazine | The year's most inventive comedy. | 2008-08-31 | 9559 | Toy story |
2 | David Ansen | fresh | 114709 | Newsweek | A winning animated feature that has something ... | 2008-08-18 | 9559 | Toy story |
3 | Leonard Klady | fresh | 114709 | Variety | The film sports a provocative and appealing st... | 2008-06-09 | 9559 | Toy story |
4 | Jonathan Rosenbaum | fresh | 114709 | Chicago Reader | An entertaining computer-generated, hyperreali... | 2008-03-10 | 9559 | Toy story |
# Helper function to fit the model, test with kfold validations and print the accuracy scores
def train_and_measure_kfold(classifier, x, y, nfolds):
from sklearn import cross_validation
"""
Function accepts a classifer from sklearn and computes the accuracy measure for a random train and test split
classifier: an sklearn class
x : a dataframe of features
y : a vector of targets
"""
kfold = cross_validation.KFold(n=x.shape[0], n_folds=nfolds, shuffle=True, random_state=1234)
train_acc = []
test_acc = []
for train_index, test_index in kfold:
clf = classifier.fit(x.iloc[train_index], y[train_index])
train_acc.append(clf.score(x.iloc[train_index], y[train_index]))
test_acc.append(clf.score(x.iloc[test_index], y[test_index]))
print '----------------------------------------------------------------'
print classifier
print "Mean of accuracy on test data: %0.2f" % np.array(test_acc).mean()
print "Std of accuracy on test data: %0.2f" % np.array(test_acc).std()
print
plt.figure()
sns.kdeplot(np.random.normal(loc=np.array(test_acc).mean(), scale=np.array(test_acc).std(), size=10000), shade=True)
## This helper function takes in a string and return a string with stopwords removed
def remove_stop_words(text):
from nltk.corpus import stopwords
stop = stopwords.words('english')
return ' '.join([w for w in text.split() if (not w in stopwords.words('english')) & (not w.isdigit())])
critics['quote_sw'] = critics.quote.apply(remove_stop_words)
print critics.ix[0,'quote_sw']
print critics.ix[0,'quote']
So ingenious concept, design execution could watch postage stamp-sized screen still engulfed charm. So ingenious in concept, design and execution that you could watch it on a postage stamp-sized screen and still be engulfed by its charm.
# Generate the feature vector and target variable
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range = (1,2))
x = vectorizer.fit_transform(critics.quote_sw)
y = (critics.fresh == 'fresh').values.astype(int)
#Convert the sparse matrix back to a normal array to run feature selection
x_back = x.toarray()
x_df = pd.DataFrame(x_back, columns=vectorizer.get_feature_names())
x_columns = list(x_df.columns)
print x_df.shape
#Feature selection
from sklearn import feature_selection as f_select
significant_features = []
pvals = []
for feature in x_columns:
pval = f_select.f_classif(x_df[feature],y)
if pval[1][0] < 0.05:
significant_features.append(feature)
pvals.append(pval[1][0])
(14072, 155299)
print len(significant_features)
1954
from sklearn import naive_bayes, linear_model
new_features = x_df[significant_features]
## Test MulinomialNB()
train_and_measure_kfold(naive_bayes.MultinomialNB(), new_features, y, 5)
## Test BernoulliNB()
x_ones = (new_features>1) # recall that a bernoulli interpretation will only work with 1s and 0s, or binary data.
train_and_measure_kfold(naive_bayes.BernoulliNB(), x_ones, y, 5)
## Test LogisticRegression()
train_and_measure_kfold(linear_model.LogisticRegression(), new_features, y, 5)
---------------------------------------------------------------- MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) Mean of accuracy on test data: 0.83 Std of accuracy on test data: 0.01 ---------------------------------------------------------------- BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) Mean of accuracy on test data: 0.61 Std of accuracy on test data: 0.01 ---------------------------------------------------------------- LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) Mean of accuracy on test data: 0.81 Std of accuracy on test data: 0.01
def get_adj(text):
import nltk
text_token=nltk.word_tokenize(text)
tagger = nltk.pos_tag(text_token)
bag_of_words = [j[0] for j in tagger if j[1] in ('JJ','JJR','JJS','VBG')]
return ' '.join(list(set(bag_of_words))) if bag_of_words else ''
critics['pos'] = critics.quote.apply(get_adj)
critics['pos'].head()
0 ingenious stamp-sized 1 most inventive 2 3 technical appealing provocative equal 4 computer-generated Name: pos, dtype: object
# Generate the feature vector and target variable
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range = (1,1))
x = vectorizer.fit_transform(critics.pos)
y = (critics.fresh == 'fresh').values.astype(int)
#Convert the sparse matrix back to a normal array to run feature selection
x_back = x.toarray()
x_df = pd.DataFrame(x_back, columns=vectorizer.get_feature_names())
x_columns = list(x_df.columns)
print x_df.shape
#Feature selection
from sklearn import feature_selection as f_select
significant_features = []
pvals = []
for feature in x_columns:
pval = f_select.f_classif(x_df[feature],y)
if pval[1][0] < 0.05:
significant_features.append(feature)
pvals.append(pval[1][0])
(14072, 6040)
print len(significant_features)
390
from sklearn import naive_bayes, linear_model
new_features = x_df[significant_features]
## Test MulinomialNB()
train_and_measure_kfold(naive_bayes.MultinomialNB(), new_features, y, 5)
## Test BernoulliNB()
x_ones = (new_features>1) # recall that a bernoulli interpretation will only work with 1s and 0s, or binary data.
train_and_measure_kfold(naive_bayes.BernoulliNB(), x_ones, y, 5)
## Test LogisticRegression()
train_and_measure_kfold(linear_model.LogisticRegression(), new_features, y, 5)
---------------------------------------------------------------- MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) Mean of accuracy on test data: 0.70 Std of accuracy on test data: 0.01 ---------------------------------------------------------------- BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) Mean of accuracy on test data: 0.61 Std of accuracy on test data: 0.01 ---------------------------------------------------------------- LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) Mean of accuracy on test data: 0.69 Std of accuracy on test data: 0.01