In [1]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [22]:

%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import naive_bayes

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)

critics = pd.read_csv('./data/rt_critics.csv')
critics.head()

Out[22]:

	critic	fresh	imdb	publication	quote	review_date	rtid	title
0	Derek Adams	fresh	114709	Time Out	So ingenious in concept, design and execution ...	2009-10-04	9559	Toy story
1	Richard Corliss	fresh	114709	TIME Magazine	The year's most inventive comedy.	2008-08-31	9559	Toy story
2	David Ansen	fresh	114709	Newsweek	A winning animated feature that has something ...	2008-08-18	9559	Toy story
3	Leonard Klady	fresh	114709	Variety	The film sports a provocative and appealing st...	2008-06-09	9559	Toy story
4	Jonathan Rosenbaum	fresh	114709	Chicago Reader	An entertaining computer-generated, hyperreali...	2008-03-10	9559	Toy story

In [4]:

from sklearn.feature_extraction.text import CountVectorizer

text = ['Math is great', 'Math is really great', 'Exciting exciting Math']

In [6]:

vectorizer = CountVectorizer(ngram_range = (1,2))
vectorizer.fit(text)
print vectorizer.get_feature_names()
x = vectorizer.transform(text)

[u'exciting', u'exciting exciting', u'exciting math', u'great', u'is', u'is great', u'is really', u'math', u'math is', u'really', u'really great']

In [7]:

print x

  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 7)	1
  (0, 8)	1
  (1, 3)	1
  (1, 4)	1
  (1, 6)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 10)	1
  (2, 0)	2
  (2, 1)	1
  (2, 2)	1
  (2, 7)	1

In [8]:

print 'Matrix'
x_back = x.toarray()
print x_back

Matrix
[[0 0 0 1 1 1 0 1 1 0 0]
 [0 0 0 1 1 0 1 1 1 1 1]
 [2 1 1 0 0 0 0 1 0 0 0]]

In [9]:

pd.DataFrame(x_back, columns=vectorizer.get_feature_names())

Out[9]:

	exciting	exciting exciting	exciting math	great	is	is great	is really	math	math is	really	really great
0	0	0	0	1	1	1	0	1	1	0	0
1	0	0	0	1	1	0	1	1	1	1	1
2	2	1	1	0	0	0	0	1	0	0	0

In [10]:

rotten_vectorizer = vectorizer.fit(critics.quote)
x = vectorizer.fit_transform(critics.quote)

In [11]:

y = (critics.fresh == 'fresh').values.astype(int)

In [13]:

rotten_vectorizer

Out[13]:

CountVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:

def train_and_measure(classifier, x, y, tsize):
    from sklearn import cross_validation
    """
    Function accepts a classifer from sklearn and computes the accuracy measure for a random train and test split
    classifier: an sklearn class
    x         : a matrix of features
    y         : a vector of targets
    """
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(x, y, test_size=tsize, random_state=1234)
    clf = classifier.fit(xtrain, ytrain)

    #Print the accuracy on the test and training dataset
    training_accuracy = clf.score(xtrain, ytrain)
    test_accuracy = clf.score(xtest, ytest)
    print classifier
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)

In [15]:

train_and_measure(naive_bayes.MultinomialNB(), x, y,0.3)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy on training data: 0.99
Accuracy on test data:     0.76

In [18]:

def train_and_measure_kfold(classifier, x, y, nfolds):
    from sklearn import cross_validation
    """
    Function accepts a classifer from sklearn and computes the accuracy measure for a random train and test split
    classifier: an sklearn class
    x         : a matrix of features
    y         : a vector of targets
    """
    kfold = cross_validation.KFold(n=x.shape[0], n_folds=nfolds, shuffle=True, random_state=1234)
    train_acc = []
    test_acc = []
    for train_index, test_index in kfold:
        clf = classifier.fit(x[train_index], y[train_index])
        train_acc.append(clf.score(x[train_index], y[train_index]))
        test_acc.append(clf.score(x[test_index], y[test_index]))
    

    print np.array(test_acc).mean()
    print np.array(test_acc).std()

    plt.figure()
    sns.kdeplot(np.random.normal(loc=np.array(test_acc).mean(), scale=np.array(test_acc).std(), size=10000), shade=True)

In [19]:

train_and_measure_kfold(naive_bayes.MultinomialNB(), x, y, 5)

0.782121819222
0.0102653931697

In [20]:

from sklearn import linear_model
train_and_measure_kfold(linear_model.LogisticRegression(), x, y, 5)

0.775228450491
0.0113577230339

In [21]:

x_ones = (x > 1)
train_and_measure_kfold(naive_bayes.BernoulliNB(), x_ones, y, 5)

0.612066993124
0.0101504812805

In [ ]: