This demo is partially based on the scipy2013 sklearn presentations
%pylab inline
Populating the interactive namespace from numpy and matplotlib
# Score function from slides
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
def score(clf, X, Y, folds=2, verbose=False, metric=accuracy_score):
predictions = np.zeros(len(Y))
for i, (train, test) in enumerate(KFold(len(X), n_folds=folds, shuffle=True)):
clf.fit(X[train], Y[train])
predictions[test] = clf.predict(X[test])
if verbose:
print("Fold {}: {}".format(i + 1, accuracy_score(Y[test], predictions[test])))
if metric:
return metric(Y, predictions)
return Y, predictions
from sklearn.datasets import fetch_20newsgroups
We will use a standard text dataset built into sklearn.
# Set to None to use all categories -- use 4 for speed purposes
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
ng_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
ng_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))
The 20ng dataset consists of posts from 20 topical forums. The goal is to classify a post as coming from one of the forums.
ng_train.target_names
['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
ng_train.target.shape
(2034,)
ng_train.data[0]
u"Hi,\n\nI've noticed that if you only save a model (with all your mapping planes\npositioned carefully) to a .3DS file that when you reload it after restarting\n3DS, they are given a default position and orientation. But if you save\nto a .PRJ file their positions/orientation are preserved. Does anyone\nknow why this information is not stored in the .3DS file? Nothing is\nexplicitly said in the manual about saving texture rules in the .PRJ file. \nI'd like to be able to read the texture rule information, does anyone have \nthe format for the .PRJ file?\n\nIs the .CEL file format available from somewhere?\n\nRych"
Text is not in a form suitable for classification algorithms. We have to convert each post to a vector of features.
The simplest approach is bag of words. In its simplest form, each dimension of the feature vector corresponds to a word ("car", "body", etc), and the value is the number of times that word appears in the post.
It is more common to use ngram based models, in which each dimension corresponds to an ngram (a sequence of n words: "I was wondering", "was wondering if"). Values still correspond to the number of times the ngram appears
sklearn has several feature extractors to make this easy
from sklearn.feature_extraction.text import CountVectorizer
# Learn ngrams of size 1,2,3
# Use the 20000 most common items as the vocabulary
counter = CountVectorizer(ngram_range=(1,3), max_features=20000)
# Transformers also have a fit_transform method
# This learns a transform then retursn the transform
# applied to the training data
train_counts = counter.fit_transform(ng_train.data)
# After the vectorizer is fit, we can convert
# any posts to feature vectors using the vocabulary it learned
test_counts = counter.transform(ng_test.data)
test_counts.shape
(1353, 20000)
# All nonzero terms are integers
test_counts[2][test_counts[2].nonzero()]
matrix([[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 3, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1]])
# Example vocabulary
np.random.permutation(counter.get_feature_names())[:100]
array([u'exist', u'fake', u'exists and', u'recognize that', u'but there are', u'something', u'difference in', u'interaction', u'polygon', u'the graphics', u'pushing', u'of pc', u'know if anyone', u'patience', u'617', u'is never', u'of ways', u'you mention', u'and source', u'in by', u'are available', u'fractal', u'the believer', u'designers', u'the charge', u'an application', u'with some', u'fear', u'nick', u'didn make', u'by david', u'made', u'pretend', u'conference', u'so now', u'question and', u'in any case', u'this the', u'press release', u'in terms of', u'an enormous', u'proprietary', u'new version of', u'spacecraft attitude', u'on something', u'office kjenks', u'for over', u'll find', u'answers to', u'to face', u'would', u'explosion', u'are on', u'typical', u'someone is', u'headline', u'allow me to', u'of the word', u'figure', u'where can ftp', u'to implement', u'from us', u'on what', u'terminal', u'war on', u'will no', u'kind of like', u'more than one', u'venus', u'sorry don', u'are distributed', u'the uk', u'far more', u'that uses', u'exclusive', u'of users', u'kennedy space', u'alter', u'battle', u'problem in', u'cost of', u'will this', u'best of', u'planning to', u'the item', u'the committee', u'run on', u'find it rather', u'nor', u'this leads', u'latter day', u'to save', u'no real', u'heaven but', u'of the art', u'and tried to', u'mu', u'in the field', u'to acknowledge', u'63'], dtype='<U29')
Simple word counts does not give any information on how identifying a word is. "the" may appear many times in a post, but it is so common that it provides very little information.
TFIDF (term frequency--inverse document frequency) attempts to remedy this. The score of a word in a document increases as the number of time it appears in the document increases, but it decreases if the word is very common across the corpus. This means that words that appear in few posts will have a higher score, indicating more discriminative power.
from sklearn.feature_extraction.text import TfidfVectorizer
# Learn ngrams of size 1,2,3
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=1, max_df=0.8, use_idf=True)
train_tfidf = tfidf.fit_transform(ng_train.data)
test_tfidf = tfidf.transform(ng_test.data)
# All nonzero terms are floats in tfidf
test_tfidf[2][test_tfidf[2].nonzero()]
matrix([[ 0.13752008, 0.04263576, 0.09513934, 0.05312323, 0.11578155, 0.06981124, 0.0684628 , 0.12162018, 0.07394826, 0.19182137, 0.06731552, 0.12549227, 0.0369442 , 0.07350893, 0.09440082, 0.10959236, 0.13752008, 0.1050397 , 0.11346445, 0.13048426, 0.08293576, 0.11142063, 0.12162018, 0.13048426, 0.20996222, 0.08273834, 0.11578155, 0.10255655, 0.06195591, 0.13752008, 0.12162018, 0.04388796, 0.13752008, 0.12549227, 0.05227341, 0.05481063, 0.13752008, 0.04412567, 0.11346445, 0.07245694, 0.06691244, 0.13752008, 0.11346445, 0.11578155, 0.07486176, 0.11346445, 0.05274717, 0.11578155, 0.11346445, 0.1079385 , 0.0761524 , 0.13048426, 0.13752008, 0.08785384, 0.04679391, 0.13048426, 0.11845645, 0.07329337, 0.05373694, 0.13752008, 0.13048426, 0.06248572, 0.13752008, 0.13048426, 0.13048426, 0.1581784 , 0.1486863 , 0.08940883, 0.04248994, 0.12162018, 0.06430122, 0.03734435, 0.13752008, 0.092357 , 0.08785384, 0.02855599, 0.10959236, 0.05769601, 0.08887487, 0.13048426, 0.08469011, 0.13752008, 0.17377913, 0.06073809, 0.04227349, 0.12549227, 0.08785384, 0.02416798, 0.05312323]])
We can use PCA to visualize the dataset in 2d
def vis_proj(proj, targets=ng_train.target):
plt.scatter(proj[:, 0], proj[:, 1], c=targets)
plt.colorbar();plt.show()
from sklearn.decomposition import TruncatedSVD
# Truncated SVD is similar to PCA
svd = TruncatedSVD(n_components=2)
proj = svd.fit_transform(train_tfidf)
vis_proj(proj)
proj = svd.fit_transform(train_counts)
vis_proj(proj)
from sklearn.pipeline import Pipeline
from sklearn import svm
pipeline = Pipeline([
('tfidf', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
('clf', svm.LinearSVC())
])
pipeline.fit(ng_train.data, ng_train.target)
Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, charset=None, charset_error=None, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=0.8, max_features=None, min_df=1, ngram_range=(1, 1), nor...ling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))])
pred = pipeline.predict(ng_test.data)
from sklearn import metrics
print metrics.classification_report(ng_test.target, pred,
target_names=ng_test.target_names)
print metrics.confusion_matrix(ng_test.target, pred)
precision recall f1-score support alt.atheism 0.69 0.62 0.65 319 comp.graphics 0.88 0.90 0.89 389 sci.space 0.79 0.89 0.83 394 talk.religion.misc 0.68 0.61 0.64 251 avg / total 0.77 0.78 0.77 1353 [[197 14 47 61] [ 7 352 25 5] [ 18 21 349 6] [ 63 12 23 153]]