%pylab inline # Score function from slides from sklearn.cross_validation import KFold from sklearn.metrics import accuracy_score def score(clf, X, Y, folds=2, verbose=False, metric=accuracy_score): predictions = np.zeros(len(Y)) for i, (train, test) in enumerate(KFold(len(X), n_folds=folds, shuffle=True)): clf.fit(X[train], Y[train]) predictions[test] = clf.predict(X[test]) if verbose: print("Fold {}: {}".format(i + 1, accuracy_score(Y[test], predictions[test]))) if metric: return metric(Y, predictions) return Y, predictions from sklearn.datasets import fetch_20newsgroups # Set to None to use all categories -- use 4 for speed purposes categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] ng_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes')) ng_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes')) ng_train.target_names ng_train.target.shape ng_train.data[0] from sklearn.feature_extraction.text import CountVectorizer # Learn ngrams of size 1,2,3 # Use the 20000 most common items as the vocabulary counter = CountVectorizer(ngram_range=(1,3), max_features=20000) # Transformers also have a fit_transform method # This learns a transform then retursn the transform # applied to the training data train_counts = counter.fit_transform(ng_train.data) # After the vectorizer is fit, we can convert # any posts to feature vectors using the vocabulary it learned test_counts = counter.transform(ng_test.data) test_counts.shape # All nonzero terms are integers test_counts[2][test_counts[2].nonzero()] # Example vocabulary np.random.permutation(counter.get_feature_names())[:100] from sklearn.feature_extraction.text import TfidfVectorizer # Learn ngrams of size 1,2,3 tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=1, max_df=0.8, use_idf=True) train_tfidf = tfidf.fit_transform(ng_train.data) test_tfidf = tfidf.transform(ng_test.data) # All nonzero terms are floats in tfidf test_tfidf[2][test_tfidf[2].nonzero()] def vis_proj(proj, targets=ng_train.target): plt.scatter(proj[:, 0], proj[:, 1], c=targets) plt.colorbar();plt.show() from sklearn.decomposition import TruncatedSVD # Truncated SVD is similar to PCA svd = TruncatedSVD(n_components=2) proj = svd.fit_transform(train_tfidf) vis_proj(proj) proj = svd.fit_transform(train_counts) vis_proj(proj) from sklearn.pipeline import Pipeline from sklearn import svm pipeline = Pipeline([ ('tfidf', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)), ('clf', svm.LinearSVC()) ]) pipeline.fit(ng_train.data, ng_train.target) pred = pipeline.predict(ng_test.data) from sklearn import metrics print metrics.classification_report(ng_test.target, pred, target_names=ng_test.target_names) print metrics.confusion_matrix(ng_test.target, pred)