%pylab inline

# Score function from slides
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score

def score(clf, X, Y, folds=2, verbose=False, metric=accuracy_score):
    predictions = np.zeros(len(Y))
    for i, (train, test) in enumerate(KFold(len(X), n_folds=folds, shuffle=True)):
        clf.fit(X[train], Y[train])
        predictions[test] = clf.predict(X[test])
        if verbose:
            print("Fold {}: {}".format(i + 1, accuracy_score(Y[test], predictions[test])))
    if metric:
        return metric(Y, predictions)
    return Y, predictions

from sklearn.datasets import fetch_20newsgroups

# Set to None to use all categories -- use 4 for speed purposes
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

ng_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
ng_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

ng_train.target_names

ng_train.target.shape

ng_train.data[0]

from sklearn.feature_extraction.text import CountVectorizer

# Learn ngrams of size 1,2,3
# Use the 20000 most common items as the vocabulary
counter = CountVectorizer(ngram_range=(1,3), max_features=20000)

# Transformers also have a fit_transform method
# This learns a transform then retursn the transform
# applied to the training data
train_counts = counter.fit_transform(ng_train.data)

# After the vectorizer is fit, we can convert
# any posts to feature vectors using the vocabulary it learned
test_counts = counter.transform(ng_test.data)

test_counts.shape

# All nonzero terms are integers
test_counts[2][test_counts[2].nonzero()]

# Example vocabulary
np.random.permutation(counter.get_feature_names())[:100]

from sklearn.feature_extraction.text import TfidfVectorizer

# Learn ngrams of size 1,2,3
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=1, max_df=0.8, use_idf=True)

train_tfidf = tfidf.fit_transform(ng_train.data)

test_tfidf = tfidf.transform(ng_test.data)

# All nonzero terms are floats in tfidf
test_tfidf[2][test_tfidf[2].nonzero()]

def vis_proj(proj, targets=ng_train.target):
    plt.scatter(proj[:, 0], proj[:, 1], c=targets)
    plt.colorbar();plt.show()

from sklearn.decomposition import TruncatedSVD
# Truncated SVD is similar to PCA
svd = TruncatedSVD(n_components=2)
proj = svd.fit_transform(train_tfidf)
vis_proj(proj)

proj = svd.fit_transform(train_counts)
vis_proj(proj)

from sklearn.pipeline import Pipeline

from sklearn import svm
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
    ('clf', svm.LinearSVC())
])

pipeline.fit(ng_train.data, ng_train.target)

pred = pipeline.predict(ng_test.data)

from sklearn import metrics
print metrics.classification_report(ng_test.target, pred,
                                    target_names=ng_test.target_names)
print metrics.confusion_matrix(ng_test.target, pred)