# write out some toy data from sklearn.datasets import load_digits import cPickle digits = load_digits() X, y = digits.data, digits.target for i in range(10): cPickle.dump((X[i::10], y[i::10]), open("data/batch_%02d.pickle" % i, "w"), -1) from sklearn.linear_model import SGDClassifier sgd = SGDClassifier() for i in range(9): X_batch, y_batch = cPickle.load(open("data/batch_%02d.pickle" % i)) sgd.partial_fit(X_batch, y_batch, classes=range(10)) X_test, y_test = cPickle.load(open("data/batch_09.pickle")) sgd.score(X_test, y_test) import pandas as pd from sklearn.feature_extraction.text import HashingVectorizer sgd = SGDClassifier() hashing_vectorizer = HashingVectorizer() for i in range(10): data_batch = pd.read_csv("data/train_%d.csv" % i) text_batch = data_batch.Comment.tolist() y_batch = data_batch.Insult.values X_batch = hashing_vectorizer.transform(text_batch) sgd.partial_fit(X_batch, y_batch, classes=range(10)) data_test = pd.read_csv("data/test_with_solutions.csv") X_test = hashing_vectorizer.transform(data_test.Comment.tolist()) y_test = data_test.Insult.values sgd.score(X_test, y_test) from sklearn.kernel_approximation import RBFSampler sgd = SGDClassifier() kernel_approximation = RBFSampler(gamma=.001, n_components=400) for i in range(9): X_batch, y_batch = cPickle.load(open("data/batch_%02d.pickle" % i)) if i == 0: kernel_approximation.fit(X_batch) X_transformed = kernel_approximation.transform(X_batch) sgd.partial_fit(X_transformed, y_batch, classes=range(10)) X_test, y_test = cPickle.load(open("data/batch_09.pickle")) sgd.score(kernel_approximation.transform(X_test), y_test)