%pylab inline import pylab as plt import numpy as np from sklearn.linear_model import LinearRegression model = LinearRegression(normalize=True) print model.normalize print model x = np.array([0, 1, 2]) y = np.array([0, 1, 2]) _ = plt.plot(x, y, marker='o') X = x[:, np.newaxis] # The input data for sklearn is 2D: (samples == 3 x features == 1) X model.fit(X, y) model.coef_ from sklearn import neighbors, datasets iris = datasets.load_iris() X, y = iris.data, iris.target knn = neighbors.KNeighborsClassifier(n_neighbors=1) knn.fit(X, y) # What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal? print iris.target_names[knn.predict([[3, 5, 4, 2]])] # A plot of the sepal space and the prediction of the KNN from helpers import plot_iris_knn plot_iris_knn() # Create some simple data import numpy as np np.random.seed(0) X = np.random.random(size=(20, 1)) y = 3 * X.squeeze() + 2 + np.random.normal(size=20) # Fit a linear regression to it from sklearn.linear_model import LinearRegression model = LinearRegression(fit_intercept=True) model.fit(X, y) print "Model coefficient: %.5f, and intercept: %.5f" % (model.coef_, model.intercept_) # Plot the data and the model prediction X_test = np.linspace(0, 1, 100)[:, np.newaxis] y_test = model.predict(X_test) import pylab as pl print X.squeeze() pl.plot(X.squeeze(), y, 'o') pl.plot(X_test.squeeze(), y_test) X, y = iris.data, iris.target from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(X) X_reduced = pca.transform(X) print "Reduced dataset shape:", X_reduced.shape import pylab as pl pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y) print "Meaning of the 2 components:" for component in pca.components_: print " + ".join("%.3f x %s" % (value, name) for value, name in zip(component, iris.feature_names)) from sklearn.cluster import KMeans k_means = KMeans(n_clusters=3, random_state=0) # Fixing the RNG in kmeans k_means.fit(X_reduced) y_pred = k_means.predict(X_reduced) pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_pred) %pylab inline import pylab as pl import numpy as np # Some nice default configuration for plots pl.rcParams['figure.figsize'] = 10, 7.5 pl.rcParams['axes.grid'] = True pl.gray() from sklearn.datasets import load_files from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB # Load the text data categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] twenty_train_small = load_files('../data/twenty_newsgroups/20news-bydate-train/', categories=categories, charset='latin-1') twenty_test_small = load_files('../data/twenty_newsgroups/20news-bydate-test/', categories=categories, charset='latin-1') # Turn the text documents into vectors of word frequencies vectorizer = TfidfVectorizer(min_df=2) X_train = vectorizer.fit_transform(twenty_train_small.data) y_train = twenty_train_small.target # Fit a classifier on the training set classifier = MultinomialNB().fit(X_train, y_train) print("Training score: {0:.1f}%".format( classifier.score(X_train, y_train) * 100)) # Evaluate the classifier on the testing set X_test = vectorizer.transform(twenty_test_small.data) y_test = twenty_test_small.target print("Testing score: {0:.1f}%".format( classifier.score(X_test, y_test) * 100)) ls -l ../data/ ls -lh ../data/twenty_newsgroups/20news-bydate-train ls -lh ../data/twenty_newsgroups/20news-bydate-train/alt.atheism/ #print(load_files.__doc__) all_twenty_train = load_files('../data/twenty_newsgroups/20news-bydate-train/', charset='latin-1', random_state=42) all_twenty_test = load_files('../data/twenty_newsgroups/20news-bydate-test/', charset='latin-1', random_state=42) all_target_names = all_twenty_train.target_names all_target_names all_twenty_train.target all_twenty_train.target.shape all_twenty_test.target.shape len(all_twenty_train.data) type(all_twenty_train.data[0]) def display_sample(i, dataset): print("Class name: " + dataset.target_names[dataset.target[i]]) print("Text content:\n") print(dataset.data[i]) display_sample(0, all_twenty_train) display_sample(1, all_twenty_train) def text_size(text, charset='iso-8859-1'): return len(text.encode(charset)) * 8 * 1e-6 train_size_mb = sum(text_size(text) for text in all_twenty_train.data) test_size_mb = sum(text_size(text) for text in all_twenty_test.data) print("Training set size: {0} MB".format(int(train_size_mb))) print("Testing set size: {0} MB".format(int(test_size_mb))) train_small_size_mb = sum(text_size(text) for text in twenty_train_small.data) test_small_size_mb = sum(text_size(text) for text in twenty_test_small.data) print("Training set size: {0} MB".format(int(train_small_size_mb))) print("Testing set size: {0} MB".format(int(test_small_size_mb))) from sklearn.feature_extraction.text import TfidfVectorizer TfidfVectorizer() vectorizer = TfidfVectorizer(min_df=1) %time X_train_small = vectorizer.fit_transform(twenty_train_small.data) X_train_small n_samples, n_features = X_train_small.shape n_samples len(twenty_train_small.data) n_features type(vectorizer.vocabulary_) len(vectorizer.vocabulary_) len(vectorizer.get_feature_names()) vectorizer.get_feature_names()[:10] vectorizer.get_feature_names()[n_features / 2:n_features / 2 + 10] from sklearn.decomposition import RandomizedPCA %time X_train_small_pca = RandomizedPCA(n_components=2).fit_transform(X_train_small) from itertools import cycle colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] for i, c in zip(np.unique(y_train), cycle(colors)): pl.scatter(X_train_small_pca[y_train == i, 0], X_train_small_pca[y_train == i, 1], c=c, label=twenty_train_small.target_names[i], alpha=0.5) _ = pl.legend(loc='best') y_train_small = twenty_train_small.target y_train_small.shape y_train_small X_train_small.shape[0] == y_train_small.shape[0] from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB(alpha=0.1) clf clf.fit(X_train_small, y_train_small) X_test_small = vectorizer.transform(twenty_test_small.data) y_test_small = twenty_test_small.target X_test_small.shape y_test_small.shape clf.score(X_test_small, y_test_small) clf.score(X_train_small, y_train_small) TfidfVectorizer() print(TfidfVectorizer.__doc__) analyzer = TfidfVectorizer().build_analyzer() analyzer("I love scikit-learn: this is a cool Python lib!") analyzer = TfidfVectorizer( preprocessor=lambda text: text, # disable lowercasing token_pattern=ur'(?u)\b[\w-]+\b', # treat hyphen as a letter # do not exclude single letter tokens ).build_analyzer() analyzer("I love scikit-learn: this is a cool Python lib!")