import numpy as np from __future__ import print_function from IPython.html.widgets import interact, RadioButtonsWidget, IntSliderWidget, TextWidget %matplotlib inline import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import matplotlib matplotlib.rcParams.update({ "lines.linewidth": 2.0, "examples.download": True, "axes.edgecolor": "#bcbcbc", "patch.linewidth": 0.5, "legend.fancybox": True, "axes.color_cycle": ["#348ABD", "#A60628", "#7A68A6", "#467821", "#CF4457", "#188487", "#E24A33"], "axes.facecolor": "#eeeeee", "axes.labelsize": "large", "axes.grid": True, "patch.edgecolor": "#eeeeee", "axes.titlesize": "x-large", "svg.embed_char_paths": "path", "examples.directory": "" }) import sklearn print("Scikits-learn version is", sklearn.__version__) from sklearn.datasets import load_diabetes diabetes = load_diabetes() features = diabetes.data[:, 2][:, np.newaxis] X1 = features y1 = diabetes.target def plot_line(slope, bias): plt.scatter(X1, y1, color='y') plt.xlabel("$X_2$", fontsize=20); plt.ylabel("$y$", fontsize=20) _ = plt.title("$y\,\,vs\,\,X_2$", fontsize=20) predictions = slope * X1 + bias plt.plot(X1, predictions, color='blue', linewidth=3) print("Slope = {}, Bias = {}".format(slope, bias)) print("Residual sum of squares: %.2f" % np.mean((predictions - y1) ** 2)) _ = interact(plot_line, slope=IntSliderWidget(min=0, max=2000, step=50, value=1000), bias=IntSliderWidget(min=0, max=300, step=30, value=100)) from sklearn.linear_model import LinearRegression model = LinearRegression() model = model.fit(X1, y1) predictions = model.predict(X1) print("Model is trained with the following params: {}".format(model.get_params())) print("Slope = {}, Bias = {}".format(model.coef_[0], model.intercept_)) # The mean square error print("Residual sum of squares: %.2f" % np.mean((predictions - y1) ** 2)) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % model.score(X1, y1)) # Plot outputs plt.scatter(X1, y1, color='y') plt.plot(X1, predictions, color='blue', linewidth=3) plt.xlabel("$X_2$", fontsize=20); plt.ylabel("$y$", fontsize=20) _ = plt.title("$y\,\,vs\,\,X_2$", fontsize=20) from sklearn import datasets from sklearn.linear_model import LogisticRegression iris = datasets.load_iris() X2 = iris.data[:, :2] # we only take the first two features. Y2 = iris.target plt.figure(1, figsize=(6, 4.5)) plt.xlabel('Sepal length'); plt.ylabel('Sepal width') _ = plt.scatter(X2[:, 0], X2[:, 1], c=Y2, edgecolors='k', cmap=plt.cm.RdYlGn) model = LogisticRegression(C=1e5) model = model.fit(X2, Y2) predictions = model.predict(X2) from sklearn.metrics import accuracy_score print("Performance of our classifier is {:.2f}%".format(accuracy_score(Y2, predictions)*100)) h2 = .02 # step size in the mesh x_min2, x_max2 = X2[:, 0].min() - .5, X2[:, 0].max() + .5 y_min2, y_max2 = X2[:, 1].min() - .5, X2[:, 1].max() + .5 xx2, yy2 = np.meshgrid(np.arange(x_min2, x_max2, h2), np.arange(y_min2, y_max2, h2)) Z2 = model.predict(np.c_[xx2.ravel(), yy2.ravel()]) # Put the result into a color plot Z2 = Z2.reshape(xx2.shape) plt.figure(1, figsize=(6, 4.5)) plt.pcolormesh(xx2, yy2, Z2, cmap=plt.cm.YlOrBr) plt.xlabel('Sepal length'); plt.ylabel('Sepal width') plt.xlim(xx2.min(), xx2.max()); plt.ylim(yy2.min(), yy2.max()) _ = plt.scatter(X2[:, 0], X2[:, 1], c=Y2, edgecolors='k', cmap=plt.cm.BrBG) from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression np.random.seed(0) n_samples = 20 true_fun = lambda X: np.cos(1.5 * np.pi * X) X3 = np.sort(np.random.rand(n_samples)) y3 = true_fun(X3) + np.random.randn(n_samples) * 0.15 def regressor(degree): plt.figure(figsize=(6, 4.5)) ax = plt.subplot(1, 1, 1) plt.setp(ax, xticks=(), yticks=()) polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) linear_regression = LinearRegression() pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)]) pipeline.fit(X3[:, np.newaxis], y3) plt.plot(X3, pipeline.predict(X3[:, np.newaxis]), label="Model") plt.plot(X3, true_fun(X3), label="True function") plt.scatter(X3, y3, label="Samples") plt.xlabel("$x$", fontsize=20); plt.ylabel("$y=cos(1.5\pi x)$", fontsize=20); plt.xlim((0, 1)); plt.ylim((-1.5, 1.5)) plt.legend(loc="best") plt.title("Degree %d" % degree) _ = interact(regressor, degree=IntSliderWidget(min=1, max=60, step=1, value=1)) from sklearn.cross_validation import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_moons, make_circles, make_classification from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA clfs_names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] clfs = [KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA()] classifiers = dict(zip(clfs_names, clfs)) X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) ds_names = ['moons', 'circles', 'iris 2 flowers', 'sandwitches'] ds = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), (iris.data[:, :2], iris.target == 0), linearly_separable ] datasets = dict(zip(ds_names, ds)) cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) h = 0.2 def plot_ds(ds_name): ds = datasets[ds_name] X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # just plot the dataset first ax = plt.subplot(1, 2, 1) # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_title(ds_name) return ds, xx ,yy def classify(dataset_name, classifier_name): clf = classifiers[classifier_name] figure = plt.figure(figsize=(10, 5)) (X,y), xx, yy = plot_ds(dataset_name) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) ax = plt.subplot(1, 2, 2) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot also the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_title(classifier_name) ax.text(xx.max() - .3, yy.min() + .3, ('Accuracy=%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') from IPython.html.widgets import interact, RadioButtonsWidget, IntSliderWidget, TextWidget, DropdownWidget _ = interact(classify, classifier_name=DropdownWidget(values=clfs_names), dataset_name=RadioButtonsWidget(values=ds_names)) from sklearn import svm def svm_classify(alpha): xx, yy = np.meshgrid(np.linspace(3, 9, 500), np.linspace(1, 5, 500)) X, Y = (iris.data[:, :2], iris.target == 2) # fit the model clf = svm.SVC(C=1.0/alpha, gamma=5) clf.fit(X, Y) plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired) # plot the decision function for each datapoint on the grid Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto', origin='lower', cmap=plt.cm.PuOr_r) contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linetypes='--') plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired) plt.axis([3, 9, 1, 5]) _ = interact(svm_classify, alpha=DropdownWidget(values=[3, 1, 0.3, 1e-1, 1e-2, 1e-3, 1e-6])) np.random.seed(0) n_samples = 15 true_fun = lambda X: np.cos(1.5 * np.pi * X) X4 = np.sort(np.random.rand(n_samples)) y4 = true_fun(X4) + np.random.randn(n_samples) * 0.15 def regressor2(degree): if not degree: return degree = int(degree) fig, axes = plt.subplots(1, 3, sharey=False) fig.set_size_inches((18,6)) polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) linear_regression = LinearRegression() pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)]) pipeline.fit(X4[:, np.newaxis], y4) train_predictions = pipeline.predict(X4[:, np.newaxis]) Y_train = true_fun(X4) train_err = np.mean((train_predictions - Y_train) ** 2) X_test = np.linspace(0, 1, 100) test_predictions = pipeline.predict(X_test[:, np.newaxis]) Y_test = true_fun(X_test) test_err = np.mean((test_predictions - Y_test) ** 2) axes[0].plot(X4, Y_train) axes[0].set_title("True function") axes[1].plot(X4, train_predictions) axes[1].set_title("Seen samples") axes[1].text(0, 0, ('Error=%.2f' % train_err), size=15, horizontalalignment='left', verticalalignment='bottom') axes[2].plot(X_test, test_predictions) axes[2].text(0, 0, ('Error=%.2f' % test_err), size=15, horizontalalignment='left', verticalalignment='bottom') _ = axes[2].set_title("Unseen samples") return test_err, train_err _ = interact(regressor2, degree=TextWidget(value="1")) def regressor3(degree): polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) linear_regression = LinearRegression() pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)]) pipeline.fit(X4[:, np.newaxis], y4) train_predictions = pipeline.predict(X4[:, np.newaxis]) Y_train = true_fun(X4) train_err = np.mean((train_predictions - Y_train) ** 2) X_test = np.linspace(0, 1, 100) test_predictions = pipeline.predict(X_test[:, np.newaxis]) Y_test = true_fun(X_test) test_err = np.mean((test_predictions - Y_test) ** 2) return test_err, train_err degrees = range(1, 8) errors = np.array([regressor3(d) for d in degrees]) plt.plot(degrees, errors[:, 0], marker='^', c='r', label='Testing samples') plt.plot(degrees, errors[:, 1], marker='o', c='b', label='Training samples') plt.yscale('log') plt.xlabel("degree"); plt.ylabel("Error") _ = plt.legend(loc='best') from sklearn.grid_search import GridSearchCV from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error, make_scorer pipeline = Pipeline([("pre1", PolynomialFeatures()), ("regressor", Ridge())]) parameters = { 'pre1__degree': range(1, 15), 'pre1__interaction_only': [False, True], 'regressor__alpha': (1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2), 'regressor__fit_intercept': [False, True], } scorer = make_scorer(mean_squared_error, greater_is_better=False) model = GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=1, cv=3, verbose=1) model = model.fit(X4[:, np.newaxis], y4) train_predictions = model.predict(X4[:, np.newaxis]) Y_train = true_fun(X4) train_err = mean_squared_error(Y_train, train_predictions) X_test = np.linspace(0, 1, 100) test_predictions = model.predict(X_test[:, np.newaxis]) Y_test = true_fun(X_test) test_err = mean_squared_error(Y_test, test_predictions) print("\n\nReport of the grid search") print("===========================") print("Training error = {:.4f}\t Testing error = {:.4f}".format(train_err, test_err)) print("Thes best hyperparameter combination as chosen by the grid search\n", model.best_params_) import pandas as pd from pandas import DataFrame def gs2df(model): """ Convert grid search data to a dataframe.""" records = [] for l in model.grid_scores_: d = dict(l.parameters) d["mean"] = l.mean_validation_score d["scores"] = l.cv_validation_scores records.append(d) return DataFrame.from_records(records) def plot_gsdf(df): """Plot the average performance of each hyperparameter.""" for col in df.columns: if col == 'scores' or col == 'mean': continue temp = df.groupby(by=[col])[["mean"]].mean() kind = 'bar' if isinstance(temp.index.values[0], str) else 'line' y_lim = temp.values.min() *0.975, temp.values.max() * 1.025 temp.plot(kind=kind, ylim=y_lim, marker='o') # scores of a grid search df = gs2df(model) # This is counter-intuitive and should not happen for classification # For more details why the scores are negative, look at # https://github.com/scikit-learn/scikit-learn/issues/2439 df['mean'] *= -1 df.head() plot_gsdf(df) from sklearn.datasets import load_digits from sklearn import cross_validation from sklearn.learning_curve import learning_curve def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Accuracy") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training accuracy") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation accuracy") plt.legend(loc="best") return plt digits = load_digits() X5, y5 = digits.data, digits.target title = "Learning Curves (Naive Bayes)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100, test_size=0.2, random_state=0) estimator = GaussianNB() __ = plot_learning_curve(estimator, title, X5, y5, ylim=(0.7, 1.01), cv=cv, n_jobs=4) from sklearn.datasets import fetch_20newsgroups categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) twenty_train.target_names len(twenty_train.data) len(twenty_train.filenames) print("\n".join(twenty_train.data[0].split("\n")[:3])) twenty_train.target[:10] from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) X_train_counts.shape count_vect.vocabulary_.get(u'algorithm') from sklearn.feature_extraction.text import TfidfTransformer tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) X_train_tf.shape tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_tfidf.shape from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) docs_new = ['God is love', 'OpenGL on the GPU is fast'] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, twenty_train.target_names[category])) twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) from sklearn.linear_model import SGDClassifier text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5)), ]) _ = text_clf.fit(twenty_train.data, twenty_train.target) predicted = text_clf.predict(twenty_test.data) print("Accuracy of our SGD classifier is {:.2f}%".format(np.mean(predicted == twenty_test.target)*100)) from sklearn import metrics print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)) import time from sklearn import datasets from sklearn.metrics import euclidean_distances from sklearn.neighbors import kneighbors_graph from sklearn.preprocessing import StandardScaler from sklearn.cluster import (MeanShift, MiniBatchKMeans, AffinityPropagation, AgglomerativeClustering, DBSCAN, SpectralClustering) import sklearn.cluster np.random.seed(0) # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times n_samples = 150 datasets_collection = { "noisy_circles": datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05), "noisy_moons" :datasets.make_moons(n_samples=n_samples, noise=.05), "blobs": datasets.make_blobs(n_samples=n_samples, random_state=8), "no_structure": (np.random.rand(n_samples, 2), None), } clustering_algos = { # create clustering estimators "mean_shift": MeanShift(bin_seeding=True), "two_means": MiniBatchKMeans(n_clusters=2), "agglomerative": AgglomerativeClustering(n_clusters=2, linkage='ward'), "spectral": SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors"), "dbscan": DBSCAN(eps=.2), "affinity_propagation": AffinityPropagation(damping=.9, preference=-200), "average_linkage": AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=2) } colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) def cluster(ds_name, algo_name): dataset = datasets_collection[ds_name] algorithm = clustering_algos[algo_name] X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = sklearn.cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=10) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # Compute distances #distances = np.exp(-euclidean_distances(X)) distances = euclidean_distances(X) # predict cluster memberships t0 = time.time() algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) fig, axes = plt.subplots(1, 2) fig.set_size_inches((12,4)) axes[0].scatter(X[:, 0], X[:, 1], c='black', s=10) axes[0].set_title('Data') axes[1].scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10) axes[1].set_title('Clustered Data') if hasattr(algorithm, 'cluster_centers_'): centers = algorithm.cluster_centers_ center_colors = colors[:len(centers)] plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors) plt.xlim(-2, 2) plt.ylim(-2, 2) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') from IPython.html.widgets import interact, RadioButtonsWidget, IntSliderWidget, TextWidget, DropdownWidget _ = interact(cluster, algo_name=DropdownWidget(values=clustering_algos.keys()), ds_name=RadioButtonsWidget(values=datasets_collection.keys()))