import numpy as np
from __future__ import print_function

from IPython.html.widgets import interact, RadioButtonsWidget, IntSliderWidget, TextWidget

%matplotlib inline

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import matplotlib
matplotlib.rcParams.update({
  "lines.linewidth": 2.0,
  "examples.download": True,
  "axes.edgecolor": "#bcbcbc",
  "patch.linewidth": 0.5,
  "legend.fancybox": True,
  "axes.color_cycle": ["#348ABD", "#A60628", "#7A68A6", "#467821", "#CF4457", "#188487", "#E24A33"],
  "axes.facecolor": "#eeeeee",
  "axes.labelsize": "large",
  "axes.grid": True,
  "patch.edgecolor": "#eeeeee",
  "axes.titlesize": "x-large",
  "svg.embed_char_paths": "path",
  "examples.directory": ""
})

import sklearn
print("Scikits-learn version is", sklearn.__version__)

from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
features = diabetes.data[:, 2][:, np.newaxis]
X1 = features
y1 = diabetes.target

def plot_line(slope, bias):
  plt.scatter(X1, y1,  color='y')
  plt.xlabel("$X_2$", fontsize=20); plt.ylabel("$y$", fontsize=20)
  _ = plt.title("$y\,\,vs\,\,X_2$", fontsize=20)
  predictions = slope * X1 + bias
  plt.plot(X1, predictions, color='blue', linewidth=3)
  print("Slope = {}, Bias = {}".format(slope, bias))
  print("Residual sum of squares: %.2f"
      % np.mean((predictions - y1) ** 2))

_ = interact(plot_line,
         slope=IntSliderWidget(min=0, max=2000, step=50, value=1000),
         bias=IntSliderWidget(min=0, max=300, step=30, value=100))

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model = model.fit(X1, y1)
predictions = model.predict(X1)
print("Model is trained with the following params: {}".format(model.get_params()))

print("Slope = {}, Bias = {}".format(model.coef_[0], model.intercept_))
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((predictions - y1) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % model.score(X1, y1))

# Plot outputs
plt.scatter(X1, y1,  color='y')
plt.plot(X1, predictions, color='blue', linewidth=3)
plt.xlabel("$X_2$", fontsize=20); plt.ylabel("$y$", fontsize=20)
_ = plt.title("$y\,\,vs\,\,X_2$", fontsize=20)

from sklearn import datasets
from sklearn.linear_model import LogisticRegression

iris = datasets.load_iris()
X2 = iris.data[:, :2]  # we only take the first two features.
Y2 = iris.target

plt.figure(1, figsize=(6, 4.5))
plt.xlabel('Sepal length'); plt.ylabel('Sepal width')
_ = plt.scatter(X2[:, 0], X2[:, 1], c=Y2, edgecolors='k', cmap=plt.cm.RdYlGn)

model = LogisticRegression(C=1e5)
model = model.fit(X2, Y2)
predictions = model.predict(X2)

from sklearn.metrics import accuracy_score

print("Performance of our classifier is {:.2f}%".format(accuracy_score(Y2, predictions)*100))

h2 = .02  # step size in the mesh
x_min2, x_max2 = X2[:, 0].min() - .5, X2[:, 0].max() + .5
y_min2, y_max2 = X2[:, 1].min() - .5, X2[:, 1].max() + .5
xx2, yy2 = np.meshgrid(np.arange(x_min2, x_max2, h2), np.arange(y_min2, y_max2, h2))
Z2 = model.predict(np.c_[xx2.ravel(), yy2.ravel()])
# Put the result into a color plot
Z2 = Z2.reshape(xx2.shape)

plt.figure(1, figsize=(6, 4.5))
plt.pcolormesh(xx2, yy2, Z2, cmap=plt.cm.YlOrBr)
plt.xlabel('Sepal length'); plt.ylabel('Sepal width')
plt.xlim(xx2.min(), xx2.max()); plt.ylim(yy2.min(), yy2.max())
_ = plt.scatter(X2[:, 0], X2[:, 1], c=Y2, edgecolors='k', cmap=plt.cm.BrBG)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

np.random.seed(0)
n_samples = 20
true_fun = lambda X: np.cos(1.5 * np.pi * X)
X3 = np.sort(np.random.rand(n_samples))
y3 = true_fun(X3) + np.random.randn(n_samples) * 0.15

def regressor(degree):
  plt.figure(figsize=(6, 4.5))
  ax = plt.subplot(1, 1, 1)
  plt.setp(ax, xticks=(), yticks=())

  polynomial_features = PolynomialFeatures(degree=degree,
                                           include_bias=False)
  linear_regression = LinearRegression()
  pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
  pipeline.fit(X3[:, np.newaxis], y3)

  plt.plot(X3, pipeline.predict(X3[:, np.newaxis]), label="Model")
  plt.plot(X3, true_fun(X3), label="True function")
  plt.scatter(X3, y3, label="Samples")
  plt.xlabel("$x$", fontsize=20); plt.ylabel("$y=cos(1.5\pi x)$", fontsize=20); plt.xlim((0, 1)); plt.ylim((-1.5, 1.5))
  plt.legend(loc="best")
  plt.title("Degree %d" % degree)

_ = interact(regressor, degree=IntSliderWidget(min=1, max=60, step=1, value=1))

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA

clfs_names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
              "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
clfs = [KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()]
classifiers = dict(zip(clfs_names, clfs))

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

ds_names = ['moons', 'circles', 'iris 2 flowers', 'sandwitches']
ds = [make_moons(noise=0.3, random_state=0),
      make_circles(noise=0.2, factor=0.5, random_state=1),
      (iris.data[:, :2], iris.target == 0),
      linearly_separable
     ]
datasets = dict(zip(ds_names, ds))

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
h = 0.2

def plot_ds(ds_name):
  ds = datasets[ds_name]
  X, y = ds
  X = StandardScaler().fit_transform(X)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

  x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
  y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
  xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                       np.arange(y_min, y_max, h))

  # just plot the dataset first
  ax = plt.subplot(1, 2, 1)
  # Plot the training points
  ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
  # and testing points
  ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
  ax.set_xlim(xx.min(), xx.max())
  ax.set_ylim(yy.min(), yy.max())
  ax.set_title(ds_name)
  return ds, xx ,yy

def classify(dataset_name, classifier_name):
  clf = classifiers[classifier_name]
  figure = plt.figure(figsize=(10, 5))
  (X,y), xx, yy = plot_ds(dataset_name)
  X = StandardScaler().fit_transform(X)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

  ax = plt.subplot(1, 2, 2)
  clf.fit(X_train, y_train)
  score = clf.score(X_test, y_test)

  # Plot the decision boundary. For that, we will assign a color to each
  # point in the mesh [x_min, m_max]x[y_min, y_max].
  if hasattr(clf, "decision_function"):
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
  else:
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

  # Put the result into a color plot
  Z = Z.reshape(xx.shape)
  ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

  # Plot also the training points
  ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
  # and testing points
  ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
             alpha=0.6)

  ax.set_xlim(xx.min(), xx.max())
  ax.set_ylim(yy.min(), yy.max())
  ax.set_title(classifier_name)
  ax.text(xx.max() - .3, yy.min() + .3, ('Accuracy=%.2f' % score).lstrip('0'),
          size=15, horizontalalignment='right')

from IPython.html.widgets import interact, RadioButtonsWidget, IntSliderWidget, TextWidget, DropdownWidget

_ = interact(classify,
             classifier_name=DropdownWidget(values=clfs_names),
             dataset_name=RadioButtonsWidget(values=ds_names))

from sklearn import svm

def svm_classify(alpha):
  xx, yy = np.meshgrid(np.linspace(3, 9, 500),
                       np.linspace(1, 5, 500))
  X, Y = (iris.data[:, :2], iris.target == 2)
  # fit the model
  clf = svm.SVC(C=1.0/alpha, gamma=5)
  clf.fit(X, Y)

  plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
  # plot the decision function for each datapoint on the grid
  Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
  Z = Z.reshape(xx.shape)

  plt.imshow(Z, interpolation='nearest',
             extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
             origin='lower', cmap=plt.cm.PuOr_r)
  contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
                         linetypes='--')
  plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
  plt.axis([3, 9, 1, 5])

_ = interact(svm_classify, alpha=DropdownWidget(values=[3, 1,  0.3, 1e-1, 1e-2, 1e-3, 1e-6]))

np.random.seed(0)
n_samples = 15
true_fun = lambda X: np.cos(1.5 * np.pi * X)
X4 = np.sort(np.random.rand(n_samples))
y4 = true_fun(X4) + np.random.randn(n_samples) * 0.15

def regressor2(degree):
  if not degree:
    return
  degree = int(degree)
  fig, axes = plt.subplots(1, 3, sharey=False)
  fig.set_size_inches((18,6))
  polynomial_features = PolynomialFeatures(degree=degree,
                                           include_bias=False)
  linear_regression = LinearRegression()
  pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
  pipeline.fit(X4[:, np.newaxis], y4)

  train_predictions = pipeline.predict(X4[:, np.newaxis])
  Y_train = true_fun(X4)
  train_err = np.mean((train_predictions -  Y_train) ** 2)
  

  X_test = np.linspace(0, 1, 100)
  test_predictions = pipeline.predict(X_test[:, np.newaxis])  
  Y_test = true_fun(X_test)
  test_err = np.mean((test_predictions -  Y_test) ** 2)

  axes[0].plot(X4, Y_train)
  axes[0].set_title("True function")
  axes[1].plot(X4, train_predictions)
  axes[1].set_title("Seen samples")
  
  axes[1].text(0, 0, ('Error=%.2f' % train_err),
          size=15, horizontalalignment='left', verticalalignment='bottom')

  axes[2].plot(X_test, test_predictions)
  axes[2].text(0, 0, ('Error=%.2f' % test_err),
          size=15, horizontalalignment='left', verticalalignment='bottom')

  _ = axes[2].set_title("Unseen samples")
  return test_err, train_err

_ = interact(regressor2, degree=TextWidget(value="1"))

def regressor3(degree):
  polynomial_features = PolynomialFeatures(degree=degree,
                                           include_bias=False)
  linear_regression = LinearRegression()
  pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
  pipeline.fit(X4[:, np.newaxis], y4)
  train_predictions = pipeline.predict(X4[:, np.newaxis])
  Y_train = true_fun(X4)
  train_err = np.mean((train_predictions -  Y_train) ** 2)
  X_test = np.linspace(0, 1, 100)
  test_predictions = pipeline.predict(X_test[:, np.newaxis])  
  Y_test = true_fun(X_test)
  test_err = np.mean((test_predictions -  Y_test) ** 2)
  return test_err, train_err

degrees = range(1, 8)
errors = np.array([regressor3(d) for d in degrees])
plt.plot(degrees, errors[:, 0], marker='^', c='r', label='Testing samples')
plt.plot(degrees, errors[:, 1], marker='o', c='b', label='Training samples')
plt.yscale('log')
plt.xlabel("degree"); plt.ylabel("Error")
_ = plt.legend(loc='best')

from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error, make_scorer

pipeline = Pipeline([("pre1", PolynomialFeatures()),
                     ("regressor",  Ridge())])
parameters = {
  'pre1__degree': range(1, 15),
  'pre1__interaction_only': [False, True],
  'regressor__alpha': (1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2),
  'regressor__fit_intercept': [False, True],
}
scorer = make_scorer(mean_squared_error, greater_is_better=False)

model = GridSearchCV(pipeline, parameters, scoring=scorer, n_jobs=1, cv=3, verbose=1)
model = model.fit(X4[:, np.newaxis], y4)

train_predictions = model.predict(X4[:, np.newaxis])
Y_train = true_fun(X4)
train_err = mean_squared_error(Y_train, train_predictions)
X_test = np.linspace(0, 1, 100)
test_predictions = model.predict(X_test[:, np.newaxis])  
Y_test = true_fun(X_test)
test_err = mean_squared_error(Y_test, test_predictions)

print("\n\nReport of the grid search")
print("===========================")
print("Training error = {:.4f}\t Testing error = {:.4f}".format(train_err, test_err))
print("Thes best hyperparameter combination as chosen by the grid search\n", model.best_params_)  

import pandas as pd
from pandas import DataFrame

def gs2df(model):
  """ Convert grid search data to a dataframe."""
  records = []
  for l in  model.grid_scores_:
    d = dict(l.parameters)
    d["mean"] = l.mean_validation_score
    d["scores"] = l.cv_validation_scores
    records.append(d)
  return DataFrame.from_records(records)

def plot_gsdf(df):
  """Plot the average performance of each hyperparameter."""
  for col in df.columns:
    if col == 'scores' or col == 'mean': continue
    temp = df.groupby(by=[col])[["mean"]].mean()
    kind = 'bar' if isinstance(temp.index.values[0], str) else 'line'
    y_lim = temp.values.min() *0.975, temp.values.max() * 1.025
    temp.plot(kind=kind, ylim=y_lim, marker='o')

# scores of a grid search
df = gs2df(model)

# This is counter-intuitive and should not happen for classification
# For more details why the scores are negative, look at
# https://github.com/scikit-learn/scikit-learn/issues/2439
df['mean'] *= -1

df.head()

plot_gsdf(df)

from sklearn.datasets import load_digits
from sklearn import cross_validation
from sklearn.learning_curve import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training accuracy")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation accuracy")

    plt.legend(loc="best")
    return plt

digits = load_digits()
X5, y5 = digits.data, digits.target
title = "Learning Curves (Naive Bayes)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100,
                                   test_size=0.2, random_state=0)
estimator = GaussianNB()
__ = plot_learning_curve(estimator, title, X5, y5, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

twenty_train.target_names

len(twenty_train.data)

len(twenty_train.filenames)

print("\n".join(twenty_train.data[0].split("\n")[:3]))

twenty_train.target[:10]

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

count_vect.vocabulary_.get(u'algorithm')

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, twenty_train.target_names[category]))

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5)),
])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print("Accuracy of our SGD classifier is {:.2f}%".format(np.mean(predicted == twenty_test.target)*100))

from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
      target_names=twenty_test.target_names))

import time
from sklearn import datasets
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import (MeanShift, MiniBatchKMeans, AffinityPropagation, AgglomerativeClustering,
                             DBSCAN, SpectralClustering)
import sklearn.cluster

np.random.seed(0)

# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
n_samples = 150
datasets_collection = {
"noisy_circles": datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05),
"noisy_moons" :datasets.make_moons(n_samples=n_samples, noise=.05),
"blobs": datasets.make_blobs(n_samples=n_samples, random_state=8),
"no_structure": (np.random.rand(n_samples, 2), None),
}

clustering_algos = {
    # create clustering estimators
    "mean_shift": MeanShift(bin_seeding=True),
    "two_means": MiniBatchKMeans(n_clusters=2),
    "agglomerative": AgglomerativeClustering(n_clusters=2,
                    linkage='ward'),
    "spectral": SpectralClustering(n_clusters=2,
                                   eigen_solver='arpack',
                                   affinity="nearest_neighbors"),
    "dbscan": DBSCAN(eps=.2),
    "affinity_propagation": AffinityPropagation(damping=.9, preference=-200),
    "average_linkage":  AgglomerativeClustering(linkage="average",
                            affinity="cityblock", n_clusters=2)
}

colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

def cluster(ds_name, algo_name):
  dataset = datasets_collection[ds_name]
  algorithm = clustering_algos[algo_name]
  X, y = dataset
  # normalize dataset for easier parameter selection
  X = StandardScaler().fit_transform(X)

  # estimate bandwidth for mean shift
  bandwidth = sklearn.cluster.estimate_bandwidth(X, quantile=0.3)

  # connectivity matrix for structured Ward
  connectivity = kneighbors_graph(X, n_neighbors=10)
  # make connectivity symmetric
  connectivity = 0.5 * (connectivity + connectivity.T)

  # Compute distances
  #distances = np.exp(-euclidean_distances(X))
  distances = euclidean_distances(X)
  # predict cluster memberships
  t0 = time.time()
  algorithm.fit(X)
  t1 = time.time()
  if hasattr(algorithm, 'labels_'):
    y_pred = algorithm.labels_.astype(np.int)
  else:
    y_pred = algorithm.predict(X)

  fig, axes = plt.subplots(1, 2)
  fig.set_size_inches((12,4))
  axes[0].scatter(X[:, 0], X[:, 1], c='black', s=10)
  axes[0].set_title('Data')
  axes[1].scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
  axes[1].set_title('Clustered Data')

  if hasattr(algorithm, 'cluster_centers_'):
            centers = algorithm.cluster_centers_
            center_colors = colors[:len(centers)]
            plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
  plt.xlim(-2, 2)
  plt.ylim(-2, 2)
  plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
           transform=plt.gca().transAxes, size=15,
           horizontalalignment='right')

from IPython.html.widgets import interact, RadioButtonsWidget, IntSliderWidget, TextWidget, DropdownWidget

_ = interact(cluster,
             algo_name=DropdownWidget(values=clustering_algos.keys()),
             ds_name=RadioButtonsWidget(values=datasets_collection.keys()))