# Global imports and settings
from __future__ import print_function
from __future__ import division
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = (8, 8)
import numpy as np
np.set_printoptions(precision=3)
from IPython.html.services.config import ConfigManager
from IPython.utils.path import locate_profile
cm = ConfigManager(profile_dir=locate_profile(get_ipython().profile))
_ = cm.update('livereveal', {'width': 1440, 'height': 720 })
# This is an IPython notebook, with executable Python code inside
from __future__ import braces
File "<ipython-input-3-8ea16afe3b06>", line 2 from __future__ import braces SyntaxError: not a chance
Materials at https://github.com/glouppe/tutorial-sklearn-lhcb
Python distribution with scientific packages (NumPy, SciPy, Scikit-Learn, Pandas)
... or Anaconda http://continuum.io/downloads
Predict, understand or identify patterns in data from observations.
Applications include: Natural language processing, Computer vision, IR and advertisement, Robotics, Bioinformatics, High Energy Physics, ...
Supervised learning:
Unsupervised learning:
Model selection and evaluation:
... and many more! (See our Reference)
Data comes as a finite learning set ${\cal L} = (X, y)$ where
Input samples are given as an array $X$ of shape n_samples
$\times$ n_features
E.g. samples are events and features are their kinematic properties.
Output values are given as an array $y$
E.g. whether the event is background or signal.
X = np.random.rand(4, 3)
y = np.random.randint(0, 2, len(X))
# E.g. for n_samples = 4, n_features = 3
print("X =")
print(X)
print("y =", y)
X = [[ 0.673 0.044 0.633] [ 0.417 0.876 0.088] [ 0.126 0.486 0.024] [ 0.479 0.378 0.557]] y = [1 1 0 1]
pandas
¶pandas
is a library providing easy-to-use data structures and data analysis tools.
# Download simulated events from https://archive.ics.uci.edu/ml/datasets/SUSY
# Load CSV file with pandas
import pandas as pd
df = pd.read_csv("SUSY.csv", header=None)
df.columns = [# 0 = background, 1 = signal
"target",
# Kinematic properties
"lepton 1 pT", "lepton 1 eta", "lepton 1 phi",
"lepton 2 pT", "lepton 2 eta", "lepton 2 phi",
"missing energy magnitude", "missing energy phi",
# Derived features
"MET_rel","axial MET", "M_R", "M_TR_2", "R", "MT2",
"S_R", "M_Delta_R", "dPhi_r_b", "cos(theta_r1)"]
df.target = df.target.astype(int)
df.head() # df.describe(), df.info(), ...
target | lepton 1 pT | lepton 1 eta | lepton 1 phi | lepton 2 pT | lepton 2 eta | lepton 2 phi | missing energy magnitude | missing energy phi | MET_rel | axial MET | M_R | M_TR_2 | R | MT2 | S_R | M_Delta_R | dPhi_r_b | cos(theta_r1) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.972861 | 0.653855 | 1.176225 | 1.157156 | -1.739873 | -0.874309 | 0.567765 | -0.175000 | 0.810061 | -0.252552 | 1.921887 | 0.889637 | 0.410772 | 1.145621 | 1.932632 | 0.994464 | 1.367815 | 0.040714 |
1 | 1 | 1.667973 | 0.064191 | -1.225171 | 0.506102 | -0.338939 | 1.672543 | 3.475464 | -1.219136 | 0.012955 | 3.775174 | 1.045977 | 0.568051 | 0.481928 | 0.000000 | 0.448410 | 0.205356 | 1.321893 | 0.377584 |
2 | 1 | 0.444840 | -0.134298 | -0.709972 | 0.451719 | -1.613871 | -0.768661 | 1.219918 | 0.504026 | 1.831248 | -0.431385 | 0.526283 | 0.941514 | 1.587535 | 2.024308 | 0.603498 | 1.562374 | 1.135454 | 0.180910 |
3 | 1 | 0.381256 | -0.976145 | 0.693152 | 0.448959 | 0.891753 | -0.677328 | 2.033060 | 1.533041 | 3.046260 | -1.005285 | 0.569386 | 1.015211 | 1.582217 | 1.551914 | 0.761215 | 1.715464 | 1.492257 | 0.090719 |
4 | 1 | 1.309996 | -0.690089 | -0.676259 | 1.589283 | -0.693326 | 0.622907 | 1.087562 | -0.381742 | 0.589204 | 1.365479 | 1.179295 | 0.968218 | 0.728563 | 0.000000 | 1.083158 | 0.043429 | 1.154854 | 0.094859 |
df_sample = df[:10000] # Draw a sample to make things faster
# Visualize samples through a scatter matrix
colors = ["blue", "red"] # blue = background, red = signal
features = ["lepton 1 pT", "lepton 1 eta", "missing energy magnitude", "R"]
_ = pd.scatter_matrix(df_sample[features], marker="x",
c=df_sample.target.apply(lambda x: colors[x]))
Need more? See pandas
visualization documentation.
# Get data as Numpy arrays from the pandas data frame
X = df_sample.drop("target", axis=1).values
y = df_sample.target.values
print(X[:5])
print(X.shape)
print(X.dtype)
[[ 0.973 0.654 1.176 1.157 -1.74 -0.874 0.568 -0.175 0.81 -0.253 1.922 0.89 0.411 1.146 1.933 0.994 1.368 0.041] [ 1.668 0.064 -1.225 0.506 -0.339 1.673 3.475 -1.219 0.013 3.775 1.046 0.568 0.482 0. 0.448 0.205 1.322 0.378] [ 0.445 -0.134 -0.71 0.452 -1.614 -0.769 1.22 0.504 1.831 -0.431 0.526 0.942 1.588 2.024 0.603 1.562 1.135 0.181] [ 0.381 -0.976 0.693 0.449 0.892 -0.677 2.033 1.533 3.046 -1.005 0.569 1.015 1.582 1.552 0.761 1.715 1.492 0.091] [ 1.31 -0.69 -0.676 1.589 -0.693 0.623 1.088 -0.382 0.589 1.365 1.179 0.968 0.729 0. 1.083 0.043 1.155 0.095]] (10000, 18) float64
The goal of supervised learning is to build an estimator $\varphi_{\cal L}: {\cal X} \mapsto {\cal Y}$ minimizing
$$ Err(\varphi_{\cal L}) = \mathbb{E}_{X,Y}\{ L(Y, \varphi_{\cal L}(X)) \}. $$where $L$ is a loss function (e.g., the zero-one loss for classification).
A decision tree is a cut-based partition of the input space, where each region (= leaf) is fit with a (simple) predictive model.
def build(L): create node t if the stopping criterion is True: assign a predictive model to t else: Find the best binary split L = L_left + L_right t.left = build(L_left) t.right = build(L_right) return t
All learning algorithms in scikit-learn, including decision trees, share a uniform and limited API consisting of complementary interfaces:
estimator
interface for building and fitting models;predictor
interface for making predictions;transformer
interface for converting data.)An estimator is any object that learns from data; it may be a classification, regression or clustering algorithm or a transformer that extracts/filters useful features from raw data.
class Estimator(object):
def fit(self, X, y=None):
"""Fits estimator to data."""
# set state of ``self``
return self
def predict(self, X):
"""Predict response of ``X``."""
# compute predictions ``pred``
return pred
# Import the decision tree class
from sklearn.tree import DecisionTreeClassifier # Change this to try
# something else
# Set hyper-parameters, for controlling the learning algorithm
clf = DecisionTreeClassifier(criterion="entropy")
# Learn a model from training data
clf.fit(X, y)
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=None, splitter='best')
# Estimator state is stored in instance attributes
clf.tree_
<sklearn.tree._tree.Tree at 0x7fbe36f80648>
# Make predictions
print(clf.predict(X[:5]))
[0 1 1 1 1]
# Compute class probabilities
print(clf.predict_proba(X[:5]))
[[ 1. 0.] [ 0. 1.] [ 0. 1.] [ 0. 1.] [ 0. 1.]]
from sklearn.tree import export_graphviz
export_graphviz(clf, out_file="tree.dot",
feature_names=df.columns[1:], max_depth=2)
!dot -Tpng -o tree.png tree.dot
from IPython.display import Image
Image("tree.png")
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
clf = DecisionTreeClassifier(criterion="entropy",
random_state=1).fit(X_train, y_train)
print("Training accuracy =", clf.score(X_train, y_train)) # Biased estimate
print("Test accuracy =", clf.score(X_test, y_test)) # Unbiased estimate
Training accuracy = 1.0 Test accuracy = 0.7144
Beware of bias when you estimate model performance:
When ${\cal L}$ is small, prefer K-Fold cross-validation instead of train-test split for more accurate estimates.
from sklearn.cross_validation import KFold
for train, test in KFold(n=len(X), n_folds=5, random_state=42):
print(train)
print(test)
print()
[2000 2001 2002 ..., 9997 9998 9999] [ 0 1 2 ..., 1997 1998 1999] [ 0 1 2 ..., 9997 9998 9999] [2000 2001 2002 ..., 3997 3998 3999] [ 0 1 2 ..., 9997 9998 9999] [4000 4001 4002 ..., 5997 5998 5999] [ 0 1 2 ..., 9997 9998 9999] [6000 6001 6002 ..., 7997 7998 7999] [ 0 1 2 ..., 7997 7998 7999] [8000 8001 8002 ..., 9997 9998 9999]
scores = []
for train, test in KFold(n=len(X), n_folds=5, random_state=42):
X_train, y_train = X[train], y[train]
X_test, y_test = X[test], y[test]
clf = DecisionTreeClassifier(criterion="entropy",
random_state=1).fit(X_train, y_train)
scores.append(clf.score(X_test, y_test))
print("%f +-%f" % (np.mean(scores), np.std(scores)))
0.704500 +-0.006804
# Shortcut
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(DecisionTreeClassifier(criterion="entropy",
random_state=1),
X, y, cv=KFold(n=len(X), n_folds=5, random_state=42),
scoring="accuracy")
print("%f +-%f" % (np.mean(scores), np.std(scores)))
0.704500 +-0.006804
from sklearn.learning_curve import validation_curve
param_range = range(1, 16)
train_scores, test_scores = validation_curve(
DecisionTreeClassifier(), X, y,
param_name="max_depth",
param_range=param_range, cv=5, n_jobs=-1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.xlabel("max_depth")
plt.ylabel("score")
plt.xlim(min(param_range), max(param_range))
plt.plot(param_range, train_scores_mean, color="red", label="training score")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2, color="red")
plt.plot(param_range, test_scores_mean, color="blue", label="test score")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="blue")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x7fbe49376f50>
# Best trade-off
print("max_depth = %d, accuracy = %f" % (param_range[np.argmax(test_scores_mean)],
np.max(test_scores_mean)))
max_depth = 5, accuracy = 0.768000
GridSearchCV
estimator.from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(DecisionTreeClassifier(),
param_grid={"max_depth": range(1, 16),
"criterion": ["gini", "entropy"],
"min_samples_leaf": [1, 5, 10, 50]},
scoring="accuracy",
cv=5, n_jobs=-1)
grid.fit(X, y)
# Warning: don't report these numbers in your experiments!
print("Best score = %f, Best parameters = %s" % (grid.best_score_,
grid.best_params_))
Best score = 0.773200, Best parameters = {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 50}
The resulting grid.best_estimator_
model is not independent from grid.best_score_
since its construction was guided by the maximization of this quantity.
As a result, the optimized grid.best_score_
accuracy may in fact be a biased, optimistic, estimate of the true performance of the model.
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
scores = cross_val_score(
GridSearchCV(DecisionTreeClassifier(),
param_grid={"max_depth": range(1, 16),
"criterion": ["gini", "entropy"],
"min_samples_leaf": [1, 5, 10, 50]},
scoring="accuracy",
cv=5, n_jobs=-1),
X, y, cv=5, scoring="accuracy")
# Unbiased estimate of the accuracy
print("%f +-%f" % (np.mean(scores), np.std(scores)))
0.770500 +-0.005477
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)
print("Training accuracy =", rf.score(X_train, y_train))
print("Test accuracy =", rf.score(X_test, y_test))
Training accuracy = 1.0 Test accuracy = 0.8016
estimator.feature_importances_
.# Feature importances from totally randomized trees (TRT)
from sklearn.ensemble import ExtraTreesClassifier
trt = ExtraTreesClassifier(n_estimators=1000,
max_features=1, max_depth=3).fit(X_train, y_train)
# Plot importances
feature_names = df.columns[1:]
importances = pd.DataFrame()
importances["DT"] = pd.Series(grid.best_estimator_.feature_importances_,
index=feature_names)
importances["RF"] = pd.Series(rf.feature_importances_, index=feature_names)
importances["TRT"] = pd.Series(trt.feature_importances_, index=feature_names)
importances.plot(kind="barh")
<matplotlib.axes.AxesSubplot at 0x7fbe496da290>
Disclaimer: Importances are measured only through the eyes of the model. They may not tell the entire nor the same story!
Relation between the response Y and a subset of features, marginalized over all other features.
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble.partial_dependence import plot_partial_dependence
gbrt = GradientBoostingClassifier(n_estimators=100,
learning_rate=0.1,
max_leaf_nodes=5)
gbrt.fit(X, y)
plot_partial_dependence(gbrt, X, feature_names=feature_names, features=[0, 6])
plot_partial_dependence(gbrt, X, feature_names=feature_names, features=[(1, 4)])
(<matplotlib.figure.Figure at 0x7fbe493c3a90>, [<matplotlib.axes.AxesSubplot at 0x7fbe35790750>])
More detailed information can be found here.
sample_weight
class_weight
from sklearn.ensemble import RandomForestClassifier
# Make classes equiprobable (shortcut: class_weight="auto"),
# even if classes are unbalanced
clf = RandomForestClassifier(class_weight={0: 0.5, 1: 0.5})
# Specify sample weights
sample_weight = np.random.rand(X_train.shape[0])
clf.fit(X_train, y_train, sample_weight=sample_weight)
RandomForestClassifier(bootstrap=True, class_weight={0: 0.5, 1: 0.5}, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
As any Python objects, estimators can be saved to disk for future reuse using pickle
.
clf = DecisionTreeClassifier().fit(X_train, y_train)
# Save to disk
import pickle
pickle.dump(clf, open("my-model.dat", "w"))
# Load from disk
pickle.load(open("my-model.dat"))
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=None, splitter='best')
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
from rep.estimators.tmva import TMVAClassifier
clf = TMVAClassifier(method="kBDT", BoostType="Bagging", NTrees=100,
UseRandomisedTrees=True, UseNVars=4, nCuts=-1,
PruneMethod="NoPruning", MaxDepth=10000, MinNodeSize=10e-7)
clf.fit(X_train, y_train)
print("Training accuracy =", clf.score(X_train, y_train))
print("Test accuracy =", clf.score(X_test, y_test))
Training accuracy = 0.956533333333 Test accuracy = 0.794
Scikit-Learn is more than training classifiers. It also covers:
questions?