%matplotlib inline
In this notebook we'll explore the classifier confusion matrix and ROC/PR curves.
See: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html and http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('bmh')
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
# first let's load some data
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data[:,:10]
y = data.target * 2 - 1
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1)
classifier = LogisticRegression(C=0.1)
_ = classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)
array([[ 62, 18], [ 10, 138]])
The confusion matrix is a matrix $C$ such that $C_{i, j}$ is equal to the number of observations known to be in group $i$ but predicted to be in group $j$.
class Perceptron :
"""An implementation of the perceptron algorithm.
Note that this implementation does not include a bias term"""
def __init__(self, max_iterations=100, learning_rate=0.2) :
self.max_iterations = max_iterations
self.learning_rate = learning_rate
def fit(self, X, y) :
"""
Train a classifier using the perceptron training algorithm.
After training the attribute 'w' will contain the perceptron weight vector.
Parameters
----------
X : ndarray, shape (num_examples, n_features)
Training data.
y : ndarray, shape (n_examples,)
Array of labels.
"""
self.w = np.zeros(len(X[0]))
converged = False
iterations = 0
while (not converged and iterations <= self.max_iterations) :
converged = True
for i in range(len(X)) :
if y[i] * self.decision_function(X[i]) <= 0 :
self.w = self.w + y[i] * self.learning_rate * X[i]
converged = False
iterations += 1
self.converged = converged
if converged :
print ('converged in %d iterations ' % iterations)
def decision_function(self, x) :
return np.inner(self.w, x)
def predict(self, X) :
"""
make predictions using a trained linear classifier
Parameters
----------
X : ndarray, shape (num_examples, n_features)
Training data.
"""
scores = np.inner(self.w, X)
return np.sign(scores)
from sklearn.metrics import roc_curve, auc
classifiers = {'Perceptron' : Perceptron(),
'logistic regression C=0.1': LogisticRegression(C=0.1),
'logistic regression C=10': LogisticRegression(C=10),
'logistic regression C=100': LogisticRegression(C=100),
}
# note that the parameter C of
# the scikit-learn LogisticRegression class is inversely
# proportional to the strength of regularization
results = {}
for description,classifier in classifiers.items() :
_ = classifier.fit(X_train, y_train)
scores=classifier.decision_function(X_test)
fpr, tpr, _ = roc_curve(y_test, scores)
results[description] = (fpr, tpr)
plt.figure()
plt.plot([0, 1], [0, 1], 'k--');
for c in results :
fpr, tpr = results[c]
plt.plot(fpr, tpr, label=c);
plt.xlabel('False positive rate');
plt.ylabel('True positive rate');
plt.title('ROC curve');
plt.legend(loc='best');
from sklearn.metrics import precision_recall_curve
results = {}
for description,classifier in classifiers.items() :
_ = classifier.fit(X_train, y_train)
scores = classifier.decision_function(X_test)
precision, recall, _ = precision_recall_curve(y_test, scores)
results[description] = (precision, recall)
plt.figure()
for c in results :
precision, recall = results[c]
#plt.plot(precision, recall, label=c);
plt.step(recall, precision, where='post', label=c)
plt.xlabel('Recall');
plt.ylabel('Precision');
plt.title('PR curve');
plt.ylim([0.0, 1.05])
plt.legend(loc='best');