Notebook

In [1]:

%matplotlib inline

Classifier accuracy¶

In this notebook we'll explore the classifier confusion matrix and ROC/PR curves.

See: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html and http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html

In [2]:

 
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('bmh')
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
 
# first let's load some data

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data[:,:10]
y = data.target * 2 - 1

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1)
 
classifier = LogisticRegression(C=0.1)
_ = classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
 
confusion_matrix(y_test, y_pred)

Out[2]:

array([[ 62,  18],
       [ 10, 138]])

The confusion matrix is a matrix $C$ such that $C_{i, j}$ is equal to the number of observations known to be in group $i$ but predicted to be in group $j$.

In [3]:

class Perceptron :
 
    """An implementation of the perceptron algorithm.
    Note that this implementation does not include a bias term"""
 
    def __init__(self, max_iterations=100, learning_rate=0.2) :
 
        self.max_iterations = max_iterations
        self.learning_rate = learning_rate
 
    def fit(self, X, y) :
        """
        Train a classifier using the perceptron training algorithm.
        After training the attribute 'w' will contain the perceptron weight vector.
 
        Parameters
        ----------
 
        X : ndarray, shape (num_examples, n_features)
        Training data.
 
        y : ndarray, shape (n_examples,)
        Array of labels.
 
        """
        self.w = np.zeros(len(X[0]))
        converged = False
        iterations = 0
        while (not converged and iterations <= self.max_iterations) :
            converged = True
            for i in range(len(X)) :
                if y[i] * self.decision_function(X[i]) <= 0 :
                    self.w = self.w + y[i] * self.learning_rate * X[i]
                    converged = False
            iterations += 1
        self.converged = converged
        if converged :
            print ('converged in %d iterations ' % iterations)
 
    def decision_function(self, x) :
        return np.inner(self.w, x)
 
    def predict(self, X) :
        """
        make predictions using a trained linear classifier
 
        Parameters
        ----------
 
        X : ndarray, shape (num_examples, n_features)
        Training data.
        """
 
        scores = np.inner(self.w, X)
        return np.sign(scores)
 

ROC curves¶

In [4]:

from sklearn.metrics import roc_curve, auc

classifiers = {'Perceptron' : Perceptron(),
            'logistic regression C=0.1': LogisticRegression(C=0.1),
            'logistic regression C=10': LogisticRegression(C=10),
            'logistic regression C=100': LogisticRegression(C=100),
              }
# note that the parameter C of
# the scikit-learn LogisticRegression class is inversely
# proportional to the strength of regularization

results = {}
for description,classifier in classifiers.items() :
    _ = classifier.fit(X_train, y_train)
    scores=classifier.decision_function(X_test)
    fpr, tpr, _ = roc_curve(y_test, scores)
    results[description] = (fpr, tpr)
    
plt.figure()
plt.plot([0, 1], [0, 1], 'k--');
for c in results :
    fpr, tpr = results[c]
    plt.plot(fpr, tpr, label=c);

plt.xlabel('False positive rate');
plt.ylabel('True positive rate');
plt.title('ROC curve');
plt.legend(loc='best');

In [8]:

from sklearn.metrics import precision_recall_curve

results = {}
for description,classifier in classifiers.items() :
    _ = classifier.fit(X_train, y_train)
    scores = classifier.decision_function(X_test)
    precision, recall, _ = precision_recall_curve(y_test, scores)
    results[description] = (precision, recall)
    

plt.figure()
for c in results :
    precision, recall = results[c]
    #plt.plot(precision, recall, label=c);
    plt.step(recall, precision, where='post', label=c)

plt.xlabel('Recall');
plt.ylabel('Precision');
plt.title('PR curve');
plt.ylim([0.0, 1.05])
plt.legend(loc='best');

In [ ]: