#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') # # Classifier accuracy # # In this notebook we'll explore the classifier confusion matrix and ROC/PR curves. # # See: # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html and # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html # In[2]: import numpy as np import matplotlib.pyplot as plt plt.style.use('bmh') from sklearn import cross_validation from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix # first let's load some data from sklearn.datasets import load_breast_cancer data = load_breast_cancer() X = data.data[:,:10] y = data.target * 2 - 1 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1) classifier = LogisticRegression(C=0.1) _ = classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) confusion_matrix(y_test, y_pred) # The confusion matrix is a matrix $C$ such that $C_{i, j}$ is equal to the # number of observations known to be in group $i$ but predicted to be in group $j$. # # In[3]: class Perceptron : """An implementation of the perceptron algorithm. Note that this implementation does not include a bias term""" def __init__(self, max_iterations=100, learning_rate=0.2) : self.max_iterations = max_iterations self.learning_rate = learning_rate def fit(self, X, y) : """ Train a classifier using the perceptron training algorithm. After training the attribute 'w' will contain the perceptron weight vector. Parameters ---------- X : ndarray, shape (num_examples, n_features) Training data. y : ndarray, shape (n_examples,) Array of labels. """ self.w = np.zeros(len(X[0])) converged = False iterations = 0 while (not converged and iterations <= self.max_iterations) : converged = True for i in range(len(X)) : if y[i] * self.decision_function(X[i]) <= 0 : self.w = self.w + y[i] * self.learning_rate * X[i] converged = False iterations += 1 self.converged = converged if converged : print ('converged in %d iterations ' % iterations) def decision_function(self, x) : return np.inner(self.w, x) def predict(self, X) : """ make predictions using a trained linear classifier Parameters ---------- X : ndarray, shape (num_examples, n_features) Training data. """ scores = np.inner(self.w, X) return np.sign(scores) # ## ROC curves # In[4]: from sklearn.metrics import roc_curve, auc classifiers = {'Perceptron' : Perceptron(), 'logistic regression C=0.1': LogisticRegression(C=0.1), 'logistic regression C=10': LogisticRegression(C=10), 'logistic regression C=100': LogisticRegression(C=100), } # note that the parameter C of # the scikit-learn LogisticRegression class is inversely # proportional to the strength of regularization results = {} for description,classifier in classifiers.items() : _ = classifier.fit(X_train, y_train) scores=classifier.decision_function(X_test) fpr, tpr, _ = roc_curve(y_test, scores) results[description] = (fpr, tpr) plt.figure() plt.plot([0, 1], [0, 1], 'k--'); for c in results : fpr, tpr = results[c] plt.plot(fpr, tpr, label=c); plt.xlabel('False positive rate'); plt.ylabel('True positive rate'); plt.title('ROC curve'); plt.legend(loc='best'); # In[8]: from sklearn.metrics import precision_recall_curve results = {} for description,classifier in classifiers.items() : _ = classifier.fit(X_train, y_train) scores = classifier.decision_function(X_test) precision, recall, _ = precision_recall_curve(y_test, scores) results[description] = (precision, recall) plt.figure() for c in results : precision, recall = results[c] #plt.plot(precision, recall, label=c); plt.step(recall, precision, where='post', label=c) plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('PR curve'); plt.ylim([0.0, 1.05]) plt.legend(loc='best'); # In[ ]: