import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import scipy.sparse.linalg as linalg
import scipy.cluster.hierarchy as hr
from scipy.spatial.distance import pdist, squareform
import sklearn.datasets as datasets
import sklearn.metrics as metrics
import sklearn.utils as utils
import sklearn.linear_model as linear_model
import sklearn.svm as svm
import sklearn.cross_validation as cross_validation
import sklearn.cluster as cluster
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from patsy import dmatrices
import seaborn as sns
%matplotlib inline
Working with the wine dataset available here:
wine = pd.read_table("datasets/wine.data", sep=',')
attributes = ['region',
'Alcohol',
'Malic acid',
'Ash',
'Alcalinity of ash',
'Magnesium',
'Total phenols',
'Flavanoids',
'Nonflavanoid phenols',
'Proanthocyanins',
'Color intensity',
'Hue',
'OD280/OD315 of diluted wines',
'Proline']
wine.columns = attributes
X = wine[['Alcohol', 'Proline']].values
grape = wine.pop('region')
y = grape.values
print wine.info()
wine.head()
svc = svm.SVC(kernel='linear')
svc.fit(X, y)
Evaluating the fit of the classifier graphically
from matplotlib.colors import ListedColormap
# Create color maps for 3-class classification problem, as with iris
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
def plot_estimator(estimator, X, y):
try:
X, y = X.values, y.values
except AttributeError:
pass
estimator.fit(X, y)
x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.axis('tight')
plt.axis('off')
plt.tight_layout()
plot_estimator(svc, X, y)
The SVM gets its name from the samples in the dataset from each class that lie closest to the other class. These training samples are called "support vectors" because changing their position in p-dimensional space would change the location of the decision boundary.
In scikits-learn, the indices of the support vectors for each class can be found in the support_vectors_ attribute of the SVC object. Here is a 2 class problem using only classes 1 and 2 in the wine dataset.
The support vectors are circled.
# Extract classes 1 and 2
X, y = X[np.in1d(y, [1, 2])], y[np.in1d(y, [1, 2])]
plot_estimator(svc, X, y)
plt.scatter(svc.support_vectors_[:, 0],
svc.support_vectors_[:, 1],
s=120,
facecolors='none',
linewidths=2,
zorder=10)
These two classes do not appear to be lineary separable.
For non-linearly separable classes we turn into regularization; regularization is tuned via the C parameter. In practice, a large C value means that the number of support vectors is small (less regularization), while a small C implies many support vectors (more regularization). scikit-learn sets a default value of C=1.
svc = svm.SVC(kernel='linear', C=1e6)
plot_estimator(svc, X, y)
plt.scatter(svc.support_vectors_[:, 0], svc.support_vectors_[:, 1], s=80,
facecolors='none', linewidths=2, zorder=10)
plt.title('High C values: small number of support vectors')
svc = svm.SVC(kernel='linear', C=1e-2)
plot_estimator(svc, X, y)
plt.scatter(svc.support_vectors_[:, 0], svc.support_vectors_[:, 1], s=80,
facecolors='none', linewidths=2, zorder=10)
plt.title('Low C values: high number of support vectors')
We can also choose from a suite of available kernels (linear, poly, rbf, sigmoid, precomputed) or a custom kernel can be passed as a function. Note that the radial basis function (rbf) kernel is just a Gaussian kernel, but with parameter γ=1σ2.
svc_lin = svm.SVC(kernel='linear')
plot_estimator(svc_lin, X, y)
plt.scatter(svc_lin.support_vectors_[:, 0], svc_lin.support_vectors_[:, 1],
s=80, facecolors='none', linewidths=2, zorder=10)
plt.title('Linear kernel')
#svc_poly = svm.SVC(kernel='poly', degree=3)
#plot_estimator(svc_poly, X, y)
#plt.scatter(svc_poly.support_vectors_[:, 0], svc_poly.support_vectors_[:, 1],
# s=80, facecolors='none', linewidths=2, zorder=10)
#plt.title('Polynomial kernel')
#svc_rbf = svm.SVC(kernel='rbf', gamma=1e-2)
#plot_estimator(svc_rbf, X, y)
#plt.scatter(svc_rbf.support_vectors_[:, 0], svc_rbf.support_vectors_[:, 1],
# s=80, facecolors='none', linewidths=2, zorder=10)
#plt.title('RBF kernel')
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
wine.values, grape.values, test_size=0.4, random_state=0)
f = svm.SVC(kernel='linear', C=1)
f.fit(X_train, y_train)
f.score(X_test, y_test)
#mean accuracy
scores = cross_validation.cross_val_score(f, wine.values, grape.values, cv=5)
print scores
#precision
cross_validation.cross_val_score(f, wine.values, grape.values, cv=5, scoring='precision')
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02 # step size in the mesh
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0 # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=C).fit(X, y)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)
lin_svc = svm.LinearSVC(C=C).fit(X, y)
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# title for the plots
titles = ['SVC with linear kernel',
'LinearSVC (linear kernel)',
'SVC with RBF kernel', 'SVC with poly kernel']
fig = plt.figure(figsize=(10,10))
for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
plt.subplot(2, 2, i + 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title(titles[i])
plt.show()
adaboost = AdaBoostClassifier(svm.SVC(kernel='linear'),
algorithm="SAMME",
n_estimators=200)
#adaboost.fit(wine.values,grape.values)
scores = cross_validation.cross_val_score(adaboost, wine.values, grape.values, cv=5)
print scores
print scores.mean()
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("custom.css", "r").read()
return HTML(styles)
css_styling()