#!/usr/bin/env python # coding: utf-8 # Source available on [Nbviewer](http://nbviewer.ipython.org/github/stephanie-w/brainscribble/blob/master/source/classification-algorithms-on-iris-dataset.ipynb). # In[123]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np from matplotlib.colors import ListedColormap # The iris dataset consists of measurements of three different species of irises. # scikit-learn embeds a copy of the iris CSV file along with a helper function to load it into numpy arrays. # In[124]: from sklearn.datasets import load_iris iris = load_iris() # In[125]: iris.keys() # In[126]: print iris.data.shape print iris.target.shape # ## Exploring the dataset # The target classes to predict are iris classification: # # * setosa # * versicolor # * virginica # In[127]: iris.target_names # The features in the iris dataset are : # # * sepal length # * sepal width # * petal length # * petal width # In[128]: iris.feature_names # In[129]: x_index = 2 #Petal Length y_index = 3 #Petal Width iris.data[:,x_index] # Petal Length values # In[130]: iris.target # Iris Classification values # In[131]: formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)]) plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target) plt.colorbar(ticks=[0, 1, 2], format=formatter) plt.xlabel(iris.feature_names[x_index]) plt.ylabel(iris.feature_names[y_index]) plt.title("Iris classification according to Petal measurements") # In[132]: x_index = 0 # Sepal Lenght y_index = 1 # Sepal Width formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)]) plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target) plt.colorbar(ticks=[0, 1, 2], format=formatter) plt.xlabel(iris.feature_names[x_index]) plt.ylabel(iris.feature_names[y_index]) plt.title("Iris classification according to sepal measurements") # ## Using classification algorithm # ### KNeighborsClassifier # # The simplest possible classifier is the nearest neighbor. # #### Fitting the model # In[133]: from sklearn import neighbors knn = neighbors.KNeighborsClassifier(n_neighbors=1) knn.fit(iris.data, iris.target) # #### Doing a predicition # In[134]: result = knn.predict([[3, 5, 4, 2],]) # What is the iris class for 3cm x 5cm sepal and 4cm x 2cm petal? print iris.target_names[result] # #### Exploring the results # Let's draw the classification region according to sepal measurements. # First, let's fit the model using the two first features only: # In[135]: X = iris.data[:, :2] #Working with the two first features : sepal length and sepal width y = iris.target knn = neighbors.KNeighborsClassifier(n_neighbors=3) knn.fit(X, y) # Then, let's build a input data matrix containing continuous values of sepal length and width (from min to max) and aply the predict function to it: # In[136]: x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1 #min and max sepal length y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1 #min and max sepal width xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # Let's put the result into a color plot: # In[137]: plt.figure() # Create color maps for 3-class classification problem from matplotlib.colors import ListedColormap cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # Plot Classification Map plt.pcolormesh(xx, yy, Z, cmap=cmap_light, ) plt.xlabel('sepal length (cm)') plt.ylabel('sepal width (cm)') plt.axis('tight') # Let's add the actual values of iris sepal lenght/width vs classification to the color map: # In[138]: plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # Add training points plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.xlabel('sepal length (cm)') plt.ylabel('sepal width (cm)') plt.axis('tight') # ### Support Vector Classifier (SVC) # In[139]: from sklearn import svm clf = svm.LinearSVC(loss = 'l2') # #### Fitting the model # In[140]: clf.fit(iris.data, iris.target) # #### Doing a predicition # In[141]: result = clf.predict([[3, 5, 4, 2],])# What is the iris class for 3cm x 5cm sepal and 4cm x 2cm petal? print iris.target_names[result] # ### Exploring different classifiers (kernels) # In[142]: X = iris.data[:, :2] #Working with the two first features : sepal length and sepal width y = iris.target # In[143]: def plot_class_map(clf, X, y, title="", **params): C = 1.0 # SVM regularization parameter clf.fit(X, y) x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1 y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() from matplotlib.colors import ListedColormap cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) plt.pcolormesh(xx, yy, Z, cmap=cmap_light) #Plot training points as well plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.xlabel('sepal length (cm)') plt.ylabel('sepal width (cm)') plt.axis('tight') plt.title(title) # Linear clf = svm.SVC(kernel='linear') plot_class_map(clf, X, y, 'SVC with linear kernel') # RBF clf = svm.SVC(kernel='rbf') plot_class_map(clf, X, y, 'SVC with linear kernel') # RBF clf = svm.SVC(kernel='poly', degree=3) plot_class_map(clf, X, y, 'SVC with polynomial kernel (3 degrees)') # Note: # # The linear models LinearSVC() and SVC(kernel='linear') yield slightly different decision boundaries. This can be a consequence of the following differences: # # * LinearSVC minimizes the squared hinge loss while SVC minimizes the regular hinge loss. # * LinearSVC uses the One-vs-All (also known as One-vs-Rest) multiclass reduction while SVC uses the One-vs-One multiclass reduction. # # In[144]: clf = svm.SVC(kernel="linear") clf.fit(iris.data, iris.target) result = clf.predict([[3, 5, 4, 2],])# What is the iris class for 3cm x 5cm sepal and 4cm x 2cm petal? print iris.target_names[result] # ### Random Forest # In[145]: from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier() # #### Fitting the model # In[146]: clf.fit(iris.data, iris.target) # #### Doing a predicition # In[147]: result = clf.predict([[3, 5, 4, 2],])# What is the iris class for 3cm x 5cm sepal and 4cm x 2cm petal? print iris.target_names[result] # #### Decision Trees and over-fitting # In[111]: X = iris.data[:, :2] #Working with the two first features : sepal length and sepal width y = iris.target clf.fit(X,y) plot_class_map(clf, X, y) # The issue with RF is it's tending to overfit the data. # hey are flexible enough that they can learn the structure of the noise in the data rather than the signal. # In[122]: clf = DecisionTreeClassifier(max_depth=4) clf.fit(X,y) plot_class_map(clf, X, y) # The model obtained by limiting tree depth is a much better fit to the data. # In[ ]: