In [1]:

rcParams['figure.figsize'] = (16, 4)

# A lot of code below borrowed from the scikits learn docs

http://peekaboo-vision.blogspot.com/2013/01/machine-learning-cheat-sheet-for-scikit.html

In [2]:

from sklearn.datasets import load_iris
data = load_iris()

Desicion Trees¶

http://scikit-learn.org/stable/modules/tree.html

In [4]:

from sklearn import tree

In [5]:

clf = tree.DecisionTreeClassifier()
clf.fit(data.data, data.target)

Out[5]:

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, min_density=None,
            min_samples_leaf=1, min_samples_split=2, random_state=None,
            splitter='best')

In [6]:

clf.predict(data.data[0])

Out[6]:

array([0])

In [7]:

clf.predict(data.data[10])

Out[7]:

array([0])

In [8]:

clf.predict(data.data)

Out[8]:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:

from IPython.display import Image #needed to render in notebook
import StringIO, pydot  #needed to convert dot format to png

dot_data = StringIO.StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

Out[9]:

Cross-validation¶

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cross_validation

In [10]:

from sklearn.naive_bayes import GaussianNB

clf2 = GaussianNB()

clf2.fit(data.data, data.target)

Out[10]:

GaussianNB()

In [11]:

from sklearn import cross_validation

In [12]:

kf = cross_validation.KFold(len(data.data), n_folds=10)
len(kf)

Out[12]:

In [22]:

pc_folds = []
for train_index, test_index in kf:
    clf2 = GaussianNB()
    clf2.fit(data.data[train_index], data.target[train_index])
    predictions = clf2.predict(data.data[test_index])
    pc_right = sum(predictions == data.target[test_index])/float(len(test_index))
    pc_folds.append(pc_right)
    

In [23]:

pc_folds

Out[23]:

[1.0,
 1.0,
 1.0,
 0.93333333333333335,
 0.93333333333333335,
 0.8666666666666667,
 1.0,
 0.8666666666666667,
 0.8666666666666667,
 1.0]

In [24]:

mean(pc_folds)

Out[24]:

0.94666666666666688

In [25]:

min(pc_folds)

Out[25]:

0.8666666666666667

Metrics¶

In [26]:

from sklearn import metrics

In [27]:

predictions = clf2.predict(data.data)
predictions

Out[27]:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [28]:

metrics.accuracy_score(data.target, predictions)

Out[28]:

0.96666666666666667

In [29]:

print metrics.classification_report(data.target, predictions)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.94      0.96      0.95        50
          2       0.96      0.94      0.95        50

avg / total       0.97      0.97      0.97       150

In [30]:

metrics.confusion_matrix(data.target, predictions)

Out[30]:

array([[50,  0,  0],
       [ 0, 48,  2],
       [ 0,  3, 47]])

In [33]:

confmat = metrics.confusion_matrix(data.target, predictions)
imshow(confmat,interpolation='nearest', cmap=cm.gray_r)

Out[33]:

<matplotlib.image.AxesImage at 0x3d88690>

In [25]:

confmat = metrics.confusion_matrix(data.target, predictions)

imshow(confmat.max() - confmat,interpolation='nearest', cmap=cm.gray)

for rownum, row in enumerate(confmat):
    for colnum, val in enumerate(row):
        text(colnum, rownum, str(val/sum(data.target == rownum).astype(float)), fontsize=24, color='black' if val < confmat.max()/2 else 'white', ha='center', va='center' )
        
xticks(arange(len(data.target_names)), data.target_names)
yticks(arange(len(data.target_names)), data.target_names);

http://media.aau.dk/null_space_pursuits/2012/01/frustrations-with-music-genre.html

By Andrés Cabrera mantaraya36@gmail.com

For course MAT 240E at UCSB

This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/