#!/usr/bin/env python # coding: utf-8 # ## DecisionTree Classification # # [https://github.com/arundhaj](https://github.com/arundhaj) # # using vertebrate data set # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt # In[2]: # Read the CSV file data = pd.read_csv('data/vertebrate.csv') # In[3]: # List of all classes data['Class Label'] # In[4]: # List of unique classes data['Class Label'].unique() # In[5]: # Number of entries for each unique classes class_group = data.groupby('Class Label').apply(lambda x: len(x)) class_group # In[6]: # Plot bar chart based on Class Label class_group.plot(kind='bar', grid=False) # In[7]: from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction import DictVectorizer cols_to_retain = ['Body Temperature', 'Skin Cover', 'Gives Birth', 'Aquatic Creature', 'Aerial Creature', 'Has Legs', 'Hibernates'] X_feature = data[cols_to_retain] X_dict = X_feature.T.to_dict().values() # turn list of dicts into a numpy array vect = DictVectorizer(sparse=False) X_vector = vect.fit_transform(X_dict) # print the features # vect.get_feature_names() # 0 to 14 is train set X_Train = X_vector[:-1] # 15th is test set X_Test = X_vector[-1:] # Used to vectorize the class label le = LabelEncoder() y_train = le.fit_transform(data['Class Label'][:-1]) # In[8]: from sklearn import tree clf = tree.DecisionTreeClassifier(criterion='entropy') clf = clf.fit(X_Train,y_train) # In[9]: # Predict the test data, not seen earlier le.inverse_transform(clf.predict(X_Test)) # In[10]: # prediction with the same training set Train_predict = clf.predict(X_Train) # In[11]: # The model predicted the training set correctly (Train_predict == y_train).all() # In[12]: # Metrics related to the DecisionTreeClassifier from sklearn.metrics import accuracy_score, classification_report print 'Accuracy is:', accuracy_score(y_train, Train_predict) print classification_report(y_train, Train_predict) # In[13]: ''' import pydot import pyparsing import StringIO dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png('data/vertebrate/tree.png') from IPython.core.display import Image Image(filename='data/vertebrate/tree.png') ''' # In[ ]: