import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline from sklearn import tree from sklearn.externals.six import StringIO import re df = pd.read_csv('fruit.csv') fruitnames = {1: 'Orange', 2: 'Pear', 3: 'Apple'} colors = {1: '#e09028', 2: '#55aa33', 3: '#cc3333'} fruitlist = ['Orange', 'Pear', 'Apple'] df.sort('fruit_id', inplace=True) # This is important because the factorizer assigns numbers # based on the order the first label is encountered, e.g. if the first instance had # fruit = 3, the y value would be 0. df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 # randomly assign training and testing set train, test = df[df['is_train']==True], df[df['is_train']==False] features = ['color_id', 'elongatedness', 'weight', 'sweetness', 'acidity'] y, _ = pd.factorize(train['fruit_id']) clf = tree.DecisionTreeClassifier() clf = clf.fit(train[features], y) preds = clf.predict(test[features]) test_result = pd.crosstab(np.array([fruitnames[x] for x in test['fruit_id']]), np.array([fruitnames[x+1] for x in preds]), rownames=['actual'], colnames=['predicted']) test_result # Repetition 1 df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 # randomly assign training and testing set train, test = df[df['is_train']==True], df[df['is_train']==False] features = ['color_id', 'elongatedness', 'weight', 'sweetness', 'acidity'] y, _ = pd.factorize(train['fruit_id']) clf = tree.DecisionTreeClassifier() clf = clf.fit(train[features], y) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) tree_string = dot_data.getvalue() # replace feature numbers with feature names tree_string = re.sub('gini = 0\.[0-9]+\\\\n', '', tree_string) for i, feature in enumerate(features): tree_string = re.sub('X\[{}\]'.format(i), feature, tree_string) # repace lists of numeric label assignments with label name for result in re.finditer('\[[ ]+([\d]+)\.[ ]+([\d]+)\.[ ]+([\d]+)\.\]', tree_string): nums = [] for i in range(0,3): nums.append(int(result.group(i+1))) if nums[0] > nums[1]: if nums[0] > nums[2]: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[0], tree_string) else: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[2], tree_string) elif nums[1] > nums[2]: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[1], tree_string) else: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[2], tree_string) with open('simple.dotfile', 'w+') as f: f.write(tree_string) # normally this would be done with libraries like pydot or networkx, but # I'm having trouble getting them to work in Python 3.4.2 under Windows, # so I'll just call the shell executable directly !dot.exe -Tpng simple.dotfile > simpletree.png from IPython.core.display import Image Image( filename ='simpletree.png') # Repetition 2 df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 # randomly assign training and testing set train, test = df[df['is_train']==True], df[df['is_train']==False] features = ['color_id', 'elongatedness', 'weight', 'sweetness', 'acidity'] y, _ = pd.factorize(train['fruit_id']) clf = tree.DecisionTreeClassifier() clf = clf.fit(train[features], y) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) tree_string = dot_data.getvalue() # replace feature numbers with feature names tree_string = re.sub('gini = 0\.[0-9]+\\\\n', '', tree_string) for i, feature in enumerate(features): tree_string = re.sub('X\[{}\]'.format(i), feature, tree_string) # repace lists of numeric label assignments with label name for result in re.finditer('\[[ ]+([\d]+)\.[ ]+([\d]+)\.[ ]+([\d]+)\.\]', tree_string): nums = [] for i in range(0,3): nums.append(int(result.group(i+1))) if nums[0] > nums[1]: if nums[0] > nums[2]: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[0], tree_string) else: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[2], tree_string) elif nums[1] > nums[2]: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[1], tree_string) else: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[2], tree_string) with open('simple.dotfile', 'w+') as f: f.write(tree_string) # normally this would be done with libraries like pydot or networkx, but # I'm having trouble getting them to work in Python 3.4.2 under Windows, # so I'll just call the shell executable directly !dot.exe -Tpng simple.dotfile > simpletree.png from IPython.core.display import Image Image( filename ='simpletree.png') # Repetition 3 df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 # randomly assign training and testing set train, test = df[df['is_train']==True], df[df['is_train']==False] features = ['color_id', 'elongatedness', 'weight', 'sweetness', 'acidity'] y, _ = pd.factorize(train['fruit_id']) clf = tree.DecisionTreeClassifier() clf = clf.fit(train[features], y) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) tree_string = dot_data.getvalue() # replace feature numbers with feature names tree_string = re.sub('gini = 0\.[0-9]+\\\\n', '', tree_string) for i, feature in enumerate(features): tree_string = re.sub('X\[{}\]'.format(i), feature, tree_string) # repace lists of numeric label assignments with label name for result in re.finditer('\[[ ]+([\d]+)\.[ ]+([\d]+)\.[ ]+([\d]+)\.\]', tree_string): nums = [] for i in range(0,3): nums.append(int(result.group(i+1))) if nums[0] > nums[1]: if nums[0] > nums[2]: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[0], tree_string) else: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[2], tree_string) elif nums[1] > nums[2]: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[1], tree_string) else: tree_string = re.sub('\[[ ]+{}\.[ ]+{}\.[ ]+{}\.\]'.format(nums[0], nums[1], nums[2]), fruitlist[2], tree_string) with open('simple.dotfile', 'w+') as f: f.write(tree_string) # normally this would be done with libraries like pydot or networkx, but # I'm having trouble getting them to work in Python 3.4.2 under Windows, # so I'll just call the shell executable directly !dot.exe -Tpng simple.dotfile > simpletree.png from IPython.core.display import Image Image( filename ='simpletree.png')