import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline from matplotlib.colors import ListedColormap from sklearn import neighbors from sklearn.ensemble import RandomForestClassifier import time df = pd.read_csv('fruit.csv') fruitnames = {1: 'Orange', 2: 'Pear', 3: 'Apple'} colors = {1: '#e09028', 2: '#55aa33', 3: '#cc3333'} fruitlist = ['Orange', 'Pear', 'Apple'] df.sort('fruit_id', inplace=True) # This is important because the factorizer assigns numbers # based on the order the first label is encountered, e.g. if the first instance had # fruit = 3, the y value would be 0. df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 # randomly assign training and testing set train, test = df[df['is_train']==True], df[df['is_train']==False] features = ['color_id', 'elongatedness', 'weight', 'sweetness', 'acidity'] y, _ = pd.factorize(train['fruit_id']) clf = RandomForestClassifier(n_jobs=2) clf = clf.fit(train[features], y) preds = clf.predict(test[features]) test_result = pd.crosstab(np.array([fruitnames[x] for x in test['fruit_id']]), np.array([fruitnames[x+1] for x in preds]), rownames=['actual'], colnames=['predicted']) test_result for i, score in enumerate(list(clf.feature_importances_)): print(round(100*score, 1), features[i]) reps=100 features=['color_id', 'elongatedness', 'weight', 'sweetness', 'acidity'] title_suffix='with all features' start = time.time() for i in range(reps): df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 # randomly assign training and testing set train, test = df[df['is_train']==True], df[df['is_train']==False] y, _ = pd.factorize(train['fruit_id']) clf = RandomForestClassifier(n_jobs=2) clf = clf.fit(train[features], y) preds = clf.predict(test[features]) test_result = pd.crosstab(np.array([fruitnames[x] for x in test['fruit_id']]), np.array([fruitnames[x+1] for x in preds]), rownames=['actual'], colnames=['predicted']) if i == 0: final_result = test_result[:] else: final_result += test_result confmatrix = np.array(final_result) correct = 0 for i in range(confmatrix.shape[0]): correct += confmatrix[i,i] accuracy = correct/confmatrix.sum() print('{} runs {}\nFeatures: {}\nAccuracy: {}\ntime: {} sec'.format(reps, title_suffix, features, accuracy, int(time.time()-start))) final_result reps=100 features=['elongatedness','sweetness',] title_suffix='with only 2 most important features' import time start = time.time() for i in range(reps): df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 # randomly assign training and testing set train, test = df[df['is_train']==True], df[df['is_train']==False] y, _ = pd.factorize(train['fruit_id']) clf = RandomForestClassifier(n_jobs=2) clf = clf.fit(train[features], y) preds = clf.predict(test[features]) test_result = pd.crosstab(np.array([fruitnames[x] for x in test['fruit_id']]), np.array([fruitnames[x+1] for x in preds]), rownames=['actual'], colnames=['predicted']) if i == 0: final_result = test_result[:] else: final_result += test_result confmatrix = np.array(final_result) correct = 0 for i in range(confmatrix.shape[0]): correct += confmatrix[i,i] accuracy = correct/confmatrix.sum() print('{} runs {}\nFeatures: {}\nAccuracy: {}\ntime: {} sec'.format(reps, title_suffix, features, accuracy, int(time.time()-start))) final_result reps=100 features=['color_id','acidity',] title_suffix='with only 2 least important features' import time start = time.time() for i in range(reps): df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 # randomly assign training and testing set train, test = df[df['is_train']==True], df[df['is_train']==False] y, _ = pd.factorize(train['fruit_id']) clf = RandomForestClassifier(n_jobs=2) clf = clf.fit(train[features], y) preds = clf.predict(test[features]) test_result = pd.crosstab(np.array([fruitnames[x] for x in test['fruit_id']]), np.array([fruitnames[x+1] for x in preds]), rownames=['actual'], colnames=['predicted']) if i == 0: final_result = test_result[:] else: final_result += test_result confmatrix = np.array(final_result) correct = 0 for i in range(confmatrix.shape[0]): correct += confmatrix[i,i] accuracy = correct/confmatrix.sum() print('{} runs {}\nFeatures: {}\nAccuracy: {}\ntime: {} sec'.format(reps, title_suffix, features, accuracy, int(time.time()-start))) final_result