from numpy import * import operator import seaborn as sns sns.set(context='poster', style='dark') sns.set_context(rc={'lines.markeredgewidth': 0.5}) import pandas as pd from mpl_toolkits.mplot3d import Axes3D dating_df = pd.read_csv('datingTestSet2.txt', sep='\t', header=None) dating_df.columns = ['miles', 'games', 'ice_cream', 'opinion'] norm_df = pd.DataFrame() labels = dating_df.columns[:-1] for L in labels: norm_df[L] = (dating_df[L]-dating_df[L].min()) / \ (dating_df[L].max()-dating_df[L].min()) norm_df['opinion'] = dating_df.opinion norm_df.head() def find_kNN(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] #distance calculation: diffMat = tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndices = distances.argsort() classCount = {} for i in range(k): voteIlabel = labels[sortedDistIndices[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0], sortedClassCount def make_sets(data_df, test_portion): import random as rnd tot_ix = range(len(data_df)) test_ix = sort(rnd.sample(tot_ix, int(test_portion * len(data_df)))) train_ix = list(set(tot_ix) ^ set(test_ix)) test_df = data_df.ix[test_ix] train_df = data_df.ix[train_ix] return train_df, test_df import time def test_model(data_df, test_portion, kNN): train_df, test_df = make_sets(data_df, test_portion) train_array = zeros((len(train_df),3)) test_array = zeros((len(test_df),3)) cols = train_df.columns train_array = train_df[cols[:-1]].values test_array = test_df[cols[:-1]].values labels = train_df.opinion.values test_df = test_df.reset_index() pass_counter = 0 fail_counter = 0 for i in range(len(test_array)): result, full_stack = find_kNN(test_array[i,:], train_array, labels, kNN) if result == test_df.ix[i].opinion: pass_counter += 1 else: fail_counter += 1 return pass_counter, fail_counter def plot_model_test(data_df, test_proportion, kNN, repeats): r = repeats pass_succ = [] pass_hist = [] pass_avg = [] for i in range(r): passes, fails = test_model(norm_df, test_proportion, kNN) pass_succ = passes/(passes+fails) pass_hist.append(pass_succ) pass_avg.append(mean(pass_hist)) plot(arange(1, len(pass_avg)+1), pass_avg, label='average (kNN={knn}, t_p={t_p})'.format(knn=kNN, t_p=test_proportion)) xlabel('trial number') ylabel('accuracy') legend(loc='best') repeats = 10 t_prop = 0.25 kNN = [3] t1 = time.time() [plot_model_test(norm_df, t_prop, i, repeats) for i in kNN] t2 = time.time() print("time elapsed:", t2-t1, "sec")