from numpy import *
import operator
import seaborn as sns
sns.set(context='poster', style='dark')
sns.set_context(rc={'lines.markeredgewidth': 0.5})
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
dating_df = pd.read_csv('datingTestSet2.txt', sep='\t', header=None)
dating_df.columns = ['miles', 'games', 'ice_cream', 'opinion']
norm_df = pd.DataFrame()
labels = dating_df.columns[:-1]
for L in labels:
norm_df[L] = (dating_df[L]-dating_df[L].min()) / \
(dating_df[L].max()-dating_df[L].min())
norm_df['opinion'] = dating_df.opinion
norm_df.head()
miles | games | ice_cream | opinion | |
---|---|---|---|---|
0 | 0.448325 | 0.398051 | 0.562334 | 3 |
1 | 0.158733 | 0.341955 | 0.987244 | 2 |
2 | 0.285429 | 0.068925 | 0.474496 | 1 |
3 | 0.823201 | 0.628480 | 0.252489 | 1 |
4 | 0.420102 | 0.079820 | 0.078578 | 1 |
def find_kNN(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
#distance calculation:
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndices = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1),
reverse=True)
return sortedClassCount[0][0], sortedClassCount
def make_sets(data_df, test_portion):
import random as rnd
tot_ix = range(len(data_df))
test_ix = sort(rnd.sample(tot_ix, int(test_portion * len(data_df))))
train_ix = list(set(tot_ix) ^ set(test_ix))
test_df = data_df.ix[test_ix]
train_df = data_df.ix[train_ix]
return train_df, test_df
import time
def test_model(data_df, test_portion, kNN):
train_df, test_df = make_sets(data_df, test_portion)
train_array = zeros((len(train_df),3))
test_array = zeros((len(test_df),3))
cols = train_df.columns
train_array = train_df[cols[:-1]].values
test_array = test_df[cols[:-1]].values
labels = train_df.opinion.values
test_df = test_df.reset_index()
pass_counter = 0
fail_counter = 0
for i in range(len(test_array)):
result, full_stack = find_kNN(test_array[i,:], train_array, labels, kNN)
if result == test_df.ix[i].opinion:
pass_counter += 1
else:
fail_counter += 1
return pass_counter, fail_counter
def plot_model_test(data_df, test_proportion, kNN, repeats):
r = repeats
pass_succ = []
pass_hist = []
pass_avg = []
for i in range(r):
passes, fails = test_model(norm_df, test_proportion, kNN)
pass_succ = passes/(passes+fails)
pass_hist.append(pass_succ)
pass_avg.append(mean(pass_hist))
plot(arange(1, len(pass_avg)+1), pass_avg,
label='average (kNN={knn}, t_p={t_p})'.format(knn=kNN, t_p=test_proportion))
xlabel('trial number')
ylabel('accuracy')
legend(loc='best')
repeats = 10
t_prop = 0.25
kNN = [3]
t1 = time.time()
[plot_model_test(norm_df, t_prop, i, repeats) for i in kNN]
t2 = time.time()
print("time elapsed:", t2-t1, "sec")
time elapsed: 0.9517326354980469 sec