import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline from matplotlib.colors import ListedColormap from sklearn import neighbors from random import uniform as random_uniform df = pd.read_csv('fruit.csv') fruitnames = {1: 'Orange', 2: 'Pear', 3: 'Apple'} colors = {1: '#e09028', 2: '#55aa33', 3: '#cc3333'} # normalize sweetness and acidity df['sweetness_normal'] = ( df.sweetness - df.sweetness.mean() ) / df.sweetness.std() df['acidity_normal'] = ( df.acidity - df.acidity.mean() ) / df.acidity.std() # Create numpy arrays for classifier X = [] y = [] for i, row in df.iterrows(): X.append([row.sweetness_normal, row.acidity_normal]) y.append(row.fruit_id) X = np.array(X) y = np.array(y) print(X[:10,:]) # x (small x) and y values normalized sweetness vs acidity print(" ... etc.") print(y) # fruit labels, 1 to 3 # adapted from http://scikit-learn.org/0.11/auto_examples/neighbors/plot_classification.html for k in [1, 3, 5, 7, 9, 11, 99, 131, 177]: h = 0.02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#ffde9e', '#b8ea9d', '#ffbaba']) cmap_bold = ListedColormap(['#ff8c28', '#11bb11', '#ff0000']) # create an instance of Neighbours Classifier and fit the data. clf = neighbors.KNeighborsClassifier(k) clf.fit(X, y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot the data points plt.scatter(X[:, 0], X[:, 1], s=40, c=y, cmap=cmap_bold) plt.xlim(-2.2, 2.2) # hardcoded plt.ylim(-2, 3.4) plt.title("K-nearest neighbors of entire dataset (k = %i)" % (k)) plt.show() # Divide 70:30 train:test def kNN_test(k=15, reps=1, prop_test=0.3, plot=True): """ Function to run k-nearest neighbors on df with given value of k, repeated 'reps' times. prop_test: the proportion of instances to sequester for the test set. All the rest are training set. plot: if True, outputs charts. If false, returns tuple of (training set error rate, test set error rate) Only test set is shown on chart, with misclassified instances larger.""" assert 0