# read the iris data into a DataFrame import pandas as pd url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] iris = pd.read_csv(url, header=None, names=col_names) # map each iris species to a number iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}) # store feature matrix in "X" feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] X = iris[feature_cols] # store response vector in "y" y = iris.species_num # import the class from sklearn.neighbors import KNeighborsClassifier # instantiate the model knn = KNeighborsClassifier(n_neighbors=50) # train the model on the entire dataset knn.fit(X, y) # predict the response values for the observations in X ("test the model") knn.predict(X) # store the predicted response values y_pred = knn.predict(X) # compute classification accuracy from sklearn import metrics print metrics.accuracy_score(y, y_pred) knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X, y) y_pred = knn.predict(X) print metrics.accuracy_score(y, y_pred) def min_max(nums): smallest = min(nums) largest = max(nums) return [smallest, largest] min_and_max = min_max([1, 2, 3]) print min_and_max print type(min_and_max) the_min, the_max = min_max([1, 2, 3]) print the_min print type(the_min) print the_max print type(the_max) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # before splitting print X.shape # after splitting print X_train.shape print X_test.shape # before splitting print y.shape # after splitting print y_train.shape print y_test.shape # WITHOUT a random_state parameter X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # print the first element of each object print X_train[0] print X_test[0] print y_train[0] print y_test[0] # WITH a random_state parameter X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4) # print the first element of each object print X_train[0] print X_test[0] print y_train[0] print y_test[0] # STEP 1: split X and y into training and testing sets (using random_state for reproducibility) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4) # STEP 2: train the model on the training set (using K=1) knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X_train, y_train) # STEP 3: test the model on the testing set, and check the accuracy y_pred = knn.predict(X_test) print metrics.accuracy_score(y_test, y_pred) knn = KNeighborsClassifier(n_neighbors=50) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print metrics.accuracy_score(y_test, y_pred) # calculate TRAINING ERROR and TESTING ERROR for K=1 through 50 k_range = range(1, 51) training_error = [] testing_error = [] for k in k_range: knn = KNeighborsClassifier(n_neighbors=k) # training error knn.fit(X, y) y_pred = knn.predict(X) training_error.append(1 - metrics.accuracy_score(y, y_pred)) # testing error knn.fit(X_train, y_train) y_pred = knn.predict(X_test) testing_error.append(1 - metrics.accuracy_score(y_test, y_pred)) %matplotlib inline import matplotlib.pyplot as plt plt.style.use('ggplot') # plot the relationship between K (HIGH TO LOW) and TESTING ERROR plt.plot(k_range, testing_error) plt.gca().invert_xaxis() plt.xlabel('Value of K for KNN') plt.ylabel('Testing Error') # create a DataFrame of K, training error, and testing error df = pd.DataFrame({'K': k_range, 'train':training_error, 'test':testing_error}).set_index('K').sort_index(ascending=False) df.head() # plot the relationship between K (HIGH TO LOW) and both TRAINING ERROR and TESTING ERROR df.plot() # instantiate the model with the best known parameters knn = KNeighborsClassifier(n_neighbors=11) # re-train the model with X and y (not X_train and y_train) - why? knn.fit(X, y) # make a prediction for an out-of-sample observation knn.predict([3, 5, 4, 2]) # try different values for random_state X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4) knn = KNeighborsClassifier(n_neighbors=50) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print metrics.accuracy_score(y_test, y_pred)