# read the iris data into a DataFrame import pandas as pd col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'] iris = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, names=col_names) iris.head() # allow plots to appear in the notebook %matplotlib inline # create a custom colormap from matplotlib.colors import ListedColormap cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # map each iris species to a number iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}) # create a scatter plot of PETAL LENGTH versus PETAL WIDTH and color by SPECIES iris.plot(kind='scatter', x='petal_length', y='petal_width', c='species_num', colormap=cmap_bold) # create a scatter plot of SEPAL LENGTH versus SEPAL WIDTH and color by SPECIES iris.plot(kind='scatter', x='sepal_length', y='sepal_width', c='species_num', colormap=cmap_bold) iris.head() # store feature matrix in "X" feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] X = iris[feature_cols] # alternative ways to create "X" X = iris.drop(['species', 'species_num'], axis=1) X = iris.loc[:, 'sepal_length':'petal_width'] X = iris.iloc[:, 0:4] # store response vector in "y" y = iris.species_num # check X's type print type(X) print type(X.values) # check y's type print type(y) print type(y.values) # check X's shape (n = number of observations, p = number of features) print X.shape # check y's shape (single dimension with length n) print y.shape from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=1) print knn knn.fit(X, y) knn.predict([3, 5, 4, 2]) X_new = [[3, 5, 4, 2], [5, 4, 3, 2]] knn.predict(X_new) # instantiate the model (using the value K=5) knn = KNeighborsClassifier(n_neighbors=5) # fit the model with data knn.fit(X, y) # predict the response for new observations knn.predict(X_new) # calculate predicted probabilities of class membership knn.predict_proba(X_new) # print distances to nearest neighbors (and their identities) knn.kneighbors([3, 5, 4, 2])