from sklearn import datasets datasets.load_iris() iris = datasets.load_iris() print iris.DESCR iris.target_names from IPython.core.display import Image Image(filename='Iris_setosa.jpg') # from Wikipedia Image(filename='Iris_versicolor.jpg') Image(filename='Iris_virginica.jpg') iris.feature_names Image(filename="Petal-sepal.jpg") iris.data[:3] iris.target[:3] colors = ('r', '#66ff66', '#4444ff') X = iris.data Y = iris.target i, j = 2, 3 pylab.figure(figsize=(10, 10)) pylab.xlabel(iris.feature_names[i], fontsize = 20) pylab.ylabel(iris.feature_names[j], fontsize = 20) pylab.scatter(X[:, i], X[:, j], c=[colors[y] for y in Y], s=50) mean([x for i, x in enumerate(iris.data) if iris.target[i] == 0], axis=0) mean([x for i, x in enumerate(iris.data) if iris.target[i] == 1], axis=0) mean([x for i, x in enumerate(iris.data) if iris.target[i] == 2], axis=0) from sklearn import neighbors knc = neighbors.KNeighborsClassifier(n_neighbors=5, warn_on_equidistant=False) step = 5 knc.fit(X[::step], Y[::step]) knc.predict([5.93, 2.77, 4.23, 1.3]) knc.predict_proba([5.93, 2.77, 4.23, 1.3]) Y_pred = knc.predict(X) i, j = 2, 3 X_ok = array([x for k, x in enumerate(X) if Y[k] == Y_pred[k]]) Y_ok = [y for k, y in enumerate(Y) if y == Y_pred[k]] X_bad = array([x for k, x in enumerate(X) if Y[k] != Y_pred[k]]) pylab.figure(figsize=(10, 10)) pylab.xlabel(iris.feature_names[i], fontsize = 20) pylab.ylabel(iris.feature_names[j], fontsize = 20) pylab.scatter(X_ok[:, i], X_ok[:, j], c=[colors[y] for y in Y_ok], s=50) pylab.scatter(X[::step, i], X[::step, j], c="w", marker="s", s=15) pylab.scatter(X_bad[:, i], X_bad[:, j], marker="x", s=50) from sklearn import cross_validation cross_val = cross_validation.KFold(n = len(X), k=10, shuffle=True) def avg_miss(Y_true, Y_pred): return (Y_true != Y_pred).sum() / float(len(Y_true)) cross_validation.cross_val_score(knc, X, Y, cv=cross_val, score_func=avg_miss)