import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import make_moons X, y = make_moons(noise=0.2, n_samples=200) X.shape y.shape print(y) plt.scatter(X[:, 0], X[:, 1], c=y, s=100) plt.figsize(14,10) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) X_train.shape X_test.shape from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier() print(knn) knn.fit(X_train, y_train) y_predict = knn.predict(X_test) plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100) plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100) #plt.scatter(X_test[:, 0], X_test[:, 1], c='w', alpha=0.3, s=100) plt.figsize(14,10) y_predict y_test knn.score(X_test, y_test) from sklearn.svm import LinearSVC svm = LinearSVC() svm.fit(X_train, y_train) y_predict = svm.predict(X_test) plt.figsize(14,10) plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100) plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100) w = svm.coef_.ravel() a = -w[0] / w[1] xx = np.linspace(-1, 2) yy = a * xx - svm.intercept_ / w[1] plt.plot(xx, yy) import pandas as pd train_data = pd.read_csv("kaggle_insult/train.csv") test_data = pd.read_csv("kaggle_insult/test_with_solutions.csv") y_train = np.array(train_data.Insult) comments_train = np.array(train_data.Comment) print(comments_train.shape) print(y_train.shape) comments_train[0], y_train[0] comments_train[5], y_train[5] from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() cv.fit(comments_train) X_train = cv.transform(comments_train) svm.fit(X_train, y_train) comments_test = np.array(test_data.Comment) y_test = np.array(test_data.Insult) X_test = cv.transform(comments_test) svm.score(X_test, y_test) index = 8 comments_test[index], y_test[index], svm.predict(X_test.tocsr()[index])[0]