import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_moons
X, y = make_moons(noise=0.2, n_samples=200)
X.shape
(200, 2)
y.shape
(200,)
print(y)
[0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0]
plt.scatter(X[:, 0], X[:, 1], c=y, s=100)
plt.figsize(14,10)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
X_train.shape
(120, 2)
X_test.shape
(80, 2)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
print(knn)
KNeighborsClassifier(algorithm=auto, leaf_size=30, n_neighbors=5, p=2, warn_on_equidistant=True, weights=uniform)
knn.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, n_neighbors=5, p=2, warn_on_equidistant=True, weights='uniform')
y_predict = knn.predict(X_test)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
#plt.scatter(X_test[:, 0], X_test[:, 1], c='w', alpha=0.3, s=100)
plt.figsize(14,10)
y_predict
array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0])
y_test
array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0])
knn.score(X_test, y_test)
0.9375
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(X_train, y_train)
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0)
y_predict = svm.predict(X_test)
plt.figsize(14,10)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
w = svm.coef_.ravel()
a = -w[0] / w[1]
xx = np.linspace(-1, 2)
yy = a * xx - svm.intercept_ / w[1]
plt.plot(xx, yy)
[<matplotlib.lines.Line2D at 0xb08fbec>]
import pandas as pd
train_data = pd.read_csv("kaggle_insult/train.csv")
test_data = pd.read_csv("kaggle_insult/test_with_solutions.csv")
y_train = np.array(train_data.Insult)
comments_train = np.array(train_data.Comment)
print(comments_train.shape)
print(y_train.shape)
(3947,) (3947,)
comments_train[0], y_train[0]
('"You fuck your dad."', 1)
comments_train[5], y_train[5]
('"@SDL OK, but I would hope they\'d sign him to a one-year contract to start with. Give him the chance to be reliable and productive, but give themselves the out if all his time off has hurt his playing skills or if he falls back into old habits."', 0)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(comments_train)
X_train = cv.transform(comments_train)
svm.fit(X_train, y_train)
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0)
comments_test = np.array(test_data.Comment)
y_test = np.array(test_data.Insult)
X_test = cv.transform(comments_test)
svm.score(X_test, y_test)
0.81677370608235733
index = 8
comments_test[index], y_test[index], svm.predict(X_test.tocsr()[index])[0]
('"To engage in an intelligent debate with you is like debating to a retarded person. It\'s useless. It looks like you\'re bent on disregarding the efforts of the government."', 1, 1)