In [1]:

import matplotlib.pyplot as plt
import numpy as np

In [2]:

from sklearn.datasets import make_moons
X, y = make_moons(noise=0.2, n_samples=200)

In [30]:

X.shape

Out[30]:

(200, 2)

In [31]:

y.shape

Out[31]:

(200,)

In [32]:

print(y)

[0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0
 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0
 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0
 1 1 1 0 0 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1
 1 0 1 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0
 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0]

In [33]:

plt.scatter(X[:, 0], X[:, 1], c=y, s=100)
plt.figsize(14,10)

In [34]:

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

In [35]:

X_train.shape

Out[35]:

(120, 2)

In [36]:

X_test.shape

Out[36]:

(80, 2)

In [50]:

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
print(knn)

KNeighborsClassifier(algorithm=auto, leaf_size=30, n_neighbors=5, p=2,
           warn_on_equidistant=True, weights=uniform)

In [54]:

knn.fit(X_train, y_train)

Out[54]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, n_neighbors=5, p=2,
           warn_on_equidistant=True, weights='uniform')

In [55]:

y_predict = knn.predict(X_test)

In [56]:

plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
#plt.scatter(X_test[:, 0], X_test[:, 1], c='w', alpha=0.3, s=100)
plt.figsize(14,10)

In [57]:

y_predict

Out[57]:

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0])

In [58]:

y_test

Out[58]:

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0])

In [59]:

knn.score(X_test, y_test)

Out[59]:

0.9375

In [60]:

from sklearn.svm import LinearSVC
svm = LinearSVC()

In [61]:

svm.fit(X_train, y_train)

Out[61]:

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [62]:

y_predict = svm.predict(X_test)

In [65]:

plt.figsize(14,10)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
w = svm.coef_.ravel()
a = -w[0] / w[1]
xx = np.linspace(-1, 2)
yy = a * xx - svm.intercept_ / w[1]
plt.plot(xx, yy)

Out[65]:

[<matplotlib.lines.Line2D at 0xb08fbec>]

In [66]:

import pandas as pd
train_data = pd.read_csv("kaggle_insult/train.csv")
test_data = pd.read_csv("kaggle_insult/test_with_solutions.csv")

In [67]:

y_train = np.array(train_data.Insult)
comments_train = np.array(train_data.Comment)
print(comments_train.shape)
print(y_train.shape)

(3947,)
(3947,)

In [68]:

comments_train[0], y_train[0]

Out[68]:

('"You fuck your dad."', 1)

In [69]:

comments_train[5], y_train[5]

Out[69]:

('"@SDL OK, but I would hope they\'d sign him to a one-year contract to start with. Give him the chance to be reliable and productive, but give themselves the out if all his time off has hurt his playing skills or if he falls back into old habits."',
 0)

In [70]:

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(comments_train)
X_train = cv.transform(comments_train)

In [71]:

svm.fit(X_train, y_train)

Out[71]:

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [72]:

comments_test = np.array(test_data.Comment)
y_test = np.array(test_data.Insult)
X_test = cv.transform(comments_test)
svm.score(X_test, y_test)

Out[72]:

0.81677370608235733

In [73]:

index = 8
comments_test[index], y_test[index], svm.predict(X_test.tocsr()[index])[0]

Out[73]:

('"To engage in an intelligent debate with you is like debating to a retarded person.  It\'s useless.  It looks like you\'re bent on disregarding the efforts of the government."',
 1,
 1)

In [28]:

In [ ]: