In [1]:
import matplotlib.pyplot as plt
import numpy as np
In [2]:
from sklearn.datasets import make_moons
X, y = make_moons(noise=0.2, n_samples=200)
In [30]:
X.shape
Out[30]:
(200, 2)
In [31]:
y.shape
Out[31]:
(200,)
In [32]:
print(y)
[0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0
 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0
 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0
 1 1 1 0 0 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1
 1 0 1 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0
 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0]
In [33]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=100)
plt.figsize(14,10)
In [34]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
In [35]:
X_train.shape
Out[35]:
(120, 2)
In [36]:
X_test.shape
Out[36]:
(80, 2)
In [50]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
print(knn)
KNeighborsClassifier(algorithm=auto, leaf_size=30, n_neighbors=5, p=2,
           warn_on_equidistant=True, weights=uniform)
In [54]:
knn.fit(X_train, y_train)
Out[54]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, n_neighbors=5, p=2,
           warn_on_equidistant=True, weights='uniform')
In [55]:
y_predict = knn.predict(X_test)
In [56]:
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
#plt.scatter(X_test[:, 0], X_test[:, 1], c='w', alpha=0.3, s=100)
plt.figsize(14,10)
In [57]:
y_predict
Out[57]:
array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0])
In [58]:
y_test
Out[58]:
array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0])
In [59]:
knn.score(X_test, y_test)
Out[59]:
0.9375
In [102]:
from sklearn.linear_model import LogisticRegression
svm = LogisticRegression()
In [103]:
svm.fit(X_train, y_train)
Out[103]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
In [104]:
y_predict = svm.predict(X_test)
In [105]:
plt.figsize(14,10)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
w = svm.coef_.ravel()
a = -w[0] / w[1]
xx = np.linspace(-1, 2)
yy = a * xx - svm.intercept_ / w[1]
plt.plot(xx, yy)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-105-3427b9cdffe3> in <module>()
      1 plt.figsize(14,10)
----> 2 plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
      3 plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
      4 w = svm.coef_.ravel()
      5 a = -w[0] / w[1]

TypeError: 'coo_matrix' object is not subscriptable
In [106]:
import pandas as pd
train_data = pd.read_csv("../kaggle_insults/train.csv")
test_data = pd.read_csv("../kaggle_insults/test_with_solutions.csv")
In [107]:
y_train = np.array(train_data.Insult)
comments_train = np.array(train_data.Comment)
print(comments_train.shape)
print(y_train.shape)
(3947,)
(3947,)
In [108]:
comments_train[8], y_train[8]
Out[108]:
('"Either you are fake or extremely stupid...maybe both..."', 1)
In [109]:
comments_train[10], y_train[5]
Out[109]:
('"@jdstorm dont wish him injury but it happened on its OWN and i DOUBT he\'s injured, he looked embarrassed to me"',
 0)
In [110]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=1)
cv.fit(comments_train)
X_train = cv.transform(comments_train)
In [111]:
svm.fit(X_train, y_train)
Out[111]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
In [112]:
comments_test = np.array(test_data.Comment)
y_test = np.array(test_data.Insult)
X_test = cv.transform(comments_test)
svm.score(X_test, y_test)
Out[112]:
0.84548545523233853
In [113]:
index = 8
comments_test[index], y_test[index], svm.predict(X_test.tocsr()[index])[0]
Out[113]:
('"To engage in an intelligent debate with you is like debating to a retarded person.  It\'s useless.  It looks like you\'re bent on disregarding the efforts of the government."',
 1,
 1)
In [114]:
np.where(y_train==1)[0][:10]
Out[114]:
array([ 0,  7,  8,  9, 15, 16, 18, 19, 34, 37])
In [115]:
x = X_train.tocsr()[8].toarray()
In [116]:
nonzero = np.where(x.ravel())[0]
In [117]:
np.array(cv.get_feature_names())[nonzero]
Out[117]:
array([u'are', u'both', u'either', u'extremely', u'fake', u'maybe', u'or',
       u'stupid', u'you'], 
      dtype='<U95')
In [118]:
nonzero
Out[118]:
array([  983,  1887,  4396,  4834,  4892,  8287, 10070, 13525, 16397])
In [119]:
X_test.shape
Out[119]:
(2647, 16469)
In [120]:
feats = cv.get_feature_names()
In [120]:
 
In [121]:
inds = [ 352, 983, 5000, 10004, 13525, 16397, 16440, 16468]
In [122]:
np.array(feats)[inds]
Out[122]:
array([u'aaaah', u'are', u'feathers', u'olympic', u'stupid', u'you',
       u'zealot', u'zuckerberg'], 
      dtype='<U95')
In [123]:
x.ravel()[inds]
Out[123]:
array([0, 1, 0, 0, 1, 1, 0, 0], dtype=int64)
In [138]:
coef_ = svm.coef_
inds = np.argsort(coef_.ravel())
important = np.hstack([inds[:10], inds[-20:]])
feature_names = np.array(cv.get_feature_names())
f_imp = feature_names[important]
coef = coef_.ravel()[important]
inds = np.argsort(coef)
f_imp = f_imp[inds]
coef = coef[inds]
In [181]:
plt.figsize(10, 3)
plt.bar(np.arange(len(coef)), np.abs(coef), width=.6, color=np.array(["green", "red"])[(coef>0).astype(np.int)])
ax = plt.gca()
ax.set_xticks(np.arange(len(coef)) + .4)
labels = ax.set_xticklabels(f_imp, rotation=45, rotation_mode="anchor", va="baseline", ha="right")
#for label in labels:
#    label.set_rotation(45)
plt.savefig("presentation/logreg-pics/bow_coef.pdf", bbox_inches="tight")
plt.show()
In [131]:
 
In [155]:
import matplotlib.ticker
In [ ]:
matplotlib.ticker.TickHelper