#!/usr/bin/env python
# coding: utf-8

# # Scikit-Learn の練習
# 以下の資料を参考にしてやったところまで。途中で使えないattributeがあったので、最後まではやってないけど、使い方は大分分かった。
# https://www.kaggle.com/c/data-science-london-scikit-learn/visualization/1091

# In[1]:


from sklearn.datasets import load_digits
digits = load_digits()
import matplotlib
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import sklearn


# In[2]:


print matplotlib.__version__
print sklearn.__version__


# In[3]:


print("images shape: %s" % str(digits.images.shape))
print("targets shpae: %s" % str(digits.target.shape))


# In[4]:


plt.matshow(digits.images[0],cmap=plt.cm.Greys);


# In[5]:


digits.target


# In[6]:


X = digits.data.reshape(-1,64)
print(X.shape)


# In[7]:


y = digits.target
print(y.shape)


# 17971個のデータポイントがあって、それぞれに8x8のイメージがある。なので64次元データ

# In[8]:


print(X)


# ## PCA

# In[9]:


from sklearn.decomposition import PCA


# In[10]:


pca = PCA(n_components=2)


# In[11]:


pca.fit(X);


# In[12]:


X_pca = pca.transform(X)
X_pca.shape


# ここで２次元に落ちている。

# In[13]:


#plt.figsize(16,10)
fig = plt.figure(figsize=(16,10))
plt.scatter(X_pca[:,0],X_pca[:,1],c=y)


# In[14]:


print(pca.mean_.shape)
print(pca.components_.shape)


# In[15]:


fix, ax = plt.subplots(1,3)
ax[0].matshow(pca.mean_.reshape(8,8),cmap=plt.cm.Greys)
ax[1].matshow(pca.components_[0,:].reshape(8,8),cmap=plt.cm.Greys)
ax[2].matshow(pca.components_[1,:].reshape(8,8),cmap=plt.cm.Greys);


# ##多様体学習
# これはあんまりよく知らない。後で勉強しよう。

# In[16]:


from sklearn.manifold import Isomap


# In[17]:


isomap = Isomap(n_components=2,n_neighbors=20)


# In[18]:


isomap.fit(X)


# In[19]:


X_isomap = isomap.transform(X)
X_isomap.shape


# In[20]:


fig = plt.figure(figsize=(16,10))
plt.scatter(X_isomap[:,0],X_isomap[:,1],c=y);


# In[21]:


from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


# In[22]:


print("X_train shape: %s" % repr(X_train.shape))
print("y_train shape: %s" % repr(y_train.shape))
print("X_test shape: %s" % repr(X_test.shape))
print("y_test shape: %s" % repr(y_test.shape))


# ## Linear SVC

# In[23]:


from sklearn.svm import LinearSVC
svm = LinearSVC()


# In[24]:


svm.fit(X_train,y_train)


# In[25]:


svm.predict(X_train)


# In[26]:


svm.score(X_train,y_train)


# svm.scoreは、正解率の平均。上の例は、トレーニングデータでの正解率

# In[27]:


svm.score(X_test,y_test)


# テストでの正解率は上記。

# ## Random Forest

# In[28]:


from sklearn.ensemble import RandomForestClassifier


# In[29]:


rf = RandomForestClassifier()


# In[30]:


rf.fit(X_train,y_train)


# In[31]:


rf.score(X_train,y_train)


# In[32]:


rf.score(X_test,y_test)


# In[33]:


from sklearn.cross_validation import cross_val_score
import numpy as np
scores = cross_val_score(rf,X_train,y_train,cv = 5)
print("score: %s mean: %f std: %f" % (str(scores),np.mean(scores),np.std(scores)))


# In[34]:


scores = cross_val_score(rf,X_train,y_train,cv = 10)
print("score: %s mean: %f std: %f" % (str(scores),np.mean(scores),np.std(scores)))


# In[35]:


scores = cross_val_score(rf,X_train,y_train,cv = 3)
print("score: %s mea: %f std: %f" % (str(scores),np.mean(scores),np.std(scores)))


# In[36]:


from sklearn.cross_validation import cross_val_score
scores = cross_val_score(rf, X_train,y_train, cv = 5)
print("score: %s mean: %f std: %f" % (str(scores),np.mean(scores),np.std(scores)))


# In[37]:


rf2 = RandomForestClassifier(n_estimators=50)
scores = cross_val_score(rf2,X_train,y_train,cv=5)
print("score: %s mean: %f std: %f" % (str(scores),np.mean(scores),np.std(scores)))


# In[38]:


from sklearn.grid_search import GridSearchCV


# In[39]:


param_grid = {'C':10. ** np.arange(-3,4)}
grid_search = GridSearchCV(svm,param_grid=param_grid,cv=3,verbose=3)


# In[40]:


grid_search.fit(X_train,y_train)


# In[41]:


print(grid_search.best_params_)
print(grid_search.best_score_)


# In[42]:


fig = plt.figure(figsize=(16,10))
plt.plot([c.mean_validation_score for c in grid_search.cv_scores_],label="validation error") ## ?
plt.plot([c.mean_training_score for c in grid_search.cv_scores_],label="training error") ## ?


# どうも上のcv_scoresが、scikit-learnの0.16.1には入っていないみたいで、grid_scoresで、mean_validation_scoreは動いたけど、mean_training_scoreは動かなかった。どうもブランチに入らないとか、そんな話らしいので、これ以上の深入りはやめて、とりあえず、gridでよかったコストパラメータのCを当てはめて、モデルを作った。

# In[44]:


svm = LinearSVC
estimator = svm(C=0.001)
estimator.fit(X_train,y_train)


# In[45]:


estimator.score(X_test,y_test)


# デフォルトのコストパラメータより良くなっていて、Random Forestよりも良い結果であった。やはりSVMはパラメータのチューニングが大事。

# In[ ]: