#!/usr/bin/env python
# coding: utf-8

# # Ensemble methods

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
import warnings; warnings.simplefilter('ignore')

# load data
digits = load_digits()
X = digits.data
y = digits.target
X /= X.max()
 
cv_generator = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)


# Let's look at accuracy as a function of the number of trees used in the ensemble:
# 

# In[2]:


n_estimators = [10, 20, 50, 100, 250, 500, 1000]
accuracy = []
for estimators in n_estimators :
    print ("num estimators: ", estimators)
    classifier = RandomForestClassifier(n_estimators=estimators)
    results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False)
    accuracy.append(np.mean(results['test_score']))
 

# In[3]:


import matplotlib.pyplot as plt
plt.semilogx(n_estimators, accuracy, 'ob')
plt.title('performance of random forests on the digits data')
plt.xlabel('number of trees')
plt.ylabel('accuracy')
plt.show()
 

# Compared to a single decision tree:

# In[4]:


from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=None)
results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False)
np.mean(results['test_score'])


# In[5]:


from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
n_estimators = [10, 20, 50, 100, 250, 500, 1000]
accuracy_bagging = []
accuracy_adaboost = []
for estimators in n_estimators :
    print ("num estimators: ", estimators)
    classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=estimators, random_state=0)
    results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False)
    accuracy_bagging.append(np.mean(results['test_score']))
    classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=estimators)
    results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False)
    accuracy_adaboost.append(np.mean(results['test_score']))


# In[6]:


plt.semilogx(n_estimators, accuracy, 'ob', label='Random Forests')
plt.semilogx(n_estimators, accuracy_bagging, 'dc', label='Bagging')
plt.semilogx(n_estimators, accuracy_adaboost, 'sr', label='AdaBoost')
plt.legend()
plt.title('performance of ensemble methods on the digits data')
plt.xlabel('number of trees')
plt.ylabel('accuracy')
plt.show()


# In[7]:


from sklearn import svm
from sklearn.model_selection import GridSearchCV
 
param_grid = [
  {'C': [1, 10, 100], 'kernel': ['linear']},
  {'C': [1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf']},
 ]
classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)
results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False)
np.mean(results['test_score'])


# In[11]:


from sklearn.model_selection import cross_val_predict
from sklearn import metrics

classifier = RandomForestClassifier(n_estimators=100)
classifier = DecisionTreeClassifier(max_depth=None)
y_pred = cross_val_predict(classifier, X, y, cv=cv_generator)
print(metrics.classification_report(y_pred, y))

from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y, y_pred)

import seaborn as sns
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap="Blues")
plt.xlabel('true label')
plt.ylabel('predicted label')