#!/usr/bin/env python # coding: utf-8 # # Ensemble methods # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_digits from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import cross_validate import warnings; warnings.simplefilter('ignore') # load data digits = load_digits() X = digits.data y = digits.target X /= X.max() cv_generator = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # Let's look at accuracy as a function of the number of trees used in the ensemble: # # In[2]: n_estimators = [10, 20, 50, 100, 250, 500, 1000] accuracy = [] for estimators in n_estimators : print ("num estimators: ", estimators) classifier = RandomForestClassifier(n_estimators=estimators) results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False) accuracy.append(np.mean(results['test_score'])) # In[3]: import matplotlib.pyplot as plt plt.semilogx(n_estimators, accuracy, 'ob') plt.title('performance of random forests on the digits data') plt.xlabel('number of trees') plt.ylabel('accuracy') plt.show() # Compared to a single decision tree: # In[4]: from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(max_depth=None) results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False) np.mean(results['test_score']) # In[5]: from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import BaggingClassifier n_estimators = [10, 20, 50, 100, 250, 500, 1000] accuracy_bagging = [] accuracy_adaboost = [] for estimators in n_estimators : print ("num estimators: ", estimators) classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=estimators, random_state=0) results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False) accuracy_bagging.append(np.mean(results['test_score'])) classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=estimators) results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False) accuracy_adaboost.append(np.mean(results['test_score'])) # In[6]: plt.semilogx(n_estimators, accuracy, 'ob', label='Random Forests') plt.semilogx(n_estimators, accuracy_bagging, 'dc', label='Bagging') plt.semilogx(n_estimators, accuracy_adaboost, 'sr', label='AdaBoost') plt.legend() plt.title('performance of ensemble methods on the digits data') plt.xlabel('number of trees') plt.ylabel('accuracy') plt.show() # In[7]: from sklearn import svm from sklearn.model_selection import GridSearchCV param_grid = [ {'C': [1, 10, 100], 'kernel': ['linear']}, {'C': [1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf']}, ] classifier = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid) results = cross_validate(classifier, X, y, cv=cv_generator, scoring='accuracy', return_train_score=False) np.mean(results['test_score']) # In[11]: from sklearn.model_selection import cross_val_predict from sklearn import metrics classifier = RandomForestClassifier(n_estimators=100) classifier = DecisionTreeClassifier(max_depth=None) y_pred = cross_val_predict(classifier, X, y, cv=cv_generator) print(metrics.classification_report(y_pred, y)) from sklearn.metrics import confusion_matrix mat = confusion_matrix(y, y_pred) import seaborn as sns sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap="Blues") plt.xlabel('true label') plt.ylabel('predicted label')