#!/usr/bin/env python # coding: utf-8 # # Chapter 8 - Tree-based Methods # - [8.1.1 Regression Trees](#8.1.1-Regression-Trees) # - [8.1.2 Classification Trees](#8.1.2-Classification-Trees) # - [Lab: 8.3.1 Fitting Classification Trees](#8.3.1-Fitting-Classification-Trees) # - [Lab: 8.3.2 Fitting Regression Trees](#8.3.2-Fitting-Regression-Trees) # - [Lab: 8.3.3 Bagging and Random Forests](#8.3.3-Bagging-and-Random-Forests) # - [Lab: 8.3.4 Boosting](#8.3.4-Boosting) # In[1]: # %load ../standard_import.txt import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pydot from IPython.display import Image from sklearn.model_selection import train_test_split, cross_val_score from sklearn.externals.six import StringIO from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor from sklearn.metrics import mean_squared_error,confusion_matrix, classification_report get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('seaborn-white') # In[2]: # This function creates images of tree models using pydot def print_tree(estimator, features, class_names=None, filled=True): tree = estimator names = features color = filled classn = class_names dot_data = StringIO() export_graphviz(estimator, out_file=dot_data, feature_names=features, class_names=classn, filled=filled) graph = pydot.graph_from_dot_data(dot_data.getvalue()) return(graph) # ### 8.1.1 Regression Trees # In R, I exported the dataset from package 'ISLR' to a csv file. # In[3]: df = pd.read_csv('Data/Hitters.csv').dropna() df.info() # In[4]: X = df[['Years', 'Hits']].as_matrix() y = np.log(df.Salary.as_matrix()) fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,4)) ax1.hist(df.Salary.as_matrix()) ax1.set_xlabel('Salary') ax2.hist(y) ax2.set_xlabel('Log(Salary)'); # In[5]: regr = DecisionTreeRegressor(max_leaf_nodes=3) regr.fit(X, y) # ### Figure 8.1 # In[6]: graph, = print_tree(regr, features=['Years', 'Hits']) Image(graph.create_png()) # ### Figure 8.2 # In[7]: df.plot('Years', 'Hits', kind='scatter', color='orange', figsize=(7,6)) plt.xlim(0,25) plt.ylim(ymin=-5) plt.xticks([1, 4.5, 24]) plt.yticks([1, 117.5, 238]) plt.vlines(4.5, ymin=-5, ymax=250) plt.hlines(117.5, xmin=4.5, xmax=25) plt.annotate('R1', xy=(2,117.5), fontsize='xx-large') plt.annotate('R2', xy=(11,60), fontsize='xx-large') plt.annotate('R3', xy=(11,170), fontsize='xx-large'); # ### Pruning # This is currently not supported in scikit-learn. See first point under 'disadvantages of decision trees in the documentation. Implementation has been discussed but Random Forests have better predictive qualities than a single pruned tree anyway if I understand correctly. # # ### 8.1.2 Classification Trees # Dataset available on http://www-bcf.usc.edu/~gareth/ISL/data.html # In[8]: df2 = pd.read_csv('Data/Heart.csv').drop('Unnamed: 0', axis=1).dropna() df2.info() # In[9]: df2.ChestPain = pd.factorize(df2.ChestPain)[0] df2.Thal = pd.factorize(df2.Thal)[0] # In[10]: X2 = df2.drop('AHD', axis=1) y2 = pd.factorize(df2.AHD)[0] # In[11]: clf = DecisionTreeClassifier(max_depth=None, max_leaf_nodes=6, max_features=3) clf.fit(X2,y2) # In[12]: clf.score(X2,y2) # In[13]: graph2, = print_tree(clf, features=X2.columns, class_names=['No', 'Yes']) Image(graph2.create_png()) # ## Lab # ### 8.3.1 Fitting Classification Trees # In R, I exported the dataset from package 'ISLR' to a csv file. # In[14]: df3 = pd.read_csv('Data/Carseats.csv').drop('Unnamed: 0', axis=1) df3.head() # In[15]: df3['High'] = df3.Sales.map(lambda x: 1 if x>8 else 0) df3.ShelveLoc = pd.factorize(df3.ShelveLoc)[0] df3.Urban = df3.Urban.map({'No':0, 'Yes':1}) df3.US = df3.US.map({'No':0, 'Yes':1}) df3.info() # In[16]: df3.head(5) # In[18]: X = df3.drop(['Sales', 'High'], axis=1) y = df3.High X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # In[19]: clf = DecisionTreeClassifier(max_depth=6) clf.fit(X, y) # In[20]: print(classification_report(y, clf.predict(X))) # In[22]: graph3, = print_tree(clf, features=X.columns, class_names=['No', 'Yes']) Image(graph3.create_png()) # In[23]: clf.fit(X_train, y_train) pred = clf.predict(X_test) # In[24]: cm = pd.DataFrame(confusion_matrix(y_test, pred).T, index=['No', 'Yes'], columns=['No', 'Yes']) cm.index.name = 'Predicted' cm.columns.name = 'True' cm # In[25]: # Precision of the model using test data is 74% print(classification_report(y_test, pred)) # Pruning not implemented in scikit-learn. # ### 8.3.2 Fitting Regression Trees # In R, I exported the dataset from package 'MASS' to a csv file. # In[26]: boston_df = pd.read_csv('Data/Boston.csv') boston_df.info() # In[28]: X = boston_df.drop('medv', axis=1) y = boston_df.medv X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # In[29]: # Pruning not supported. Choosing max depth 3) regr2 = DecisionTreeRegressor(max_depth=3) regr2.fit(X_train, y_train) pred = regr2.predict(X_test) # In[31]: graph, = print_tree(regr2, features=X.columns) Image(graph.create_png()) # In[32]: plt.scatter(pred, y_test, label='medv') plt.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes) plt.xlabel('pred') plt.ylabel('y_test') # In[33]: mean_squared_error(y_test, pred) # ### 8.3.3 Bagging and Random Forests # In[34]: # There are 13 features in the dataset X.shape # In[35]: # Bagging: using all features regr1 = RandomForestRegressor(max_features=13, random_state=1) regr1.fit(X_train, y_train) # In[36]: pred = regr1.predict(X_test) plt.scatter(pred, y_test, label='medv') plt.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes) plt.xlabel('pred') plt.ylabel('y_test') # In[37]: mean_squared_error(y_test, pred) # In[38]: # Random forests: using 6 features regr2 = RandomForestRegressor(max_features=6, random_state=1) regr2.fit(X_train, y_train) # In[39]: pred = regr2.predict(X_test) mean_squared_error(y_test, pred) # In[40]: Importance = pd.DataFrame({'Importance':regr2.feature_importances_*100}, index=X.columns) Importance.sort_values('Importance', axis=0, ascending=True).plot(kind='barh', color='r', ) plt.xlabel('Variable Importance') plt.gca().legend_ = None # ### 8.3.4 Boosting # In[41]: regr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, random_state=1) regr.fit(X_train, y_train) # In[42]: feature_importance = regr.feature_importances_*100 rel_imp = pd.Series(feature_importance, index=X.columns).sort_values(inplace=False) print(rel_imp) rel_imp.T.plot(kind='barh', color='r', ) plt.xlabel('Variable Importance') plt.gca().legend_ = None # In[43]: mean_squared_error(y_test, regr.predict(X_test))