import numpy as np # set a seed for reproducibility np.random.seed(1234) # generate 1000 random numbers (between 0 and 1) for each model, representing 1000 observations mod1 = np.random.rand(1000) mod2 = np.random.rand(1000) mod3 = np.random.rand(1000) mod4 = np.random.rand(1000) mod5 = np.random.rand(1000) # each model independently predicts 1 (the "correct response") if random number was at least 0.3 preds1 = np.where(mod1 > 0.3, 1, 0) preds2 = np.where(mod2 > 0.3, 1, 0) preds3 = np.where(mod3 > 0.3, 1, 0) preds4 = np.where(mod4 > 0.3, 1, 0) preds5 = np.where(mod5 > 0.3, 1, 0) # print the first 20 predictions from each model print preds1[:20] print preds2[:20] print preds3[:20] print preds4[:20] print preds5[:20] # average the predictions and then round to 0 or 1 ensemble_preds = np.round((preds1 + preds2 + preds3 + preds4 + preds5)/5.0).astype(int) # print the ensemble's first 20 predictions print ensemble_preds[:20] # how accurate was each individual model? print preds1.mean() print preds2.mean() print preds3.mean() print preds4.mean() print preds5.mean() # how accurate was the ensemble? print ensemble_preds.mean() # set a seed for reproducibility np.random.seed(1) # create an array of 1 through 20 nums = np.arange(1, 21) print nums # sample that array 20 times with replacement print np.random.choice(a=nums, size=20, replace=True) # read in and prepare the vehicle training data import pandas as pd url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/vehicles_train.csv' train = pd.read_csv(url) train['vtype'] = train.vtype.map({'car':0, 'truck':1}) train # set a seed for reproducibility np.random.seed(123) # create ten bootstrap samples (will be used to select rows from the DataFrame) samples = [np.random.choice(a=14, size=14, replace=True) for _ in range(1, 11)] samples # show the rows for the first decision tree train.iloc[samples[0], :] # read in and prepare the vehicle testing data url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/vehicles_test.csv' test = pd.read_csv(url) test['vtype'] = test.vtype.map({'car':0, 'truck':1}) test from sklearn.tree import DecisionTreeRegressor # grow each tree deep treereg = DecisionTreeRegressor(max_depth=None, random_state=123) # list for storing predicted price from each tree predictions = [] # define testing data X_test = test.iloc[:, 1:] y_test = test.iloc[:, 0] # grow one tree for each bootstrap sample and make predictions on testing data for sample in samples: X_train = train.iloc[sample, 1:] y_train = train.iloc[sample, 0] treereg.fit(X_train, y_train) y_pred = treereg.predict(X_test) predictions.append(y_pred) # convert predictions from list to NumPy array predictions = np.array(predictions) predictions # average predictions np.mean(predictions, axis=0) # calculate RMSE from sklearn import metrics y_pred = np.mean(predictions, axis=0) np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # define the training and testing sets X_train = train.iloc[:, 1:] y_train = train.iloc[:, 0] X_test = test.iloc[:, 1:] y_test = test.iloc[:, 0] # instruct BaggingRegressor to use DecisionTreeRegressor as the "base estimator" from sklearn.ensemble import BaggingRegressor bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1) # fit and predict bagreg.fit(X_train, y_train) y_pred = bagreg.predict(X_test) y_pred # calculate RMSE np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # show the first bootstrap sample samples[0] # show the "in-bag" observations for each sample for sample in samples: print set(sample) # show the "out-of-bag" observations for each sample for sample in samples: print sorted(set(range(14)) - set(sample)) # compute the out-of-bag R-squared score (not MSE, unfortunately!) for B=500 bagreg.oob_score_ # read in the baseball salary data url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/hitters.csv' hitters = pd.read_csv(url) hitters.head() # show a cross-tabulation of League and NewLeague pd.crosstab(hitters.League, hitters.NewLeague) # check for missing values hitters.isnull().sum() # remove rows with missing values hitters.dropna(inplace=True) # factorize encodes categorical values as integers pd.factorize(hitters.League) # convert to dummy variables hitters['League'] = pd.factorize(hitters.League)[0] hitters['Division'] = pd.factorize(hitters.Division)[0] hitters['NewLeague'] = pd.factorize(hitters.NewLeague)[0] hitters.head() %matplotlib inline # histogram of Salary hitters.Salary.plot(kind='hist') # scatter plot of Years versus Hits colored by Salary hitters.plot(kind='scatter', x='Years', y='Hits', c='Salary', colormap='jet', xlim=(0, 25), ylim=(0, 250)) # exclude columns which represent career statistics feature_cols = hitters.columns[hitters.columns.str.startswith('C') == False] # exclude the response feature_cols = feature_cols.drop('Salary') # define X and y X = hitters[feature_cols] y = hitters.Salary # list of values to try for max_depth max_depth_range = range(1, 21) # list to store the average RMSE for each value of max_depth RMSE_scores = [] # use 10-fold cross-validation with each value of max_depth from sklearn.cross_validation import cross_val_score for depth in max_depth_range: treereg = DecisionTreeRegressor(max_depth=depth, random_state=1) MSE_scores = cross_val_score(treereg, X, y, cv=10, scoring='mean_squared_error') RMSE_scores.append(np.mean(np.sqrt(-MSE_scores))) # plot max_depth (x-axis) versus RMSE (y-axis) import matplotlib.pyplot as plt plt.plot(max_depth_range, RMSE_scores) plt.xlabel('max_depth') plt.ylabel('RMSE (lower is better)') # show the best RMSE and the corresponding max_depth sorted(zip(RMSE_scores, max_depth_range))[0] # max_depth=2 was best, so fit a tree using that parameter treereg = DecisionTreeRegressor(max_depth=2, random_state=1) treereg.fit(X, y) # compute feature importances pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_}).sort('importance') from sklearn.ensemble import RandomForestRegressor rfreg = RandomForestRegressor() rfreg # list of values to try for n_estimators estimator_range = range(10, 310, 10) # list to store the average RMSE for each value of n_estimators RMSE_scores = [] # use 5-fold cross-validation with each value of n_estimators for estimator in estimator_range: rfreg = RandomForestRegressor(n_estimators=estimator, random_state=1) MSE_scores = cross_val_score(rfreg, X, y, cv=5, scoring='mean_squared_error') RMSE_scores.append(np.mean(np.sqrt(-MSE_scores))) # plot n_estimators (x-axis) versus RMSE (y-axis) plt.plot(estimator_range, RMSE_scores) plt.xlabel('n_estimators') plt.ylabel('RMSE (lower is better)') # list of values to try for max_features feature_range = range(1, len(feature_cols)+1) # list to store the average RMSE for each value of max_features RMSE_scores = [] # use 10-fold cross-validation with each value of max_features for feature in feature_range: rfreg = RandomForestRegressor(n_estimators=150, max_features=feature, random_state=1) MSE_scores = cross_val_score(rfreg, X, y, cv=10, scoring='mean_squared_error') RMSE_scores.append(np.mean(np.sqrt(-MSE_scores))) # plot max_features (x-axis) versus RMSE (y-axis) plt.plot(feature_range, RMSE_scores) plt.xlabel('max_features') plt.ylabel('RMSE (lower is better)') # show the best RMSE and the corresponding max_features sorted(zip(RMSE_scores, feature_range))[0] # max_features=8 was best, so fit a Random Forest using that parameter rfreg = RandomForestRegressor(n_estimators=150, max_features=8, oob_score=True, random_state=1) rfreg.fit(X, y) # compute feature importances pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort('importance') # compute the out-of-bag R-squared score rfreg.oob_score_ # check the shape of X X.shape # set a threshold for which features to include print rfreg.transform(X, threshold=0.1).shape print rfreg.transform(X, threshold='mean').shape print rfreg.transform(X, threshold='median').shape # create a new feature matrix that only include important features X_important = rfreg.transform(X, threshold='mean') # check the RMSE for a Random Forest that only uses important features rfreg = RandomForestRegressor(n_estimators=150, max_features=3, random_state=1) scores = cross_val_score(rfreg, X_important, y, cv=10, scoring='mean_squared_error') np.mean(np.sqrt(-scores))