import numpy as np # set a seed for reproducibility np.random.seed(1234) # generate 1000 random numbers (between 0 and 1) for each model, representing 1000 observations mod1 = np.random.rand(1000) mod2 = np.random.rand(1000) mod3 = np.random.rand(1000) mod4 = np.random.rand(1000) mod5 = np.random.rand(1000) # each model independently predicts 1 (the "correct response") if random number was at least 0.3 preds1 = np.where(mod1 > 0.3, 1, 0) preds2 = np.where(mod2 > 0.3, 1, 0) preds3 = np.where(mod3 > 0.3, 1, 0) preds4 = np.where(mod4 > 0.3, 1, 0) preds5 = np.where(mod5 > 0.3, 1, 0) # print the first 20 predictions from each model print preds1[:20] print preds2[:20] print preds3[:20] print preds4[:20] print preds5[:20] # add the predictions together sum_of_preds = preds1 + preds2 + preds3 + preds4 + preds5 # ensemble predicts 1 (the "correct response") if at least 3 models predict 1 ensemble_preds = np.where(sum_of_preds >=3 , 1, 0) # print the ensemble's first 20 predictions print ensemble_preds[:20] # how accurate was the ensemble? ensemble_preds.mean() # set a seed for reproducibility np.random.seed(1) # create an array of 0 to 9, then sample 10 times with replacement np.random.choice(a=10, size=10, replace=True) import pandas as pd # read in vehicle data vehicles = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/used_vehicles.csv') # convert car to 0 and truck to 1 vehicles['type'] = vehicles.type.map({'car':0, 'truck':1}) # print out data vehicles # calculate the number of rows in vehicles n_rows = vehicles.shape[0] # set a seed for reproducibility np.random.seed(123) # create three bootstrap samples (will be used to select rows from the DataFrame) sample1 = np.random.choice(a=n_rows, size=n_rows, replace=True) sample2 = np.random.choice(a=n_rows, size=n_rows, replace=True) sample3 = np.random.choice(a=n_rows, size=n_rows, replace=True) # print samples print sample1 print sample2 print sample3 # use sample1 to select rows from DataFrame print vehicles.iloc[sample1, :] from sklearn.tree import DecisionTreeRegressor # grow one regression tree with each bootstrapped training set treereg1 = DecisionTreeRegressor(random_state=123) treereg1.fit(vehicles.iloc[sample1, 1:], vehicles.iloc[sample1, 0]) treereg2 = DecisionTreeRegressor(random_state=123) treereg2.fit(vehicles.iloc[sample2, 1:], vehicles.iloc[sample2, 0]) treereg3 = DecisionTreeRegressor(random_state=123) treereg3.fit(vehicles.iloc[sample3, 1:], vehicles.iloc[sample3, 0]) # read in out-of-sample data oos = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/used_vehicles_oos.csv') # convert car to 0 and truck to 1 oos['type'] = oos.type.map({'car':0, 'truck':1}) # print data oos # select feature columns (every column except for the 0th column) feature_cols = vehicles.columns[1:] # make predictions on out-of-sample data preds1 = treereg1.predict(oos[feature_cols]) preds2 = treereg2.predict(oos[feature_cols]) preds3 = treereg3.predict(oos[feature_cols]) # print predictions print preds1 print preds2 print preds3 # average predictions and compare to actual values print (preds1 + preds2 + preds3)/3 print oos.price.values # set is a data structure used to identify unique elements print set(range(14)) # only show the unique elements in sample1 print set(sample1) # use the "set difference" to identify the out-of-bag observations for each tree print sorted(set(range(14)) - set(sample1)) print sorted(set(range(14)) - set(sample2)) print sorted(set(range(14)) - set(sample3)) # read in the Titanic data titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/titanic.csv') # encode sex feature titanic['sex'] = titanic.sex.map({'female':0, 'male':1}) # fill in missing values for age titanic.age.fillna(titanic.age.mean(), inplace=True) # create three dummy variables, drop the first dummy variable, and store this as a DataFrame embarked_dummies = pd.get_dummies(titanic.embarked, prefix='embarked').iloc[:, 1:] # concatenate the two dummy variable columns onto the original DataFrame # note: axis=0 means rows, axis=1 means columns titanic = pd.concat([titanic, embarked_dummies], axis=1) # create a list of feature columns feature_cols = ['pclass', 'sex', 'age', 'embarked_Q', 'embarked_S'] # print the updated DataFrame titanic.head(10) # import class, instantiate estimator, fit with all data from sklearn.ensemble import RandomForestClassifier rfclf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1) rfclf.fit(titanic[feature_cols], titanic.survived) # compute the feature importances pd.DataFrame({'feature':feature_cols, 'importance':rfclf.feature_importances_}) # compute the out-of-bag classification accuracy rfclf.oob_score_