import numpy as np

# set a seed for reproducibility
np.random.seed(1234)

# generate 1000 random numbers (between 0 and 1) for each model, representing 1000 observations
mod1 = np.random.rand(1000)
mod2 = np.random.rand(1000)
mod3 = np.random.rand(1000)
mod4 = np.random.rand(1000)
mod5 = np.random.rand(1000)

# each model independently predicts 1 (the "correct response") if random number was at least 0.3
preds1 = np.where(mod1 > 0.3, 1, 0)
preds2 = np.where(mod2 > 0.3, 1, 0)
preds3 = np.where(mod3 > 0.3, 1, 0)
preds4 = np.where(mod4 > 0.3, 1, 0)
preds5 = np.where(mod5 > 0.3, 1, 0)

# print the first 20 predictions from each model
print preds1[:20]
print preds2[:20]
print preds3[:20]
print preds4[:20]
print preds5[:20]

# add the predictions together
sum_of_preds = preds1 + preds2 + preds3 + preds4 + preds5

# ensemble predicts 1 (the "correct response") if at least 3 models predict 1
ensemble_preds = np.where(sum_of_preds >=3 , 1, 0)

# print the ensemble's first 20 predictions
print ensemble_preds[:20]

# how accurate was the ensemble?
ensemble_preds.mean()

# set a seed for reproducibility
np.random.seed(1)

# create an array of 0 to 9, then sample 10 times with replacement
np.random.choice(a=10, size=10, replace=True)

import pandas as pd

# read in vehicle data
vehicles = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/used_vehicles.csv')

# convert car to 0 and truck to 1
vehicles['type'] = vehicles.type.map({'car':0, 'truck':1})

# print out data
vehicles

# calculate the number of rows in vehicles
n_rows = vehicles.shape[0]

# set a seed for reproducibility
np.random.seed(123)

# create three bootstrap samples (will be used to select rows from the DataFrame)
sample1 = np.random.choice(a=n_rows, size=n_rows, replace=True)
sample2 = np.random.choice(a=n_rows, size=n_rows, replace=True)
sample3 = np.random.choice(a=n_rows, size=n_rows, replace=True)

# print samples
print sample1
print sample2
print sample3

# use sample1 to select rows from DataFrame
print vehicles.iloc[sample1, :]

from sklearn.tree import DecisionTreeRegressor

# grow one regression tree with each bootstrapped training set
treereg1 = DecisionTreeRegressor(random_state=123)
treereg1.fit(vehicles.iloc[sample1, 1:], vehicles.iloc[sample1, 0])

treereg2 = DecisionTreeRegressor(random_state=123)
treereg2.fit(vehicles.iloc[sample2, 1:], vehicles.iloc[sample2, 0])

treereg3 = DecisionTreeRegressor(random_state=123)
treereg3.fit(vehicles.iloc[sample3, 1:], vehicles.iloc[sample3, 0])

# read in out-of-sample data
oos = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/used_vehicles_oos.csv')

# convert car to 0 and truck to 1
oos['type'] = oos.type.map({'car':0, 'truck':1})

# print data
oos

# select feature columns (every column except for the 0th column)
feature_cols = vehicles.columns[1:]

# make predictions on out-of-sample data
preds1 = treereg1.predict(oos[feature_cols])
preds2 = treereg2.predict(oos[feature_cols])
preds3 = treereg3.predict(oos[feature_cols])

# print predictions
print preds1
print preds2
print preds3

# average predictions and compare to actual values
print (preds1 + preds2 + preds3)/3
print oos.price.values

# set is a data structure used to identify unique elements
print set(range(14))

# only show the unique elements in sample1
print set(sample1)

# use the "set difference" to identify the out-of-bag observations for each tree
print sorted(set(range(14)) - set(sample1))
print sorted(set(range(14)) - set(sample2))
print sorted(set(range(14)) - set(sample3))

# read in the Titanic data
titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/titanic.csv')

# encode sex feature
titanic['sex'] = titanic.sex.map({'female':0, 'male':1})

# fill in missing values for age
titanic.age.fillna(titanic.age.mean(), inplace=True)

# create three dummy variables, drop the first dummy variable, and store this as a DataFrame
embarked_dummies = pd.get_dummies(titanic.embarked, prefix='embarked').iloc[:, 1:]

# concatenate the two dummy variable columns onto the original DataFrame
# note: axis=0 means rows, axis=1 means columns
titanic = pd.concat([titanic, embarked_dummies], axis=1)

# create a list of feature columns
feature_cols = ['pclass', 'sex', 'age', 'embarked_Q', 'embarked_S']

# print the updated DataFrame
titanic.head(10)

# import class, instantiate estimator, fit with all data
from sklearn.ensemble import RandomForestClassifier
rfclf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1)
rfclf.fit(titanic[feature_cols], titanic.survived)

# compute the feature importances
pd.DataFrame({'feature':feature_cols, 'importance':rfclf.feature_importances_})

# compute the out-of-bag classification accuracy
rfclf.oob_score_