# vehicle data import pandas as pd url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/vehicles_train.csv' train = pd.read_csv(url) # before splitting anything, just predict the mean of the entire dataset train['prediction'] = train.price.mean() train # calculate RMSE for those predictions from sklearn import metrics import numpy as np np.sqrt(metrics.mean_squared_error(train.price, train.prediction)) # define a function that calculates the RMSE for a given split of miles def mileage_split(miles): lower_mileage_price = train[train.miles < miles].price.mean() higher_mileage_price = train[train.miles >= miles].price.mean() train['prediction'] = np.where(train.miles < miles, lower_mileage_price, higher_mileage_price) return np.sqrt(metrics.mean_squared_error(train.price, train.prediction)) # calculate RMSE for tree which splits on miles < 50000 print 'RMSE:', mileage_split(50000) train # calculate RMSE for tree which splits on miles < 100000 print 'RMSE:', mileage_split(100000) train # check all possible mileage splits mileage_range = range(train.miles.min(), train.miles.max(), 1000) RMSE = [mileage_split(miles) for miles in mileage_range] # plot mileage cutpoint (x-axis) versus RMSE (y-axis) %matplotlib inline import matplotlib.pyplot as plt plt.plot(mileage_range, RMSE) plt.xlabel('Mileage cutpoint') plt.ylabel('RMSE (lower is better)') # encode car as 0 and truck as 1 train['vtype'] = train.vtype.map({'car':0, 'truck':1}) # define X and y feature_cols = ['year', 'miles', 'doors', 'vtype'] X = train[feature_cols] y = train.price # instantiate a DecisionTreeRegressor (with random_state=1) from sklearn.tree import DecisionTreeRegressor treereg = DecisionTreeRegressor(random_state=1) treereg # use leave-one-out cross-validation (LOOCV) to estimate the RMSE for this model from sklearn.cross_validation import cross_val_score scores = cross_val_score(treereg, X, y, cv=14, scoring='mean_squared_error') np.mean(np.sqrt(-scores)) # try different values one-by-one treereg = DecisionTreeRegressor(max_depth=1, random_state=1) scores = cross_val_score(treereg, X, y, cv=14, scoring='mean_squared_error') np.mean(np.sqrt(-scores)) # list of values to try max_depth_range = range(1, 8) # list to store the average RMSE for each value of max_depth RMSE_scores = [] # use LOOCV with each value of max_depth for depth in max_depth_range: treereg = DecisionTreeRegressor(max_depth=depth, random_state=1) MSE_scores = cross_val_score(treereg, X, y, cv=14, scoring='mean_squared_error') RMSE_scores.append(np.mean(np.sqrt(-MSE_scores))) # plot max_depth (x-axis) versus RMSE (y-axis) plt.plot(max_depth_range, RMSE_scores) plt.xlabel('max_depth') plt.ylabel('RMSE (lower is better)') # max_depth=3 was best, so fit a tree using that parameter treereg = DecisionTreeRegressor(max_depth=3, random_state=1) treereg.fit(X, y) # "Gini importance" of each feature: the (normalized) total reduction of error brought by that feature pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_}) # create a GraphViz file from sklearn.tree import export_graphviz export_graphviz(treereg, out_file='tree_vehicles.dot', feature_names=feature_cols) # At the command line, run this to convert to PNG: # dot -Tpng tree_vehicles.dot -o tree_vehicles.png # read the testing data url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/vehicles_test.csv' test = pd.read_csv(url) test['vtype'] = test.vtype.map({'car':0, 'truck':1}) test # use fitted model to make predictions on testing data X_test = test[feature_cols] y_test = test.price y_pred = treereg.predict(X_test) y_pred # calculate RMSE np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # calculate RMSE for your own tree! y_test = [3000, 6000, 12000] y_pred = [0, 0, 0] from sklearn import metrics np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # read in the data url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/titanic.csv' titanic = pd.read_csv(url) titanic.head(10) # encode female as 0 and male as 1 titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1}) # fill in the missing values for age with the mean age titanic.Age.fillna(titanic.Age.mean(), inplace=True) # create three dummy variables, drop the first dummy variable, and store the two remaining columns as a DataFrame embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:] # concatenate the two dummy variable columns onto the original DataFrame titanic = pd.concat([titanic, embarked_dummies], axis=1) # print the updated DataFrame titanic.head(10) # define X and y feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S'] X = titanic[feature_cols] y = titanic.Survived # fit a classification tree with max_depth=3 on all data from sklearn.tree import DecisionTreeClassifier treeclf = DecisionTreeClassifier(max_depth=3, random_state=1) treeclf.fit(X, y) # create a GraphViz file export_graphviz(treeclf, out_file='tree_titanic.dot', feature_names=feature_cols) # At the command line, run this to convert to PNG: # dot -Tpng tree_titanic.dot -o tree_titanic.png # compute the feature importances pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})