# import pandas import pandas as pd # read in vehicle data vehicles = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/used_vehicles.csv') # print out data vehicles # convert car to 0 and truck to 1 vehicles['type'] = vehicles.type.map({'car':0, 'truck':1}) # select feature columns (every column except for the 0th column) feature_cols = vehicles.columns[1:] # define X (features) and y (response) X = vehicles[feature_cols] y = vehicles.price # split into train/test from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # print out each of the arrays print X_train print y_train print X_test print y_test # import class, instantiate estimator, fit with training set from sklearn.tree import DecisionTreeRegressor treereg = DecisionTreeRegressor(random_state=1) treereg.fit(X_train, y_train) # make predictions preds = treereg.predict(X_test) # print predictions and actual values print preds print y_test # print RMSE from sklearn import metrics import numpy as np np.sqrt(metrics.mean_squared_error(y_test, preds)) # use cross-validation to find best max_depth from sklearn.cross_validation import cross_val_score # try max_depth=2 treereg = DecisionTreeRegressor(max_depth=2, random_state=1) scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error') np.mean(np.sqrt(-scores)) # try max_depth=3 treereg = DecisionTreeRegressor(max_depth=3, random_state=1) scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error') np.mean(np.sqrt(-scores)) # try max_depth=4 treereg = DecisionTreeRegressor(max_depth=4, random_state=1) scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error') np.mean(np.sqrt(-scores)) # max_depth=3 was best, so fit a tree using that parameter with ALL DATA treereg = DecisionTreeRegressor(max_depth=3, random_state=1) treereg.fit(X, y) # compute the "Gini importance" of each feature: the (normalized) total reduction of MSE brought by that feature pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_}) # create a Graphviz file from sklearn.tree import export_graphviz with open("15_vehicles.dot", 'wb') as f: f = export_graphviz(treereg, out_file=f, feature_names=feature_cols) # at the command line, run this to convert to PNG: # dot -Tpng 15_vehicles.dot -o 15_vehicles.png # read in out-of-sample data oos = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/used_vehicles_oos.csv') # convert car to 0 and truck to 1 oos['type'] = oos.type.map({'car':0, 'truck':1}) # print data oos # define X and y X_oos = oos[feature_cols] y_oos = oos.price # make predictions on out-of-sample data preds = treereg.predict(X_oos) # print predictions and actual values print preds print y_oos.values # print RMSE np.sqrt(metrics.mean_squared_error(y_oos, preds)) # print RMSE for the tree you created! your_preds = [4000, 5000, 13500] np.sqrt(metrics.mean_squared_error(y_oos, your_preds)) # read in the data titanic = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/titanic.csv') titanic.head(10) # look for missing values titanic.isnull().sum() # encode sex feature titanic['sex'] = titanic.sex.map({'female':0, 'male':1}) # fill in missing values for age titanic.age.fillna(titanic.age.mean(), inplace=True) # print the updated DataFrame titanic.head(10) # create three dummy variables using get_dummies pd.get_dummies(titanic.embarked, prefix='embarked').head(10) # create three dummy variables, drop the first dummy variable, and store this as a DataFrame embarked_dummies = pd.get_dummies(titanic.embarked, prefix='embarked').iloc[:, 1:] # concatenate the two dummy variable columns onto the original DataFrame # note: axis=0 means rows, axis=1 means columns titanic = pd.concat([titanic, embarked_dummies], axis=1) # print the updated DataFrame titanic.head(10) # create a list of feature columns feature_cols = ['pclass', 'sex', 'age', 'embarked_Q', 'embarked_S'] # define X and y X = titanic[feature_cols] y = titanic.survived # fit a classification tree with max_depth=3 on all data from sklearn.tree import DecisionTreeClassifier treeclf = DecisionTreeClassifier(max_depth=3, random_state=1) treeclf.fit(X, y) # create a Graphviz file with open("15_titanic.dot", 'wb') as f: f = export_graphviz(treeclf, out_file=f, feature_names=feature_cols) # compute the feature importances pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})