%%html
<link rel="stylesheet" href="static/hyrule.css" type="text/css">
%pylab inline
Populating the interactive namespace from numpy and matplotlib
import pandas as pd
from matplotlib import pyplot as plt
vehicles = pd.read_csv('./data/used_vehicles.csv')
vehicles['type'] = vehicles.type.map({'car':0, 'truck':1})
df_car = df[df.type=='car']
df_car_4_door = df_car[df_car.doors==2]
df_car_4_door_miles
price | year | miles | doors | type | |
---|---|---|---|---|---|
0 | 22000 | 2012 | 13000 | 2 | car |
1 | 14000 | 2010 | 30000 | 2 | car |
5 | 4000 | 2006 | 124000 | 2 | car |
8 | 3000 | 2003 | 138000 | 2 | car |
df_2d = df[df.doors==2]
df_4d = df[df.doors==4]
df_doors.sort('miles')
price | year | miles | doors | type | |
---|---|---|---|---|---|
0 | 22000 | 2012 | 13000 | 2 | car |
1 | 14000 | 2010 | 30000 | 2 | car |
5 | 4000 | 2006 | 124000 | 2 | car |
8 | 3000 | 2003 | 138000 | 2 | car |
12 | 1800 | 1999 | 163000 | 2 | truck |
10 | 2500 | 2003 | 190000 | 2 | truck |
df_4d.sort('miles').mean()
price 5587.500 year 2004.375 miles 118062.500 doors 4.000 dtype: float64
df.groupby('miles').price.mean()
miles 13000 22000 30000 14000 47000 9000 62000 5000 73500 13000 78000 9500 124000 4000 138000 2150 160000 1900 163000 1800 177000 3000 190000 2500 209000 2000 Name: price, dtype: int64
feature_cols = vehicles.columns[1:]
X = vehicles[feature_cols]
y = vehicles.price
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
from sklearn.tree import DecisionTreeRegressor
treereg = DecisionTreeRegressor(random_state=1)
treereg.fit(X_train, y_train)
DecisionTreeRegressor(compute_importances=None, criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, random_state=1, splitter='best')
preds = treereg.predict(X_test)
# print predictions and actual values
print preds
print y_test
[ 5000. 1900. 1900. 5000.] [ 9500 2000 3000 13000]
from sklearn.cross_validation import cross_val_score
treereg = DecisionTreeRegressor(max_depth=2, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
np.mean(np.sqrt(-scores))
4804.3767888427128
treereg = DecisionTreeRegressor(max_depth=3, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
np.mean(np.sqrt(-scores))
4592.1554255755254
# try max_depth=4
treereg = DecisionTreeRegressor(max_depth=4, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
np.mean(np.sqrt(-scores))
4704.0052694797387
# max_depth=3 was best, so fit a tree using that parameter with ALL DATA
treereg = DecisionTreeRegressor(max_depth=3, random_state=1)
treereg.fit(X, y)
DecisionTreeRegressor(compute_importances=None, criterion='mse', max_depth=3, max_features=None, max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, random_state=1, splitter='best')
% matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
all_scores = []
best_score = -1
best_depth = 0
for i in range(1, 6):
treereg = DecisionTreeRegressor(max_depth=i, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
current_score = np.mean(np.sqrt(-scores))
# If the score mean is better than the current best,
# or best is the default (-1), then update!
if current_score < best_score or best_score == -1:
best_score = current_score
best_depth = i
# store to plot anyway!
all_scores.append(current_score)
print "Best score: %s" % best_score
print "Best depth: %s" % best_depth
# now actually fit the model
treereg = DecisionTreeRegressor(max_depth=best_depth, random_state=1)
treereg.fit(X, y)
plt.figure()
plt.plot(range(1, 6), all_scores)
plt.xlabel('x=max tree depth')
Best score: 4592.15542558 Best depth: 3
<matplotlib.text.Text at 0x10a217610>
% matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
all_scores = []
best_score = -1
min_samples_leaf = 0
for i in range(1, 12):
treereg = DecisionTreeRegressor(min_samples_leaf=i, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
current_score = np.mean(np.sqrt(-scores))
# If the score mean is better than the current best,
# or best is the default (-1), then update!
if current_score < best_score or best_score == -1:
best_score = current_score
min_samples_leaf = i
# store to plot anyway!
all_scores.append(current_score)
print "Best score: %s" % best_score
print "min_samples_leaf: %s" % best_depth
# now actually fit the model
treereg = DecisionTreeRegressor(max_depth=best_depth, random_state=1)
treereg.fit(X, y)
plt.figure()
plt.plot(range(1, 12), all_scores)
plt.xlabel('x=min_samples_leaf')
Best score: 4607.62855534 min_samples_leaf: 3
<matplotlib.text.Text at 0x10a66a0d0>
# read in the data
titanic = pd.read_csv('./data/titanic.csv')
titanic.head(10)
survived | pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | Braund, Mr. Owen Harris | male | 22 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 1 | 3 | Heikkinen, Miss. Laina | female | 26 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 0 | 3 | Allen, Mr. William Henry | male | 35 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14 | 1 | 0 | 237736 | 30.0708 | NaN | C |
# look for missing values
titanic.isnull().sum()
survived 0 pclass 0 name 0 sex 0 age 177 sibsp 0 parch 0 ticket 0 fare 0 cabin 687 embarked 2 dtype: int64
# copy
titanic_c = pd.DataFrame(titanic)
# encode sex feature
titanic_c['sex'] = titanic.sex.map({'female':0, 'male':1})
# fill in missing values for age
titanic_c.age.fillna(titanic.age.mean(), inplace=True)
# is there a more intelligent way we might handle age?
#titanic_c.age.fillna(, inplace=True)
# print the updated DataFrame
titanic_c.head(10)
survived | pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 1 | 3 | Heikkinen, Miss. Laina | 0 | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 0 | 3 | Allen, Mr. William Henry | 1 | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 0 | 3 | Moran, Mr. James | 1 | 29.699118 | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 0 | 1 | McCarthy, Mr. Timothy J | 1 | 54.000000 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 0 | 3 | Palsson, Master. Gosta Leonard | 1 | 2.000000 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | 0 | 27.000000 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | 0 | 14.000000 | 1 | 0 | 237736 | 30.0708 | NaN | C |
# create three dummy variables using get_dummies
pd.get_dummies(titanic_c.embarked, prefix='embarked').head(10)
embarked_C | embarked_Q | embarked_S | |
---|---|---|---|
0 | 0 | 0 | 1 |
1 | 1 | 0 | 0 |
2 | 0 | 0 | 1 |
3 | 0 | 0 | 1 |
4 | 0 | 0 | 1 |
5 | 0 | 1 | 0 |
6 | 0 | 0 | 1 |
7 | 0 | 0 | 1 |
8 | 0 | 0 | 1 |
9 | 1 | 0 | 0 |
# create three dummy variables, drop the first dummy variable, and store this as a DataFrame
embarked_dummies = pd.get_dummies(titanic_c.embarked, prefix='embarked').iloc[:, 1:]
# join the two dummy variable columns onto the original DataFrame
titanic_c = titanic_c.join(embarked_dummies)
# print the updated DataFrame
titanic_c.head(10)
survived | pclass | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | embarked_Q | embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 0 | 1 |
1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | 0 |
2 | 1 | 3 | Heikkinen, Miss. Laina | 0 | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 | 1 |
3 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 | 1 |
4 | 0 | 3 | Allen, Mr. William Henry | 1 | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S | 0 | 1 |
5 | 0 | 3 | Moran, Mr. James | 1 | 29.699118 | 0 | 0 | 330877 | 8.4583 | NaN | Q | 1 | 0 |
6 | 0 | 1 | McCarthy, Mr. Timothy J | 1 | 54.000000 | 0 | 0 | 17463 | 51.8625 | E46 | S | 0 | 1 |
7 | 0 | 3 | Palsson, Master. Gosta Leonard | 1 | 2.000000 | 3 | 1 | 349909 | 21.0750 | NaN | S | 0 | 1 |
8 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | 0 | 27.000000 | 0 | 2 | 347742 | 11.1333 | NaN | S | 0 | 1 |
9 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | 0 | 14.000000 | 1 | 0 | 237736 | 30.0708 | NaN | C | 0 | 0 |
# create a list of feature columns
feature_cols = ['pclass', 'sex', 'age', 'embarked_Q', 'embarked_S']
# define X and y
X = titanic_c[feature_cols]
y = titanic_c.survived
# fit a classification tree with max_depth=3 on all data
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X, y)
DecisionTreeClassifier(compute_importances=None, criterion='gini', max_depth=3, max_features=None, max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, random_state=1, splitter='best')
# compute the feature importances
pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})
feature | importance | |
---|---|---|
0 | pclass | 0.242664 |
1 | sex | 0.655584 |
2 | age | 0.064494 |
3 | embarked_Q | 0.000000 |
4 | embarked_S | 0.037258 |
from sklearn import metrics
# Compute the confusion matrix
conf = metrics.confusion_matrix(y, treeclf.predict(X))
print conf
[[524 25] [133 209]]
import numpy as np
def find_expected_value(confusion, cost_benefit):
# if you use a probability matrix instead, this next line will return the same matrix back
probabilities = confusion.astype('float') / confusion.sum()
return (probabilities * cost_benefit.astype('float')).sum()
cb = np.array([[45.0, -30.0], [0, 0]])
conf = np.array([[45423, 13041], [98724, 12324]])
print find_expected_value(conf, cb)
9.7503716551
lemons_confusion = np.array([[292, 66], [5997, 44733]])
lemons_cb = np.array([[0, 0], [-6278.22, 672.82]])
find_expected_value(lemons_confusion,lemons_cb)
-147.84740604447222