In [1]:

%%html
<link rel="stylesheet" href="static/hyrule.css" type="text/css">

In [2]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [2]:

import pandas as pd
from matplotlib import pyplot as plt
vehicles = pd.read_csv('./data/used_vehicles.csv')

In [24]:

vehicles['type'] = vehicles.type.map({'car':0, 'truck':1})

In [8]:

df_car = df[df.type=='car']

In [11]:

df_car_4_door = df_car[df_car.doors==2]

In [12]:

df_car_4_door_miles

Out[12]:

	price	year	miles	doors	type
0	22000	2012	13000	2	car
1	14000	2010	30000	2	car
5	4000	2006	124000	2	car
8	3000	2003	138000	2	car

In [18]:

df_2d = df[df.doors==2]
df_4d = df[df.doors==4]

In [17]:

df_doors.sort('miles')

Out[17]:

	price	year	miles	doors	type
0	22000	2012	13000	2	car
1	14000	2010	30000	2	car
5	4000	2006	124000	2	car
8	3000	2003	138000	2	car
12	1800	1999	163000	2	truck
10	2500	2003	190000	2	truck

In [20]:

df_4d.sort('miles').mean()

Out[20]:

price      5587.500
year       2004.375
miles    118062.500
doors         4.000
dtype: float64

In [21]:

df.groupby('miles').price.mean()

Out[21]:

miles
13000     22000
30000     14000
47000      9000
62000      5000
73500     13000
78000      9500
124000     4000
138000     2150
160000     1900
163000     1800
177000     3000
190000     2500
209000     2000
Name: price, dtype: int64

In [25]:

feature_cols = vehicles.columns[1:]
X = vehicles[feature_cols]
y = vehicles.price

In [26]:

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [27]:

from sklearn.tree import DecisionTreeRegressor
treereg = DecisionTreeRegressor(random_state=1)
treereg.fit(X_train, y_train)

Out[27]:

DecisionTreeRegressor(compute_importances=None, criterion='mse',
           max_depth=None, max_features=None, max_leaf_nodes=None,
           min_density=None, min_samples_leaf=1, min_samples_split=2,
           random_state=1, splitter='best')

In [28]:

preds = treereg.predict(X_test)

# print predictions and actual values
print preds
print y_test

[ 5000.  1900.  1900.  5000.]
[ 9500  2000  3000 13000]

In [29]:

from sklearn.cross_validation import cross_val_score
treereg = DecisionTreeRegressor(max_depth=2, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
np.mean(np.sqrt(-scores))

Out[29]:

4804.3767888427128

In [30]:

treereg = DecisionTreeRegressor(max_depth=3, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
np.mean(np.sqrt(-scores))

Out[30]:

4592.1554255755254

In [31]:

# try max_depth=4
treereg = DecisionTreeRegressor(max_depth=4, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
np.mean(np.sqrt(-scores))

Out[31]:

4704.0052694797387

In [32]:

# max_depth=3 was best, so fit a tree using that parameter with ALL DATA
treereg = DecisionTreeRegressor(max_depth=3, random_state=1)
treereg.fit(X, y)

Out[32]:

DecisionTreeRegressor(compute_importances=None, criterion='mse', max_depth=3,
           max_features=None, max_leaf_nodes=None, min_density=None,
           min_samples_leaf=1, min_samples_split=2, random_state=1,
           splitter='best')

In [33]:

% matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

all_scores = []
best_score = -1
best_depth = 0
for i in range(1, 6):
    treereg = DecisionTreeRegressor(max_depth=i, random_state=1)
    scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
    current_score = np.mean(np.sqrt(-scores))
    # If the score mean is better than the current best, 
    # or best is the default (-1), then update!
    if current_score < best_score or best_score == -1:
        best_score = current_score
        best_depth = i
    # store to plot anyway!
    all_scores.append(current_score)
    
print "Best score: %s" % best_score
print "Best depth: %s" % best_depth

# now actually fit the model
treereg = DecisionTreeRegressor(max_depth=best_depth, random_state=1)
treereg.fit(X, y)

plt.figure()
plt.plot(range(1, 6), all_scores)
plt.xlabel('x=max tree depth')

Best score: 4592.15542558
Best depth: 3

Out[33]:

<matplotlib.text.Text at 0x10a217610>

In [36]:

% matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

all_scores = []
best_score = -1
min_samples_leaf = 0
for i in range(1, 12):
    treereg = DecisionTreeRegressor(min_samples_leaf=i, random_state=1)
    scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
    current_score = np.mean(np.sqrt(-scores))
    # If the score mean is better than the current best, 
    # or best is the default (-1), then update!
    if current_score < best_score or best_score == -1:
        best_score = current_score
        min_samples_leaf = i
    # store to plot anyway!
    all_scores.append(current_score)
    
print "Best score: %s" % best_score
print "min_samples_leaf: %s" % best_depth

# now actually fit the model
treereg = DecisionTreeRegressor(max_depth=best_depth, random_state=1)
treereg.fit(X, y)

plt.figure()
plt.plot(range(1, 12), all_scores)
plt.xlabel('x=min_samples_leaf')

Best score: 4607.62855534
min_samples_leaf: 3

Out[36]:

<matplotlib.text.Text at 0x10a66a0d0>

In [37]:

# read in the data
titanic = pd.read_csv('./data/titanic.csv')
titanic.head(10)

Out[37]:

	survived	pclass	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked
0	0	3	Braund, Mr. Owen Harris	male	22	1	0	A/5 21171	7.2500	NaN	S
1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38	1	0	PC 17599	71.2833	C85	C
2	1	3	Heikkinen, Miss. Laina	female	26	0	0	STON/O2. 3101282	7.9250	NaN	S
3	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35	1	0	113803	53.1000	C123	S
4	0	3	Allen, Mr. William Henry	male	35	0	0	373450	8.0500	NaN	S
5	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
6	0	1	McCarthy, Mr. Timothy J	male	54	0	0	17463	51.8625	E46	S
7	0	3	Palsson, Master. Gosta Leonard	male	2	3	1	349909	21.0750	NaN	S
8	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27	0	2	347742	11.1333	NaN	S
9	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14	1	0	237736	30.0708	NaN	C

In [38]:

# look for missing values
titanic.isnull().sum()

Out[38]:

survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
ticket        0
fare          0
cabin       687
embarked      2
dtype: int64

In [39]:

# copy
titanic_c = pd.DataFrame(titanic)

# encode sex feature
titanic_c['sex'] = titanic.sex.map({'female':0, 'male':1})

# fill in missing values for age
titanic_c.age.fillna(titanic.age.mean(), inplace=True)

# is there a more intelligent way we might handle age?
#titanic_c.age.fillna(, inplace=True)

# print the updated DataFrame
titanic_c.head(10)

Out[39]:

	survived	pclass	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked
0	0	3	Braund, Mr. Owen Harris	1	22.000000	1	0	A/5 21171	7.2500	NaN	S
1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	38.000000	1	0	PC 17599	71.2833	C85	C
2	1	3	Heikkinen, Miss. Laina	0	26.000000	0	0	STON/O2. 3101282	7.9250	NaN	S
3	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	35.000000	1	0	113803	53.1000	C123	S
4	0	3	Allen, Mr. William Henry	1	35.000000	0	0	373450	8.0500	NaN	S
5	0	3	Moran, Mr. James	1	29.699118	0	0	330877	8.4583	NaN	Q
6	0	1	McCarthy, Mr. Timothy J	1	54.000000	0	0	17463	51.8625	E46	S
7	0	3	Palsson, Master. Gosta Leonard	1	2.000000	3	1	349909	21.0750	NaN	S
8	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	0	27.000000	0	2	347742	11.1333	NaN	S
9	1	2	Nasser, Mrs. Nicholas (Adele Achem)	0	14.000000	1	0	237736	30.0708	NaN	C

In [40]:

# create three dummy variables using get_dummies
pd.get_dummies(titanic_c.embarked, prefix='embarked').head(10)

Out[40]:

	embarked_C	embarked_Q	embarked_S
0	0	0	1
1	1	0	0
2	0	0	1
3	0	0	1
4	0	0	1
5	0	1	0
6	0	0	1
7	0	0	1
8	0	0	1
9	1	0	0

In [41]:

# create three dummy variables, drop the first dummy variable, and store this as a DataFrame
embarked_dummies = pd.get_dummies(titanic_c.embarked, prefix='embarked').iloc[:, 1:]

# join the two dummy variable columns onto the original DataFrame
titanic_c = titanic_c.join(embarked_dummies)

# print the updated DataFrame
titanic_c.head(10)

Out[41]:

	survived	pclass	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	embarked_Q	embarked_S
0	0	3	Braund, Mr. Owen Harris	1	22.000000	1	0	A/5 21171	7.2500	NaN	S	0	1
1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	38.000000	1	0	PC 17599	71.2833	C85	C	0	0
2	1	3	Heikkinen, Miss. Laina	0	26.000000	0	0	STON/O2. 3101282	7.9250	NaN	S	0	1
3	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	35.000000	1	0	113803	53.1000	C123	S	0	1
4	0	3	Allen, Mr. William Henry	1	35.000000	0	0	373450	8.0500	NaN	S	0	1
5	0	3	Moran, Mr. James	1	29.699118	0	0	330877	8.4583	NaN	Q	1	0
6	0	1	McCarthy, Mr. Timothy J	1	54.000000	0	0	17463	51.8625	E46	S	0	1
7	0	3	Palsson, Master. Gosta Leonard	1	2.000000	3	1	349909	21.0750	NaN	S	0	1
8	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	0	27.000000	0	2	347742	11.1333	NaN	S	0	1
9	1	2	Nasser, Mrs. Nicholas (Adele Achem)	0	14.000000	1	0	237736	30.0708	NaN	C	0	0

In [42]:

# create a list of feature columns
feature_cols = ['pclass', 'sex', 'age', 'embarked_Q', 'embarked_S']

# define X and y
X = titanic_c[feature_cols]
y = titanic_c.survived

In [43]:

# fit a classification tree with max_depth=3 on all data
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X, y)

Out[43]:

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=3, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=1, splitter='best')

In [44]:

# compute the feature importances
pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

Out[44]:

	feature	importance
0	pclass	0.242664
1	sex	0.655584
2	age	0.064494
3	embarked_Q	0.000000
4	embarked_S	0.037258

In [47]:

from sklearn import metrics
# Compute the confusion matrix
conf = metrics.confusion_matrix(y, treeclf.predict(X))
print conf

[[524  25]
 [133 209]]

In [3]:

import numpy as np
def find_expected_value(confusion, cost_benefit):
    # if you use a probability matrix instead, this next line will return the same matrix back
    probabilities = confusion.astype('float') / confusion.sum()
    return (probabilities * cost_benefit.astype('float')).sum()

cb = np.array([[45.0, -30.0], [0, 0]])

conf = np.array([[45423, 13041], [98724, 12324]])
print find_expected_value(conf, cb)

9.7503716551

In [4]:

lemons_confusion = np.array([[292, 66], [5997, 44733]])
lemons_cb = np.array([[0, 0], [-6278.22, 672.82]])
find_expected_value(lemons_confusion,lemons_cb)

Out[4]:

-147.84740604447222

In [ ]: