In [34]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

WARNING: pylab import has clobbered these variables: ['clf']
`%matplotlib` prevents importing * from pylab and numpy

In [35]:

import pandas as pd

In [36]:

import numpy as np
from __future__ import division
class Transformations(object):
    """since these transformations are all related, we'll nest them all under a feature norm class"""
    def mean_at_zero(self, arr):
        return np.array([i - np.mean(a) for i in arr])

    def norm_to_min_zero(self, arr):
        return np.array([i / max(a) for i in arr])
    
    def norm_to_absolute_min_zero(self, arr):
        """should be a range of 0 to 1, where 0 maintains its 0 value"""
        return np.array([(i-min(arr))/(max(arr)-min(arr)) for i in arr])
    
    def norm_to_neg_pos(self, arr):
        """should be a range of -1 to 1, where 0 represents the mean"""
        return np.array([(i-mean(arr))/(max(arr)-mean(arr)) for i in arr])
    
    def norm_by_std(self, arr):
        """should be a range where 0 represents the mean"""
        return np.array([(i-mean(arr))/std(arr) for i in arr])

In [37]:

transformer = Transformations()
a = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
print transformer.norm_to_absolute_min_zero(a) == np.array([0.0, 0.25, 0.5, 0.75, 1.0])
print transformer.norm_to_neg_pos(a) == np.array([-1.0, -0.5, 0.0, 0.5, 1.0])
print transformer.norm_by_std(a) == np.array([-1.414213562373095, -0.7071067811865475, 0.0, 0.7071067811865475, 1.414213562373095])

[ True  True  True  True  True]
[ True  True  True  True  True]
[ True  True  True  True  True]

In [44]:

import pandas as pd
from sklearn import tree
from sklearn.cross_validation import cross_val_score

# Load in data and create sets. dropping all na columns on the live data set.
lemons = pd.read_csv('./data/lemons.csv')
lemons_oos = pd.read_csv('./data/lemons_oos.csv')

print lemons.dtypes

RefId                                  int64
IsBadBuy                               int64
PurchDate                             object
Auction                               object
VehYear                                int64
VehicleAge                             int64
Make                                  object
Model                                 object
Trim                                  object
SubModel                              object
Color                                 object
Transmission                          object
WheelTypeID                          float64
WheelType                             object
VehOdo                                 int64
Nationality                           object
Size                                  object
TopThreeAmericanName                  object
MMRAcquisitionAuctionAveragePrice    float64
MMRAcquisitionAuctionCleanPrice      float64
MMRAcquisitionRetailAveragePrice     float64
MMRAcquisitonRetailCleanPrice        float64
MMRCurrentAuctionAveragePrice        float64
MMRCurrentAuctionCleanPrice          float64
MMRCurrentRetailAveragePrice         float64
MMRCurrentRetailCleanPrice           float64
PRIMEUNIT                             object
AUCGUART                              object
BYRNO                                  int64
VNZIP1                                 int64
VNST                                  object
VehBCost                             float64
IsOnlineSale                           int64
WarrantyCost                           int64
dtype: object

In [45]:

lemons = lemons.dropna(axis=1)
# Generating a list of continuous data features from the describe dataframe. 
# Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value)
features = list(lemons.describe().columns)
features.remove('RefId')
features.remove('IsBadBuy')

best_score = -1
for depth in range(1, 10):
    scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth, random_state=1234),
                                lemons[features],
                                lemons.IsBadBuy,
                                scoring='roc_auc',
                                cv=5)
    if scores.mean() > best_score:
        best_depth = depth
        best_score = scores.mean()

# Is the best score we have better than each DummyClassifier type?
from sklearn import dummy, metrics
for strat in ['stratified', 'most_frequent', 'uniform']:
    dummyclf = dummy.DummyClassifier(strategy=strat).fit(lemons[features], lemons.IsBadBuy)
    print 'did better than %s?' % strat, metrics.roc_auc_score(lemons.IsBadBuy, dummyclf.predict(lemons[features])) < best_score

# seems so!

# Create a classifier and prediction.
clf = tree.DecisionTreeClassifier(max_depth=depth, random_state=1234).fit(lemons[features], lemons.IsBadBuy)

y_pred = clf.predict(lemons_oos[features])

# Create a submission
submission = pd.DataFrame({ 'RefId' : lemons_oos.RefId, 'prediction' : y_pred })
submission.to_csv('submission.csv')

did better than stratified? True
did better than most_frequent? True
did better than uniform? True

In [46]:

lemons[features]

Out[46]:

	VehYear	VehicleAge	VehOdo	BYRNO	VNZIP1	VehBCost	IsOnlineSale	WarrantyCost
0	2006	3	89046	21973	33619	7100	0	1113
1	2004	5	93593	19638	33619	7600	0	1053
2	2005	4	69367	19638	33619	4000	0	1020
3	2004	5	81054	19638	33619	5600	0	594
4	2004	5	65328	19638	33619	4200	0	533
5	2005	4	79315	19638	33619	5400	0	1623
6	2006	3	74722	19638	33619	6900	0	1623
7	2003	6	72132	5546	33619	3300	0	1455
8	2005	4	80736	19638	33619	6800	0	1243
9	2003	6	75156	5546	33619	4900	0	1923
10	2004	5	84498	19638	33619	7100	0	1243
11	2002	7	66536	5546	33619	5800	0	2003
12	2006	3	59789	5546	33619	7700	0	671
13	2004	5	52106	5546	33619	4500	0	754
14	2002	7	88958	5546	33619	8000	0	2452
15	2004	5	76173	19638	33619	8800	0	920
16	2005	4	80064	19638	33619	5600	0	1763
17	2003	6	77694	19638	33619	8300	0	1923
18	2004	5	57723	19638	33619	7000	0	671
19	2005	4	78434	19638	33619	10700	0	1272
20	2001	8	82944	19638	33619	3600	0	2322
21	2003	6	55711	19638	33619	5100	0	971
22	2005	5	76586	19638	33619	8200	0	1389
23	2005	5	86889	19638	33619	5200	0	594
24	2004	6	68990	19619	33619	8500	0	1215
25	2008	2	80949	19619	33619	7900	0	2152
26	2003	7	59858	5546	33619	4600	0	1220
27	2006	4	50227	19619	33619	7500	0	1003
28	2006	4	58024	20928	33619	5600	0	671
29	2006	4	40919	20928	33619	7700	0	623
...	...	...	...	...	...	...	...	...
51058	2002	7	81794	18881	30212	3400	0	2063
51059	2008	1	49069	18881	30212	8000	0	482
51060	2006	3	69007	18111	30212	5600	0	728
51061	2006	3	62228	18111	30212	4525	0	533
51062	2001	8	72553	18111	30212	4325	0	1220
51063	2005	4	72224	18111	30212	6100	0	1038
51064	2006	3	64020	18111	30212	7440	0	1703
51065	2005	4	48447	18111	30212	7340	0	1328
51066	2004	5	81403	18111	30212	8900	0	983
51067	2005	4	77249	18111	30212	5330	0	1389
51068	2006	3	60549	18111	30212	5600	0	533
51069	2002	7	82568	18111	30212	4015	0	1543
51070	2006	3	64990	18111	30212	6835	0	1703
51071	2007	2	60074	18111	30212	5700	0	533
51072	2006	3	84168	18881	30212	4800	0	1243
51073	2007	2	72802	18881	30212	9500	0	1389
51074	2005	4	59383	18111	30212	9000	0	1417
51075	2002	7	75700	18881	30212	7000	0	1455
51076	2006	3	70004	18881	30212	5000	0	1155
51077	2005	4	48642	18881	30212	5500	0	482
51078	2006	3	57444	18881	30212	9800	0	1251
51079	2004	5	69098	18881	30212	4500	0	533
51080	2004	5	76391	18111	30212	4200	0	803
51081	2007	2	44622	18881	30212	6000	0	482
51082	2006	3	69941	18111	30212	10400	0	1606
51083	2002	7	93744	18111	30212	7500	0	1353
51084	2007	2	74407	18111	30212	8000	0	803
51085	2004	5	82563	18881	30212	7000	0	1243
51086	2006	3	65399	18111	30212	7900	0	1508
51087	2006	3	79554	18881	30212	7000	0	1974

51088 rows × 8 columns

In [27]:

lemons = pd.read_csv('./data/lemons.csv')
lemons = lemons.drop('PRIMEUNIT',1)
lemons = lemons.drop('AUCGUART',1)
lemons = lemons.dropna(axis=0)
# Generating a list of continuous data features from the describe dataframe. 
# Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value)
features = list(lemons.describe().columns)
features.remove('RefId')
features.remove('IsBadBuy')

In [28]:

lemons[features].head()

Out[28]:

	VehYear	VehicleAge	WheelTypeID	VehOdo	MMRAcquisitionAuctionAveragePrice	MMRAcquisitionAuctionCleanPrice	MMRAcquisitionRetailAveragePrice	MMRAcquisitonRetailCleanPrice	MMRCurrentAuctionAveragePrice	MMRCurrentAuctionCleanPrice	MMRCurrentRetailAveragePrice	MMRCurrentRetailCleanPrice	BYRNO	VNZIP1	VehBCost	WarrantyCost
0	2006	3	1	89046	8155	9829	11636	13600	7451	8552	11597	12409	21973	33619	7100	1113
1	2004	5	1	93593	6854	8383	10897	12572	7456	9222	11374	12791	19638	33619	7600	1053
2	2005	4	2	69367	3913	5054	7723	8707	3247	4384	6739	7911	19638	33619	4000	1020
3	2004	5	2	81054	3901	4908	6706	8577	4709	5827	8149	9451	19638	33619	5600	594
4	2004	5	2	65328	2966	4038	6240	8496	2980	4115	6230	8603	19638	33619	4200	533

In [30]:

from sklearn import feature_selection as f_select
significant_features = []
pvals = []
for feature in features:
    pval = f_select.f_classif(lemons[feature],lemons.IsBadBuy)
    if pval[1][0] < 0.05:
        significant_features.append(feature)
        pvals.append(pval[1][0])

In [33]:

print features
significant_features

['VehYear', 'VehicleAge', 'WheelTypeID', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'BYRNO', 'VNZIP1', 'VehBCost', 'IsOnlineSale', 'WarrantyCost']

Out[33]:

['VehYear',
 'VehicleAge',
 'WheelTypeID',
 'VehOdo',
 'MMRAcquisitionAuctionAveragePrice',
 'MMRAcquisitionAuctionCleanPrice',
 'MMRAcquisitionRetailAveragePrice',
 'MMRAcquisitonRetailCleanPrice',
 'MMRCurrentAuctionAveragePrice',
 'MMRCurrentAuctionCleanPrice',
 'MMRCurrentRetailAveragePrice',
 'MMRCurrentRetailCleanPrice',
 'BYRNO',
 'VehBCost',
 'IsOnlineSale',
 'WarrantyCost']

In [ ]: