%pylab inline
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['clf'] `%matplotlib` prevents importing * from pylab and numpy
import pandas as pd
import numpy as np
from __future__ import division
class Transformations(object):
"""since these transformations are all related, we'll nest them all under a feature norm class"""
def mean_at_zero(self, arr):
return np.array([i - np.mean(a) for i in arr])
def norm_to_min_zero(self, arr):
return np.array([i / max(a) for i in arr])
def norm_to_absolute_min_zero(self, arr):
"""should be a range of 0 to 1, where 0 maintains its 0 value"""
return np.array([(i-min(arr))/(max(arr)-min(arr)) for i in arr])
def norm_to_neg_pos(self, arr):
"""should be a range of -1 to 1, where 0 represents the mean"""
return np.array([(i-mean(arr))/(max(arr)-mean(arr)) for i in arr])
def norm_by_std(self, arr):
"""should be a range where 0 represents the mean"""
return np.array([(i-mean(arr))/std(arr) for i in arr])
transformer = Transformations()
a = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
print transformer.norm_to_absolute_min_zero(a) == np.array([0.0, 0.25, 0.5, 0.75, 1.0])
print transformer.norm_to_neg_pos(a) == np.array([-1.0, -0.5, 0.0, 0.5, 1.0])
print transformer.norm_by_std(a) == np.array([-1.414213562373095, -0.7071067811865475, 0.0, 0.7071067811865475, 1.414213562373095])
[ True True True True True] [ True True True True True] [ True True True True True]
import pandas as pd
from sklearn import tree
from sklearn.cross_validation import cross_val_score
# Load in data and create sets. dropping all na columns on the live data set.
lemons = pd.read_csv('./data/lemons.csv')
lemons_oos = pd.read_csv('./data/lemons_oos.csv')
print lemons.dtypes
RefId int64 IsBadBuy int64 PurchDate object Auction object VehYear int64 VehicleAge int64 Make object Model object Trim object SubModel object Color object Transmission object WheelTypeID float64 WheelType object VehOdo int64 Nationality object Size object TopThreeAmericanName object MMRAcquisitionAuctionAveragePrice float64 MMRAcquisitionAuctionCleanPrice float64 MMRAcquisitionRetailAveragePrice float64 MMRAcquisitonRetailCleanPrice float64 MMRCurrentAuctionAveragePrice float64 MMRCurrentAuctionCleanPrice float64 MMRCurrentRetailAveragePrice float64 MMRCurrentRetailCleanPrice float64 PRIMEUNIT object AUCGUART object BYRNO int64 VNZIP1 int64 VNST object VehBCost float64 IsOnlineSale int64 WarrantyCost int64 dtype: object
lemons = lemons.dropna(axis=1)
# Generating a list of continuous data features from the describe dataframe.
# Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value)
features = list(lemons.describe().columns)
features.remove('RefId')
features.remove('IsBadBuy')
best_score = -1
for depth in range(1, 10):
scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth, random_state=1234),
lemons[features],
lemons.IsBadBuy,
scoring='roc_auc',
cv=5)
if scores.mean() > best_score:
best_depth = depth
best_score = scores.mean()
# Is the best score we have better than each DummyClassifier type?
from sklearn import dummy, metrics
for strat in ['stratified', 'most_frequent', 'uniform']:
dummyclf = dummy.DummyClassifier(strategy=strat).fit(lemons[features], lemons.IsBadBuy)
print 'did better than %s?' % strat, metrics.roc_auc_score(lemons.IsBadBuy, dummyclf.predict(lemons[features])) < best_score
# seems so!
# Create a classifier and prediction.
clf = tree.DecisionTreeClassifier(max_depth=depth, random_state=1234).fit(lemons[features], lemons.IsBadBuy)
y_pred = clf.predict(lemons_oos[features])
# Create a submission
submission = pd.DataFrame({ 'RefId' : lemons_oos.RefId, 'prediction' : y_pred })
submission.to_csv('submission.csv')
did better than stratified? True did better than most_frequent? True did better than uniform? True
lemons[features]
VehYear | VehicleAge | VehOdo | BYRNO | VNZIP1 | VehBCost | IsOnlineSale | WarrantyCost | |
---|---|---|---|---|---|---|---|---|
0 | 2006 | 3 | 89046 | 21973 | 33619 | 7100 | 0 | 1113 |
1 | 2004 | 5 | 93593 | 19638 | 33619 | 7600 | 0 | 1053 |
2 | 2005 | 4 | 69367 | 19638 | 33619 | 4000 | 0 | 1020 |
3 | 2004 | 5 | 81054 | 19638 | 33619 | 5600 | 0 | 594 |
4 | 2004 | 5 | 65328 | 19638 | 33619 | 4200 | 0 | 533 |
5 | 2005 | 4 | 79315 | 19638 | 33619 | 5400 | 0 | 1623 |
6 | 2006 | 3 | 74722 | 19638 | 33619 | 6900 | 0 | 1623 |
7 | 2003 | 6 | 72132 | 5546 | 33619 | 3300 | 0 | 1455 |
8 | 2005 | 4 | 80736 | 19638 | 33619 | 6800 | 0 | 1243 |
9 | 2003 | 6 | 75156 | 5546 | 33619 | 4900 | 0 | 1923 |
10 | 2004 | 5 | 84498 | 19638 | 33619 | 7100 | 0 | 1243 |
11 | 2002 | 7 | 66536 | 5546 | 33619 | 5800 | 0 | 2003 |
12 | 2006 | 3 | 59789 | 5546 | 33619 | 7700 | 0 | 671 |
13 | 2004 | 5 | 52106 | 5546 | 33619 | 4500 | 0 | 754 |
14 | 2002 | 7 | 88958 | 5546 | 33619 | 8000 | 0 | 2452 |
15 | 2004 | 5 | 76173 | 19638 | 33619 | 8800 | 0 | 920 |
16 | 2005 | 4 | 80064 | 19638 | 33619 | 5600 | 0 | 1763 |
17 | 2003 | 6 | 77694 | 19638 | 33619 | 8300 | 0 | 1923 |
18 | 2004 | 5 | 57723 | 19638 | 33619 | 7000 | 0 | 671 |
19 | 2005 | 4 | 78434 | 19638 | 33619 | 10700 | 0 | 1272 |
20 | 2001 | 8 | 82944 | 19638 | 33619 | 3600 | 0 | 2322 |
21 | 2003 | 6 | 55711 | 19638 | 33619 | 5100 | 0 | 971 |
22 | 2005 | 5 | 76586 | 19638 | 33619 | 8200 | 0 | 1389 |
23 | 2005 | 5 | 86889 | 19638 | 33619 | 5200 | 0 | 594 |
24 | 2004 | 6 | 68990 | 19619 | 33619 | 8500 | 0 | 1215 |
25 | 2008 | 2 | 80949 | 19619 | 33619 | 7900 | 0 | 2152 |
26 | 2003 | 7 | 59858 | 5546 | 33619 | 4600 | 0 | 1220 |
27 | 2006 | 4 | 50227 | 19619 | 33619 | 7500 | 0 | 1003 |
28 | 2006 | 4 | 58024 | 20928 | 33619 | 5600 | 0 | 671 |
29 | 2006 | 4 | 40919 | 20928 | 33619 | 7700 | 0 | 623 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
51058 | 2002 | 7 | 81794 | 18881 | 30212 | 3400 | 0 | 2063 |
51059 | 2008 | 1 | 49069 | 18881 | 30212 | 8000 | 0 | 482 |
51060 | 2006 | 3 | 69007 | 18111 | 30212 | 5600 | 0 | 728 |
51061 | 2006 | 3 | 62228 | 18111 | 30212 | 4525 | 0 | 533 |
51062 | 2001 | 8 | 72553 | 18111 | 30212 | 4325 | 0 | 1220 |
51063 | 2005 | 4 | 72224 | 18111 | 30212 | 6100 | 0 | 1038 |
51064 | 2006 | 3 | 64020 | 18111 | 30212 | 7440 | 0 | 1703 |
51065 | 2005 | 4 | 48447 | 18111 | 30212 | 7340 | 0 | 1328 |
51066 | 2004 | 5 | 81403 | 18111 | 30212 | 8900 | 0 | 983 |
51067 | 2005 | 4 | 77249 | 18111 | 30212 | 5330 | 0 | 1389 |
51068 | 2006 | 3 | 60549 | 18111 | 30212 | 5600 | 0 | 533 |
51069 | 2002 | 7 | 82568 | 18111 | 30212 | 4015 | 0 | 1543 |
51070 | 2006 | 3 | 64990 | 18111 | 30212 | 6835 | 0 | 1703 |
51071 | 2007 | 2 | 60074 | 18111 | 30212 | 5700 | 0 | 533 |
51072 | 2006 | 3 | 84168 | 18881 | 30212 | 4800 | 0 | 1243 |
51073 | 2007 | 2 | 72802 | 18881 | 30212 | 9500 | 0 | 1389 |
51074 | 2005 | 4 | 59383 | 18111 | 30212 | 9000 | 0 | 1417 |
51075 | 2002 | 7 | 75700 | 18881 | 30212 | 7000 | 0 | 1455 |
51076 | 2006 | 3 | 70004 | 18881 | 30212 | 5000 | 0 | 1155 |
51077 | 2005 | 4 | 48642 | 18881 | 30212 | 5500 | 0 | 482 |
51078 | 2006 | 3 | 57444 | 18881 | 30212 | 9800 | 0 | 1251 |
51079 | 2004 | 5 | 69098 | 18881 | 30212 | 4500 | 0 | 533 |
51080 | 2004 | 5 | 76391 | 18111 | 30212 | 4200 | 0 | 803 |
51081 | 2007 | 2 | 44622 | 18881 | 30212 | 6000 | 0 | 482 |
51082 | 2006 | 3 | 69941 | 18111 | 30212 | 10400 | 0 | 1606 |
51083 | 2002 | 7 | 93744 | 18111 | 30212 | 7500 | 0 | 1353 |
51084 | 2007 | 2 | 74407 | 18111 | 30212 | 8000 | 0 | 803 |
51085 | 2004 | 5 | 82563 | 18881 | 30212 | 7000 | 0 | 1243 |
51086 | 2006 | 3 | 65399 | 18111 | 30212 | 7900 | 0 | 1508 |
51087 | 2006 | 3 | 79554 | 18881 | 30212 | 7000 | 0 | 1974 |
51088 rows × 8 columns
lemons = pd.read_csv('./data/lemons.csv')
lemons = lemons.drop('PRIMEUNIT',1)
lemons = lemons.drop('AUCGUART',1)
lemons = lemons.dropna(axis=0)
# Generating a list of continuous data features from the describe dataframe.
# Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value)
features = list(lemons.describe().columns)
features.remove('RefId')
features.remove('IsBadBuy')
lemons[features].head()
VehYear | VehicleAge | WheelTypeID | VehOdo | MMRAcquisitionAuctionAveragePrice | MMRAcquisitionAuctionCleanPrice | MMRAcquisitionRetailAveragePrice | MMRAcquisitonRetailCleanPrice | MMRCurrentAuctionAveragePrice | MMRCurrentAuctionCleanPrice | MMRCurrentRetailAveragePrice | MMRCurrentRetailCleanPrice | BYRNO | VNZIP1 | VehBCost | IsOnlineSale | WarrantyCost | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2006 | 3 | 1 | 89046 | 8155 | 9829 | 11636 | 13600 | 7451 | 8552 | 11597 | 12409 | 21973 | 33619 | 7100 | 0 | 1113 |
1 | 2004 | 5 | 1 | 93593 | 6854 | 8383 | 10897 | 12572 | 7456 | 9222 | 11374 | 12791 | 19638 | 33619 | 7600 | 0 | 1053 |
2 | 2005 | 4 | 2 | 69367 | 3913 | 5054 | 7723 | 8707 | 3247 | 4384 | 6739 | 7911 | 19638 | 33619 | 4000 | 0 | 1020 |
3 | 2004 | 5 | 2 | 81054 | 3901 | 4908 | 6706 | 8577 | 4709 | 5827 | 8149 | 9451 | 19638 | 33619 | 5600 | 0 | 594 |
4 | 2004 | 5 | 2 | 65328 | 2966 | 4038 | 6240 | 8496 | 2980 | 4115 | 6230 | 8603 | 19638 | 33619 | 4200 | 0 | 533 |
from sklearn import feature_selection as f_select
significant_features = []
pvals = []
for feature in features:
pval = f_select.f_classif(lemons[feature],lemons.IsBadBuy)
if pval[1][0] < 0.05:
significant_features.append(feature)
pvals.append(pval[1][0])
print features
significant_features
['VehYear', 'VehicleAge', 'WheelTypeID', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'BYRNO', 'VNZIP1', 'VehBCost', 'IsOnlineSale', 'WarrantyCost']
['VehYear', 'VehicleAge', 'WheelTypeID', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'BYRNO', 'VehBCost', 'IsOnlineSale', 'WarrantyCost']