Notebook

@ morph, for the YSDA ML Trainings 18 October, 2014¶

Download data¶

data_dir = 'tradeshift/' !mkdir {data_dir}!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/train.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A06%3A50Z&sr=b&sp=r&sig=cupgPW%2BU6BpdsnrykcEBBRqLEW565pXYQ6k%2FSc0Me1M%3D' -O {data_dir + 'train.csv.gz'}!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/test.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A09%3A52Z&sr=b&sp=r&sig=YLQCFyAdhIRnz2o4p24zRssUjHYjQ1xOHuTKFsdLxu8%3D' -O {data_dir + 'test.csv.gz'}!wget 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/3984/trainLabels.csv.gz?sv=2012-02-12&se=2014-10-21T00%3A11%3A04Z&sr=b&sp=r&sig=%2Bm9sbZYXOY8L80d1PJEdumGPXvkQby2rpkVOf1fvjUM%3D' -O {data_dir + 'trainLabels.csv.gz'}

Unpack¶

%%time !gunzip {data_dir + '*.gz'}!ls -l -h {data_dir}

Big Data -- Sample Data!¶

In [13]:

import pandas as pd

In [14]:

train = pd.read_csv(data_dir + 'train.csv')

In [15]:

train.shape

Out[15]:

(1700000, 146)

In [16]:

sample_size = 170000
ratio = train.shape[0] / sample_size

train_sample = train[
    [hash(id) % ratio == 0 for id in train['id']]
]

train_sample.shape

Out[16]:

(170000, 146)

In [17]:

train_sample.to_csv(data_dir + 'train_sample.csv', index = False)

In [18]:

# Free memory

del train

Try to make something useful¶

In [1]:

import pandas as pd

data_dir = 'tradeshift/'

In [2]:

train_sample = pd.read_csv(data_dir + 'train_sample.csv')

In [3]:

labels = pd.read_csv(data_dir + 'trainLabels.csv')

In [4]:

labels.columns

Out[4]:

Index([u'id', u'y1', u'y2', u'y3', u'y4', u'y5', u'y6', u'y7', u'y8', u'y9', u'y10', u'y11', u'y12', u'y13', u'y14', u'y15', u'y16', u'y17', u'y18', u'y19', u'y20', u'y21', u'y22', u'y23', u'y24', u'y25', u'y26', u'y27', u'y28', u'y29', u'y30', u'y31', u'y32', u'y33'], dtype='object')

In [5]:

train_with_labels = pd.merge(train_sample, labels, on = 'id')

In [6]:

train_with_labels.shape

Out[6]:

(170000, 179)

In [7]:

from collections import Counter

Counter([name[0] for name in train_with_labels.columns])

Out[7]:

Counter({'x': 145, 'y': 33, 'i': 1})

In [8]:

del labels
del train_sample

In [9]:

test = pd.read_csv(data_dir + 'test.csv')

Categorical values encoding¶

In [10]:

from sklearn.feature_extraction import DictVectorizer

X_numerical = []
X_test_numerical = []

vec = DictVectorizer()

names_categorical = []

train_with_labels.replace('YES', 1, inplace = True)
train_with_labels.replace('NO', 0, inplace = True)
train_with_labels.replace('nan', np.NaN, inplace = True)

test.replace('YES', 1, inplace = True)
test.replace('NO', 0, inplace = True)
test.replace('nan', np.NaN, inplace = True)


for name in train_with_labels.columns :    
    if name.startswith('x') :
        column_type, _ = max(Counter(map(lambda x: str(type(x)), train_with_labels[name])).items(), key = lambda x: x[1])
        
        # LOL expression
        if column_type == str(str) :
            train_with_labels[name] = map(str, train_with_labels[name])
            test[name] = map(str, test[name])

            names_categorical.append(name)
            print name, len(np.unique(train_with_labels[name]))
        else :
            X_numerical.append(train_with_labels[name].fillna(-999))
            X_test_numerical.append(test[name].fillna(-999))
        
X_numerical = np.column_stack(X_numerical)
X_test_numerical = np.column_stack(X_test_numerical)

X_sparse = vec.fit_transform(train_with_labels[names_categorical].T.to_dict().values())
X_test_sparse = vec.transform(test[names_categorical].T.to_dict().values())

print X_numerical.shape, X_sparse.shape, X_test_numerical.shape, X_test_sparse.shape

x3 40882
x4 5019
x34 48090
x35 6797
x61 78363
x64 49408
x65 7035
x91 27960
x94 37786
x95 5086
(170000, 135) (170000, 306426) (545082, 135) (545082, 306426)

In [ ]:

X_numerical = np.nan_to_num(X_numerical)
X_test_numerical = np.nan_to_num(X_test_numerical)

In [ ]:

from sklearn.externals import joblib

joblib.dump(
    (X_numerical, X_sparse, X_test_numerical, X_test_sparse),
    data_dir + 'X.dump',
    compress = 1,
)

Trying to predict something¶

Build two level classifier, first train base level¶

In [ ]:

from sklearn.metrics import roc_auc_score, f1_score, log_loss, make_scorer
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier

log_loss_scorer = make_scorer(log_loss, needs_proba = True)

y_columns = [name for name in train_with_labels.columns if name.startswith('y')]

X_numerical_base, X_numerical_meta, X_sparse_base, X_sparse_meta, y_base, y_meta = train_test_split(
    X_numerical, 
    X_sparse, 
    train_with_labels[y_columns].values,
    test_size = 0.5
)

X_meta = [] 
X_test_meta = []

print "Build meta"

for i in range(y_base.shape[1]) :
    print i
    
    y = y_base[:, i]
    if len(np.unique(y)) == 2 : 
        rf = RandomForestClassifier(n_estimators = 10, n_jobs = 1)
        rf.fit(X_numerical_base, y)
        X_meta.append(rf.predict_proba(X_numerical_meta))
        X_test_meta.append(rf.predict_proba(X_test_numerical))

        svm = LinearSVC()
        svm.fit(X_sparse_base, y)
        X_meta.append(svm.decision_function(X_sparse_meta))
        X_test_meta.append(svm.decision_function(X_test_sparse))
        
X_meta = np.column_stack(X_meta)
X_test_meta = np.column_stack(X_test_meta)

Build meta
0
1

In [ ]:

print X_meta.shape, X_test_meta.shape

Here train meta level and get predictions for test set¶

In [ ]:

p_test = []

for i in range(y_base.shape[1]) :
    y = y_meta[:, i]

    constant = Counter(y)
    constant = constant[0] < 4 or constant[1] < 4
    
    predicted = None
    
    if constant :
        # Best constant
        constant_pred = np.mean(list(y_base[:, i]) + list(y_meta[:, i]))
        
        predicted = np.ones(X_test_meta.shape[0]) * constant_pred
        print "%d is constant like: %f" % (i, constant_pred)
    else :
        rf = RandomForestClassifier(n_estimators=30, n_jobs = 1)
        rf.fit(np.hstack([X_meta, X_numerical_meta]), y)

        predicted = rf.predict_proba(np.hstack([X_test_meta, X_test_numerical]))

        predicted = predicted[:, 1]
        
        rf = RandomForestClassifier(n_estimators=30, n_jobs = 1)
        scores = cross_val_score(rf, np.hstack([X_meta, X_numerical_meta]), y, cv = 4, n_jobs = 1, scoring = log_loss_scorer)

        print i, 'RF log-loss: %.4f ± %.4f, mean = %.6f' %(np.mean(scores), np.std(scores), np.mean(predicted))

    
    p_test.append(
        predicted
    )
    
p_test = np.column_stack(p_test)

Save predictions¶

In [ ]:

p_test.shape

In [ ]:

import gzip

def save_predictions(name, ids, predictions) :
    out = gzip.open(name, 'w')
    print >>out, 'id_label,pred'
    for id, id_predictions in zip(test['id'], p_test) :
        for y_id, pred in enumerate(id_predictions) :
            if pred == 0 or pred == 1 :
                pred = str(int(pred))
            else :
                pred = '%.6f' % pred
            print >>out, '%d_y%d,%s' % (id, y_id + 1, pred)

In [ ]:

save_predictions('quick_start.csv.gz', test['id'].values, p_test)

In [ ]:

!ls -l -h  quick_start*.csv.gz

Public result¶

Quick start on 10% of train - 0.0212323¶

In [ ]: