In [2]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA


from autodiff import optimize
from scipy import stats
import pylab as pl

%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib

In [3]:
def soft_absolute(u):
    epsilon = 1e-8
    return np.sqrt(u*u + epsilon)

def logistic(u):
    return 1. / (1. + np.exp(-u))

class SparseFilter(BaseEstimator, TransformerMixin):
    def __init__(self, n_features = 200, n_iterations = 300, activate=soft_absolute):
        self.epsilon = 1e-8
        self.n_features = n_features
        self.n_iterations = n_iterations
        self.activate = activate
    def fit(self, X, y = None):
        n_samples, n_dim = X.shape
        W = np.random.randn(n_dim, self.n_features)
        b = np.random.randn(self.n_features)
        obj_fn = self.get_objective_fn(X)
        self.W_, self.b_ = optimize.fmin_l_bfgs_b(obj_fn, (W, b), 
                                iprint = 1, 
                                maxfun = self.n_iterations)
        return self
    def get_objective_fn(self, X):
        def _objective_fn(W, b):
            Y = self.activate(np.dot(X, W) + b)
            Y = Y / np.sqrt(np.sum(Y*Y, axis = 0) + self.epsilon)
            Y = Y / np.sqrt(np.sum(Y*Y, axis = 1)[:, np.newaxis] + self.epsilon)
            return np.sum(Y)
        return _objective_fn
    def transform(self, X):
        Y = self.activate(np.dot(X, self.W_) + self.b_)
        Y = Y / np.sqrt(np.sum(Y*Y, axis=0) + self.epsilon)
        Y = Y / np.sqrt(np.sum(Y*Y, axis=1)[:, np.newaxis] + self.epsilon)
        return Y

load data

In [19]:
train_data = pd.read_csv('data/train.csv', header=None)
train_labels = pd.read_csv('data/trainLabels.csv', header=None)
test_data = pd.read_csv('data/test.csv', header=None)

train_X = np.asarray(train_data)
train_y = np.asarray(train_labels).ravel()
test_X = np.asarray(test_data)

## shuffle train data
train_X, train_y = shuffle(train_X, train_y)

print train_X.shape, test_X.shape, train_y.shape
(1000, 40) (9000, 40) (1000,)

unsupervised feature engineering

In [44]:
## train a sparse filter on both train and test data
sf = SparseFilter(n_features=50, n_iterations=1000)
sf.fit(np.r_[train_X, test_X])
train_sf_X = sf.transform(train_X)
test_sf_X = sf.transform(test_X)
In [45]:
## train a pca on both train and test data
## but before that, see how many components are suitable
pca = PCA()
pca.fit(np.r_[train_X, test_X])
pd.DataFrame(pca.explained_variance_ratio_).plot(kind = 'bar')
n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.85)[0][0]
print n_components

pca = PCA(n_components=n_components)
pca.fit(np.r_[train_X, test_X])
train_pca_X = pca.transform(train_X)
test_pca_X = pca.transform(test_X)
15

In [46]:
## combine two features
## standarization
ss = StandardScaler()
train_combined_X = ss.fit_transform(np.c_[train_sf_X, train_pca_X])
test_combined_X = ss.transform(np.c_[test_sf_X, test_pca_X])
print train_combined_X.shape, test_combined_X.shape
(1000, 65) (9000, 65)

In [47]:
## filter features by forest model
trees = ExtraTreesClassifier(n_estimators=100)
trees.fit(train_combined_X, train_y)
pd.DataFrame(trees.feature_importances_).plot(kind='bar')
selected_features = np.where(trees.feature_importances_ > 0.005)[0]

train_selected_X = train_combined_X[:, selected_features]
test_selected_X = test_combined_X[:, selected_features]
print train_selected_X.shape, test_selected_X.shape
(1000, 49) (9000, 49)

In [55]:
svc = SVC(probability=True)
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, scoring = 'accuracy', cv = 10, n_jobs=-1)
gs.fit(train_selected_X, train_y)
print gs.best_params_
print gs.best_score_
{'gamma': 0.03}
0.97

In [56]:
svc = SVC(probability=True, **gs.best_params_)
svc.fit(train_selected_X, train_y)
Out[56]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.03,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
In [57]:
test_yhat = svc.predict(test_selected_X)
print test_yhat.shape
(9000,)

In [62]:
test_yhat_df = pd.DataFrame(dict(Id = np.arange(1, test_yhat.shape[0]+1), Solution=test_yhat))
test_yhat_df.to_csv('submission/submission1.csv', header = True, index=False)
In [52]:
forest = ExtraTreesClassifier(n_estimators=50)
max_features_choices = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
gs = GridSearchCV(forest, {'max_features': max_features_choices}, scoring = 'accuracy', cv = 10, n_jobs=-1)
gs.fit(train_selected_X, train_y)
print gs.best_params_
print gs.best_score_
{'max_features': 0.1}
0.966

The cross validation result is very close to the evaluation score in public board, which implies the assumpiton of iid is quite solid, and thus confirms that it is most likely to be "simulated" data

In [ ]: