import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from autodiff import optimize
from scipy import stats
import pylab as pl
%pylab inline --no-import-all
def soft_absolute(u):
epsilon = 1e-8
return np.sqrt(u*u + epsilon)
def logistic(u):
return 1. / (1. + np.exp(-u))
class SparseFilter(BaseEstimator, TransformerMixin):
def __init__(self, n_features = 200, n_iterations = 300, activate=soft_absolute):
self.epsilon = 1e-8
self.n_features = n_features
self.n_iterations = n_iterations
self.activate = activate
def fit(self, X, y = None):
n_samples, n_dim = X.shape
W = np.random.randn(n_dim, self.n_features)
b = np.random.randn(self.n_features)
obj_fn = self.get_objective_fn(X)
self.W_, self.b_ = optimize.fmin_l_bfgs_b(obj_fn, (W, b),
iprint = 1,
maxfun = self.n_iterations)
return self
def get_objective_fn(self, X):
def _objective_fn(W, b):
Y = self.activate(np.dot(X, W) + b)
Y = Y / np.sqrt(np.sum(Y*Y, axis = 0) + self.epsilon)
Y = Y / np.sqrt(np.sum(Y*Y, axis = 1)[:, np.newaxis] + self.epsilon)
return np.sum(Y)
return _objective_fn
def transform(self, X):
Y = self.activate(np.dot(X, self.W_) + self.b_)
Y = Y / np.sqrt(np.sum(Y*Y, axis=0) + self.epsilon)
Y = Y / np.sqrt(np.sum(Y*Y, axis=1)[:, np.newaxis] + self.epsilon)
return Y
train_data = pd.read_csv('data/train.csv', header=None)
train_labels = pd.read_csv('data/trainLabels.csv', header=None)
test_data = pd.read_csv('data/test.csv', header=None)
train_X = np.asarray(train_data)
train_y = np.asarray(train_labels).ravel()
test_X = np.asarray(test_data)
## shuffle train data
train_X, train_y = shuffle(train_X, train_y)
print train_X.shape, test_X.shape, train_y.shape
## train a sparse filter on both train and test data
sf = SparseFilter(n_features=50, n_iterations=1000)
sf.fit(np.r_[train_X, test_X])
train_sf_X = sf.transform(train_X)
test_sf_X = sf.transform(test_X)
## train a pca on both train and test data
## but before that, see how many components are suitable
pca = PCA()
pca.fit(np.r_[train_X, test_X])
pd.DataFrame(pca.explained_variance_ratio_).plot(kind = 'bar')
n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.85)[0][0]
print n_components
pca = PCA(n_components=n_components)
pca.fit(np.r_[train_X, test_X])
train_pca_X = pca.transform(train_X)
test_pca_X = pca.transform(test_X)
## combine two features
## standarization
ss = StandardScaler()
train_combined_X = ss.fit_transform(np.c_[train_sf_X, train_pca_X])
test_combined_X = ss.transform(np.c_[test_sf_X, test_pca_X])
print train_combined_X.shape, test_combined_X.shape
## filter features by forest model
trees = ExtraTreesClassifier(n_estimators=100)
trees.fit(train_combined_X, train_y)
pd.DataFrame(trees.feature_importances_).plot(kind='bar')
selected_features = np.where(trees.feature_importances_ > 0.005)[0]
train_selected_X = train_combined_X[:, selected_features]
test_selected_X = test_combined_X[:, selected_features]
print train_selected_X.shape, test_selected_X.shape
svc = SVC(probability=True)
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, scoring = 'accuracy', cv = 10, n_jobs=-1)
gs.fit(train_selected_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(probability=True, **gs.best_params_)
svc.fit(train_selected_X, train_y)
test_yhat = svc.predict(test_selected_X)
print test_yhat.shape
test_yhat_df = pd.DataFrame(dict(Id = np.arange(1, test_yhat.shape[0]+1), Solution=test_yhat))
test_yhat_df.to_csv('submission/submission1.csv', header = True, index=False)
forest = ExtraTreesClassifier(n_estimators=50)
max_features_choices = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
gs = GridSearchCV(forest, {'max_features': max_features_choices}, scoring = 'accuracy', cv = 10, n_jobs=-1)
gs.fit(train_selected_X, train_y)
print gs.best_params_
print gs.best_score_
The cross validation result is very close to the evaluation score in public board, which implies the assumpiton of iid is quite solid, and thus confirms that it is most likely to be "simulated" data