import numpy as np import math from math import log from sklearn import metrics,preprocessing,cross_validation from sklearn.feature_extraction.text import TfidfVectorizer import sklearn.linear_model as lm import pandas as p from time import gmtime, strftime import scipy import sys import sklearn.decomposition from sklearn.metrics import mean_squared_error from string import punctuation from sklearn.neighbors import RadiusNeighborsRegressor, KNeighborsRegressor import time from scipy import sparse from matplotlib import * from itertools import combinations from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier import operator from sklearn import svm def tied_rank(x): """ This function is by Ben Hamner and taken from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py Computes the tied rank of elements in x. This function computes the tied rank of elements in x. Parameters ---------- x : list of numbers, numpy array Returns ------- score : list of numbers The tied rank f each element in x """ sorted_x = sorted(zip(x,range(len(x)))) r = [0 for k in x] cur_val = sorted_x[0][0] last_rank = 0 for i in range(len(sorted_x)): if cur_val != sorted_x[i][0]: cur_val = sorted_x[i][0] for j in range(last_rank, i): r[sorted_x[j][1]] = float(last_rank+1+i)/2.0 last_rank = i if i==len(sorted_x)-1: for j in range(last_rank, i+1): r[sorted_x[j][1]] = float(last_rank+i+2)/2.0 return r def auc(actual, posterior): """ This function is by Ben Hamner and taken from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py Computes the area under the receiver-operater characteristic (AUC) This function computes the AUC error metric for binary classification. Parameters ---------- actual : list of binary numbers, numpy array The ground truth value posterior : same type as actual Defines a ranking on the binary numbers, from most likely to be positive to least likely to be positive. Returns ------- score : double The mean squared error between actual and posterior """ r = tied_rank(posterior) num_positive = len([0 for x in actual if x==1]) num_negative = len(actual)-num_positive sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1]) auc = ((sum_positive - num_positive*(num_positive+1)/2.0) / (num_negative*num_positive)) sys.stdout.write('.') return auc def auc_scorer(estimator, X, y): predicted = estimator.predict_proba(X)[:,1] return auc(y, predicted) def normalize10day(stocks): def process_column(i): if operator.mod(i, 5) == 1: return stocks[:,i] * 0 if operator.mod(i, 5) == 2: return stocks[:,i] * 0 if operator.mod(i, 5) == 4: return stocks[:,i] * 0 #return np.log(stocks[:,i] + 1) else: return stocks[:,i] / stocks[:,0] n = stocks.shape[0] stocks_dat = np.array([ process_column(i) for i in range(46)]).transpose() #stocks_movingavgO9O10 = np.array([int(i > j) for i,j in zip(stocks_dat[:,45], stocks_dat[:,40])]).reshape((n, 1)) #stocks_movingavgC9O10 = np.array([int(i > j) for i,j in zip(stocks_dat[:,45], stocks_dat[:,43])]).reshape((n, 1)) #return np.hstack((stocks_dat, stocks_movingavgO9O10, stocks_movingavgC9O10)) return stocks_dat print "loading data.." train = np.array(p.read_table('./training.csv', sep = ",")) test = np.array(p.read_table('./test.csv', sep = ",")) ################################################################################ # READ IN THE TEST DATA ################################################################################ # all data from opening 1 to straight to opening 10 X_test_stockdata = normalize10day(test[:,range(2, 48)]) # load in test data X_test_stockindicators = np.vstack((np.identity(94)[:,range(93)] for i in range(25))) #X_test = np.hstack((X_test_stockindicators, X_test_stockdata)) X_test = X_test_stockdata #np.identity(94)[:,range(93)] ################################################################################ # READ IN THE TRAIN DATA ################################################################################ n_windows = 490 windows = range(n_windows) X_windows = [train[:,range(1 + 5*w, 47 + 5*w)] for w in windows] X_windows_normalized = [normalize10day(w) for w in X_windows] X_stockdata = np.vstack(X_windows_normalized) X_stockindicators = np.vstack((np.identity(94)[:,range(93)] for i in range(n_windows))) #X = np.hstack((X_stockindicators, X_stockdata)) X = X_stockdata # read in the response variable y_stockdata = np.vstack([train[:, [46 + 5*w, 49 + 5*w]] for w in windows]) y = (y_stockdata[:,1] - y_stockdata[:,0] > 0) + 0 print "this step done" print "preparing models" modelname = "lasso" if modelname == "ridge": C = np.linspace(300, 5000, num = 10)[::-1] models = [lm.LogisticRegression(penalty = "l2", C = c) for c in C] if modelname == "lasso": C = np.linspace(300, 5000, num = 10)[::-1] models = [lm.LogisticRegression(penalty = "l1", C = c) for c in C] if modelname == "sgd": C = np.linspace(0.00005, .01, num = 5) models = [lm.SGDClassifier(loss = "log", penalty = "l2", alpha = c, warm_start = False) for c in C] if modelname == "randomforest": C = np.linspace(50, 300, num = 10) models = [RandomForestClassifier(n_estimators = int(c)) for c in C] print "calculating cv scores" cv_scores = [0] * len(models) for i, model in enumerate(models): # for all of the models, save the cross-validation scores into the array cv_scores cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, scoring = auc_scorer)) #cv_scores[i] = np.mean(cross_validation.cross_val_score(model, X, y, cv=5, score_func = auc)) print " (%d/%d) C = %f: CV = %f" % (i + 1, len(C), C[i], cv_scores[i]) # find which model and C is the best best = cv_scores.index(max(cv_scores)) best_model = models[best] best_cv = cv_scores[best] best_C = C[best] print "BEST %f: %f" % (best_C, best_cv) print "training on full data" # fit the best model on the full data best_model.fit(X, y) print "prediction" # do a prediction and save it pred = best_model.predict_proba(X_test)[:,1] testfile = p.read_csv('./test.csv', sep=",", na_values=['?'], index_col=[0,1]) # submit as D multiplied by 100 + stock id testindices = [100 * D + StId for (D, StId) in testfile.index] pred_df = p.DataFrame(np.vstack((testindices, pred)).transpose(), columns=["Id", "Prediction"]) pred_df.to_csv('./predictions/' + modelname + '/' + modelname + ' ' + strftime("%m-%d %X") + " C-" + str(round(best_C,4)) + " CV-" + str(round(best_cv, 4)) + ".csv", index = False) print "submission file created"