from numpy import array, hstack from sklearn import metrics, cross_validation, linear_model from scipy import sparse from itertools import combinations import numpy as np import pandas as pd SEED = 25 train = 'data/train.csv' test = 'data/test.csv' train_data = pd.read_csv(train) test_data = pd.read_csv(test) #一列目と最終列(ROLE_CODE)以外のtest, trainデータを合わせる all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1])) num_train = np.shape(train_data)[0] print "train_data.shape:", train_data.shape print "train_data:" print train_data.ix[:1,] print "\nTransforming data…" dp = group_data(all_data, degree=2) #2種類ずつグルーピングした2つのデータ列 dt = group_data(all_data, degree=3) #3種類ずつグルーピングしたデータ列 y = array(train_data.ACTION) X = all_data[:num_train] # ~学習用データ行まで X_2 = dp[:num_train] X_3 = dt[:num_train] X_test = all_data[num_train:] #テスト用データ行 ~ X_test_2 = dp[num_train:] X_test_3 = dt[num_train:] X_train_all = np.hstack((X, X_2, X_3)) X_test_all = np.hstack((X_test, X_test_2, X_test_3)) print "X_train_all[0,5:10]: " print X_train_all[0,5:11] num_features = X_train_all.shape[1] print "num_features:", num_features #特徴量 = (8 + 8C2 + 8C3) model = linear_model.LogisticRegression() #ロジスティックモデルを"model"で呼び出し Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)] print "Performing greedy feature selection..." score_hist = [] N = 10 good_features = set([]) #最低2回以上、最新のスコアが一個前よりも悪くなるまで続ける #3-4時間かかる while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: scores = [] #good説明変数以外の変数を足して交差検定してみる for f in range(len(Xts)): if f not in good_features: feats = list(good_features) + [f] #新しく説明変数を追加 Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() score = cv_loop(Xt, y, model, N) scores.append((score, f)) print "Feature: %i Mean AUC: %f" % (f, score) #i番目の変数を説明変数に加えたときのAUC good_features.add(sorted(scores)[-1][1]) #一番よかった変数をgood説明変数に追加 score_hist.append(sorted(scores)[-1]) #よかったスコアと変数を記録 print "Current features: %s" % sorted(list(good_features)) # Remove last added feature from good_features good_features.remove(score_hist[-1][1]) good_features = sorted(list(good_features)) print "Selected features %s" % good_features #特徴量 print "Performing hyperparameter selection..." # Hyperparameter selection loop score_hist = [] Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr() Cvals = np.logspace(-4, 4, 15, base=2) for C in Cvals: model.C = C score = cv_loop(Xt, y, model, N) score_hist.append((score,C)) print "C: %f Mean AUC: %f" %(C, score) bestC = sorted(score_hist)[-1][1] print "Best C value: %f" % (bestC) print "Performing One Hot Encoding on entire dataset..." Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features])) Xt, keymap = OneHotEncoder(Xt) X_train = Xt[:num_train] X_test = Xt[num_train:] print "Training full model..." model.fit(X_train, y) print "Making prediction and saving results..." preds = model.predict_proba(X_test)[:,1] create_test_submission(submit, preds) (1,2,3) (1,3,4) (2,3,4) (1,2,4) 1 898855606 89885293890 529389012 2938913012 2 895677706 892345693890 123573464 23458913012 def group_data(data, degree=3, hash=hash): """ numpy.array -> numpy.array numpy.array型のリストを返す Groups all columns of data into all combinations of triples """ #空のリストを作成 new_data = [] #m:dataの行(32769) n:dataの列(10) m,n = data.shape for indicies in combinations(range(n), degree): new_data.append([hash(tuple(v)) for v in data[:,indicies]]) return array(new_data).T #転置はnumpy.array型で有効 # for indicies in combinations(range(n), degree): combinations(8, 3) 8C3の組み合せのタプルを返す= (1,2,3),(1,2,4),...,(6,7,8) indicies = (1,3,4) の場合 v = [85475,123472,118300] : v行1,3,4列のデータのリスト hash(tuple(v)) = 8988556068844375206 : vのリストを表すhash(IDみたいなもの)が帰ってくる ※hash(v) = (ただのリストは扱えないので、エラーになる) すべての組み合せに対して行って転置する > ([(1,2,3)列のデータのhashリスト], ..., [(6,7,8)列のデータのhashリスト]) >>> return array(new_data).T > = ([1行目の10C3のデータの組み合せのhashリスト], [2行目の8C3のデータの組み合せのhashリスト]), ...., [32769行目の10C3のデータの組み合せのhashリスト]) hash((81625,21375)) 返り値   output : 行(顧客ID) 列(変数の値) のダミー変数を返す keymap : 変数の値の辞書型 output(疎行列) データの値 \ keymaps 123 345 124 .... 24025 3240 0 0 0 ... 0 123 1 0 0 ... 0 24025 0 0 0 ... 1 を、圧縮した疎行列フォーマットのarrayを返す keymap : {19721: 0, 118667: 1, 119695: 2,..., 117887: 66} カテゴリに対応するkeymap def OneHotEncoder(data, keymap=None): """ OneHotEncoder takes data matrix with categorical columns and converts it to a sparse binary matrix. カテゴリ列をもったデータ行列 → 疎なバイナリ行列に変換 Returns sparse binary matrix and keymap mapping categories to indicies. If a keymap is supplied on input it will be used instead of creating one and any categories appearing in the data that are not in the keymap are ignored 返り値  疎なバイナリと、添字とカテゴリに対応するキーマッピング インプットデータにキーマップが存在するなら、1を作る代わりにそれを使用し、キーマップに含まれてない データに現れる任意のカテゴリは無視される """ if keymap is None: keymap = [] for col in data.T: uniques = set(list(col)) keymap.append(dict((key, i) for i, key in enumerate(uniques))) total_pts = data.shape[0] outdat = [] for i, col in enumerate(data.T): km = keymap[i] num_labels = len(km) spmat = sparse.lil_matrix((total_pts, num_labels)) for j, val in enumerate(col): if val in km: spmat[j, km[val]] = 1 outdat.append(spmat) outdat = sparse.hstack(outdat).tocsr() return outdat, keymap data = X_train_all[:,[7]] keymap = None if keymap is None: keymap = [] for col in data.T: uniques = set(list(col)) keymap.append(dict((key, i) for i, key in enumerate(uniques))) total_pts = data.shape[0] outdat = [] for i, col in enumerate(data.T): km = keymap[i] num_labels = len(km) spmat = sparse.lil_matrix((total_pts, num_labels)) #要素がすべて0の疎行列作成 for j, val in enumerate(col): if val in km: spmat[j, km[val]] = 1 outdat.append(spmat) print spmat.toarray() print "output = " print sparse.hstack(outdat).tocsr()[:3] print keymap[0] 返り値 N回分のAUCを平均した meanAUC を返す Xt ダミー変数化したデータ (使われる変数カテゴリのリストはfeatsで与えられる) y 32769の答えのリスト model Logistic回帰 N 交差検定を行う回数 # This loop essentially from Paul's starter code def cv_loop(X, y, model, N): mean_auc = 0. for i in range(N): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state = i*SEED) model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:,1] auc = metrics.auc_score(y_cv, preds) print "AUC (fold %d/%d): %f" % (i + 1, N, auc) mean_auc += auc return mean_auc/N mean_auc = 0. X = Xt N = 10 for i in range(N): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state = i*SEED) model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:,1] auc = metrics.auc_score(y_cv, preds) print "AUC (fold %d/%d): %f" % (i + 1, N, auc) mean_auc += auc def create_test_submission(filename, prediction): content = ['id,ACTION'] for i, p in enumerate(prediction): #predictionにインデックスiをつける content.append('%i,%f' %(i+1,p)) f = open(filename, 'w') f.write('\n'.join(content)) f.close() print 'Saved' x1 = np.logspace( -4, 4, N, base = 2) print x1 y = np.zeros(N) plt.plot(x1, y, 'o') np.logspace(-4, 4, 15, base = 2)