from numpy import array, hstack
from sklearn import metrics, cross_validation, linear_model
from scipy import sparse
from itertools import combinations
import numpy as np
import pandas as pd
SEED = 25
train = 'data/train.csv'
test = 'data/test.csv'
train_data = pd.read_csv(train)
test_data = pd.read_csv(test)
#一列目と最終列(ROLE_CODE)以外のtest, trainデータを合わせる
all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
num_train = np.shape(train_data)[0]
print "train_data.shape:", train_data.shape
print "train_data:"
print train_data.ix[:1,]
train_data.shape: (32769, 10) train_data: ACTION RESOURCE MGR_ID ROLE_ROLLUP_1 ROLE_ROLLUP_2 ROLE_DEPTNAME \ 0 1 39353 85475 117961 118300 123472 1 1 17183 1540 117961 118343 123125 ROLE_TITLE ROLE_FAMILY_DESC ROLE_FAMILY ROLE_CODE 0 117905 117906 290919 117908 1 118536 118536 308574 118539
def group_data
print "\nTransforming data…"
dp = group_data(all_data, degree=2) #2種類ずつグルーピングした2つのデータ列
dt = group_data(all_data, degree=3) #3種類ずつグルーピングしたデータ列
y = array(train_data.ACTION)
X = all_data[:num_train] # ~学習用データ行まで
X_2 = dp[:num_train]
X_3 = dt[:num_train]
X_test = all_data[num_train:] #テスト用データ行 ~
X_test_2 = dp[num_train:]
X_test_3 = dt[num_train:]
X_train_all = np.hstack((X, X_2, X_3))
X_test_all = np.hstack((X_test, X_test_2, X_test_3))
print "X_train_all[0,5:10]: "
print X_train_all[0,5:11]
Transforming data… X_train_all[0,5:10]: [ 117905 117906 290919 3746691918769517531 3746691954495007581 3746691955141275006]
def OneHotEncoder
num_features = X_train_all.shape[1]
print "num_features:", num_features #特徴量 = (8 + 8C2 + 8C3)
model = linear_model.LogisticRegression() #ロジスティックモデルを"model"で呼び出し
Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]
num_features: 92
def cv_loop
説明変数を順番に増やしていく
3-4時間くらいかかる. 92個から26個に変数がへる。
print "Performing greedy feature selection..."
score_hist = []
N = 10
good_features = set([])
#最低2回以上、最新のスコアが一個前よりも悪くなるまで続ける
#3-4時間かかる
while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
scores = []
#good説明変数以外の変数を足して交差検定してみる
for f in range(len(Xts)):
if f not in good_features:
feats = list(good_features) + [f] #新しく説明変数を追加
Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
score = cv_loop(Xt, y, model, N)
scores.append((score, f))
print "Feature: %i Mean AUC: %f" % (f, score) #i番目の変数を説明変数に加えたときのAUC
good_features.add(sorted(scores)[-1][1]) #一番よかった変数をgood説明変数に追加
score_hist.append(sorted(scores)[-1]) #よかったスコアと変数を記録
print "Current features: %s" % sorted(list(good_features))
# Remove last added feature from good_features
good_features.remove(score_hist[-1][1])
good_features = sorted(list(good_features))
print "Selected features %s" % good_features #特徴量
Performing greedy feature selection...
def cv_loop
4. と同様。ハイパーパラメータ決定後、
選ばれた説明変数で再びOneHotCoding
print "Performing hyperparameter selection..."
# Hyperparameter selection loop
score_hist = []
Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
Cvals = np.logspace(-4, 4, 15, base=2)
for C in Cvals:
model.C = C
score = cv_loop(Xt, y, model, N)
score_hist.append((score,C))
print "C: %f Mean AUC: %f" %(C, score)
bestC = sorted(score_hist)[-1][1]
print "Best C value: %f" % (bestC)
print "Performing One Hot Encoding on entire dataset..."
Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features]))
Xt, keymap = OneHotEncoder(Xt)
X_train = Xt[:num_train]
X_test = Xt[num_train:]
def create_test_submission でcsvファイル作成
print "Training full model..."
model.fit(X_train, y)
print "Making prediction and saving results..."
preds = model.predict_proba(X_test)[:,1]
create_test_submission(submit, preds)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-33-d9906fe572f1> in <module>() 1 print "Training full model..." ----> 2 model.fit(X_train, y) 3 4 print "Making prediction and saving results..." 5 preds = model.predict_proba(X_test)[:,1] NameError: name 'X_train' is not defined
Training full model...
それぞれの人のdegree種類のデータの組み合せを、変数化した行列を返す(= 組み合わせたデータの数値が同じとき以外重複しない)
def group_data(data, degree=3, hash=hash):
"""
numpy.array -> numpy.array
numpy.array型のリストを返す
Groups all columns of data into all combinations of triples
"""
#空のリストを作成
new_data = []
#m:dataの行(32769) n:dataの列(10)
m,n = data.shape
for indicies in combinations(range(n), degree):
new_data.append([hash(tuple(v)) for v in data[:,indicies]])
return array(new_data).T #転置はnumpy.array型で有効
hash((81625,21375))
3789057782055395431
数値データを、ダミー変数化する
def OneHotEncoder(data, keymap=None):
"""
OneHotEncoder takes data matrix with categorical columns and
converts it to a sparse binary matrix.
カテゴリ列をもったデータ行列 → 疎なバイナリ行列に変換
Returns sparse binary matrix and keymap mapping categories to indicies.
If a keymap is supplied on input it will be used instead of creating one
and any categories appearing in the data that are not in the keymap are
ignored
返り値 疎なバイナリと、添字とカテゴリに対応するキーマッピング
インプットデータにキーマップが存在するなら、1を作る代わりにそれを使用し、キーマップに含まれてない
データに現れる任意のカテゴリは無視される
"""
if keymap is None:
keymap = []
for col in data.T:
uniques = set(list(col))
keymap.append(dict((key, i) for i, key in enumerate(uniques)))
total_pts = data.shape[0]
outdat = []
for i, col in enumerate(data.T):
km = keymap[i]
num_labels = len(km)
spmat = sparse.lil_matrix((total_pts, num_labels))
for j, val in enumerate(col):
if val in km:
spmat[j, km[val]] = 1
outdat.append(spmat)
outdat = sparse.hstack(outdat).tocsr()
return outdat, keymap
if keymap is None
キーマップがなかったら, 与えたデータの要素のインデックスつき辞書を作成
インデックスは enumerate()でつくられる
ROLE_FAMILY (67種類) のデータを与えたときの例↓
data = X_train_all[:,[7]]
keymap = None
if keymap is None:
keymap = []
for col in data.T:
uniques = set(list(col))
keymap.append(dict((key, i) for i, key in enumerate(uniques)))
total_pts = data.shape[0]
outdat = []
for i, col in enumerate(data.T):
km = keymap[i]
num_labels = len(km)
spmat = sparse.lil_matrix((total_pts, num_labels)) #要素がすべて0の疎行列作成
for j, val in enumerate(col):
if val in km:
spmat[j, km[val]] = 1
outdat.append(spmat)
print spmat.toarray()
print "output = "
print sparse.hstack(outdat).tocsr()[:3]
print keymap[0]
[[ 0. 0. 0. ..., 0. 0. 0.] [ 0. 0. 0. ..., 0. 0. 0.] [ 1. 0. 0. ..., 0. 0. 0.] ..., [ 0. 0. 0. ..., 0. 0. 0.] [ 0. 0. 0. ..., 0. 0. 0.] [ 1. 0. 0. ..., 0. 0. 0.]] output = (0, 51) 1.0 (1, 46) 1.0 (2, 0) 1.0 {19721: 0, 118667: 1, 119695: 2, 119184: 48, 270488: 4, 249618: 5, 121620: 7, 118638: 57, 136398: 41, 118295: 9, 118424: 10, 119784: 58, 131999: 12, 118704: 20, 118363: 43, 120518: 32, 143398: 16, 123689: 18, 118960: 13, 118504: 52, 122032: 19, 118643: 59, 308574: 46, 119221: 22, 119095: 23, 3130: 24, 118331: 25, 121916: 26, 118205: 27, 118453: 28, 4673: 29, 118467: 30, 6725: 31, 120134: 15, 124487: 33, 118474: 34, 118347: 35, 161100: 36, 118478: 37, 118736: 38, 19793: 39, 118612: 40, 127957: 11, 118870: 42, 123611: 14, 119772: 44, 119006: 3, 155173: 47, 292795: 49, 118372: 50, 290919: 51, 124136: 45, 149353: 53, 118762: 54, 119788: 55, 121069: 6, 120302: 8, 124145: 17, 118131: 21, 151277: 56, 132725: 60, 125407: 61, 130364: 62, 159679: 63, 254395: 64, 118398: 65, 117887: 66}
テスト用の標本を、データからランダムに20%選び、交差検定をN回行う。
# This loop essentially from Paul's starter code
def cv_loop(X, y, model, N):
mean_auc = 0.
for i in range(N):
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
X, y, test_size=.20,
random_state = i*SEED)
model.fit(X_train, y_train)
preds = model.predict_proba(X_cv)[:,1]
auc = metrics.auc_score(y_cv, preds)
print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
mean_auc += auc
return mean_auc/N
mean_auc = 0.
X = Xt
N = 10
for i in range(N):
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
X, y, test_size=.20,
random_state = i*SEED)
model.fit(X_train, y_train)
preds = model.predict_proba(X_cv)[:,1]
auc = metrics.auc_score(y_cv, preds)
print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
mean_auc += auc
AUC (fold 1/10): 0.619964 AUC (fold 2/10): 0.639762 AUC (fold 3/10): 0.624336 AUC (fold 4/10): 0.635603 AUC (fold 5/10): 0.625756 AUC (fold 6/10): 0.638609 AUC (fold 7/10): 0.604183 AUC (fold 8/10): 0.610715 AUC (fold 9/10): 0.623856 AUC (fold 10/10): 0.609694
def create_test_submission(filename, prediction):
content = ['id,ACTION']
for i, p in enumerate(prediction): #predictionにインデックスiをつける
content.append('%i,%f' %(i+1,p))
f = open(filename, 'w')
f.write('\n'.join(content))
f.close()
print 'Saved'
x1 = np.logspace( -4, 4, N, base = 2)
print x1
y = np.zeros(N)
plt.plot(x1, y, 'o')
[ 0.0625 0.11573434 0.214311 0.39685026 0.73486725 1.36079 2.5198421 4.66611616 8.64047791 16. ]
[<matplotlib.lines.Line2D at 0x108abe9d0>]
np.logspace(-4, 4, 15, base = 2)
array([ 0.0625 , 0.09287464, 0.13801119, 0.20508384, 0.30475341, 0.45286183, 0.6729501 , 1. , 1.48599429, 2.20817903, 3.28134142, 4.87605462, 7.24578931, 10.76720154, 16. ])