from numpy import array, hstack
from sklearn import metrics, cross_validation, linear_model
from scipy import sparse
from itertools import combinations
import numpy as np
import pandas as pd

SEED = 25


train = 'data/train.csv'
test = 'data/test.csv'
train_data = pd.read_csv(train)
test_data = pd.read_csv(test)

#一列目と最終列(ROLE_CODE)以外のtest, trainデータを合わせる
all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))   
num_train = np.shape(train_data)[0]         

print "train_data.shape:", train_data.shape
print "train_data:"
print train_data.ix[:1,]

print "\nTransforming data…"
dp = group_data(all_data, degree=2)   #2種類ずつグルーピングした2つのデータ列
dt = group_data(all_data, degree=3)   #3種類ずつグルーピングしたデータ列

y = array(train_data.ACTION)

X = all_data[:num_train]    # ~学習用データ行まで
X_2 = dp[:num_train]
X_3 = dt[:num_train]

X_test = all_data[num_train:]   #テスト用データ行 ~ 
X_test_2 = dp[num_train:]
X_test_3 = dt[num_train:]

X_train_all = np.hstack((X, X_2, X_3))    
X_test_all = np.hstack((X_test, X_test_2, X_test_3))

print "X_train_all[0,5:10]: "
print X_train_all[0,5:11]
    

num_features = X_train_all.shape[1]          
print "num_features:", num_features     #特徴量 = (8 + 8C2 + 8C3)

model = linear_model.LogisticRegression()     #ロジスティックモデルを"model"で呼び出し
Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)] 

print "Performing greedy feature selection..."
score_hist = []
N = 10
good_features = set([])
#最低2回以上、最新のスコアが一個前よりも悪くなるまで続ける
#3-4時間かかる
while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
    scores = []
    
    #good説明変数以外の変数を足して交差検定してみる
    for f in range(len(Xts)):
        if f not in good_features:
            feats = list(good_features) + [f]                    #新しく説明変数を追加
            Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()  
            score = cv_loop(Xt, y, model, N)
            scores.append((score, f))
            print "Feature: %i Mean AUC: %f" % (f, score)   #i番目の変数を説明変数に加えたときのAUC
    good_features.add(sorted(scores)[-1][1])      #一番よかった変数をgood説明変数に追加
    score_hist.append(sorted(scores)[-1])         #よかったスコアと変数を記録
    print "Current features: %s" % sorted(list(good_features))
    
     # Remove last added feature from good_features
good_features.remove(score_hist[-1][1])
good_features = sorted(list(good_features))
print "Selected features %s" % good_features   #特徴量

print "Performing hyperparameter selection..."
# Hyperparameter selection loop
score_hist = []
Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
Cvals = np.logspace(-4, 4, 15, base=2)
for C in Cvals:
    model.C = C
    score = cv_loop(Xt, y, model, N)
    score_hist.append((score,C))
    print "C: %f Mean AUC: %f" %(C, score)
bestC = sorted(score_hist)[-1][1]
print "Best C value: %f" % (bestC)

print "Performing One Hot Encoding on entire dataset..."
Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features]))
Xt, keymap = OneHotEncoder(Xt)
X_train = Xt[:num_train]
X_test = Xt[num_train:]

print "Training full model..."
model.fit(X_train, y)

print "Making prediction and saving results..."
preds = model.predict_proba(X_test)[:,1]
create_test_submission(submit, preds)
             (1,2,3)           (1,3,4)         (2,3,4)       (1,2,4)
     1    898855606       89885293890      529389012       2938913012
     2    895677706       892345693890     123573464      23458913012
def group_data(data, degree=3, hash=hash):
    """ 
    numpy.array -> numpy.array
    
    numpy.array型のリストを返す
    Groups all columns of data into all combinations of triples
    """
    #空のリストを作成
    new_data = []  
    #m:dataの行(32769) n:dataの列(10)
    m,n = data.shape

    for indicies in combinations(range(n), degree):
        new_data.append([hash(tuple(v)) for v in data[:,indicies]])
    return array(new_data).T   #転置はnumpy.array型で有効
# for indicies in combinations(range(n), degree):
    combinations(8, 3) 8C3の組み合せのタプルを返す= (1,2,3),(1,2,4),...,(6,7,8) 

    indicies = (1,3,4) の場合   
      v = [85475,123472,118300]      : v行1,3,4列のデータのリスト
      hash(tuple(v)) = 8988556068844375206      : vのリストを表すhash(IDみたいなもの)が帰ってくる
                                                     ※hash(v) =  (ただのリストは扱えないので、エラーになる)
    
    すべての組み合せに対して行って転置する
       >    ([(1,2,3)列のデータのhashリスト], ..., [(6,7,8)列のデータのhashリスト])
    >>> return array(new_data).T
       >      = ([1行目の10C3のデータの組み合せのhashリスト], [2行目の8C3のデータの組み合せのhashリスト]),
                 ...., [32769行目の10C3のデータの組み合せのhashリスト])

hash((81625,21375))
返り値 　　output : 行(顧客ID) 列(変数の値)  のダミー変数を返す
          keymap : 変数の値の辞書型

output(疎行列)
データの値 \ keymaps   123    345    124 ....  24025
  3240                0      0      0  ...     0  
   123                1      0      0  ...     0 
  24025               0      0      0  ...     1      を、圧縮した疎行列フォーマットのarrayを返す

keymap : {19721: 0, 118667: 1, 119695: 2,..., 117887: 66} カテゴリに対応するkeymap
def OneHotEncoder(data, keymap=None):
     """
     OneHotEncoder takes data matrix with categorical columns and
     converts it to a sparse binary matrix.
     カテゴリ列をもったデータ行列　→　疎なバイナリ行列に変換
     
     Returns sparse binary matrix and keymap mapping categories to indicies.
     If a keymap is supplied on input it will be used instead of creating one
     and any categories appearing in the data that are not in the keymap are
     ignored
     返り値　　疎なバイナリと、添字とカテゴリに対応するキーマッピング
     インプットデータにキーマップが存在するなら、１を作る代わりにそれを使用し、キーマップに含まれてない
     データに現れる任意のカテゴリは無視される
     """

     if keymap is None:
          keymap = []
          for col in data.T:
               uniques = set(list(col))
               keymap.append(dict((key, i) for i, key in enumerate(uniques)))
     total_pts = data.shape[0]
     outdat = []
     for i, col in enumerate(data.T):
          km = keymap[i]
          num_labels = len(km)
          spmat = sparse.lil_matrix((total_pts, num_labels))
          for j, val in enumerate(col):
               if val in km:
                    spmat[j, km[val]] = 1
          outdat.append(spmat)
     outdat = sparse.hstack(outdat).tocsr()
     return outdat, keymap

data = X_train_all[:,[7]]
keymap = None

if keymap is None:
          keymap = []
          for col in data.T:
               uniques = set(list(col))
               keymap.append(dict((key, i) for i, key in enumerate(uniques)))
                
total_pts = data.shape[0]
outdat = []    
for i, col in enumerate(data.T):
     km = keymap[i]
     num_labels = len(km)
     spmat = sparse.lil_matrix((total_pts, num_labels)) #要素がすべて0の疎行列作成
     for j, val in enumerate(col):
          if val in km:
               spmat[j, km[val]] = 1
     outdat.append(spmat)                
print spmat.toarray()                
print "output = "
print sparse.hstack(outdat).tocsr()[:3]
print keymap[0]

返り値　N回分のAUCを平均した meanAUC を返す

    Xt  ダミー変数化したデータ (使われる変数カテゴリのリストはfeatsで与えられる)
    y   32769の答えのリスト
    model Logistic回帰
    N  交差検定を行う回数

# This loop essentially from Paul's starter code
def cv_loop(X, y, model, N):
    mean_auc = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.20, 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.auc_score(y_cv, preds)
        print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc += auc
    return mean_auc/N


    mean_auc = 0.
    X = Xt
    N = 10
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=.20, 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.auc_score(y_cv, preds)
        print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
        mean_auc += auc

def create_test_submission(filename, prediction):
    content = ['id,ACTION']
    for i, p in enumerate(prediction):         #predictionにインデックスiをつける
        content.append('%i,%f' %(i+1,p))
    f = open(filename, 'w')
    f.write('\n'.join(content))
    f.close()
    print 'Saved'

x1 = np.logspace( -4, 4, N, base = 2)
print x1
y = np.zeros(N)
plt.plot(x1, y, 'o')

np.logspace(-4, 4, 15, base = 2)