%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import brewer2mpl
from matplotlib import rcParams

#colorbrewer2 Dark2 qualitative color table
dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7)
dark2_colors = dark2_cmap.mpl_colors

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()   

import csv
csv_file_object = csv.reader(open(r"train_rev2","rb"))
header = csv_file_object.next()

results = []
results2 = []
results3 = []
results4 = []
results5 = []
counter = 0
path  = "train_rev2"
with open(path, "r") as data:
    #Count the number of lines: 47686352
#     for i, l in enumerate(data):
#         pass
#     print i + 1
#     header = data.readline()
    for line in data:
        counter += 1
        line = line.strip("\n")
        line = line.strip()
        if counter <= 1000:
            results.append(line.split(","))
        if counter >= 1000 and counter <= 2000:
            results2.append(line.split(","))
        if counter >= 2000 and counter <= 3000:
            results3.append(line.split(","))
        if counter >= 3000 and counter <= 4000:
            results4.append(line.split(","))
        if counter >= 4000 and counter <= 5000:
            results5.append(line.split(","))

testing  = pd.DataFrame(data=np.asarray(results[1:]), columns=header)
testing.click = testing.click.astype(int)
print testing.irow(0)
testing.head()

from datetime import datetime, date, time
testing['timestamp'] = testing['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s")))
testing.tail()

trial_text = header[3:-1]
df = pd.DataFrame()

for elem in trial_text:
    interim = pd.get_dummies(testing[elem])
    if "d41d8cd9" in interim.columns.values:
        interim.drop("d41d8cd9", axis=1, inplace=True)
    interim.rename(columns=lambda x: elem + "_" + x, inplace=True)
    df = df.join(interim)

for ix, value in enumerate(testing.ix[:,3:-1].values):
    mydict = {}
    indy = []
    indy.append(ix)
    for j, title in enumerate(value):
        mydict[header[j+3] + "_" + title] = 1
    df2 = pd.DataFrame(mydict, index = indy)
    df = df.append(df2)

df.fillna(0, inplace=True)

df['timestamp'] = testing['timestamp']
print df.shape
df.head()

# Samping the scatter plot functionality
fig, axes=plt.subplots(figsize=(20, 15), nrows=6, ncols=5)
fig.subplots_adjust(hspace = 0.8)
fig.subplots_adjust(wspace = 0.8)
bar_width = 1.

for ix in np.arange(6):
    for ix2 in np.arange(5):
        if (ix*5 + ix2) < len(header):
            count_data = testing.groupby(header[ix*5 + ix2]).click.count() #Count plot of clicks
            sum_data = testing.groupby(header[ix*5 + ix2]).click.sum() #Sum plot of clicks
            percentage_data = sum_data / count_data
            index = np.arange(len(count_data))
            axes[ix][ix2].bar(index, count_data, bar_width, color='b', label='category count')
            axes[ix][ix2].set_ylabel("Category count")
            axes[ix][ix2].set_xlabel("Sequential Categories")
            axes[ix][ix2].set_title(header[ix*5 + ix2])
            remove_border(axes[ix][ix2], top=False, right=True, left=True, bottom=True)
            secondAxis = axes[ix][ix2].twinx()
            secondAxis.plot(index + bar_width / 2., percentage_data, color='r', label='percentage from category who clicked')
            secondAxis.set_ylabel("Percentage Click")
        else:
            fig.delaxes(axes[ix][ix2])

from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

y = testing.click

xtrain, xtest, ytrain, ytest = train_test_split(df, y)

clf = MultinomialNB().fit(xtrain, ytrain)

print "Training accuracy: %0.2f%%" % (100 * clf.score(xtrain, ytrain))
print "Test accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest))

prob = clf.predict_proba(xtest)
plt.hist(prob[:,1])
remove_border()
plt.ylabel("Count")
plt.xlabel("Probability")
plt.title("Naive Bayes Probability Histogram")

clf2 = LogisticRegression().fit(xtrain, ytrain)

print "Training accuracy: %0.2f%%" % (100 * clf2.score(xtrain, ytrain))
print "Test accuracy: %0.2f%%" % (100 * clf2.score(xtest, ytest))

prob = clf2.predict_proba(xtest)
plt.hist(prob[:,1])
remove_border()
plt.ylabel("Count")
plt.xlabel("Probability")
plt.title("Logistic Regression Probability Histogram")

testing2  = pd.DataFrame(data=np.asarray(results2[:]), columns=header)
testing2.click = testing2.click.astype(int)
testing2['timestamp'] = testing2['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s")))

fig, axes=plt.subplots(figsize=(20,15), nrows=6, ncols=5)
fig.subplots_adjust(hspace = 0.8)
fig.subplots_adjust(wspace = 0.6)

for ix in np.arange(6):
    for ix2 in np.arange(5):
        if (ix*5 + ix2) < len(header):
            first_count = testing[header[ix*5 + ix2]].unique()
            first_count_len = len(first_count)
            first_count_set = set(first_count)
            second_count_set = set(testing2[header[ix*5 + ix2]].unique())
            second_count_len = len(second_count_set.difference(first_count_set))
            index = np.arange(2)
            axes[ix][ix2].bar(index, [first_count_len, first_count_len+second_count_len], color='b', label='count')
            axes[ix][ix2].set_title(header[ix*5 + ix2])
            axes[ix][ix2].set_ylabel("Count")
            axes[ix][ix2].set_xlabel("Index")
            remove_border(axes[ix][ix2])
        else:
            fig.delaxes(axes[ix][ix2])

testing3  = pd.DataFrame(data=np.asarray(results3[:]), columns=header)
testing3.click = testing3.click.astype(int)
testing3['timestamp'] = testing3['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s")))

testing4  = pd.DataFrame(data=np.asarray(results4[:]), columns=header)
testing4.click = testing4.click.astype(int)
testing4['timestamp'] = testing4['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s")))

testing5  = pd.DataFrame(data=np.asarray(results5[:]), columns=header)
testing5.click = testing5.click.astype(int)
testing5['timestamp'] = testing5['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s")))

fig, axes=plt.subplots(figsize=(20,15), nrows=6, ncols=5)
fig.subplots_adjust(hspace = 0.8)
fig.subplots_adjust(wspace = 0.6)

for ix in np.arange(6):
    for ix2 in np.arange(5):
        if (ix*5 + ix2) < len(header):
            first_count = testing[header[ix*5 + ix2]].unique()
            first_count_len = len(first_count)
            first_count_set = set(first_count)
            second_count_set = set(testing2[header[ix*5 + ix2]].unique())
            second_count_set = second_count_set.union(first_count_set)
            second_count_len = len(second_count_set)
            third_count_set = set(testing3[header[ix*5 + ix2]].unique())
            third_count_set = third_count_set.union(second_count_set)
            third_count_len = len(third_count_set)
            fourth_count_set = set(testing4[header[ix*5 + ix2]].unique())
            fourth_count_set = fourth_count_set.union(third_count_set)
            fourth_count_len = len(fourth_count_set)
            fifth_count_set = set(testing5[header[ix*5 + ix2]].unique())
            fifth_count_set = fifth_count_set.union(fourth_count_set)
            fifth_count_len = len(fifth_count_set)        
            index = np.arange(5)
            axes[ix][ix2].bar(index, [first_count_len, second_count_len, third_count_len, fourth_count_len, fifth_count_len], color='b', label='count')
            axes[ix][ix2].set_title(header[ix*5 + ix2])
            axes[ix][ix2].set_ylabel("Count")
            axes[ix][ix2].set_xlabel("Index")
            remove_border(axes[ix][ix2])
        else:
            fig.delaxes(axes[ix][ix2])

trial_text_new = header[3:]
df_new = pd.DataFrame()

for elem_new in trial_text_new:
    if elem_new != "device_ip" and elem_new != "device_id" and elem_new != "device_model":
        interim_new = pd.get_dummies(testing[elem_new])
        if "d41d8cd9" in interim_new.columns.values:
            interim_new.drop("d41d8cd9", axis=1, inplace=True)
        interim_new.rename(columns=lambda x: elem_new + "_" + x, inplace=True)
        df_new = df_new.join(interim_new)

COLnames = df_new.columns.values
df_new.head()

for ix, value in enumerate(testing.ix[:,3:-1].values):
    mydict = {}
    indy = []
    indy.append(ix)
    for j, title in enumerate(value):
        if j != 8 and j != 9 and j != 12 and title != "d41d8cd9":
            mydict[header[j+3] + "_" + title] = 1
    df2_new = pd.DataFrame(mydict, index = indy)
    df_new = df_new.append(df2_new)

df_new['timestamp'] = testing['timestamp']
df_new.fillna(0, inplace=True)
y=testing.click
xtrain, xtest, ytrain, ytest = train_test_split(df_new, y)

clf = MultinomialNB().fit(xtrain, ytrain)
print "Testing Accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest))
print "Training Accuracy: %0.2f%%" % (100 * clf.score(xtrain, ytrain))
prob = clf.predict_proba(xtest)
plt.hist(prob[:,1])
remove_border()
plt.ylabel("Count")
plt.xlabel("Probability")
plt.title("Naive Bayes Probability Histogram")

clf2 = LogisticRegression().fit(xtrain, ytrain)
print "Testing Accuracy: %0.2f%%" % (100 * clf2.score(xtest, ytest))
print "Training Accuracy: %0.2f%%" % (100 * clf2.score(xtrain, ytrain))
prob = clf2.predict_proba(xtest)
plt.hist(prob[:,1])
remove_border()
plt.ylabel("Count")
plt.xlabel("Probability")
plt.title("Logistic Regression Probability Histogram")

tester = []
path  = "varinfo.txt"
with open(path,"r") as infile:
    for row in infile:
        row = row.strip("c^").split("\n")
        tester.append(row[0])

the_brain = {}
feature_set = []
totalScores = []
for value in tester[1:]:
    totalScores.append((value.split()[0], value.split()[5]))
    feature = value.split()[0].rsplit('_', 1)[0]
    score = float(value.split()[5].strip("%"))
    if feature not in the_brain:
        feature_set.append(feature)
        the_brain[feature] = [score]
    else:
        the_brain[feature].append(score)
print len(feature_set)

fig, axes=plt.subplots(figsize=(23,15), nrows=6, ncols=5)
fig.subplots_adjust(hspace = 0.8)
fig.subplots_adjust(wspace = 0.6)

for ix in np.arange(6):
    for ix2 in np.arange(5):
        if (ix*6 + ix2) < len(feature_set):
            axes[ix][ix2].hist(the_brain[feature_set[ix*6+ix2]], bins = 10)
            axes[ix][ix2].set_title(feature_set[ix*6+ix2])
            axes[ix][ix2].set_ylabel("Count")
            axes[ix][ix2].set_xlabel("Weight")
            remove_border(axes[ix][ix2])
        else:
            fig.delaxes(axes[ix][ix2])

lessThanAbsOne = [e1[0] for e1 in totalScores if e1[1] < 1 and e1[1] > -1]

with open('featureRemove.csv', 'w') as fp:
    a = csv.writer(fp, delimiter='\n')
    data = lessThanAbsOne
    a.writerow(data)

path  = "log_loss_.csv"
counter = 0
boot_res_x = []
boot_res_y = []
first_res_x = []
first_res_y = []
with open(path, "r") as data:
    for line in data:
        line = line.replace(" ", ",")
        line = line.split(",")
        line = [e.strip("\n") for e in line if e != ""]
        if counter > 1 and line[6]!='26':
            boot_res_y.append(line[0])
            boot_res_x.append(line[3])
        if counter > 1 and line[6]=='26':
            first_res_y.append(line[0])
            first_res_x.append(line[3])
        counter = counter + 1
        
#PLOTs
plt.plot(boot_res_x, boot_res_y, color = "b", label = "bootstrap samples")
plt.plot(first_res_x, first_res_y, color = "r", label = "simple model")
plt.ylim(0.2, 0.5)
remove_border()
plt.title("Log loss versus number of training examples")
plt.ylabel("Log loss")
plt.xlabel("Number of training examples")
plt.legend()