%matplotlib inline import numpy as np import matplotlib.pyplot as plt import pandas as pd import brewer2mpl from matplotlib import rcParams #colorbrewer2 Dark2 qualitative color table dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7) dark2_colors = dark2_cmap.mpl_colors rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() import csv csv_file_object = csv.reader(open(r"train_rev2","rb")) header = csv_file_object.next() results = [] results2 = [] results3 = [] results4 = [] results5 = [] counter = 0 path = "train_rev2" with open(path, "r") as data: #Count the number of lines: 47686352 # for i, l in enumerate(data): # pass # print i + 1 # header = data.readline() for line in data: counter += 1 line = line.strip("\n") line = line.strip() if counter <= 1000: results.append(line.split(",")) if counter >= 1000 and counter <= 2000: results2.append(line.split(",")) if counter >= 2000 and counter <= 3000: results3.append(line.split(",")) if counter >= 3000 and counter <= 4000: results4.append(line.split(",")) if counter >= 4000 and counter <= 5000: results5.append(line.split(",")) testing = pd.DataFrame(data=np.asarray(results[1:]), columns=header) testing.click = testing.click.astype(int) print testing.irow(0) testing.head() from datetime import datetime, date, time testing['timestamp'] = testing['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s"))) testing.tail() trial_text = header[3:-1] df = pd.DataFrame() for elem in trial_text: interim = pd.get_dummies(testing[elem]) if "d41d8cd9" in interim.columns.values: interim.drop("d41d8cd9", axis=1, inplace=True) interim.rename(columns=lambda x: elem + "_" + x, inplace=True) df = df.join(interim) for ix, value in enumerate(testing.ix[:,3:-1].values): mydict = {} indy = [] indy.append(ix) for j, title in enumerate(value): mydict[header[j+3] + "_" + title] = 1 df2 = pd.DataFrame(mydict, index = indy) df = df.append(df2) df.fillna(0, inplace=True) df['timestamp'] = testing['timestamp'] print df.shape df.head() # Samping the scatter plot functionality fig, axes=plt.subplots(figsize=(20, 15), nrows=6, ncols=5) fig.subplots_adjust(hspace = 0.8) fig.subplots_adjust(wspace = 0.8) bar_width = 1. for ix in np.arange(6): for ix2 in np.arange(5): if (ix*5 + ix2) < len(header): count_data = testing.groupby(header[ix*5 + ix2]).click.count() #Count plot of clicks sum_data = testing.groupby(header[ix*5 + ix2]).click.sum() #Sum plot of clicks percentage_data = sum_data / count_data index = np.arange(len(count_data)) axes[ix][ix2].bar(index, count_data, bar_width, color='b', label='category count') axes[ix][ix2].set_ylabel("Category count") axes[ix][ix2].set_xlabel("Sequential Categories") axes[ix][ix2].set_title(header[ix*5 + ix2]) remove_border(axes[ix][ix2], top=False, right=True, left=True, bottom=True) secondAxis = axes[ix][ix2].twinx() secondAxis.plot(index + bar_width / 2., percentage_data, color='r', label='percentage from category who clicked') secondAxis.set_ylabel("Percentage Click") else: fig.delaxes(axes[ix][ix2]) from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import LogisticRegression y = testing.click xtrain, xtest, ytrain, ytest = train_test_split(df, y) clf = MultinomialNB().fit(xtrain, ytrain) print "Training accuracy: %0.2f%%" % (100 * clf.score(xtrain, ytrain)) print "Test accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest)) prob = clf.predict_proba(xtest) plt.hist(prob[:,1]) remove_border() plt.ylabel("Count") plt.xlabel("Probability") plt.title("Naive Bayes Probability Histogram") clf2 = LogisticRegression().fit(xtrain, ytrain) print "Training accuracy: %0.2f%%" % (100 * clf2.score(xtrain, ytrain)) print "Test accuracy: %0.2f%%" % (100 * clf2.score(xtest, ytest)) prob = clf2.predict_proba(xtest) plt.hist(prob[:,1]) remove_border() plt.ylabel("Count") plt.xlabel("Probability") plt.title("Logistic Regression Probability Histogram") testing2 = pd.DataFrame(data=np.asarray(results2[:]), columns=header) testing2.click = testing2.click.astype(int) testing2['timestamp'] = testing2['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s"))) fig, axes=plt.subplots(figsize=(20,15), nrows=6, ncols=5) fig.subplots_adjust(hspace = 0.8) fig.subplots_adjust(wspace = 0.6) for ix in np.arange(6): for ix2 in np.arange(5): if (ix*5 + ix2) < len(header): first_count = testing[header[ix*5 + ix2]].unique() first_count_len = len(first_count) first_count_set = set(first_count) second_count_set = set(testing2[header[ix*5 + ix2]].unique()) second_count_len = len(second_count_set.difference(first_count_set)) index = np.arange(2) axes[ix][ix2].bar(index, [first_count_len, first_count_len+second_count_len], color='b', label='count') axes[ix][ix2].set_title(header[ix*5 + ix2]) axes[ix][ix2].set_ylabel("Count") axes[ix][ix2].set_xlabel("Index") remove_border(axes[ix][ix2]) else: fig.delaxes(axes[ix][ix2]) testing3 = pd.DataFrame(data=np.asarray(results3[:]), columns=header) testing3.click = testing3.click.astype(int) testing3['timestamp'] = testing3['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s"))) testing4 = pd.DataFrame(data=np.asarray(results4[:]), columns=header) testing4.click = testing4.click.astype(int) testing4['timestamp'] = testing4['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s"))) testing5 = pd.DataFrame(data=np.asarray(results5[:]), columns=header) testing5.click = testing5.click.astype(int) testing5['timestamp'] = testing5['hour'].map(lambda x: int(datetime.strptime(x[4:6] + "/" + x[2:4] + "/" + x[0:2] + " " + x[6:8], "%d/%m/%y %H").strftime("%s"))) fig, axes=plt.subplots(figsize=(20,15), nrows=6, ncols=5) fig.subplots_adjust(hspace = 0.8) fig.subplots_adjust(wspace = 0.6) for ix in np.arange(6): for ix2 in np.arange(5): if (ix*5 + ix2) < len(header): first_count = testing[header[ix*5 + ix2]].unique() first_count_len = len(first_count) first_count_set = set(first_count) second_count_set = set(testing2[header[ix*5 + ix2]].unique()) second_count_set = second_count_set.union(first_count_set) second_count_len = len(second_count_set) third_count_set = set(testing3[header[ix*5 + ix2]].unique()) third_count_set = third_count_set.union(second_count_set) third_count_len = len(third_count_set) fourth_count_set = set(testing4[header[ix*5 + ix2]].unique()) fourth_count_set = fourth_count_set.union(third_count_set) fourth_count_len = len(fourth_count_set) fifth_count_set = set(testing5[header[ix*5 + ix2]].unique()) fifth_count_set = fifth_count_set.union(fourth_count_set) fifth_count_len = len(fifth_count_set) index = np.arange(5) axes[ix][ix2].bar(index, [first_count_len, second_count_len, third_count_len, fourth_count_len, fifth_count_len], color='b', label='count') axes[ix][ix2].set_title(header[ix*5 + ix2]) axes[ix][ix2].set_ylabel("Count") axes[ix][ix2].set_xlabel("Index") remove_border(axes[ix][ix2]) else: fig.delaxes(axes[ix][ix2]) trial_text_new = header[3:] df_new = pd.DataFrame() for elem_new in trial_text_new: if elem_new != "device_ip" and elem_new != "device_id" and elem_new != "device_model": interim_new = pd.get_dummies(testing[elem_new]) if "d41d8cd9" in interim_new.columns.values: interim_new.drop("d41d8cd9", axis=1, inplace=True) interim_new.rename(columns=lambda x: elem_new + "_" + x, inplace=True) df_new = df_new.join(interim_new) COLnames = df_new.columns.values df_new.head() for ix, value in enumerate(testing.ix[:,3:-1].values): mydict = {} indy = [] indy.append(ix) for j, title in enumerate(value): if j != 8 and j != 9 and j != 12 and title != "d41d8cd9": mydict[header[j+3] + "_" + title] = 1 df2_new = pd.DataFrame(mydict, index = indy) df_new = df_new.append(df2_new) df_new['timestamp'] = testing['timestamp'] df_new.fillna(0, inplace=True) y=testing.click xtrain, xtest, ytrain, ytest = train_test_split(df_new, y) clf = MultinomialNB().fit(xtrain, ytrain) print "Testing Accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest)) print "Training Accuracy: %0.2f%%" % (100 * clf.score(xtrain, ytrain)) prob = clf.predict_proba(xtest) plt.hist(prob[:,1]) remove_border() plt.ylabel("Count") plt.xlabel("Probability") plt.title("Naive Bayes Probability Histogram") clf2 = LogisticRegression().fit(xtrain, ytrain) print "Testing Accuracy: %0.2f%%" % (100 * clf2.score(xtest, ytest)) print "Training Accuracy: %0.2f%%" % (100 * clf2.score(xtrain, ytrain)) prob = clf2.predict_proba(xtest) plt.hist(prob[:,1]) remove_border() plt.ylabel("Count") plt.xlabel("Probability") plt.title("Logistic Regression Probability Histogram") tester = [] path = "varinfo.txt" with open(path,"r") as infile: for row in infile: row = row.strip("c^").split("\n") tester.append(row[0]) the_brain = {} feature_set = [] totalScores = [] for value in tester[1:]: totalScores.append((value.split()[0], value.split()[5])) feature = value.split()[0].rsplit('_', 1)[0] score = float(value.split()[5].strip("%")) if feature not in the_brain: feature_set.append(feature) the_brain[feature] = [score] else: the_brain[feature].append(score) print len(feature_set) fig, axes=plt.subplots(figsize=(23,15), nrows=6, ncols=5) fig.subplots_adjust(hspace = 0.8) fig.subplots_adjust(wspace = 0.6) for ix in np.arange(6): for ix2 in np.arange(5): if (ix*6 + ix2) < len(feature_set): axes[ix][ix2].hist(the_brain[feature_set[ix*6+ix2]], bins = 10) axes[ix][ix2].set_title(feature_set[ix*6+ix2]) axes[ix][ix2].set_ylabel("Count") axes[ix][ix2].set_xlabel("Weight") remove_border(axes[ix][ix2]) else: fig.delaxes(axes[ix][ix2]) lessThanAbsOne = [e1[0] for e1 in totalScores if e1[1] < 1 and e1[1] > -1] with open('featureRemove.csv', 'w') as fp: a = csv.writer(fp, delimiter='\n') data = lessThanAbsOne a.writerow(data) path = "log_loss_.csv" counter = 0 boot_res_x = [] boot_res_y = [] first_res_x = [] first_res_y = [] with open(path, "r") as data: for line in data: line = line.replace(" ", ",") line = line.split(",") line = [e.strip("\n") for e in line if e != ""] if counter > 1 and line[6]!='26': boot_res_y.append(line[0]) boot_res_x.append(line[3]) if counter > 1 and line[6]=='26': first_res_y.append(line[0]) first_res_x.append(line[3]) counter = counter + 1 #PLOTs plt.plot(boot_res_x, boot_res_y, color = "b", label = "bootstrap samples") plt.plot(first_res_x, first_res_y, color = "r", label = "simple model") plt.ylim(0.2, 0.5) remove_border() plt.title("Log loss versus number of training examples") plt.ylabel("Log loss") plt.xlabel("Number of training examples") plt.legend()