# !pip install Quandl %matplotlib inline import numpy as np import numpy import pandas as pd import math import scipy import random import Quandl import matplotlib.pyplot as plt import matplotlib.mlab as mlab import string import calendar import datetime from pandas.tseries.offsets import * import operator from itertools import combinations, permutations from matplotlib import rcParams import scipy.stats as stats from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import MultinomialNB import sklearn #colorbrewer2 Dark2 qualitative color table dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)] #http://colorbrewer2.org/ brewer_rg = ['#E41A1C', '#4DAF4A'] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' def remove_border(ax=None, axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ if ax == None: ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #remove grid ax.grid(False) #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() pd.set_option('display.width', 500) pd.set_option('display.max_columns', 100) # Quandl API key authtoken='Cx1CtXeu61zjTzpehmNV' ''' input: sources - list of data source titles from Quandl, ex. 'FRED' for Federal Reserve Economic Data function: scrapes the Quandl website using its search API to find the Quandl code for every variable from a given source return: data - dictionary with names and Quandl codes for every variable from the supplied sources ''' def scrape_quandl(sources): data={} for source in sources: for j in range(1,100): data_options = Quandl.search(query="", source = source, page=j) for i in data_options: data[i["code"]]=i["code"] #Adds the S and P to the dataset so that it is always included data['s_and_p_500']='YAHOO/INDEX_GSPC.6' return data ''' get_quandl_data input: dict of Quandl codes function: downloads Quandl data set column name for each as its repective key from the dict return: list of dataframes ''' def get_quandl_data(quandl_codes, cols=True): data_list = [] for number, code in enumerate(quandl_codes.values()): data_holder = Quandl.get([code], authtoken=authtoken) if(cols == True): data_holder.columns = [quandl_codes.keys()[number]] data_list.append(data_holder) return data_list ''' collapser input: dataframe of Quandl data function: and adds rows for every business day during the timeframe of that dataset performs linear interpolation to fill in all missing values then drops all values except for Mondays return: cleaned dataframe of Quandle data ''' def collapser(df): df = df.asfreq(BDay()) df = df.apply(pd.Series.interpolate) df = df.asfreq(Week(weekday=1)) return df ''' merger input: list of dataframes of Quandl data function: runs collapser on each dataframe finds the latest start date and earliest end date accross ass dataframes and truncates at those date concatonats all dataset into single dataframe return: dataframe of Quandl data of uniform length with no missing values start and end date of dataframe ''' def merger(data_list): begin_date=datetime.datetime(1900,1,1,12,13,14) end_date=datetime.datetime(2100,1,1,12,13,14) for i in reversed(range(len(data_list))): try: data_list[i] = collapser(data_list[i]) if (data_list[i].index[0]>datetime.datetime(2000,1,1,12,13,14)): del data_list[i] elif (data_list[i].index[-1]begin_date): begin_date=data_list[i].index[0] elif (data_list[i].index[-1]= bins[i]) & (df[col] < bins[i+1])] = 1 add_cols.append(title) return df, add_cols def advance_cols(df, cols): for col in cols: title = '%s_adv' % col df[title] = float('nan') for i in range(len(df)-1): df[title][i] = df[col][i+1] return df[:-1] ''' input: data - dictionary with names and Quandl codes for every variable from the supplied sources function: calls for a pd dataframe, cleans and interpolates the data so that it is all weekly, deletes datasets without at least 12 years of data or any that throw an error, then adds the prediction for next week column return: dataset - Pandas dataframe with columns for every variable ''' def prepare_data(data): list_of_data = get_quandl_data(data, cols=False) dataset, dates = merger(list_of_data) dataset = percentageDifference(dataset, ['YAHOO.INDEX_GSPC - Adjusted Close']) dataset, add_cols = biner(dataset, ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff'], [0.000001, 100.], 0) dataset = advance_cols(dataset, add_cols) return dataset ''' input: dataset - Pandas dataframe with columns for every variable function: Goes variable by variable and tests if adding that variable improves the score of the Naive Bayes model. If a variable improves the model, it keeps it, if it keeps the model the same or makes it worse, it ignores it return: columns_keep - list of Quandl codes that made it into the final model (the columns that were useful for prediction) k - float with the final score of the most accurate model ''' def NB_Add_Drop(dataset): validate = 0 #list_of_data = get_quandl_data(data) #dataset, dates, num_col = merger(list_of_data) #dataset = percentageDifference(dataset, ['s_and_p_500']) #dataset, add_cols = biner(dataset, ['s_and_p_500_pdiff'], [0.000001, 100.]) #dataset = advance_cols(dataset, add_cols) columns_keep = ['YAHOO.INDEX_GSPC - Adjusted Close'] for i in dataset.columns: if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff'): continue if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'): continue try: columns_keep.append(i) data_array = dataframe_to_array(dataset, columns_keep, ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv']) data_array_in=data_array[:,:-1] data_array_out=data_array[:,-1:] #Converts the x by 1 by 1 array into a x by 1 array data_array_out=data_array_out.reshape(data_array_out.shape[0]) mean_list = [] for j in range(0,1000): xtrain, xtest, ytrain, ytest = train_test_split(data_array_in, data_array_out) clf = MultinomialNB().fit(xtrain, ytrain) mean_list.append(clf.score(xtest, ytest)) k = sum(mean_list)/float(len(mean_list)) if (k <= validate): columns_keep.remove(i) else: validate = k except: columns_keep.remove(i) continue return columns_keep, k ''' NB_All_Vars input: dataset - Pandas dataframe with columns for every variable tops - int, number of top variables you want returned function: Runs a Naive Bayes that looks at each variable and the S and P and returns the top x variables as specified return: columns_keep - list of Quandl codes that made it into the final model (the columns that were useful for prediction) k - float with the final score of the most accurate model ''' def NB_All_Vars(dataset, tops): columns_keep = ['YAHOO.INDEX_GSPC - Adjusted Close'] regressions = {} for i in dataset.columns: if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff'): continue if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'): continue try: columns_keep.append(i) data_array = dataframe_to_array(dataset, columns_keep, ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv']) data_array_in=data_array[:,:-1] data_array_out=data_array[:,-1:] #Converts the x by 1 by 1 array into a x by 1 array data_array_out=data_array_out.reshape(data_array_out.shape[0]) mean_list = [] for j in range(0,1000): xtrain, xtest, ytrain, ytest = train_test_split(data_array_in, data_array_out) clf = MultinomialNB().fit(xtrain, ytrain) mean_list.append(clf.score(xtest, ytest)) k = sum(mean_list)/float(len(mean_list)) regressions[i]=k columns_keep.remove(i) except: columns_keep.remove(i) continue top_regressions = dict(sorted(regressions.iteritems(), key=operator.itemgetter(1), reverse=True)[:tops]) return top_regressions ''' NB_Add_Drop_Top input: dataset - Pandas dataframe with columns for every variable tops - int, number of top variables you want returned function: Runs the Add_Drop Naive Bayes model but only on the variables returned in top_regressions return: columns_keep - list of Quandl codes that made it into the final model (the columns that were useful for prediction) k - float with the final score of the most accurate model ''' def NB_Add_Drop_Top(dataset, top_regressions): validate = 0 columns_keep = ['YAHOO.INDEX_GSPC - Adjusted Close'] for i in top_regressions.keys(): if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff'): continue if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'): continue try: columns_keep.append(i) print columns_keep data_array = dataframe_to_array(dataset, columns_keep, ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv']) data_array_in=data_array[:,:-1] data_array_out=data_array[:,-1:] #Converts the x by 1 by 1 array into a x by 1 array data_array_out=data_array_out.reshape(data_array_out.shape[0]) mean_list = [] for j in range(0,1000): xtrain, xtest, ytrain, ytest = train_test_split(data_array_in, data_array_out) clf = MultinomialNB().fit(xtrain, ytrain) mean_list.append(clf.score(xtest, ytest)) k = sum(mean_list)/float(len(mean_list)) if (k <= validate): columns_keep.remove(i) else: validate = k except: columns_keep.remove(i) continue return columns_keep, k data_NB = scrape_quandl(["FRED","OFDP"]) datalist = prepare_data(data_NB) top_regressions = NB_All_Vars(datalist, 20) final_NB = NB_Add_Drop_Top(datalist, top_regressions) ''' dict of of all the Quandl datasets we will use the key will be used as the column name for convenience ''' data = {'gold_price':'BUNDESBANK/BBK01_WT5511.1', 'usd_to_pound':'QUANDL/USDGBP.1', 'cpi':'FRED/CPIAUCSL.1', 'unemployment':'FRED/UNRATE.1', 'gas_price':'BTS_MM/RETAILGAS.1', 'volatility':'YAHOO/INDEX_VIX.6', 'house_sales':'FRED/HSN1F.1', 'usd_to_euro':'QUANDL/USDEUR.1', 'crude_futures':'OFDP/FUTURE_CL2.1', 'housing_prices':'FRED/ASPTFC.1', 'gas_futures':'OFDP/FUTURE_NG1.2', '10-year_treasury':'FRED/DGS10', 'corporate_profits':'FRED/CP', 'wheat_futures':'OFDP/FUTURE_W1.5', 'gdp':'FRED/GDP', 'treasury_futures':'OFDP.FUTURE_US1.1', 'inventories_sales_ratio':'FRED/ISRATIO', 'iron_ore':'WORLDBANK/WLD_IRON_ORE.1', 'retail_and_food':'FRED/RSAFS.1', 's_and_p_500':'YAHOO.INDEX_GSPC.6'} list_of_data = get_quandl_data(data) dataset, dates = merger(list_of_data) dataset = percentageDifference(dataset, data.keys()) dataset, add_cols = biner(dataset, ['s_and_p_500_pdiff'], [0.000001, 100.]) dataset = advance_cols(dataset, add_cols) # sanity check assert len(dataset[np.logical_not(np.isfinite(dataset)).any(axis=1)]) == 0 inc = dataset['s_and_p_500_pdiff'][dataset['s_and_p_500_pdiff'] > 0] dec = dataset['s_and_p_500_pdiff'][dataset['s_and_p_500_pdiff'] <= 0] mean = dataset['s_and_p_500_pdiff'].mean() std = dataset['s_and_p_500_pdiff'].std() in_one_std = dataset['s_and_p_500_pdiff'][(dataset['s_and_p_500_pdiff'] >= (mean - std)) & (dataset['s_and_p_500_pdiff'] <= (mean + std))] out_one_std = dataset['s_and_p_500_pdiff'][(dataset['s_and_p_500_pdiff'] <= (mean - std)) | (dataset['s_and_p_500_pdiff'] >= (mean + std))] int_inc = pd.concat([inc, out_one_std], axis=1, join='inner').ix[:,0] int_dec = pd.concat([dec, out_one_std], axis=1, join='inner').ix[:,0] extreme = dataset.sort_index(by=['s_and_p_500_pdiff'], ascending=False).s_and_p_500_pdiff print_num = 5 extreme = pd.concat([extreme[:print_num], extreme[-print_num:]]) print 'the s&p 500 increased %d weeks (%.2f%%) and decreased %d weeks (%.2f%%) between %s and %s' % (len(inc), (float(len(inc))/len(dataset))*100, len(dec), (float(len(dec))/len(dataset))*100, dates[0].strftime('%B %d, %Y'), dates[1].strftime('%B %d, %Y')) print 'over the whole period the s & p 500 increased by %.2f%%' % ((dataset['s_and_p_500'][-1] - dataset['s_and_p_500'][0])/dataset['s_and_p_500'][0]*100) print 'the average weekly increase was %.2f%% with a standard deviation of %.2f%%' % ((inc.mean() * 100), inc.std()*100) print 'the average weekly decrease was %.2f%% with a standard deviation of %.2f%%' % (abs(dec.mean() * 100), dec.std()*100) print 'the average weekly change over the whole period was %.2f%% with a standard deviation of %.2f%%' % (mean*100, std*100) print 'the weekly change was outside 1 standard deviation of the mean %d times' % len(out_one_std) print 'of those %d of them were increases and had an average change of %.2f%% and a standard deviation of %.2f%%' % (len(int_inc), int_inc.mean()*100, int_inc.std()*100) print 'and %d of them were decreases and had an average change of %.2f%% and a standard deviation of %.2f%%' % (len(int_dec), int_dec.mean()*100, int_dec.std()*100) print print 'the %d most extreme increases/decreases were' % print_num for i in range(len(extreme)): print '-' * 25 print extreme.index[i].strftime('%B %d, %Y') print '%.2f%%' % (extreme[i]*100) plt.figure() ax = dataset['s_and_p_500_pdiff'].hist(bins=np.sqrt(len(dataset))) plt.xlabel('weekly percent change of the s&p 500') plt.ylabel('frequency') plt.title('weekly percent change of the s&p 500') plt.vlines((mean+std), 0, 100, label=('+/-1 standard deviation')) plt.vlines((mean-std), 0, 100) plt.legend(frameon=False) remove_border(ax) ''' input: column name function: removes '_' from column name return: clean column name ''' def clean_column_name(col): return ' '.join(col.split('_')) ''' input: dataframe a list of columns (y axis) target column (x axis) function: plots each column of the dataframe agaist the target column overplots a liner fit return: a sorted list column names by correlation coefficient dict of the slopes of the liner regression line ''' def scatter_plot(df, cols, target): xaxis = target yaxis = cols ff = 5 * (len(yaxis)-1) fig, axes=plt.subplots(nrows=len(yaxis), figsize=(5,ff)) Rs = [] slope = {} for i, y in enumerate(yaxis): axes[i].scatter(df[xaxis], df[y]) m, b, r, p, se = scipy.stats.linregress(df[xaxis], df[y]) lab = 'r^2 = %.2f \np = %.2f' % (r*r, p) axes[i].plot(df[xaxis], np.array(df[xaxis]) * m + b, label=lab) axes[i].set_xlabel(clean_column_name(xaxis)) axes[i].set_ylabel(clean_column_name(y)) axes[i].legend(frameon=False) # axes[i].set_title(lab) remove_border(axes[i]) Rs.append((y, (r*r))) slope[y] = m fig.tight_layout() return sorted(Rs, key=operator.itemgetter(1), reverse=True), slope r, slope = scatter_plot(dataset, data.keys(), 's_and_p_500') ''' input: dataset function: checks for collinearity between all variables excludes percent difference, binned, and advanced columns by default return: sorted list of tuples of column names and their correlation coefficent ''' def colin(df, extra=False): cols = list(df.columns) if extra == False: cols = [c for c in list(df.columns) if c[-6:] != '_pdiff' or '_bins' or '_adv'] cols = combinations(cols, 2) Rs = [] for col in cols: m, b, r, p, se = scipy.stats.linregress(df[col[0]], df[col[1]]) Rs.append((col, (r*r))) return sorted(Rs, key=operator.itemgetter(1), reverse=True) print_num = 5 print 'top %d most collinear' % print_num for i in colin(dataset)[:print_num]: print '-' * 25 print clean_column_name(i[0][0]) print clean_column_name(i[0][1]) print i[1] ''' input: dataframe column name optional begin and end date funtion: creats a line graph of a specific coulumn name if data increases week to week fill green if data decreases week to week fill red return: none ''' def filled_plt(df, col, begin_date=None, end_date=None): if begin_date == None: begin_date = df.index[0] if end_date == None: end_date = df.index[-1] temp = df.truncate(before=begin_date, after=end_date) for i in range(len(temp)-1): x = temp.index[i:i+2] y = temp[col][i:i+2] c = brewer_rg[1] if y[0] >= y[1]: c = brewer_rg[0] plt.fill_between(x, y, color=c) ymin = min([0, temp[col].min()]) plt.ylim(ymin, temp[col].max()) plt.xticks(rotation='vertical') plt.xlabel('date') plt.ylabel('value') plt.title('%s between %s and %s' % (clean_column_name(col), begin_date.strftime('%B %d, %Y'), end_date.strftime('%B %d, %Y'))) remove_border() filled_plt(dataset, 's_and_p_500', begin_date=datetime.datetime(2011,6,7,0,0,0))#, end_date=datetime.datetime(2010,1,1,0,0,0)) ''' input: dataframe column name to compare to (x axis) list of column names to check agreement with dict of slopes of regression lines for each dataset against the target optional begin and end date function: creats a line graph of specific column name if target data is in agreement with all specified other datasets fill green agreement is defined by the slope of the regression line (+ the two datapoints change in the same direction WoW - the two datapoints chage in opposite directions WoW) return: the number of times all specified datasets were in agreement with the target dataset total number of weeks that were checked ''' def agreement_plt(df, target, var, dct, begin_date=None, end_date=None): if begin_date == None: begin_date = df.index[0] if end_date == None: end_date = df.index[-1] temp = df.truncate(before=begin_date, after=end_date) agree = 0. total = 0. title = '' for v in var: title += ' %s,' % clean_column_name(v) for i in range(len(temp)-1): x = temp.index[i:i+2] y = temp[target][i:i+2] c = dark2_colors[0] t = [] for v in var: if (dct[v] > 0): if ((y[0] >= y[1]) & (temp[v][i] >= temp[v][i+1])) | ((y[0] <= y[1]) & (temp[v][i] <= temp[v][i+1])): t.append(0) else: t.append(1) else: if ((y[0] >= y[1]) & (temp[v][i] <= temp[v][i+1])) | ((y[0] <= y[1]) & (temp[v][i] >= temp[v][i+1])): t.append(0) else: t.append(1) if(sum(t) == 0): plt.fill_between(x, y, color=c) agree += 1 total += 1 else: plt.plot(x, y, color=dark2_colors[1]) total += 1 ymin = min([0, temp[target].min()]) plt.ylim(ymin, temp[target].max()) plt.xticks(rotation='vertical') plt.xlabel('date') plt.ylabel('value') plt.title('trends of %s and %s between %s and %s' % (title, clean_column_name(target), begin_date.strftime('%B %d, %Y'), end_date.strftime('%B %d, %Y'))) remove_border() return (agree, total) agree = agreement_plt(dataset, 's_and_p_500', data.keys()[0:2], slope, begin_date=datetime.datetime(2007,6,7,0,0,0), end_date=datetime.datetime(2010,1,1,0,0,0)) print 'they are in agreement %.2f%% of the time' % (agree[0]/agree[1]*100) datakeys = [x+"_pdiff" for x in data.keys()] data_array = dataframe_to_array(dataset, datakeys, ['s_and_p_500_pdiff_b_0.00-100.00_adv']) training = data_array[0:-100] testing = data_array[-100:] training_rows = len(training) training_columns = len(training[0]) testing_rows = len(testing) testing_columns = len(testing[0]) split = len(dataset) - len(testing) #Neural Network Class class Neural_Net: #constructor initializes a new neural network with randomly selected weights and pre-specified height, and number of neurons per layer def __init__(self,non,height): #list to store the number of neurons in each layer of the network self.num_of_neurons = non #height of the network self.L = height #list to store number of weights in each layer of the network, indexed by layer, output neuron, input neuron self.weights = numpy.zeros(shape=((non[0]+1),(non[0]+1),(non[0]+1))) #delta_matrix: stores the gradient that is used in backpropagation self.deltas = numpy.zeros(shape=((non[0]+1),(non[0]+1))) #matrix that stores thresholded signals self.signals = numpy.zeros(shape=((non[0]+1),(non[0]+1))) #(tunable) learning_rate used in backpropagation self.learning_rate = .00001 #initialize weights to be between -2 and 2 for i in range(1,self.L+1): for j in range(1,self.num_of_neurons[i]+1): for k in range(self.num_of_neurons[i-1]+1): self.weights[i][j][k] = (random.randrange(-2,2)) #changed to adjust #forward_pass computes the output of the neural network given an input def forward_pass(self,x): #(for convenience, we index neurons starting at 1 instead of zero) self.signals[0][0] = -1 for i in range(1,self.num_of_neurons[0]+1): self.signals[0][i] = x[i-1] for i in range(1,self.L+1): self.signals[i][0] = -1 for j in range(1,self.num_of_neurons[i]+1): self.signals[i][j] = self.compute_signal(i,j) return self.signals[self.L][1] #tune_weights performs the backpropagation algorithm given a training example as input def tune_weights(self,y): self.deltas[self.L][1] = 2*(self.signals[self.L][1]-y)*(1-math.pow(self.signals[self.L][1],2)) for i in range(self.L-1,0,-1): for j in range(1,self.num_of_neurons[i]+1): self.deltas[i][j] = self.compute_delta(i,j) for i in range(1,self.L+1): for j in range(1,self.num_of_neurons[i]+1): for k in range(self.num_of_neurons[i-1]+1): self.weights[i][j][k] = self.weights[i][j][k]-self.learning_rate*self.signals[i-1][k]*self.deltas[i][j] #compute_signal: computes the delta for a given neuron at a given level def compute_signal(self,level,neuron): s = 0 for i in range(self.num_of_neurons[level-1]+1): s += self.weights[level][neuron][i]*self.signals[level-1][i] return self.g(s) #compute_delta: computes the signal s for a given neuron at a given level def compute_delta(self,level,neuron): s = 0 for j in range(1,self.num_of_neurons[level+1]+1): s += self.weights[level+1][j][neuron]*self.deltas[level+1][j] return (1-math.pow(self.signals[level][neuron],2))*s #soft threshold function def g(self,s): #print s try: return (math.exp(s)-math.exp(-s))/(math.exp(s)+math.exp(-s)) except OverflowError: return 0. num_of_neurons = [(len(testing[0])-1),13,1] network = Neural_Net(num_of_neurons,2) e = 500 train = numpy.zeros(shape = (e)) test = numpy.zeros(shape = (e)) for epoch in range(e): training_error = 0 test_error = 0 for j in range(testing_rows): test_error = test_error+math.pow(network.forward_pass(testing[j]) - testing[j][testing_columns-1], 2) #compute the test errors #compute the training errors, SEQUENTIALLY. In other words, we perform backpropagation for *every* example #instead of all at once. for i in range(training_rows): training_error = training_error+math.pow(network.forward_pass(training[i])- training[i][training_columns-1], 2) network.tune_weights(training[i][training_columns-1]) training_error = training_error/training_rows test_error = test_error/testing_rows train[epoch] = training_error test[epoch] = test_error nn_results = [] for j in range(testing_rows): nn_results.append(network.forward_pass(testing[j])) plt.plot(numpy.arange(e), test, lw=2, label = 'test') plt.plot(numpy.arange(e), train, lw=2, label = 'train') plt.legend(loc=0, frameon=False) plt.xlabel('Epoch') plt.ylabel('MSE') remove_border() def results(df, graph, nn_results, sp, title="prediction"): temp = df.copy() for i in range(len(temp)-1): temp[graph][i] = temp[graph][i+1] temp = temp[-len(nn_results):] mi = temp[graph].min()-.005 ma = temp[graph].max()+.005 ci = 0. cd = 0. ti = 0. td = 0. pi = 0. pde = 0. for i in range(len(nn_results)-1): t = temp[graph][i] x1 = temp.index[i] x2 = temp.index[i+1] y1 = temp[graph][i] y2 = temp[graph][i+1] if(t > 0 and nn_results[i] > sp): plt.fill_between([x1,x2], [ma,ma], color=brewer_rg[1], label=('accurate prediction')) ci += 1 ti += 1 pi += 1 elif(t < 0 and nn_results[i] <= sp): plt.fill_between([x1,x2], [mi,mi], color=brewer_rg[1], label=('accurate prediction')) cd += 1 td += 1 pde += 1 elif(t > 0 and nn_results[i] <= sp): plt.fill_between([x1,x2], [mi,mi], color=brewer_rg[0], label=('inacurate prediction')) ti += 1 pde += 1 elif(t < 0 and nn_results[i] > sp): plt.fill_between([x1,x2], [ma,ma], color=brewer_rg[0], label=('inacurate prediction')) td += 1 pi += 1 plt.fill_between([x1,x2], [y1,y2], color='k', label='s&p 500 pdiff') plt.ylim(mi, ma) plt.xticks(rotation='vertical') plt.xlabel('date') plt.ylabel('s&p 500 pdiff') plt.title(title) remove_border() return ci, cd, ti, td, pi, pde data_array = dataframe_to_array(datalist, top_regressions.keys(), ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv']) training = data_array[0:-100] testing = data_array[-100:] train_in = training[:,:-1] train_out =training [:,-1:] train_out=train_out.reshape(train_out.shape[0]) test_in = testing[:,:-1] test_out = testing [:,-1:] test_out=test_out.reshape(test_out.shape[0]) clf = MultinomialNB().fit(train_in, train_out) k = clf.score(test_in, test_out) predictions = clf.predict(test_in) cinb, cdnb, tinb, tdnb, _, _ = results(datalist, 'YAHOO.INDEX_GSPC - Adjusted Close_pdiff', predictions, .5, title="accuracy of NB predictions") print 'average accuracy: %.2f%%' % ((cinb+cdnb)/(tinb+tdnb)*100) print 'accuracy on decrease: %.2f%%' % (cdnb/tdnb*100) print 'accuracy on increase: %.2f%%' % (cinb/tinb*100) ci, cd, ti, td, _, _ = results(dataset, 's_and_p_500_pdiff', nn_results, 0., title='accuracy of NN predictions') print 'average accuracy: %.2f%%' % ((ci+cd)/(ti+td)*100) print 'accuracy on decrease: %.2f%%' % (cd/td*100) print 'accuracy on increase: %.2f%%' % (ci/ti*100) def score(df, split, nn_results): t = df[-split:].copy() t['out'] = float('nan') for i in range(split): if nn_results[i] < 0: t['out'][i] = -1 else: t['out'][i] = 1 return t dataset_out = score(dataset, len(testing), nn_results) def buy_pass(df_score, nn_results): my_list = [] market_list =[] start = 100000000 my_money = start market_money = start for i in range(1, (len(nn_results)-1)): market_money = market_money * (1+dataset_out.s_and_p_500_pdiff[i+1]) market_list.append(market_money) if (nn_results[i] >0): my_money = my_money * (1+dataset_out.s_and_p_500_pdiff[i+1]) my_list.append(my_money) my_list = [(i-start)/start for i in my_list] market_list = [(i-start)/start for i in market_list] x = df_score.index[1:-1] return my_list, market_list, x my_list, market_list, N = buy_pass(dataset_out, nn_results) my_list = [i*100 for i in my_list] market_list = [i*100 for i in market_list] def buy_pass_plot(list_data, list_lab, N, x, y, title): mi = np.inf ma = -np.inf if len(list_data) != len(list_lab): raise Exception('length mismatch') for i in range(len(list_data)): plt.plot(N, list_data[i], label=list_lab[i]) if min(list_data[i]) < mi: mi = min(list_data[i]) if max(list_data[i]) > ma: ma = max(list_data[i]) plt.axhline(y=.05, linewidth=1.5, color='k') plt.xticks(rotation='vertical') plt.xlabel(x) plt.ylabel(y) plt.title(title) plt.legend(loc=0, frameon=False) if 0 < mi: mi = 0 plt.ylim(mi,ma) remove_border(bottom=False) buy_pass_plot([my_list, market_list], ['strategy', 'no strategy'], N, 'date', '% chage', 'basic trading strategy based on nn prediction') print 'our model made %.2f%% while the market increased by %.2f%% over a period of %d weeks' % (my_list[-1], market_list[-1], len(my_list)) my_list_pdiff = [] for i in range(len(my_list)): my_list_pdiff.append(my_list[i]-market_list[i]) buy_pass_plot([my_list_pdiff], ['%'], N, 'date', '% better/worse', '% better or worse than market') print 'our model preformed %.2f%% better than the market' % (my_list[-1]-market_list[-1])