Notebook

Artificial Neural Networks and Stock Price Predictions¶

Jared Rosen, Jake Silberg, Kevin Sun¶

Table of Contents¶

Introduction and Goal¶

Background
Assumptions
Setup

Data¶

Selection
Gathering and Processing
Exploration

Model¶

Building the Model
Validating the Model
Cross Validation

Results¶

Preformance

Introduction and Goal####¶

In [1]:

# !pip install Quandl
# !pip install PyBrain

In [90]:

%matplotlib inline

import numpy as np
import numpy
import pandas as pd
import math
import scipy
import random
import Quandl
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import string
import calendar
import datetime
from pandas.tseries.offsets import *
import operator
from itertools import combinations, permutations
from matplotlib import rcParams
import scipy.stats as stats
import pybrain
from pybrain.datasets import ClassificationDataSet
from pybrain.utilities import percentError
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure.modules import SoftmaxLayer
from pybrain.structure import TanhLayer
from pybrain.structure import SigmoidLayer
from sklearn.cross_validation import train_test_split 
from sklearn.naive_bayes import MultinomialNB
import sklearn

In [178]:

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

#http://colorbrewer2.org/
brewer_rg = ['#E41A1C', '#4DAF4A']

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(ax=None, axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    if ax == None:
        ax = axes or plt.gca()
    
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #remove grid
    ax.grid(False)
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

# Quandl API key
authtoken='Cx1CtXeu61zjTzpehmNV'

Data Gathering###¶

Selection¶

In [48]:

'''
dict of of all the Quandl datasets we will use
the key will be used as the column name for convenience
'''
data = {'gold_price':'BUNDESBANK/BBK01_WT5511.1',
        'usd_to_pound':'QUANDL/USDGBP.1',
        'cpi':'FRED/CPIAUCSL.1',
        'unemployment':'FRED/UNRATE.1',
        'gas_price':'BTS_MM/RETAILGAS.1',
        'volatility':'YAHOO/INDEX_VIX.6',
        'house_sales':'FRED/HSN1F.1',
        'usd_to_euro':'QUANDL/USDEUR.1',
        'corporate_profits':'FRED/CP.1',
        'Dax':'YAHOO/INDEX_GDAXI.1',
        'heating_oil':'OFDP/FUTURE_HO1.1',
        'dow_jones':'BCB/UDJIAD1.1',
        'consumer_credit':'FRED/TOTALSL.1',
        'bank_interest_rate':'FRED/TERMCBCCALLNS.1',
        '10_year_treasury':'FRED/DGS10.1',
        'population':'FRED/POP.1',
        '3_month_treasury':'FRED/DTB3.1',
        'corn':'OFDP/FUTURE_C1.4',
        'infants_toddlers':'FRED/CUSR0000SEAF.1',
        'retail':'FRED/RSAFS.1',
        's_and_p_500':'YAHOO/INDEX_GSPC.6'}

Gathering and Processing¶

In [49]:

'''
input:     dict of Quandl codes

function:  downloads Quandl data
           set column name for each as its repective key from the dict
           
return:    list of dataframes
'''
def get_quandl_data(quandl_codes):
    list_of_data = []
    for number, code in enumerate(quandl_codes.values()):
        data = Quandl.get([code], authtoken=authtoken)
        data.columns = [quandl_codes.keys()[number]]
        list_of_data.append(data)
    return list_of_data

In [50]:

list_of_data = get_quandl_data(data)

Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'OFDP.FUTURE_HO1.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.CPIAUCSL.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'QUANDL.USDEUR.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.RSAFS.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'BTS_MM.RETAILGAS.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.POP.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'BUNDESBANK.BBK01_WT5511.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'QUANDL.USDGBP.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.CP.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.DTB3.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'OFDP.FUTURE_C1.4']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.UNRATE.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.TOTALSL.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.HSN1F.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.CUSR0000SEAF.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'BCB.UDJIAD1.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.DGS10.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'YAHOO.INDEX_GDAXI.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'YAHOO.INDEX_GSPC.6']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'FRED.TERMCBCCALLNS.1']
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use.
Returning Dataframe for  [u'YAHOO.INDEX_VIX.6']

In [51]:

'''
input:     dataframe of Quandl data

function:  and adds rows for every business day during the timeframe of that dataset
           performs linear interpolation to fill in all missing values
           then drops all values except for Mondays
           
return:    cleaned dataframe of Quandle data
'''
def collapser(df):
    df = df.asfreq(BDay())
    df = df.apply(pd.Series.interpolate)
    df = df.asfreq(Week(weekday=1))
    return df

In [52]:

'''
input:     tlist of dataframes of Quandle data

function:  runs collapser on each dataframe
           finds the latest start date and earliest end date accross ass dataframes and truncates at those date
           concatonats all dataset into single dataframe

return:    dataframe of Quandl data of uniform length with no missing values
           start and end date of dataframe
'''
def merger(list_of_data):
    begin_date=datetime.datetime(1900,1,1,12,13,14)
    end_date=datetime.datetime(2100,1,1,12,13,14)
    for i in range(len(list_of_data)):
        list_of_data[i] = collapser(list_of_data[i])
        if (list_of_data[i].index[1]>begin_date):
            begin_date=list_of_data[i].index[0]
        if (list_of_data[i].index[-1]<end_date):
            end_date=list_of_data[i].index[-1]
    for i in range(len(list_of_data)):
        list_of_data[i] = list_of_data[i].truncate(before=begin_date, after=end_date)
    data=pd.concat(list_of_data, axis=1)
    return data, (begin_date, end_date)

In [53]:

dataset, dates = merger(list_of_data)

In [54]:

'''
input:     dataframe
           array x number of column names that need percent difference calculations
           
function:  calulates the percent difference between each week of data for each of the specificed columns
           new columns are named by appending '_pdiff'
           first date is dropped becuase there is no percentage difference information on that date.
           
return:    dataframe with x new columns 
'''
def percentageDifference(df, columns):
    diff_columns=[]
    df_columns=list(df.columns)
    for column in columns:
        if column in df_columns:
            diff_columns.append(column)
            df[column+"_pdiff"]= float('nan')
        else:
            continue
    for i in range(1, len(df)):
        for ncolumn in diff_columns:
            df[ncolumn+"_pdiff"].ix[i] = (df[ncolumn].ix[i] - df[ncolumn].ix[i-1]) / df[ncolumn].ix[i-1]
    return df[1:]

In [55]:

dataset = percentageDifference(dataset, data.keys())

In [56]:

'''
input:     dataframe
           an array of column names
           and a array of bins
           
funtion:   places each piece of data into a bin (0 = not in bin; 1 = in bin)
           new column are named with bin range
           
return:    datatframe with indicators of if the element is in the bin  
           array of new column names
'''
def biner(df, cols, bins):
    add_cols = []
    for col in cols:
        for i in range(len(bins)-1):
            title = '%s_b_%.2f-%.2f' % (col, bins[i], bins[i+1])
            df[title] = 0
            df[title][(df[col] >= bins[i]) & (df[col] < bins[i+1])] = 1
            add_cols.append(title)
    return df, add_cols

In [57]:

dataset, add_cols = biner(dataset, ['s_and_p_500_pdiff'], [0.000001, 100.])

In [58]:

'''
input:     dataframe
           array of columns names
           
function:  shifts all values for each specificed column forward up by one
           new columns are named with '_adv'
That is, the _adv column contains the value of the NEXT week.
'''
def advance_cols(df, cols):
    for col in cols:
        title = '%s_adv' % col
        df[title] = float('nan')
        for i in range(len(df)-1):
            df[title][i] = df[col][i+1]
    return df[:-1]

In [59]:

dataset = advance_cols(dataset, add_cols)

Exploration¶

In [60]:

inc = dataset['s_and_p_500_pdiff'][dataset['s_and_p_500_pdiff'] > 0]
dec = dataset['s_and_p_500_pdiff'][dataset['s_and_p_500_pdiff'] <= 0]
mean = dataset['s_and_p_500_pdiff'].mean()
std = dataset['s_and_p_500_pdiff'].std()
in_one_std = dataset['s_and_p_500_pdiff'][(dataset['s_and_p_500_pdiff'] >= (mean - std)) & (dataset['s_and_p_500_pdiff'] <= (mean + std))]
out_one_std = dataset['s_and_p_500_pdiff'][(dataset['s_and_p_500_pdiff'] <= (mean - std)) | (dataset['s_and_p_500_pdiff'] >= (mean + std))]

int_inc = pd.concat([inc, out_one_std], axis=1, join='inner').ix[:,0]
int_dec = pd.concat([dec, out_one_std], axis=1, join='inner').ix[:,0]


extreme = dataset.sort_index(by=['s_and_p_500_pdiff'], ascending=False).s_and_p_500_pdiff
print_num = 5
extreme = pd.concat([extreme[:print_num], extreme[-print_num:]])



print 'the s&p 500 increased %d weeks (%.2f%%) and decreased %d weeks (%.2f%%) between %s and %s' % (len(inc), (float(len(inc))/len(dataset))*100, len(dec), (float(len(dec))/len(dataset))*100, dates[0].strftime('%B %d, %Y'), dates[1].strftime('%B %d, %Y')) 
print 'over the whole period the s & p 500 incresaed by %.2f%%' % ((dataset['s_and_p_500'][-1] - dataset['s_and_p_500'][0])/dataset['s_and_p_500'][0]*100)
print 'the average weekly increase was %.2f%% with a standard deviation of %.2f%%' % ((inc.mean() * 100), inc.std()*100)
print 'the average weekly decrease was %.2f%% with a standard deviation of %.2f%%' % (abs(dec.mean() * 100), dec.std()*100)
print 'the average weekly change over the whole period was %.2f%% with a standard deviation of %.2f%%' % (mean*100, std*100)
print 'the weekly change was outside 1 standard deviation of the mean %d times' % len(out_one_std)
print 'of those %d of them were increses and had an average change of %.2f%% and a standard deviation of %.2f%%' % (len(int_inc), int_inc.mean()*100, int_inc.std()*100)
print 'and %d of them were decreases and had an average change of %.2f%% and a standard deviation of %.2f%%' % (len(int_dec), int_dec.mean()*100, int_dec.std()*100)
print
print 'the %d most extreme increses/decreces were' % print_num
for i in range(len(extreme)):
    print '-' * 25
    print extreme.index[i].strftime('%B %d, %Y')
    print '%.2f%%' % (extreme[i]*100)

the s&p 500 increased 370 weeks (54.25%) and decreased 312 weeks (45.75%) between September 07, 1999 and October 09, 2012
over the whole period the s & p 500 incresaed by 8.19%
the average weekly increase was 1.80% with a standard deviation of 1.67%
the average weekly decrease was 2.03% with a standard deviation of 1.91%
the average weekly change over the whole period was 0.04% with a standard deviation of 2.61%
the weekly change was outside 1 standard deviation of the mean 166 times
of those 77 of them were increses and had an average change of 4.30% and a standard deviation of 1.88%
and 89 of them were decreases and had an average change of -4.30% and a standard deviation of 2.02%

the 5 most extreme increses/decreces were
-------------------------
July 30, 2002
13.17%
-------------------------
October 15, 2002
10.36%
-------------------------
March 21, 2000
9.91%
-------------------------
March 18, 2003
8.21%
-------------------------
March 17, 2009
8.13%
-------------------------
January 20, 2009
-7.64%
-------------------------
March 03, 2009
-9.93%
-------------------------
November 11, 2008
-10.62%
-------------------------
July 23, 2002
-11.46%
-------------------------
October 07, 2008
-14.59%

In [179]:

plt.figure()
ax = dataset['s_and_p_500_pdiff'].hist(bins=np.sqrt(len(dataset)))
plt.xlabel('weekly percent change of the s&p 500')
plt.ylabel('frequency')
plt.title('weekly percent change of the s&p 500')
plt.vlines((mean+std), 0, 100, label=('+/-1 stadard deviation'))
plt.vlines((mean-std), 0, 100)
plt.legend(frameon=False)
remove_border(ax)

In [62]:

'''
input: column name

function: removes '_' from column name

return: clean column name
'''
def clean_column_name(col):
    return ' '.join(col.split('_'))

In [63]:

'''
inpit:     dataframe
           a list of columns (y axis)
           target column (x axis)

function:  plots each column of the dataframe agaist the target column
           overplots a liner fit

return:   a sorted list column names by correlation coefficient
          dict of the slopes of the liner regression line
'''
def scatter_plot(df, cols, target):
    xaxis = target
    yaxis = cols
    ff = 5 * (len(yaxis)-1)
    fig, axes=plt.subplots(nrows=len(yaxis), figsize=(5,ff))
    Rs = []
    slope = {}
    for i, y in enumerate(yaxis):
        axes[i].scatter(df[xaxis], df[y])
        m, b, r, p, se = scipy.stats.linregress(df[xaxis], df[y])
        lab = 'r^2 = %.2f \np = %.2f' % (r*r, p)
        axes[i].plot(df[xaxis], np.array(df[xaxis]) * m + b, label=lab)
        axes[i].set_xlabel(clean_column_name(xaxis))
        axes[i].set_ylabel(clean_column_name(y))
        axes[i].legend(frameon=False)
#         axes[i].set_title(lab)
        remove_border(axes[i]) 
        Rs.append((y, (r*r)))
        slope[y] = m
    fig.tight_layout()
    return sorted(Rs, key=operator.itemgetter(1), reverse=True), slope
    
        

r, slope = scatter_plot(dataset, data.keys(), 's_and_p_500')

In [64]:

'''
input:     dataset

function:  checks for colinerity between all variables
           excults percent difference, bined, and advanced columns by default

return:    sorted list of tuples of column names and their corolation coeficent 
'''
def colin(df, extra=False):
    cols = list(df.columns)
    if extra == False:
        cols = [c for c in list(df.columns) if c[-6:] != '_pdiff' or  '_bins' or '_adv']
    cols = combinations(cols, 2)
    Rs = []
    for col in cols:
        m, b, r, p, se = scipy.stats.linregress(df[col[0]], df[col[1]])
        Rs.append((col, (r*r)))
    return sorted(Rs, key=operator.itemgetter(1), reverse=True)

print_num = 5
print 'top %d most collinear' % print_num
for i in colin(dataset)[:print_num]:
    print '-' * 25
    print clean_column_name(i[0][0])
    print clean_column_name(i[0][1])
    print i[1]

top 5 most collinear
-------------------------
cpi
population
0.988564121493
-------------------------
heating oil
gas price
0.954903320034
-------------------------
cpi
consumer credit
0.925192316073
-------------------------
population
consumer credit
0.922901527615
-------------------------
dow jones pdiff
s and p 500 pdiff
0.919088304642

In [180]:

'''
input:     dataframe
           column name
           optional begin and end date

funtion:   creats a line graph of a specific coulumn name
           if data increses week to week fill green
           if data devreases week to week fill red

return: none
'''
def filled_plt(df, col, begin_date=None, end_date=None):
    if begin_date == None:
        begin_date = df.index[0]
        
    if end_date == None:
        end_date = df.index[-1]

    temp = df.truncate(before=begin_date, after=end_date)
    for i in range(len(temp)-1):
        x = temp.index[i:i+2]
        y = temp[col][i:i+2]
        c = brewer_rg[1]
        if y[0] >= y[1]:
            c = brewer_rg[0]
        plt.fill_between(x, y, color=c)
    ymin = min([0, temp[col].min()])
    plt.ylim(ymin, temp[col].max())
    plt.xticks(rotation='vertical')
    plt.xlabel('date')
    plt.ylabel('value')
    plt.title('%s between %s and %s' % (clean_column_name(col), begin_date.strftime('%B %d, %Y'), end_date.strftime('%B %d, %Y')))
    remove_border()

filled_plt(dataset, 's_and_p_500', begin_date=datetime.datetime(2007,6,7,0,0,0), end_date=datetime.datetime(2010,1,1,0,0,0)) 

In [181]:

'''
input:     dataframe
           column name to compare to (x axis)
           list of column names to check agrement with
           dict of slopes of regression lines for each dataset agisnst the target
           optional begin and end date

function:  creats a line graph of specific column name
           if target data is in agreement with all specified other datasets fill green
               agreement is defined by the slope of the regression line 
               (+ the two datapoints change in the same direction WoW
               - the two datapoints chage in opposite directions WoW)

return: the number of times all specifed datasets were in agreement with the target dataset
        total number of weeks that were checked
'''    
def agreement_plt(df, target, var, dct, begin_date=None, end_date=None):
    if begin_date == None:
        begin_date = df.index[0]
    if end_date == None:
        end_date = df.index[-1]
    temp = df.truncate(before=begin_date, after=end_date)
    agree = 0.
    total = 0.
    title = ''
    for v in var:
        title += ' %s,' % clean_column_name(v)
    for i in range(len(temp)-1):
        x = temp.index[i:i+2]
        y = temp[target][i:i+2]
        c = dark2_colors[0]
        t = []
        for v in var:
            if (dct[v] > 0):
                if ((y[0] >= y[1]) & (temp[v][i] >= temp[v][i+1])) | ((y[0] <= y[1]) & (temp[v][i] <= temp[v][i+1])):
                    t.append(0)
                else:
                    t.append(1)
            else:
                if ((y[0] >= y[1]) & (temp[v][i] <= temp[v][i+1])) | ((y[0] <= y[1]) & (temp[v][i] >= temp[v][i+1])):
                    t.append(0)
                else:
                    t.append(1)
        if(sum(t) == 0):
            plt.fill_between(x, y, color=c)
            agree += 1
            total += 1
        else:
            plt.plot(x, y, color=dark2_colors[1])
            total += 1
    ymin = min([0, temp[target].min()])
    plt.ylim(ymin, temp[target].max())
    plt.xticks(rotation='vertical')
    plt.xlabel('date')
    plt.ylabel('value')
    plt.title('trends of %s and %s between %s and %s' % (title, clean_column_name(target), begin_date.strftime('%B %d, %Y'), end_date.strftime('%B %d, %Y')))
    remove_border()
    return (agree, total)

agree = agreement_plt(dataset, 's_and_p_500', ['volatility', 'gold_price'], slope, begin_date=datetime.datetime(2007,6,7,0,0,0), end_date=datetime.datetime(2010,1,1,0,0,0)) 
print 'they are in agreement %.2f%% of the time' % (agree[0]/agree[1]*100)

they are in agreement 45.86% of the time

Model¶

Building the Model¶

In [67]:

def dataframe_to_array(df, input_cols, output_cols):
    return df[input_cols+output_cols].values

In [109]:

datakeys = [x+"_pdiff" for x in data.keys()]

data_array = dataframe_to_array(dataset, datakeys, ['s_and_p_500_pdiff_b_0.00-100.00_adv'])

training = data_array[0:-100]
testing = data_array[-100:]

# training, testing = sklearn.cross_validation.train_test_split(data_array)
split = len(dataset) - len(testing)

training_rows = len(training)
training_columns = len(training[0])

testing_rows = len(testing)
testing_columns = len(testing[0])

In [234]:

#Neural Network Class 
class Neural_Net:
    #constructor initializes a new neural network with randomly selected weights and pre-specified height, and number of neurons per layer
    def __init__(self,non,height):
        #list to store the number of neurons in each layer of the network
        self.num_of_neurons = non
        #height of the network
        self.L = height
        #list to store number of weights in each layer of the network, indexed by layer, output neuron, input neuron
        self.weights = numpy.zeros(shape=((non[0]+1),(non[0]+1),(non[0]+1)))
        #delta_matrix: stores the gradient that is used in backpropagation
        self.deltas = numpy.zeros(shape=((non[0]+1),(non[0]+1)))
        #matrix that stores thresholded signals
        self.signals = numpy.zeros(shape=((non[0]+1),(non[0]+1)))
        #(tunable) learning_rate used in backpropagation
        self.learning_rate = .2
        #initialize weights to be between -2 and 2
        for i in range(1,self.L+1):
            for j in range(1,self.num_of_neurons[i]+1):
                for k in range(self.num_of_neurons[i-1]+1):
                    self.weights[i][j][k] = (random.random()-.5)*3 #changed to adjust        
                                            
    #forward_pass computes the output of the neural network given an input
    def forward_pass(self,x):
        #(for convenience, we index neurons starting at 1 instead of zero)
        self.signals[0][0] = -1
        for i in range(1,self.num_of_neurons[0]+1):
            self.signals[0][i] = x[i-1]
        for i in range(1,self.L+1):
            self.signals[i][0] = -1
            for j in range(1,self.num_of_neurons[i]+1):
                self.signals[i][j] = self.compute_signal(i,j)
        return self.signals[self.L][1]
                    
    #tune_weights performs the backpropagation algorithm given a training example as input
    def tune_weights(self,y):
        self.deltas[self.L][1] = 2*(self.signals[self.L][1]-y)*(1-math.pow(self.signals[self.L][1],2))
        for i in range(self.L-1,0,-1):
            for j in range(1,self.num_of_neurons[i]+1):
                self.deltas[i][j] = self.compute_delta(i,j)
        for i in range(1,self.L+1):
            for j in range(1,self.num_of_neurons[i]+1):
                for k in range(self.num_of_neurons[i-1]+1):
                    self.weights[i][j][k] = self.weights[i][j][k]-self.learning_rate*self.signals[i-1][k]*self.deltas[i][j]
    
    #compute_signal: computes the delta for a given neuron at a given level
    def compute_signal(self,level,neuron):
        s = 0
        for i in range(self.num_of_neurons[level-1]+1):
            s += self.weights[level][neuron][i]*self.signals[level-1][i]
        return self.g(s)
    
    #compute_delta: computes the signal s for a given neuron at a given level
    def compute_delta(self,level,neuron):
        s = 0
        for j in range(1,self.num_of_neurons[level+1]+1):
            s += self.weights[level+1][j][neuron]*self.deltas[level+1][j]
        return (1-math.pow(self.signals[level][neuron],2))*s
    
    #soft threshold function
    def g(self,s):
        #print s
        return (math.exp(s)-math.exp(-s))/(math.exp(s)+math.exp(-s))

In [235]:

num_of_neurons = [(len(testing[0])-1),7,1]
network = Neural_Net(num_of_neurons,2)
training_error = 0
test_error = 0
train = numpy.zeros(shape = (1000))
test = numpy.zeros(shape = (1000))

In [236]:

for epoch in range(200):
    training_error = 0
    test_error = 0
    for j in range(testing_rows):
        test_error = test_error+math.pow(network.forward_pass(testing[j]) - testing[j][testing_columns-1], 2)
    #compute the test errors
    #compute the training errors, SEQUENTIALLY. In other words, we perform backpropagation for *every* example
    #instead of all at once. 
    for i in range(training_rows):
        training_error = training_error+math.pow(network.forward_pass(training[i])- training[i][training_columns-1], 2)
    network.tune_weights(training[i][training_columns-1])
    training_error = training_error/training_rows
    test_error = test_error/testing_rows
    train[epoch] = training_error
    test[epoch]  = test_error
    

In [237]:

nn_results = []
for j in range(testing_rows):
    nn_results.append(network.forward_pass(testing[j]))
    

In [238]:

plt.plot(numpy.arange(1000), test, lw=2, label = 'test')
plt.plot(numpy.arange(1000), train, lw=2, label = 'train')
plt.legend(loc=0, frameon=False)
plt.xlabel('Epoch')
plt.ylabel('MSE')
remove_border()

Results¶

In [276]:

def score(df, split, nn_results):
    t = df[-(split):]
    t['out'] = float('nan')
    for i in range(split):
        if nn_results[i] < .5:
            t['out'][i] = 0
        else:
            t['out'][i] = 1
    return t

dataset_score = score(dataset, len(testing), nn_results) 
print 'Kevin / Lab 10'
print len(dataset_score[-(split):][dataset_score['s_and_p_500_pdiff_b_0.00-100.00_adv'] == dataset_score['out']]) / float(len(dataset_score)) * 100
dataset_score[['out', 's_and_p_500_pdiff_b_0.00-100.00_adv']][dataset_score['out'] == 0]
# print len(dataset_score[['out', 's_and_p_500_pdiff_b_0.00-100.00_adv']][dataset_score['s_and_p_500_pdiff_b_0.00-100.00_adv'] == 0])
# print len(dataset_score)

Kevin / Lab 10
64.0

Out[276]:

	out	s_and_p_500_pdiff_b_0.00-100.00_adv
2010-11-09	0	0

In [279]:

def results(df, target, nn_results):
    
    temp = df
    for i in range(1,len(df)):
        temp[nn_results][i-1] = temp[nn_results][i]
    temp = temp[:-1]
    
    mi = temp[target].min()-.01
    ma = temp[target].max()+.01
    temp[nn_results][temp[nn_results] == 1] = ma
    temp[nn_results][temp[nn_results] == 0] = mi
    ap = 0
    tp = 0
    for i in range(len(temp)-1):
        t = temp['s_and_p_500'].ix[i] - temp['s_and_p_500'].ix[i+1]
        x = temp.index[i:i+2]
        y1 = temp[target][i:i+2]
        y2 = temp[nn_results][i]
        plt.vlines(x[0], mi, ma, label=('week'), linewidth=.75, color='w')
        if(t > 0 and temp[nn_results][i] < 0) or (t < 0 and temp[nn_results][i] > 0):
            plt.fill_between(x, [y2,y2], color=brewer_rg[1], label=('accurate prediction'))
            ap += 1
            tp += 1
        else:
            plt.fill_between(x, [y2,y2], color=brewer_rg[0], label=('inacurate prediction'))
            tp += 1
        plt.fill_between(x, y1, color='k', label='s&p 500 pdiff')
    
    plt.ylim(mi, ma)
    plt.xticks(rotation='vertical')
    plt.xlabel('date')
    plt.ylabel('s&p 500 pdiff')
    plt.title('accuracey of nn prediction')
#     plt.legend(frameon=False)
    remove_border()
    return (ap, tp)
        
ap, tp = results(dataset_score, 's_and_p_500_pdiff', 'out')
print 'total predictions: %d' % tp
print 'accurate predictions: %d (%.2f%%)' % (ap, (ap/float(tp))*100)
print 'inacurate predictions: %d (%.2f%%)' % (tp-ap, ((tp-ap)/float(tp))*100)

total predictions: 98
accurate predictions: 62 (63.27%)
inacurate predictions: 36 (36.73%)

In [ ]: