# !pip install Quandl
# !pip install PyBrain
%matplotlib inline
import numpy as np
import numpy
import pandas as pd
import math
import scipy
import random
import Quandl
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import string
import calendar
import datetime
from pandas.tseries.offsets import *
import operator
from itertools import combinations, permutations
from matplotlib import rcParams
import scipy.stats as stats
import pybrain
from pybrain.datasets import ClassificationDataSet
from pybrain.utilities import percentError
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure.modules import SoftmaxLayer
from pybrain.structure import TanhLayer
from pybrain.structure import SigmoidLayer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
import sklearn
#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]
#http://colorbrewer2.org/
brewer_rg = ['#E41A1C', '#4DAF4A']
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'
def remove_border(ax=None, axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
if ax == None:
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#remove grid
ax.grid(False)
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
# Quandl API key
authtoken='Cx1CtXeu61zjTzpehmNV'
'''
dict of of all the Quandl datasets we will use
the key will be used as the column name for convenience
'''
data = {'gold_price':'BUNDESBANK/BBK01_WT5511.1',
'usd_to_pound':'QUANDL/USDGBP.1',
'cpi':'FRED/CPIAUCSL.1',
'unemployment':'FRED/UNRATE.1',
'gas_price':'BTS_MM/RETAILGAS.1',
'volatility':'YAHOO/INDEX_VIX.6',
'house_sales':'FRED/HSN1F.1',
'usd_to_euro':'QUANDL/USDEUR.1',
'corporate_profits':'FRED/CP.1',
'Dax':'YAHOO/INDEX_GDAXI.1',
'heating_oil':'OFDP/FUTURE_HO1.1',
'dow_jones':'BCB/UDJIAD1.1',
'consumer_credit':'FRED/TOTALSL.1',
'bank_interest_rate':'FRED/TERMCBCCALLNS.1',
'10_year_treasury':'FRED/DGS10.1',
'population':'FRED/POP.1',
'3_month_treasury':'FRED/DTB3.1',
'corn':'OFDP/FUTURE_C1.4',
'infants_toddlers':'FRED/CUSR0000SEAF.1',
'retail':'FRED/RSAFS.1',
's_and_p_500':'YAHOO/INDEX_GSPC.6'}
'''
input: dict of Quandl codes
function: downloads Quandl data
set column name for each as its repective key from the dict
return: list of dataframes
'''
def get_quandl_data(quandl_codes):
list_of_data = []
for number, code in enumerate(quandl_codes.values()):
data = Quandl.get([code], authtoken=authtoken)
data.columns = [quandl_codes.keys()[number]]
list_of_data.append(data)
return list_of_data
list_of_data = get_quandl_data(data)
Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'OFDP.FUTURE_HO1.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.CPIAUCSL.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'QUANDL.USDEUR.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.RSAFS.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'BTS_MM.RETAILGAS.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.POP.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'BUNDESBANK.BBK01_WT5511.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'QUANDL.USDGBP.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.CP.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.DTB3.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'OFDP.FUTURE_C1.4'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.UNRATE.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.TOTALSL.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.HSN1F.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.CUSR0000SEAF.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'BCB.UDJIAD1.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.DGS10.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'YAHOO.INDEX_GDAXI.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'YAHOO.INDEX_GSPC.6'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'FRED.TERMCBCCALLNS.1'] Token Cx1CtXeu61zjTzpehmNV activated and saved for later use. Returning Dataframe for [u'YAHOO.INDEX_VIX.6']
'''
input: dataframe of Quandl data
function: and adds rows for every business day during the timeframe of that dataset
performs linear interpolation to fill in all missing values
then drops all values except for Mondays
return: cleaned dataframe of Quandle data
'''
def collapser(df):
df = df.asfreq(BDay())
df = df.apply(pd.Series.interpolate)
df = df.asfreq(Week(weekday=1))
return df
'''
input: tlist of dataframes of Quandle data
function: runs collapser on each dataframe
finds the latest start date and earliest end date accross ass dataframes and truncates at those date
concatonats all dataset into single dataframe
return: dataframe of Quandl data of uniform length with no missing values
start and end date of dataframe
'''
def merger(list_of_data):
begin_date=datetime.datetime(1900,1,1,12,13,14)
end_date=datetime.datetime(2100,1,1,12,13,14)
for i in range(len(list_of_data)):
list_of_data[i] = collapser(list_of_data[i])
if (list_of_data[i].index[1]>begin_date):
begin_date=list_of_data[i].index[0]
if (list_of_data[i].index[-1]<end_date):
end_date=list_of_data[i].index[-1]
for i in range(len(list_of_data)):
list_of_data[i] = list_of_data[i].truncate(before=begin_date, after=end_date)
data=pd.concat(list_of_data, axis=1)
return data, (begin_date, end_date)
dataset, dates = merger(list_of_data)
'''
input: dataframe
array x number of column names that need percent difference calculations
function: calulates the percent difference between each week of data for each of the specificed columns
new columns are named by appending '_pdiff'
first date is dropped becuase there is no percentage difference information on that date.
return: dataframe with x new columns
'''
def percentageDifference(df, columns):
diff_columns=[]
df_columns=list(df.columns)
for column in columns:
if column in df_columns:
diff_columns.append(column)
df[column+"_pdiff"]= float('nan')
else:
continue
for i in range(1, len(df)):
for ncolumn in diff_columns:
df[ncolumn+"_pdiff"].ix[i] = (df[ncolumn].ix[i] - df[ncolumn].ix[i-1]) / df[ncolumn].ix[i-1]
return df[1:]
dataset = percentageDifference(dataset, data.keys())
'''
input: dataframe
an array of column names
and a array of bins
funtion: places each piece of data into a bin (0 = not in bin; 1 = in bin)
new column are named with bin range
return: datatframe with indicators of if the element is in the bin
array of new column names
'''
def biner(df, cols, bins):
add_cols = []
for col in cols:
for i in range(len(bins)-1):
title = '%s_b_%.2f-%.2f' % (col, bins[i], bins[i+1])
df[title] = 0
df[title][(df[col] >= bins[i]) & (df[col] < bins[i+1])] = 1
add_cols.append(title)
return df, add_cols
dataset, add_cols = biner(dataset, ['s_and_p_500_pdiff'], [0.000001, 100.])
'''
input: dataframe
array of columns names
function: shifts all values for each specificed column forward up by one
new columns are named with '_adv'
That is, the _adv column contains the value of the NEXT week.
'''
def advance_cols(df, cols):
for col in cols:
title = '%s_adv' % col
df[title] = float('nan')
for i in range(len(df)-1):
df[title][i] = df[col][i+1]
return df[:-1]
dataset = advance_cols(dataset, add_cols)
inc = dataset['s_and_p_500_pdiff'][dataset['s_and_p_500_pdiff'] > 0]
dec = dataset['s_and_p_500_pdiff'][dataset['s_and_p_500_pdiff'] <= 0]
mean = dataset['s_and_p_500_pdiff'].mean()
std = dataset['s_and_p_500_pdiff'].std()
in_one_std = dataset['s_and_p_500_pdiff'][(dataset['s_and_p_500_pdiff'] >= (mean - std)) & (dataset['s_and_p_500_pdiff'] <= (mean + std))]
out_one_std = dataset['s_and_p_500_pdiff'][(dataset['s_and_p_500_pdiff'] <= (mean - std)) | (dataset['s_and_p_500_pdiff'] >= (mean + std))]
int_inc = pd.concat([inc, out_one_std], axis=1, join='inner').ix[:,0]
int_dec = pd.concat([dec, out_one_std], axis=1, join='inner').ix[:,0]
extreme = dataset.sort_index(by=['s_and_p_500_pdiff'], ascending=False).s_and_p_500_pdiff
print_num = 5
extreme = pd.concat([extreme[:print_num], extreme[-print_num:]])
print 'the s&p 500 increased %d weeks (%.2f%%) and decreased %d weeks (%.2f%%) between %s and %s' % (len(inc), (float(len(inc))/len(dataset))*100, len(dec), (float(len(dec))/len(dataset))*100, dates[0].strftime('%B %d, %Y'), dates[1].strftime('%B %d, %Y'))
print 'over the whole period the s & p 500 incresaed by %.2f%%' % ((dataset['s_and_p_500'][-1] - dataset['s_and_p_500'][0])/dataset['s_and_p_500'][0]*100)
print 'the average weekly increase was %.2f%% with a standard deviation of %.2f%%' % ((inc.mean() * 100), inc.std()*100)
print 'the average weekly decrease was %.2f%% with a standard deviation of %.2f%%' % (abs(dec.mean() * 100), dec.std()*100)
print 'the average weekly change over the whole period was %.2f%% with a standard deviation of %.2f%%' % (mean*100, std*100)
print 'the weekly change was outside 1 standard deviation of the mean %d times' % len(out_one_std)
print 'of those %d of them were increses and had an average change of %.2f%% and a standard deviation of %.2f%%' % (len(int_inc), int_inc.mean()*100, int_inc.std()*100)
print 'and %d of them were decreases and had an average change of %.2f%% and a standard deviation of %.2f%%' % (len(int_dec), int_dec.mean()*100, int_dec.std()*100)
print
print 'the %d most extreme increses/decreces were' % print_num
for i in range(len(extreme)):
print '-' * 25
print extreme.index[i].strftime('%B %d, %Y')
print '%.2f%%' % (extreme[i]*100)
the s&p 500 increased 370 weeks (54.25%) and decreased 312 weeks (45.75%) between September 07, 1999 and October 09, 2012 over the whole period the s & p 500 incresaed by 8.19% the average weekly increase was 1.80% with a standard deviation of 1.67% the average weekly decrease was 2.03% with a standard deviation of 1.91% the average weekly change over the whole period was 0.04% with a standard deviation of 2.61% the weekly change was outside 1 standard deviation of the mean 166 times of those 77 of them were increses and had an average change of 4.30% and a standard deviation of 1.88% and 89 of them were decreases and had an average change of -4.30% and a standard deviation of 2.02% the 5 most extreme increses/decreces were ------------------------- July 30, 2002 13.17% ------------------------- October 15, 2002 10.36% ------------------------- March 21, 2000 9.91% ------------------------- March 18, 2003 8.21% ------------------------- March 17, 2009 8.13% ------------------------- January 20, 2009 -7.64% ------------------------- March 03, 2009 -9.93% ------------------------- November 11, 2008 -10.62% ------------------------- July 23, 2002 -11.46% ------------------------- October 07, 2008 -14.59%
plt.figure()
ax = dataset['s_and_p_500_pdiff'].hist(bins=np.sqrt(len(dataset)))
plt.xlabel('weekly percent change of the s&p 500')
plt.ylabel('frequency')
plt.title('weekly percent change of the s&p 500')
plt.vlines((mean+std), 0, 100, label=('+/-1 stadard deviation'))
plt.vlines((mean-std), 0, 100)
plt.legend(frameon=False)
remove_border(ax)
'''
input: column name
function: removes '_' from column name
return: clean column name
'''
def clean_column_name(col):
return ' '.join(col.split('_'))
'''
inpit: dataframe
a list of columns (y axis)
target column (x axis)
function: plots each column of the dataframe agaist the target column
overplots a liner fit
return: a sorted list column names by correlation coefficient
dict of the slopes of the liner regression line
'''
def scatter_plot(df, cols, target):
xaxis = target
yaxis = cols
ff = 5 * (len(yaxis)-1)
fig, axes=plt.subplots(nrows=len(yaxis), figsize=(5,ff))
Rs = []
slope = {}
for i, y in enumerate(yaxis):
axes[i].scatter(df[xaxis], df[y])
m, b, r, p, se = scipy.stats.linregress(df[xaxis], df[y])
lab = 'r^2 = %.2f \np = %.2f' % (r*r, p)
axes[i].plot(df[xaxis], np.array(df[xaxis]) * m + b, label=lab)
axes[i].set_xlabel(clean_column_name(xaxis))
axes[i].set_ylabel(clean_column_name(y))
axes[i].legend(frameon=False)
# axes[i].set_title(lab)
remove_border(axes[i])
Rs.append((y, (r*r)))
slope[y] = m
fig.tight_layout()
return sorted(Rs, key=operator.itemgetter(1), reverse=True), slope
r, slope = scatter_plot(dataset, data.keys(), 's_and_p_500')
'''
input: dataset
function: checks for colinerity between all variables
excults percent difference, bined, and advanced columns by default
return: sorted list of tuples of column names and their corolation coeficent
'''
def colin(df, extra=False):
cols = list(df.columns)
if extra == False:
cols = [c for c in list(df.columns) if c[-6:] != '_pdiff' or '_bins' or '_adv']
cols = combinations(cols, 2)
Rs = []
for col in cols:
m, b, r, p, se = scipy.stats.linregress(df[col[0]], df[col[1]])
Rs.append((col, (r*r)))
return sorted(Rs, key=operator.itemgetter(1), reverse=True)
print_num = 5
print 'top %d most collinear' % print_num
for i in colin(dataset)[:print_num]:
print '-' * 25
print clean_column_name(i[0][0])
print clean_column_name(i[0][1])
print i[1]
top 5 most collinear ------------------------- cpi population 0.988564121493 ------------------------- heating oil gas price 0.954903320034 ------------------------- cpi consumer credit 0.925192316073 ------------------------- population consumer credit 0.922901527615 ------------------------- dow jones pdiff s and p 500 pdiff 0.919088304642
'''
input: dataframe
column name
optional begin and end date
funtion: creats a line graph of a specific coulumn name
if data increses week to week fill green
if data devreases week to week fill red
return: none
'''
def filled_plt(df, col, begin_date=None, end_date=None):
if begin_date == None:
begin_date = df.index[0]
if end_date == None:
end_date = df.index[-1]
temp = df.truncate(before=begin_date, after=end_date)
for i in range(len(temp)-1):
x = temp.index[i:i+2]
y = temp[col][i:i+2]
c = brewer_rg[1]
if y[0] >= y[1]:
c = brewer_rg[0]
plt.fill_between(x, y, color=c)
ymin = min([0, temp[col].min()])
plt.ylim(ymin, temp[col].max())
plt.xticks(rotation='vertical')
plt.xlabel('date')
plt.ylabel('value')
plt.title('%s between %s and %s' % (clean_column_name(col), begin_date.strftime('%B %d, %Y'), end_date.strftime('%B %d, %Y')))
remove_border()
filled_plt(dataset, 's_and_p_500', begin_date=datetime.datetime(2007,6,7,0,0,0), end_date=datetime.datetime(2010,1,1,0,0,0))
'''
input: dataframe
column name to compare to (x axis)
list of column names to check agrement with
dict of slopes of regression lines for each dataset agisnst the target
optional begin and end date
function: creats a line graph of specific column name
if target data is in agreement with all specified other datasets fill green
agreement is defined by the slope of the regression line
(+ the two datapoints change in the same direction WoW
- the two datapoints chage in opposite directions WoW)
return: the number of times all specifed datasets were in agreement with the target dataset
total number of weeks that were checked
'''
def agreement_plt(df, target, var, dct, begin_date=None, end_date=None):
if begin_date == None:
begin_date = df.index[0]
if end_date == None:
end_date = df.index[-1]
temp = df.truncate(before=begin_date, after=end_date)
agree = 0.
total = 0.
title = ''
for v in var:
title += ' %s,' % clean_column_name(v)
for i in range(len(temp)-1):
x = temp.index[i:i+2]
y = temp[target][i:i+2]
c = dark2_colors[0]
t = []
for v in var:
if (dct[v] > 0):
if ((y[0] >= y[1]) & (temp[v][i] >= temp[v][i+1])) | ((y[0] <= y[1]) & (temp[v][i] <= temp[v][i+1])):
t.append(0)
else:
t.append(1)
else:
if ((y[0] >= y[1]) & (temp[v][i] <= temp[v][i+1])) | ((y[0] <= y[1]) & (temp[v][i] >= temp[v][i+1])):
t.append(0)
else:
t.append(1)
if(sum(t) == 0):
plt.fill_between(x, y, color=c)
agree += 1
total += 1
else:
plt.plot(x, y, color=dark2_colors[1])
total += 1
ymin = min([0, temp[target].min()])
plt.ylim(ymin, temp[target].max())
plt.xticks(rotation='vertical')
plt.xlabel('date')
plt.ylabel('value')
plt.title('trends of %s and %s between %s and %s' % (title, clean_column_name(target), begin_date.strftime('%B %d, %Y'), end_date.strftime('%B %d, %Y')))
remove_border()
return (agree, total)
agree = agreement_plt(dataset, 's_and_p_500', ['volatility', 'gold_price'], slope, begin_date=datetime.datetime(2007,6,7,0,0,0), end_date=datetime.datetime(2010,1,1,0,0,0))
print 'they are in agreement %.2f%% of the time' % (agree[0]/agree[1]*100)
they are in agreement 45.86% of the time
def dataframe_to_array(df, input_cols, output_cols):
return df[input_cols+output_cols].values
datakeys = [x+"_pdiff" for x in data.keys()]
data_array = dataframe_to_array(dataset, datakeys, ['s_and_p_500_pdiff_b_0.00-100.00_adv'])
training = data_array[0:-100]
testing = data_array[-100:]
# training, testing = sklearn.cross_validation.train_test_split(data_array)
split = len(dataset) - len(testing)
training_rows = len(training)
training_columns = len(training[0])
testing_rows = len(testing)
testing_columns = len(testing[0])
#Neural Network Class
class Neural_Net:
#constructor initializes a new neural network with randomly selected weights and pre-specified height, and number of neurons per layer
def __init__(self,non,height):
#list to store the number of neurons in each layer of the network
self.num_of_neurons = non
#height of the network
self.L = height
#list to store number of weights in each layer of the network, indexed by layer, output neuron, input neuron
self.weights = numpy.zeros(shape=((non[0]+1),(non[0]+1),(non[0]+1)))
#delta_matrix: stores the gradient that is used in backpropagation
self.deltas = numpy.zeros(shape=((non[0]+1),(non[0]+1)))
#matrix that stores thresholded signals
self.signals = numpy.zeros(shape=((non[0]+1),(non[0]+1)))
#(tunable) learning_rate used in backpropagation
self.learning_rate = .2
#initialize weights to be between -2 and 2
for i in range(1,self.L+1):
for j in range(1,self.num_of_neurons[i]+1):
for k in range(self.num_of_neurons[i-1]+1):
self.weights[i][j][k] = (random.random()-.5)*3 #changed to adjust
#forward_pass computes the output of the neural network given an input
def forward_pass(self,x):
#(for convenience, we index neurons starting at 1 instead of zero)
self.signals[0][0] = -1
for i in range(1,self.num_of_neurons[0]+1):
self.signals[0][i] = x[i-1]
for i in range(1,self.L+1):
self.signals[i][0] = -1
for j in range(1,self.num_of_neurons[i]+1):
self.signals[i][j] = self.compute_signal(i,j)
return self.signals[self.L][1]
#tune_weights performs the backpropagation algorithm given a training example as input
def tune_weights(self,y):
self.deltas[self.L][1] = 2*(self.signals[self.L][1]-y)*(1-math.pow(self.signals[self.L][1],2))
for i in range(self.L-1,0,-1):
for j in range(1,self.num_of_neurons[i]+1):
self.deltas[i][j] = self.compute_delta(i,j)
for i in range(1,self.L+1):
for j in range(1,self.num_of_neurons[i]+1):
for k in range(self.num_of_neurons[i-1]+1):
self.weights[i][j][k] = self.weights[i][j][k]-self.learning_rate*self.signals[i-1][k]*self.deltas[i][j]
#compute_signal: computes the delta for a given neuron at a given level
def compute_signal(self,level,neuron):
s = 0
for i in range(self.num_of_neurons[level-1]+1):
s += self.weights[level][neuron][i]*self.signals[level-1][i]
return self.g(s)
#compute_delta: computes the signal s for a given neuron at a given level
def compute_delta(self,level,neuron):
s = 0
for j in range(1,self.num_of_neurons[level+1]+1):
s += self.weights[level+1][j][neuron]*self.deltas[level+1][j]
return (1-math.pow(self.signals[level][neuron],2))*s
#soft threshold function
def g(self,s):
#print s
return (math.exp(s)-math.exp(-s))/(math.exp(s)+math.exp(-s))
num_of_neurons = [(len(testing[0])-1),7,1]
network = Neural_Net(num_of_neurons,2)
training_error = 0
test_error = 0
train = numpy.zeros(shape = (1000))
test = numpy.zeros(shape = (1000))
for epoch in range(200):
training_error = 0
test_error = 0
for j in range(testing_rows):
test_error = test_error+math.pow(network.forward_pass(testing[j]) - testing[j][testing_columns-1], 2)
#compute the test errors
#compute the training errors, SEQUENTIALLY. In other words, we perform backpropagation for *every* example
#instead of all at once.
for i in range(training_rows):
training_error = training_error+math.pow(network.forward_pass(training[i])- training[i][training_columns-1], 2)
network.tune_weights(training[i][training_columns-1])
training_error = training_error/training_rows
test_error = test_error/testing_rows
train[epoch] = training_error
test[epoch] = test_error
nn_results = []
for j in range(testing_rows):
nn_results.append(network.forward_pass(testing[j]))
plt.plot(numpy.arange(1000), test, lw=2, label = 'test')
plt.plot(numpy.arange(1000), train, lw=2, label = 'train')
plt.legend(loc=0, frameon=False)
plt.xlabel('Epoch')
plt.ylabel('MSE')
remove_border()
def score(df, split, nn_results):
t = df[-(split):]
t['out'] = float('nan')
for i in range(split):
if nn_results[i] < .5:
t['out'][i] = 0
else:
t['out'][i] = 1
return t
dataset_score = score(dataset, len(testing), nn_results)
print 'Kevin / Lab 10'
print len(dataset_score[-(split):][dataset_score['s_and_p_500_pdiff_b_0.00-100.00_adv'] == dataset_score['out']]) / float(len(dataset_score)) * 100
dataset_score[['out', 's_and_p_500_pdiff_b_0.00-100.00_adv']][dataset_score['out'] == 0]
# print len(dataset_score[['out', 's_and_p_500_pdiff_b_0.00-100.00_adv']][dataset_score['s_and_p_500_pdiff_b_0.00-100.00_adv'] == 0])
# print len(dataset_score)
Kevin / Lab 10 64.0
out | s_and_p_500_pdiff_b_0.00-100.00_adv | |
---|---|---|
2010-11-09 | 0 | 0 |
def results(df, target, nn_results):
temp = df
for i in range(1,len(df)):
temp[nn_results][i-1] = temp[nn_results][i]
temp = temp[:-1]
mi = temp[target].min()-.01
ma = temp[target].max()+.01
temp[nn_results][temp[nn_results] == 1] = ma
temp[nn_results][temp[nn_results] == 0] = mi
ap = 0
tp = 0
for i in range(len(temp)-1):
t = temp['s_and_p_500'].ix[i] - temp['s_and_p_500'].ix[i+1]
x = temp.index[i:i+2]
y1 = temp[target][i:i+2]
y2 = temp[nn_results][i]
plt.vlines(x[0], mi, ma, label=('week'), linewidth=.75, color='w')
if(t > 0 and temp[nn_results][i] < 0) or (t < 0 and temp[nn_results][i] > 0):
plt.fill_between(x, [y2,y2], color=brewer_rg[1], label=('accurate prediction'))
ap += 1
tp += 1
else:
plt.fill_between(x, [y2,y2], color=brewer_rg[0], label=('inacurate prediction'))
tp += 1
plt.fill_between(x, y1, color='k', label='s&p 500 pdiff')
plt.ylim(mi, ma)
plt.xticks(rotation='vertical')
plt.xlabel('date')
plt.ylabel('s&p 500 pdiff')
plt.title('accuracey of nn prediction')
# plt.legend(frameon=False)
remove_border()
return (ap, tp)
ap, tp = results(dataset_score, 's_and_p_500_pdiff', 'out')
print 'total predictions: %d' % tp
print 'accurate predictions: %d (%.2f%%)' % (ap, (ap/float(tp))*100)
print 'inacurate predictions: %d (%.2f%%)' % (tp-ap, ((tp-ap)/float(tp))*100)
total predictions: 98 accurate predictions: 62 (63.27%) inacurate predictions: 36 (36.73%)