# !pip install Quandl

%matplotlib inline

import numpy as np
import numpy
import pandas as pd
import math
import scipy
import random
import Quandl
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import string
import calendar
import datetime
from pandas.tseries.offsets import *
import operator
from itertools import combinations, permutations
from matplotlib import rcParams
import scipy.stats as stats
from sklearn.cross_validation import train_test_split 
from sklearn.naive_bayes import MultinomialNB
import sklearn

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

#http://colorbrewer2.org/
brewer_rg = ['#E41A1C', '#4DAF4A']

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(ax=None, axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    if ax == None:
        ax = axes or plt.gca()
    
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #remove grid
    ax.grid(False)
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)


# Quandl API key
authtoken='Cx1CtXeu61zjTzpehmNV'

'''
input:     sources - list of data source titles from Quandl, ex. 'FRED' for Federal Reserve Economic Data

function:   scrapes the Quandl website using its search API to find the Quandl code for every variable from a given source

return: data - dictionary with names and Quandl codes for every variable from the supplied sources
'''

def scrape_quandl(sources):
    data={}
    for source in sources:
        for j in range(1,100):
            data_options = Quandl.search(query="", source = source, page=j)
            for i in data_options:
                data[i["code"]]=i["code"]
    #Adds the S and P to the dataset so that it is always included
    data['s_and_p_500']='YAHOO/INDEX_GSPC.6'
    return data 


'''
get_quandl_data
input:     dict of Quandl codes

function:  downloads Quandl data
           set column name for each as its repective key from the dict
           
return:    list of dataframes
'''

def get_quandl_data(quandl_codes, cols=True):
    data_list = []
    for number, code in enumerate(quandl_codes.values()):
        data_holder = Quandl.get([code], authtoken=authtoken)
        if(cols == True):
            data_holder.columns = [quandl_codes.keys()[number]]
        data_list.append(data_holder)
    return data_list


'''
collapser
input:     dataframe of Quandl data

function:  and adds rows for every business day during the timeframe of that dataset
           performs linear interpolation to fill in all missing values
           then drops all values except for Mondays
           
return:    cleaned dataframe of Quandle data
'''

def collapser(df):
    df = df.asfreq(BDay())
    df = df.apply(pd.Series.interpolate)
    df = df.asfreq(Week(weekday=1))
    return df


'''
merger
input:     list of dataframes of Quandl data

function:  runs collapser on each dataframe
           finds the latest start date and earliest end date accross ass dataframes and truncates at those date
           concatonats all dataset into single dataframe

return:    dataframe of Quandl data of uniform length with no missing values
           start and end date of dataframe
'''

def merger(data_list):
    begin_date=datetime.datetime(1900,1,1,12,13,14)
    end_date=datetime.datetime(2100,1,1,12,13,14)
    
    for i in reversed(range(len(data_list))):
        try:
            data_list[i] = collapser(data_list[i])
            if (data_list[i].index[0]>datetime.datetime(2000,1,1,12,13,14)):
                del data_list[i]
            elif (data_list[i].index[-1]<datetime.datetime(2012,1,1,12,13,14)):
                del data_list[i]
            elif (data_list[i].index[0]>begin_date):
                begin_date=data_list[i].index[0]
            elif (data_list[i].index[-1]<end_date):
                end_date=data_list[i].index[-1]
        except: 
            del data_list[i]
            continue
    for i in range(len(data_list)):
        data_list[i] = data_list[i].truncate(before=begin_date, after=end_date)
            
    data=pd.concat(data_list, axis=1)
    return data, (begin_date, end_date)


def dataframe_to_array(df, input_cols, output_cols):
    return df[input_cols+output_cols].values


def percentageDifference(df, columns):
    diff_columns=[]
    df_columns=list(df.columns)
    for column in columns:
        if column in df_columns:
            diff_columns.append(column)
            df[column+"_pdiff"]= float('nan')
        else:
            continue
    for i in range(1, len(df)):
        for ncolumn in diff_columns:
            df[ncolumn+"_pdiff"].ix[i] = (df[ncolumn].ix[i] - df[ncolumn].ix[i-1]) / df[ncolumn].ix[i-1]
            if np.logical_not(np.isfinite(df[ncolumn+"_pdiff"].ix[i])):
                df[ncolumn+"_pdiff"].ix[i] = 0
    return df[1:]


def biner(df, cols, bins, neural=-1):
    add_cols = []
    for col in cols:
        for i in range(len(bins)-1):
            title = '%s_b_%.2f-%.2f' % (col, bins[i], bins[i+1])
            df[title] = neural
            df[title][(df[col] >= bins[i]) & (df[col] < bins[i+1])] = 1
            add_cols.append(title)
    return df, add_cols


def advance_cols(df, cols):
    for col in cols:
        title = '%s_adv' % col
        df[title] = float('nan')
        for i in range(len(df)-1):
            df[title][i] = df[col][i+1]
    return df[:-1]

'''
input:     data - dictionary with names and Quandl codes for every variable from the supplied sources

function:   calls for a pd dataframe, cleans and interpolates the data so that it is all weekly, deletes datasets without at least
            12 years of data or any that throw an error, then adds the prediction for next week column

return: dataset - Pandas dataframe with columns for every variable
'''

def prepare_data(data):
    list_of_data = get_quandl_data(data, cols=False)
    dataset, dates = merger(list_of_data)
    dataset = percentageDifference(dataset, ['YAHOO.INDEX_GSPC - Adjusted Close'])
    dataset, add_cols = biner(dataset, ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff'], [0.000001, 100.], 0)
    dataset = advance_cols(dataset, add_cols)
    return dataset

'''
input:     dataset - Pandas dataframe with columns for every variable

function:  Goes variable by variable and tests if adding that variable improves the score of the Naive Bayes model. 
           If a variable improves the model, it keeps it, if it keeps the model the same or makes it worse, it ignores it

return: columns_keep - list of Quandl codes that made it into the final model (the columns that were useful for prediction)
        k - float with the final score of the most accurate model
'''

def NB_Add_Drop(dataset):
    validate = 0
    #list_of_data = get_quandl_data(data)
    #dataset, dates, num_col = merger(list_of_data)
    #dataset = percentageDifference(dataset, ['s_and_p_500'])
    #dataset, add_cols = biner(dataset, ['s_and_p_500_pdiff'], [0.000001, 100.])
    #dataset = advance_cols(dataset, add_cols)
    columns_keep = ['YAHOO.INDEX_GSPC - Adjusted Close']
    for i in dataset.columns:
        if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff'):
            continue
        if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'):
            continue
        try:
            columns_keep.append(i)
            data_array = dataframe_to_array(dataset, columns_keep, ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'])
            data_array_in=data_array[:,:-1]
            data_array_out=data_array[:,-1:]
            #Converts the x by 1 by 1 array into a x by 1 array
            data_array_out=data_array_out.reshape(data_array_out.shape[0])
            mean_list = []
            for j in range(0,1000):
                xtrain, xtest, ytrain, ytest = train_test_split(data_array_in, data_array_out)
                clf = MultinomialNB().fit(xtrain, ytrain)
                mean_list.append(clf.score(xtest, ytest))
            k = sum(mean_list)/float(len(mean_list))
            if (k <= validate):
                columns_keep.remove(i)
            else:
                validate = k
        except:
            columns_keep.remove(i)
            continue
    return columns_keep, k


'''
NB_All_Vars

input:     dataset - Pandas dataframe with columns for every variable
           tops - int, number of top variables you want returned

function:  Runs a Naive Bayes that looks at each variable and the S and P and returns the top x variables as specified

return: columns_keep - list of Quandl codes that made it into the final model (the columns that were useful for prediction)
        k - float with the final score of the most accurate model
'''

def NB_All_Vars(dataset, tops):
    columns_keep = ['YAHOO.INDEX_GSPC - Adjusted Close']
    regressions = {}
    for i in dataset.columns:
        if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff'):
            continue
        if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'):
            continue
        try:
            columns_keep.append(i)
            data_array = dataframe_to_array(dataset, columns_keep, ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'])
            data_array_in=data_array[:,:-1]
            data_array_out=data_array[:,-1:]
            #Converts the x by 1 by 1 array into a x by 1 array
            data_array_out=data_array_out.reshape(data_array_out.shape[0])
            mean_list = []
            for j in range(0,1000):
                xtrain, xtest, ytrain, ytest = train_test_split(data_array_in, data_array_out)
                clf = MultinomialNB().fit(xtrain, ytrain)
                mean_list.append(clf.score(xtest, ytest))
            k = sum(mean_list)/float(len(mean_list))
            regressions[i]=k
            columns_keep.remove(i)
    
        except:
            columns_keep.remove(i)
            continue
    top_regressions = dict(sorted(regressions.iteritems(), key=operator.itemgetter(1), reverse=True)[:tops])
    return top_regressions


'''
NB_Add_Drop_Top

input:     dataset - Pandas dataframe with columns for every variable
           tops - int, number of top variables you want returned

function:  Runs the Add_Drop Naive Bayes model but only on the variables returned in top_regressions
return: columns_keep - list of Quandl codes that made it into the final model (the columns that were useful for prediction)
        k - float with the final score of the most accurate model
'''

def NB_Add_Drop_Top(dataset, top_regressions):
    validate = 0
    columns_keep = ['YAHOO.INDEX_GSPC - Adjusted Close']
    for i in top_regressions.keys():
        if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff'):
            continue
        if (i=='YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'):
            continue
        try:
            columns_keep.append(i)
            print columns_keep
            data_array = dataframe_to_array(dataset, columns_keep, ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'])
            data_array_in=data_array[:,:-1]
            data_array_out=data_array[:,-1:]
            #Converts the x by 1 by 1 array into a x by 1 array
            data_array_out=data_array_out.reshape(data_array_out.shape[0])
            mean_list = []
            for j in range(0,1000):
                xtrain, xtest, ytrain, ytest = train_test_split(data_array_in, data_array_out)
                clf = MultinomialNB().fit(xtrain, ytrain)
                mean_list.append(clf.score(xtest, ytest))
            k = sum(mean_list)/float(len(mean_list))
            if (k <= validate):
                columns_keep.remove(i)
            else:
                validate = k
        except:
            columns_keep.remove(i)
            continue
    return columns_keep, k


data_NB = scrape_quandl(["FRED","OFDP"])
datalist = prepare_data(data_NB)
top_regressions = NB_All_Vars(datalist, 20)
final_NB = NB_Add_Drop_Top(datalist, top_regressions)

'''
dict of of all the Quandl datasets we will use
the key will be used as the column name for convenience
'''

data = {'gold_price':'BUNDESBANK/BBK01_WT5511.1',
        'usd_to_pound':'QUANDL/USDGBP.1',
        'cpi':'FRED/CPIAUCSL.1',
        'unemployment':'FRED/UNRATE.1',
        'gas_price':'BTS_MM/RETAILGAS.1',
        'volatility':'YAHOO/INDEX_VIX.6',
        'house_sales':'FRED/HSN1F.1',
        'usd_to_euro':'QUANDL/USDEUR.1',
        'crude_futures':'OFDP/FUTURE_CL2.1',
        'housing_prices':'FRED/ASPTFC.1',
        'gas_futures':'OFDP/FUTURE_NG1.2',
        '10-year_treasury':'FRED/DGS10',
        'corporate_profits':'FRED/CP',
        'wheat_futures':'OFDP/FUTURE_W1.5',
        'gdp':'FRED/GDP',
        'treasury_futures':'OFDP.FUTURE_US1.1',
        'inventories_sales_ratio':'FRED/ISRATIO',
        'iron_ore':'WORLDBANK/WLD_IRON_ORE.1',
        'retail_and_food':'FRED/RSAFS.1',
        's_and_p_500':'YAHOO.INDEX_GSPC.6'}

list_of_data = get_quandl_data(data)

dataset, dates = merger(list_of_data)
dataset = percentageDifference(dataset, data.keys())
dataset, add_cols = biner(dataset, ['s_and_p_500_pdiff'], [0.000001, 100.])
dataset = advance_cols(dataset, add_cols)

# sanity check
assert len(dataset[np.logical_not(np.isfinite(dataset)).any(axis=1)]) == 0

inc = dataset['s_and_p_500_pdiff'][dataset['s_and_p_500_pdiff'] > 0]
dec = dataset['s_and_p_500_pdiff'][dataset['s_and_p_500_pdiff'] <= 0]
mean = dataset['s_and_p_500_pdiff'].mean()
std = dataset['s_and_p_500_pdiff'].std()
in_one_std = dataset['s_and_p_500_pdiff'][(dataset['s_and_p_500_pdiff'] >= (mean - std)) & (dataset['s_and_p_500_pdiff'] <= (mean + std))]
out_one_std = dataset['s_and_p_500_pdiff'][(dataset['s_and_p_500_pdiff'] <= (mean - std)) | (dataset['s_and_p_500_pdiff'] >= (mean + std))]

int_inc = pd.concat([inc, out_one_std], axis=1, join='inner').ix[:,0]
int_dec = pd.concat([dec, out_one_std], axis=1, join='inner').ix[:,0]


extreme = dataset.sort_index(by=['s_and_p_500_pdiff'], ascending=False).s_and_p_500_pdiff
print_num = 5
extreme = pd.concat([extreme[:print_num], extreme[-print_num:]])


print 'the s&p 500 increased %d weeks (%.2f%%) and decreased %d weeks (%.2f%%) between %s and %s' % (len(inc), (float(len(inc))/len(dataset))*100, len(dec), (float(len(dec))/len(dataset))*100, dates[0].strftime('%B %d, %Y'), dates[1].strftime('%B %d, %Y')) 
print 'over the whole period the s & p 500 increased by %.2f%%' % ((dataset['s_and_p_500'][-1] - dataset['s_and_p_500'][0])/dataset['s_and_p_500'][0]*100)
print 'the average weekly increase was %.2f%% with a standard deviation of %.2f%%' % ((inc.mean() * 100), inc.std()*100)
print 'the average weekly decrease was %.2f%% with a standard deviation of %.2f%%' % (abs(dec.mean() * 100), dec.std()*100)
print 'the average weekly change over the whole period was %.2f%% with a standard deviation of %.2f%%' % (mean*100, std*100)
print 'the weekly change was outside 1 standard deviation of the mean %d times' % len(out_one_std)
print 'of those %d of them were increases and had an average change of %.2f%% and a standard deviation of %.2f%%' % (len(int_inc), int_inc.mean()*100, int_inc.std()*100)
print 'and %d of them were decreases and had an average change of %.2f%% and a standard deviation of %.2f%%' % (len(int_dec), int_dec.mean()*100, int_dec.std()*100)
print
print 'the %d most extreme increases/decreases were' % print_num
for i in range(len(extreme)):
    print '-' * 25
    print extreme.index[i].strftime('%B %d, %Y')
    print '%.2f%%' % (extreme[i]*100)

plt.figure()
ax = dataset['s_and_p_500_pdiff'].hist(bins=np.sqrt(len(dataset)))
plt.xlabel('weekly percent change of the s&p 500')
plt.ylabel('frequency')
plt.title('weekly percent change of the s&p 500')
plt.vlines((mean+std), 0, 100, label=('+/-1 standard deviation'))
plt.vlines((mean-std), 0, 100)
plt.legend(frameon=False)
remove_border(ax)

'''
input: column name

function: removes '_' from column name

return: clean column name
'''
def clean_column_name(col):
    return ' '.join(col.split('_'))

'''
input:     dataframe
           a list of columns (y axis)
           target column (x axis)

function:  plots each column of the dataframe agaist the target column
           overplots a liner fit

return:   a sorted list column names by correlation coefficient
          dict of the slopes of the liner regression line
'''
def scatter_plot(df, cols, target):
    xaxis = target
    yaxis = cols
    ff = 5 * (len(yaxis)-1)
    fig, axes=plt.subplots(nrows=len(yaxis), figsize=(5,ff))
    Rs = []
    slope = {}
    for i, y in enumerate(yaxis):
        axes[i].scatter(df[xaxis], df[y])
        m, b, r, p, se = scipy.stats.linregress(df[xaxis], df[y])
        lab = 'r^2 = %.2f \np = %.2f' % (r*r, p)
        axes[i].plot(df[xaxis], np.array(df[xaxis]) * m + b, label=lab)
        axes[i].set_xlabel(clean_column_name(xaxis))
        axes[i].set_ylabel(clean_column_name(y))
        axes[i].legend(frameon=False)
#         axes[i].set_title(lab)
        remove_border(axes[i]) 
        Rs.append((y, (r*r)))
        slope[y] = m
    fig.tight_layout()
    return sorted(Rs, key=operator.itemgetter(1), reverse=True), slope
    
        
r, slope = scatter_plot(dataset, data.keys(), 's_and_p_500')

'''
input:     dataset

function:  checks for collinearity between all variables
           excludes percent difference, binned, and advanced columns by default

return:    sorted list of tuples of column names and their correlation coefficent 
'''
def colin(df, extra=False):
    cols = list(df.columns)
    if extra == False:
        cols = [c for c in list(df.columns) if c[-6:] != '_pdiff' or  '_bins' or '_adv']
    cols = combinations(cols, 2)
    Rs = []
    for col in cols:
        m, b, r, p, se = scipy.stats.linregress(df[col[0]], df[col[1]])
        Rs.append((col, (r*r)))
    return sorted(Rs, key=operator.itemgetter(1), reverse=True)

print_num = 5
print 'top %d most collinear' % print_num
for i in colin(dataset)[:print_num]:
    print '-' * 25
    print clean_column_name(i[0][0])
    print clean_column_name(i[0][1])
    print i[1]

'''
input:     dataframe
           column name
           optional begin and end date

funtion:   creats a line graph of a specific coulumn name
           if data increases week to week fill green
           if data decreases week to week fill red

return: none
'''
def filled_plt(df, col, begin_date=None, end_date=None):
    if begin_date == None:
        begin_date = df.index[0]
        
    if end_date == None:
        end_date = df.index[-1]

    temp = df.truncate(before=begin_date, after=end_date)
    for i in range(len(temp)-1):
        x = temp.index[i:i+2]
        y = temp[col][i:i+2]
        c = brewer_rg[1]
        if y[0] >= y[1]:
            c = brewer_rg[0]
        plt.fill_between(x, y, color=c)
    ymin = min([0, temp[col].min()])
    plt.ylim(ymin, temp[col].max())
    plt.xticks(rotation='vertical')
    plt.xlabel('date')
    plt.ylabel('value')
    plt.title('%s between %s and %s' % (clean_column_name(col), begin_date.strftime('%B %d, %Y'), end_date.strftime('%B %d, %Y')))
    remove_border()

filled_plt(dataset, 's_and_p_500', begin_date=datetime.datetime(2011,6,7,0,0,0))#, end_date=datetime.datetime(2010,1,1,0,0,0)) 

'''
input:     dataframe
           column name to compare to (x axis)
           list of column names to check agreement with
           dict of slopes of regression lines for each dataset against the target
           optional begin and end date

function:  creats a line graph of specific column name
           if target data is in agreement with all specified other datasets fill green
               agreement is defined by the slope of the regression line 
               (+ the two datapoints change in the same direction WoW
               - the two datapoints chage in opposite directions WoW)

return: the number of times all specified datasets were in agreement with the target dataset
        total number of weeks that were checked
'''    
def agreement_plt(df, target, var, dct, begin_date=None, end_date=None):
    if begin_date == None:
        begin_date = df.index[0]
    if end_date == None:
        end_date = df.index[-1]
    temp = df.truncate(before=begin_date, after=end_date)
    agree = 0.
    total = 0.
    title = ''
    for v in var:
        title += ' %s,' % clean_column_name(v)
    for i in range(len(temp)-1):
        x = temp.index[i:i+2]
        y = temp[target][i:i+2]
        c = dark2_colors[0]
        t = []
        for v in var:
            if (dct[v] > 0):
                if ((y[0] >= y[1]) & (temp[v][i] >= temp[v][i+1])) | ((y[0] <= y[1]) & (temp[v][i] <= temp[v][i+1])):
                    t.append(0)
                else:
                    t.append(1)
            else:
                if ((y[0] >= y[1]) & (temp[v][i] <= temp[v][i+1])) | ((y[0] <= y[1]) & (temp[v][i] >= temp[v][i+1])):
                    t.append(0)
                else:
                    t.append(1)
        if(sum(t) == 0):
            plt.fill_between(x, y, color=c)
            agree += 1
            total += 1
        else:
            plt.plot(x, y, color=dark2_colors[1])
            total += 1
    ymin = min([0, temp[target].min()])
    plt.ylim(ymin, temp[target].max())
    plt.xticks(rotation='vertical')
    plt.xlabel('date')
    plt.ylabel('value')
    plt.title('trends of %s and %s between %s and %s' % (title, clean_column_name(target), begin_date.strftime('%B %d, %Y'), end_date.strftime('%B %d, %Y')))
    remove_border()
    return (agree, total)

agree = agreement_plt(dataset, 's_and_p_500', data.keys()[0:2], slope, begin_date=datetime.datetime(2007,6,7,0,0,0), end_date=datetime.datetime(2010,1,1,0,0,0)) 
print 'they are in agreement %.2f%% of the time' % (agree[0]/agree[1]*100)


datakeys = [x+"_pdiff" for x in data.keys()]

data_array = dataframe_to_array(dataset, datakeys, ['s_and_p_500_pdiff_b_0.00-100.00_adv'])

training = data_array[0:-100]
testing = data_array[-100:]

training_rows = len(training)
training_columns = len(training[0])

testing_rows = len(testing)
testing_columns = len(testing[0])
split = len(dataset) - len(testing)


#Neural Network Class 
class Neural_Net:
    #constructor initializes a new neural network with randomly selected weights and pre-specified height, and number of neurons per layer
    def __init__(self,non,height):
        #list to store the number of neurons in each layer of the network
        self.num_of_neurons = non
        #height of the network
        self.L = height
        #list to store number of weights in each layer of the network, indexed by layer, output neuron, input neuron
        self.weights = numpy.zeros(shape=((non[0]+1),(non[0]+1),(non[0]+1)))
        #delta_matrix: stores the gradient that is used in backpropagation
        self.deltas = numpy.zeros(shape=((non[0]+1),(non[0]+1)))
        #matrix that stores thresholded signals
        self.signals = numpy.zeros(shape=((non[0]+1),(non[0]+1)))
        #(tunable) learning_rate used in backpropagation
        self.learning_rate = .00001
        #initialize weights to be between -2 and 2
        for i in range(1,self.L+1):
            for j in range(1,self.num_of_neurons[i]+1):
                for k in range(self.num_of_neurons[i-1]+1):
                    self.weights[i][j][k] = (random.randrange(-2,2)) #changed to adjust        
                                            
    #forward_pass computes the output of the neural network given an input
    def forward_pass(self,x):
        #(for convenience, we index neurons starting at 1 instead of zero)
        self.signals[0][0] = -1
        for i in range(1,self.num_of_neurons[0]+1):
            self.signals[0][i] = x[i-1]
        for i in range(1,self.L+1):
            self.signals[i][0] = -1
            for j in range(1,self.num_of_neurons[i]+1):
                self.signals[i][j] = self.compute_signal(i,j)
        return self.signals[self.L][1]
                    
    #tune_weights performs the backpropagation algorithm given a training example as input
    def tune_weights(self,y):
        self.deltas[self.L][1] = 2*(self.signals[self.L][1]-y)*(1-math.pow(self.signals[self.L][1],2))
        for i in range(self.L-1,0,-1):
            for j in range(1,self.num_of_neurons[i]+1):
                self.deltas[i][j] = self.compute_delta(i,j)
        for i in range(1,self.L+1):
            for j in range(1,self.num_of_neurons[i]+1):
                for k in range(self.num_of_neurons[i-1]+1):
                    self.weights[i][j][k] = self.weights[i][j][k]-self.learning_rate*self.signals[i-1][k]*self.deltas[i][j]
    
    #compute_signal: computes the delta for a given neuron at a given level
    def compute_signal(self,level,neuron):
        s = 0
        for i in range(self.num_of_neurons[level-1]+1):
            s += self.weights[level][neuron][i]*self.signals[level-1][i]
        return self.g(s)
    
    #compute_delta: computes the signal s for a given neuron at a given level
    def compute_delta(self,level,neuron):
        s = 0
        for j in range(1,self.num_of_neurons[level+1]+1):
            s += self.weights[level+1][j][neuron]*self.deltas[level+1][j]
        return (1-math.pow(self.signals[level][neuron],2))*s
    
    #soft threshold function
    def g(self,s):
        #print s
        try:
            return (math.exp(s)-math.exp(-s))/(math.exp(s)+math.exp(-s))
        except OverflowError:
            return 0.
        

num_of_neurons = [(len(testing[0])-1),13,1]
network = Neural_Net(num_of_neurons,2)
e = 500
train = numpy.zeros(shape = (e))
test = numpy.zeros(shape = (e))

for epoch in range(e):
    training_error = 0
    test_error = 0
    for j in range(testing_rows):
        test_error = test_error+math.pow(network.forward_pass(testing[j]) - testing[j][testing_columns-1], 2)
    #compute the test errors
    #compute the training errors, SEQUENTIALLY. In other words, we perform backpropagation for *every* example
    #instead of all at once. 
    for i in range(training_rows):
        training_error = training_error+math.pow(network.forward_pass(training[i])- training[i][training_columns-1], 2)
    network.tune_weights(training[i][training_columns-1])
    training_error = training_error/training_rows
    test_error = test_error/testing_rows
    train[epoch] = training_error
    test[epoch]  = test_error
    

nn_results = []
for j in range(testing_rows):
    nn_results.append(network.forward_pass(testing[j]))

plt.plot(numpy.arange(e), test, lw=2, label = 'test')
plt.plot(numpy.arange(e), train, lw=2, label = 'train')
plt.legend(loc=0, frameon=False)
plt.xlabel('Epoch')
plt.ylabel('MSE')
remove_border()

def results(df, graph, nn_results, sp, title="prediction"):
    temp = df.copy()
    for i in range(len(temp)-1):
        temp[graph][i] = temp[graph][i+1]
    temp = temp[-len(nn_results):]
    mi = temp[graph].min()-.005
    ma = temp[graph].max()+.005
    ci = 0.
    cd = 0.
    ti = 0.
    td = 0.
    pi = 0.
    pde = 0.
    for i in range(len(nn_results)-1):
        t = temp[graph][i]
        x1 = temp.index[i]
        x2 = temp.index[i+1]
        y1 = temp[graph][i]
        y2 = temp[graph][i+1]
        if(t > 0 and nn_results[i] > sp):
            plt.fill_between([x1,x2], [ma,ma], color=brewer_rg[1], label=('accurate prediction'))
            ci += 1
            ti += 1
            pi += 1
        elif(t < 0 and nn_results[i] <= sp):
            plt.fill_between([x1,x2], [mi,mi], color=brewer_rg[1], label=('accurate prediction'))
            cd += 1
            td += 1
            pde += 1
        elif(t > 0 and nn_results[i] <= sp):
            plt.fill_between([x1,x2], [mi,mi], color=brewer_rg[0], label=('inacurate prediction'))
            ti += 1
            pde += 1
        elif(t < 0 and nn_results[i] > sp):
            plt.fill_between([x1,x2], [ma,ma], color=brewer_rg[0], label=('inacurate prediction'))
            td += 1
            pi += 1
        plt.fill_between([x1,x2], [y1,y2], color='k', label='s&p 500 pdiff')
    plt.ylim(mi, ma)
    plt.xticks(rotation='vertical')
    plt.xlabel('date')
    plt.ylabel('s&p 500 pdiff')
    plt.title(title)
    remove_border()
    return ci, cd, ti, td, pi, pde

data_array = dataframe_to_array(datalist, top_regressions.keys(), ['YAHOO.INDEX_GSPC - Adjusted Close_pdiff_b_0.00-100.00_adv'])

training = data_array[0:-100]
testing = data_array[-100:]
train_in = training[:,:-1]
train_out =training [:,-1:]
train_out=train_out.reshape(train_out.shape[0])
test_in = testing[:,:-1]
test_out = testing [:,-1:]
test_out=test_out.reshape(test_out.shape[0])
clf = MultinomialNB().fit(train_in, train_out)
k = clf.score(test_in, test_out)
predictions = clf.predict(test_in)

cinb, cdnb, tinb, tdnb, _, _ = results(datalist, 'YAHOO.INDEX_GSPC - Adjusted Close_pdiff', predictions, .5, title="accuracy of NB predictions")

print 'average accuracy: %.2f%%' % ((cinb+cdnb)/(tinb+tdnb)*100)
print 'accuracy on decrease: %.2f%%' % (cdnb/tdnb*100)
print 'accuracy on increase: %.2f%%' % (cinb/tinb*100)

ci, cd, ti, td, _, _ = results(dataset, 's_and_p_500_pdiff', nn_results, 0., title='accuracy of NN predictions')

print 'average accuracy: %.2f%%' % ((ci+cd)/(ti+td)*100)
print 'accuracy on decrease: %.2f%%' % (cd/td*100)
print 'accuracy on increase: %.2f%%' % (ci/ti*100)

def score(df, split, nn_results):
    t = df[-split:].copy()
    t['out'] = float('nan')
    for i in range(split):
        if nn_results[i] < 0:
            t['out'][i] = -1
        else:
            t['out'][i] = 1
    return t

dataset_out = score(dataset, len(testing), nn_results) 

def buy_pass(df_score, nn_results):
    my_list = []
    market_list =[]
    start = 100000000
    my_money = start
    market_money = start
    for i in range(1, (len(nn_results)-1)):
        market_money = market_money * (1+dataset_out.s_and_p_500_pdiff[i+1])
        market_list.append(market_money)
        if (nn_results[i] >0):
            my_money = my_money * (1+dataset_out.s_and_p_500_pdiff[i+1])
        my_list.append(my_money)
        
    my_list = [(i-start)/start for i in my_list]  
    market_list = [(i-start)/start for i in market_list]
    x = df_score.index[1:-1]
    return my_list, market_list, x


my_list, market_list, N = buy_pass(dataset_out, nn_results)
my_list = [i*100 for i in my_list]
market_list = [i*100 for i in market_list]

def buy_pass_plot(list_data, list_lab, N, x, y, title):
    mi = np.inf
    ma = -np.inf
    if len(list_data) != len(list_lab):
        raise Exception('length mismatch')
    
    for i in range(len(list_data)):
        plt.plot(N, list_data[i], label=list_lab[i])
        if min(list_data[i]) < mi:
            mi = min(list_data[i])
        if max(list_data[i]) > ma:
            ma = max(list_data[i])
    plt.axhline(y=.05, linewidth=1.5, color='k')
    plt.xticks(rotation='vertical')
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(title)
    plt.legend(loc=0, frameon=False)
    if 0 < mi:
        mi = 0
    plt.ylim(mi,ma)
    remove_border(bottom=False)

buy_pass_plot([my_list, market_list], ['strategy', 'no strategy'], N, 'date', '% chage', 'basic trading strategy based on nn prediction')
print 'our model made %.2f%% while the market increased by %.2f%% over a period of %d weeks' % (my_list[-1], market_list[-1], len(my_list))


my_list_pdiff = []
for i in range(len(my_list)):
    my_list_pdiff.append(my_list[i]-market_list[i])
    
buy_pass_plot([my_list_pdiff], ['%'], N, 'date', '% better/worse', '% better or worse than market')
print 'our model preformed %.2f%% better than the market' % (my_list[-1]-market_list[-1])