# Code to allow inline plotting, derived from psets

import warnings
warnings.filterwarnings('ignore')
from helperfile import *
plt.rc('figure', figsize=(10, 6))

%matplotlib inline

import json

import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

with open('SPX.csv', 'r') as ff:
    print ff.readline() # headers
    print ff.readline() # first row

data = pd.read_csv('SPX.csv',parse_dates={'Timestamp': ['DATE']},index_col='Timestamp')
data

history = data.ix[:, ['OPEN', 'VOLUME']]
history.head()

volume = history.VOLUME.resample('1w', how='sum') # weekly vwap
value = history.prod(axis=1).resample('1w', how='sum')
vwap = value / volume
print vwap

vwap.ix['1993-01-31':'2013-09-15']

selection = vwap.between_time('1993-01-31', '2013-09-15')
vol = pd.Series(volume.between_time('1993-01-31', '2013-09-15'),dtype=float)
#vol.head(20)
# attempt to fill possible missing data
selection.ix['1993-01-31':'2013-09-15'].head(20)
filled = selection.fillna(method='pad', limit=1)
filled.ix['1993-01-31':'2013-09-15'].head(20)
vol = vol.fillna(0.)
#vol.head(20)
# now for visualization, plot the volume weighted avereage price, along with volume, with respect to time. 
vol.plot(style='y')
filled.plot(secondary_y=True, style='r')
remove_border()

BB = history.OPEN.resample('1w', how='ohlc')
week_range = BB.high - BB.low
#week_range.describe()
week_return = BB.close / BB.open - 1
#week_return.describe()
#week_return.head()
MM = week_return.between_time('1993-01-31', '2013-09-15')
#lagged = MM.shift(1)
lagged = week_return.tshift(1, 'w').between_time('1993-01-31', '2013-09-15')
# Ordinary least squares
pd.ols(y=MM, x=lagged)

MM = vwap / BB.open - 1
MM = MM.between_time('1993-01-31', '2013-09-15')
lagged = MM.tshift(1, 'w').between_time('1993-01-31', '2013-09-15')
pd.ols(y=MM, x=lagged)

# numpy/matplotlib
%pylab inline
history = randn(10000)
plot(randn(10000)) #plot the 10000-day historical price vector.
title('returns') 
xlabel('day')
ylabel('return [%]')
remove_border()

h = hist(history,100)
print 'Mean:' , history.mean()
print 'Standard deviation:%.2f' % history.std()

price = 100*(history/100+1).cumprod() # normalized price with a starting asset base of $100.
plot(price) # cumulative sum
title('normalized price')
xlabel('day number') 
ylabel('price [$]')
remove_border()

history = randn(1000,5000)
prices = 100*(history/100+1).cumprod(axis=0) # normalized price along axis
h = plot(prices[:,:300]) # plot first 300
xlabel('day number')
ylabel('price [$]')
remove_border()

finalPrice = prices[-1,:] # get last row of prices table
h = hist(finalPrice,50) # plot histogram
xlabel('price [$]')
ylabel('frequency [#]')
print 'Mean:', finalPrice.mean()
print 'Std:', finalPrice.std()

history3x = history*3. # leveraged returns
prices3x = 100*(history3x/100+1).cumprod(axis=0) # normalized price
finalPrice3x = prices3x[-1,:] # last
h1 = hist(finalPrice3x,500) # histogram 1
h2 = hist(finalPrice,100) # histogram 2
xlim([0,400]) # setting limit of x axis
legend(['3 times','1 times'])
print 'Mean 3x:', finalPrice3x.mean()

warnings.filterwarnings('ignore')
import signal_processing as sp
sp.example()

# make a new dataframe from the original dataset 
prices = pd.DataFrame(data.reindex( index=data.index[ ::-1 ] ).OPEN)
prices.plot()
remove_border()

prices['SMA50'] = sp.sma(prices.OPEN,50)
prices['SMA200'] = sp.sma(prices.OPEN,200)
prices.plot()
remove_border()

equity_curves = pd.DataFrame(sp.dmac(prices.OPEN,prices.OPEN,sp.sma,2, 5, 5, True),columns =['Value SMA'],index=prices.index)
equity_curves.plot(); remove_border()

equity_curves['Technical SMA'] = sp.dmac(prices.OPEN,prices.OPEN,sp.sma,2, 5, 5, False)
equity_curves['Baseline'] = 100*prices.OPEN/prices.OPEN[0]

equity_curves.plot(); remove_border()

stats = sp.performance(equity_curves['Baseline'], time_frame= 251)
pd.DataFrame([stats.values()],columns = stats.keys(),index=['Baseline'])

stats = []
for strategy in ['Baseline','Value SMA','Technical SMA']:
    stats.append( sp.performance(equity_curves[strategy], time_frame= 251))
    
pd.DataFrame([stat.values() for stat in stats],columns = stats[0].keys(),index=['Baseline','Value SMA','Technical SMA'])

from mpl_toolkits.mplot3d import Axes3D

values = []
for short_period in range(1,5):
    for long_period in range(short_period + 1, short_period + 5):
        values.append([short_period, long_period,sp.dmac(prices.OPEN,prices.OPEN,sp.sma,short_period, long_period, 5, True,end_value=True)])
        
X,Y,Z = zip(*values)

print max(values,key=lambda x:x[2])

fig = plt.figure()
ax = Axes3D(fig)
ax.plot_trisurf(X,Y,Z)


for filt,name in [(sp.ema,'EMA'),(sp.gaussian,'Gaussian'),(sp.butterworth,'Butterworth')]:
    values = []
    for short_period in range(1,5):
        for long_period in range(short_period + 1, short_period + 5):
            values.append([short_period, long_period,sp.dmac(prices.OPEN,prices.OPEN,filt,short_period, long_period, 5, True,end_value=True)])

    optimal = max(values,key=lambda x:x[2])
            
    print "%s Value, Returns: %d, Periods: %d, %d" % (name, optimal[2], optimal[0], optimal[1])
    
    values = []
    for short_period in range(1,5):
        for long_period in range(short_period + 1, short_period + 5):
            values.append([short_period, long_period,sp.dmac(prices.OPEN,prices.OPEN,filt,short_period, long_period, 5, False,end_value=True)])

    optimal = max(values,key=lambda x:x[2])
            
    print "%s Technical, Returns: %d, Periods: %d, %d" % (name, optimal[2], optimal[0], optimal[1])
            

    equity_curves['Value EMA'] = sp.dmac(prices.OPEN,prices.OPEN,sp.ema,1, 5, 5, True)
equity_curves['Value Gaussian'] = sp.dmac(prices.OPEN,prices.OPEN,sp.gaussian,2, 3, 5, True)
del equity_curves['Technical SMA']
equity_curves.plot()
remove_border()

stats = []
for strategy in ['Baseline','Value SMA','Value EMA','Value Gaussian']:
    stats.append( sp.performance(equity_curves[strategy], time_frame= 251))
    
pd.DataFrame([stat.values() for stat in stats],columns = stats[0].keys(),index=['Baseline','Value SMA','Value EMA','Value Gaussian'])

stats = []
for strategy in ['Value SMA','Value EMA','Value Gaussian']:
    stats.append( sp.stats(equity_curves[strategy], equity_curves['Baseline'],time_frame= 251))
    
pd.DataFrame([stat.values() for stat in stats],columns = stats[0].keys(),index=['Value SMA','Value EMA','Value Gaussian'])

equity_curves = pd.DataFrame(sp.rsi(prices.OPEN,50),columns =['RSI'],index=prices.index)
equity_curves.plot(); remove_border()

equity_curves['Upper Bound'] = [70]*len(equity_curves.RSI)
equity_curves['Lower Bound'] = [30]*len(equity_curves.RSI)
# normalize the original s&p
equity_curves['Baseline'] = 100*prices.OPEN/prices.OPEN[-1]
equity_curves.plot(); remove_border(); plt.legend(loc='upper left')

values = []
for period in range(1,101):
    values.append([period,sp.equity_curve(sp.rsi(prices.OPEN,period), prices.OPEN, [70]*len(prices.OPEN), [30]*len(prices.OPEN), 5,end_value=True)])
                  
print max(values,key=lambda x:x[1])

periods, returns = zip(*values)

plt.plot(returns)

equity_curves = pd.DataFrame(sp.equity_curve(sp.rsi(prices.OPEN,13), prices.OPEN, [70]*len(prices.OPEN), [30]*len(prices.OPEN), 5),columns =['RSI'],index=prices.index)
# normalize the original s&p
equity_curves['Baseline'] = 100*prices.OPEN/prices.OPEN[1]
equity_curves.plot(); remove_border();# plt.legend(loc='upper left')

RR = close_px/close_px.shift(1) - 1
daily_index = (1 + RR).cumprod()
daily_index['MSFT'].plot()
remove_border()

# Let's sample and plot monthly returns...
monthly_index = daily_index.asfreq('EOM', method='ffill')
monthly_RR = monthly_index / monthly_index.shift(1) - 1
monthly_RR.plot()
remove_border()

subRR = RR.ix[:, ['AAPL', 'IBM', 'XOM', 'MSFT']]
(1 + subRR).cumprod().plot()
remove_border()

RR[-250:].hist(bins=50)
remove_border()

RR.ix[-1].plot(kind='bar')
remove_border()

# We have the following plot for Apple:
px = close_px['AAPL']
close_px['AAPL'].plot()
remove_border()

vol = rolling_std(px / px.shift(1) - 1, 250) * np.sqrt(250)
vol.plot()
remove_border()

evol = ewmstd(RR, span=250) * np.sqrt(250)
vol.plot()
evol.plot()
plt.xlim([datetime.datetime(1991, 1, 1), datetime.datetime(2012, 1, 1)])

RR.corr()

plot_corr(RR.corr())

plot_corr(monthly_RR.corr())

# Note: the function leave_one_out is included in the helper file.
leave_one_out(RR)

DataFrame(leave_one_out(RR))

grouped = RR.groupby(lambda x: x.year)
for year, group in grouped:
    print year
    print group[:5]

def get_beta(RR):
    RR = RR.dropna()
    RR['intercept'] = 1.
    model = sm.OLS(RR['MSFT'], RR.ix[:, ['AAPL', 'intercept']]).fit()
    return model.params

#get_beta(RR), for AAPL

grouped = RR.groupby([lambda x: x.year, lambda x: x.month])
beta_by_ym = grouped.apply(get_beta)
beta_by_ym.unstack(0)['AAPL']

RR = close_px / close_px.shift(1) - 1
y = RR['AAPL']
x = RR.ix[:, ['MSFT']]
model = ols(y=y, x=x) # OLS
model

# more least squares
model = ols(y=y, x=x, window=250)
model.beta.info()
model.beta['MSFT'].plot()
remove_border()

def RB(returns, benchmark, window=250):
    betas = {}
    for col, ts in returns.iteritems():
        betas[col] = _r_b(ts, spx, window=250)
    return DataFrame(betas)
def _r_b(returns, benchmark, window=250):
    model = ols(y=returns, x={'bmk' : benchmark}, window=window,min_periods=100)
    return model.beta['bmk']
def GMR(daily_returns):
    daily_index = T(daily_returns)
    monthly_index = daily_index.asfreq('EOM', method='pad')
    return TR(monthly_index)
def T(returns):
    return (1 + returns).cumprod()
def TR(prices):
    return prices / prices.shift(1) - 1


rets = GMR(TR(close_px))
spx = rets.pop('SPX')
betas =RB(rets, spx, window=36)
betas['MSFT'].plot()
remove_border()

# benchmark is the S&P 500

px = close_px.ix[datetime.datetime(1992, 1, 1):]
RR = px / px.shift(1) - 1
annualizer = np.sqrt(250)
bmk = RR.pop('SPX')

tickers = RR.columns
weights = Series(1. / len(tickers), index=tickers)
weights

# plot weighted portfolio versus the benchmark
port_returns = (RR * weights).sum(1)
plot_returns(port_returns, bmk)

calc_te(weights, RR, bmk) * annualizer

clean_RR = RR.fillna(0)
clean_bmk = bmk.fillna(0)
opt_weights = get_opt_holdings_opt(clean_RR, clean_bmk)
calc_te(opt_weights, RR, bmk) * annualizer

def rolling_min_te_portfolio(UNIRET, TRARET, window=250,T_intervals=100):
    
    ports = {}
    for i, cur_date in enumerate(UNIRET.index):
        if i < T_intervals:
            continue
        if i % 100 == 0:
            print >> sys.stdout, '*',
        UNI = UNIRET[max(0,i-window+1):i]
        TRA = TRARET[max(0,i-window+1):i]
        UNI = UNI.ix[:, UNI.count() > 0]
        UNI = UNI.fillna(0)
        TRA = TRA.fillna(0)
        ports[cur_date] = get_opt_holdings_opt(UNI,TRA)
    return DataFrame(ports).T
ports = rolling_min_te_portfolio(RR, bmk)

ports

ports['JNJ'].plot()
remove_border()

port_return = (ports.shift(1) * RR).sum(1)
plot_returns(port_return, bmk)

def sharpe(RR, ann=250):
    return np.sqrt(ann) * RR.mean() / RR.std()
sharpe(port_return)

sharpe(bmk)


#ours
import twitter3 as tw3
import performance1 as perf1
import performance2 as perf2
import sentiment1 as sent
#standard
import pandas as pd
import numpy as np
import datetime
from pattern.web import Element
import requests
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
#import ystockquote as ysq


# spy = requests.get('http://en.wikipedia.org/wiki/List_of_S&P_500_companies').text
# tickers = []
# dom = Element(spy)
# for a in dom('tr td:first-child a'):
#     ticker = a.content
#     if len(ticker) < 5:
#         tickers.append(str(ticker))
# tickers.append('BRK.B')
# tickers.append('CMCSA')
# tickers.append('DISCA')
# tickers = np.sort(tickers)
#tickerlist = tickers


spx = pd.read_csv('SPXSymbolsPD.csv', delimiter=',')
tickerlist = list(spx['Ticker'])
datelist = [datetime.date(2013,12,5),datetime.date(2013,12,6),datetime.date(2013,12,7),datetime.date(2013,12,8),datetime.date(2013,12,9),datetime.date(2013,12,10)]
weekdays_datetime = [datetime.date(2013,12,5),datetime.date(2013,12,6),datetime.date(2013,12,9),datetime.date(2013,12,10)]
weekdays_str = [str(day) for day in weekdays_datetime]
weekend_datetime = [datetime.date(2013,12,7),datetime.date(2013,12,8)]
weekend_str = [str(day) for day in weekend_datetime]

#loadfull = tw3.safemultisearch(tickerlist,datelist)
#loadfull.to_csv('data.csv')
data = pd.read_csv('data.csv')
#remove weekend days as the market is closed
data = data[data['date']!=weekend_str[0]]
data = data[data['date']!=weekend_str[1]]

#data,performance = perf1.check_performance(data)
#data = data[data['performance']>-1]#only keep if performance is 0 or 1, not NaN
#data.to_csv('data_perf.csv')
data = pd.read_csv('data_perf.csv')

print data.irow(0)
print data.irow(0)['text']

X, Y = sent.make_xy(data,vectorizer=None)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33,random_state=22)
clf = MultinomialNB()
clf.fit(X_train,Y_train)
clf.predict(X_test)
print "accuracy on testing set",1-float(sum(abs(Y_test-clf.predict(X_test))))/float(len(Y_test))
print "accuracy on training set",1-float(sum(abs(Y_train-clf.predict(X_train))))/float(len(Y_train))

print "fraction that are positive on the day",float(len(Y_test[Y_test==1]))/float(len(Y_test))

alphas = [0, .1, 1, 5, 10, 50, 100, 150, 200]
min_dfs = [1e-6, 1e-7, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

#Find the best value for alpha and min_df, and the best classifier
best_alpha = None
best_min_df = None
max_loglike = -np.inf

#for alpha in alphas:
for alpha in alphas:
    print alpha#I print alpha as this takes a bit to run and I like to be able to see the progress
    for min_df in min_dfs:         
        vectorizer = CountVectorizer(min_df = min_df)       
        X, Y = sent.make_xy(data, vectorizer)
        clf = MultinomialNB(alpha=alpha)#should move outside of inner for loop
        loglike=sent.cv_score(clf,X,Y,sent.log_likelihood)
        #print loglike
        if loglike>max_loglike:
            max_loglike=loglike
            print "max_loglike",max_loglike,"alpha",alpha,"min_df",min_df
            best_alpha = alpha
            best_min_df = min_df
print "best max_loglike",max_loglike,"best alpha",best_alpha,"best min_df",best_min_df

#data = pd.read_csv('data.csv')
#data = data[data['date']!=weekend_str[0]]
#data = data[data['date']!=weekend_str[1]]
#data,performance = perf2.check_performance_scaled(data)
#data = data[data['performance']>=0]#only keep if performance is 0 to 1, not NaN
##data.to_csv('data_perf_scaled.csv')
data = pd.read_csv('data_perf_scaled.csv')

X, Y = sent.make_xy(data,vectorizer=None)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33,random_state=22)
clf = MultinomialNB()
clf.fit(X_train,Y_train)
#clf.predict(X_test)

predvec = clf.predict(X_test)
indexpred = np.abs(predvec-.5)>0
actual = np.divide(Y_test[indexpred]-.5,np.abs(Y_test[indexpred]-.5))*.5+.5
predict = np.divide(clf.predict(X_test)[indexpred]-.5,np.abs(clf.predict(X_test)[indexpred]-.5))*.5+.5
print "accuracy on testing set",1-float(sum(abs(actual-predict)))/float(len(actual))
print "portion of stocks with motion going up in testing set",float(len(predict[predict>0]))/float(len(predict))
predvec = clf.predict(X_train)
indexpred = np.abs(predvec-.5)>0
actual = np.divide(Y_train[indexpred]-.5,np.abs(Y_train[indexpred]-.5))*.5+.5
predict = np.divide(clf.predict(X_train)[indexpred]-.5,np.abs(clf.predict(X_train)[indexpred]-.5))*.5+.5
print "accuracy on training set",1-float(sum(abs(actual-predict)))/float(len(actual))

vec = CountVectorizer(min_df = 1e-3)
#vec = CountVectorizer()
text = [words for i,words in data.text.iteritems()]
vec.fit(text)
words = np.array(vec.get_feature_names())
singles = np.eye(len(words))

X_df, Y_df = sent.make_xy(data,vectorizer=vec)
X_train_df,X_test_df,Y_train_df,Y_test_df = train_test_split(X_df,Y_df,test_size=0.33,random_state=22)
clf_df = MultinomialNB()
clf_df.fit(X_train_df,Y_train_df)

#clf_df = clf
negativeindices = clf_df.predict_proba(sparse.csc_matrix(singles))[:,0].argsort()
positiveindices = clf_df.predict_proba(sparse.csc_matrix(singles))[:,1].argsort()


badwords=words[negativeindices]
badprobs = clf_df.predict_proba(singles)[negativeindices,1]
badprobs=badprobs[::-1]
badwords=badwords[::-1]
goodwords=words[positiveindices]
goodwords=goodwords[::-1]
goodprobs = clf_df.predict_proba(singles)[positiveindices,1]
goodprobs = goodprobs[::-1]
print "negative words",[(badwords[i],badprobs[i]) for i in range(10)]
print "positive words",[(goodwords[i],goodprobs[i]) for i in range(10)]