# Code to allow inline plotting, derived from psets import warnings warnings.filterwarnings('ignore') from helperfile import * plt.rc('figure', figsize=(10, 6)) %matplotlib inline import json import requests import pandas as pd import numpy as np import matplotlib.pyplot as plt pd.set_option('display.width', 500) pd.set_option('display.max_columns', 30) # set some nicer defaults for matplotlib from matplotlib import rcParams #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecessary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() with open('SPX.csv', 'r') as ff: print ff.readline() # headers print ff.readline() # first row data = pd.read_csv('SPX.csv',parse_dates={'Timestamp': ['DATE']},index_col='Timestamp') data history = data.ix[:, ['OPEN', 'VOLUME']] history.head() volume = history.VOLUME.resample('1w', how='sum') # weekly vwap value = history.prod(axis=1).resample('1w', how='sum') vwap = value / volume print vwap vwap.ix['1993-01-31':'2013-09-15'] selection = vwap.between_time('1993-01-31', '2013-09-15') vol = pd.Series(volume.between_time('1993-01-31', '2013-09-15'),dtype=float) #vol.head(20) # attempt to fill possible missing data selection.ix['1993-01-31':'2013-09-15'].head(20) filled = selection.fillna(method='pad', limit=1) filled.ix['1993-01-31':'2013-09-15'].head(20) vol = vol.fillna(0.) #vol.head(20) # now for visualization, plot the volume weighted avereage price, along with volume, with respect to time. vol.plot(style='y') filled.plot(secondary_y=True, style='r') remove_border() BB = history.OPEN.resample('1w', how='ohlc') week_range = BB.high - BB.low #week_range.describe() week_return = BB.close / BB.open - 1 #week_return.describe() #week_return.head() MM = week_return.between_time('1993-01-31', '2013-09-15') #lagged = MM.shift(1) lagged = week_return.tshift(1, 'w').between_time('1993-01-31', '2013-09-15') # Ordinary least squares pd.ols(y=MM, x=lagged) MM = vwap / BB.open - 1 MM = MM.between_time('1993-01-31', '2013-09-15') lagged = MM.tshift(1, 'w').between_time('1993-01-31', '2013-09-15') pd.ols(y=MM, x=lagged) # numpy/matplotlib %pylab inline history = randn(10000) plot(randn(10000)) #plot the 10000-day historical price vector. title('returns') xlabel('day') ylabel('return [%]') remove_border() h = hist(history,100) print 'Mean:' , history.mean() print 'Standard deviation:%.2f' % history.std() price = 100*(history/100+1).cumprod() # normalized price with a starting asset base of $100. plot(price) # cumulative sum title('normalized price') xlabel('day number') ylabel('price [$]') remove_border() history = randn(1000,5000) prices = 100*(history/100+1).cumprod(axis=0) # normalized price along axis h = plot(prices[:,:300]) # plot first 300 xlabel('day number') ylabel('price [$]') remove_border() finalPrice = prices[-1,:] # get last row of prices table h = hist(finalPrice,50) # plot histogram xlabel('price [$]') ylabel('frequency [#]') print 'Mean:', finalPrice.mean() print 'Std:', finalPrice.std() history3x = history*3. # leveraged returns prices3x = 100*(history3x/100+1).cumprod(axis=0) # normalized price finalPrice3x = prices3x[-1,:] # last h1 = hist(finalPrice3x,500) # histogram 1 h2 = hist(finalPrice,100) # histogram 2 xlim([0,400]) # setting limit of x axis legend(['3 times','1 times']) print 'Mean 3x:', finalPrice3x.mean() warnings.filterwarnings('ignore') import signal_processing as sp sp.example() # make a new dataframe from the original dataset prices = pd.DataFrame(data.reindex( index=data.index[ ::-1 ] ).OPEN) prices.plot() remove_border() prices['SMA50'] = sp.sma(prices.OPEN,50) prices['SMA200'] = sp.sma(prices.OPEN,200) prices.plot() remove_border() equity_curves = pd.DataFrame(sp.dmac(prices.OPEN,prices.OPEN,sp.sma,2, 5, 5, True),columns =['Value SMA'],index=prices.index) equity_curves.plot(); remove_border() equity_curves['Technical SMA'] = sp.dmac(prices.OPEN,prices.OPEN,sp.sma,2, 5, 5, False) equity_curves['Baseline'] = 100*prices.OPEN/prices.OPEN[0] equity_curves.plot(); remove_border() stats = sp.performance(equity_curves['Baseline'], time_frame= 251) pd.DataFrame([stats.values()],columns = stats.keys(),index=['Baseline']) stats = [] for strategy in ['Baseline','Value SMA','Technical SMA']: stats.append( sp.performance(equity_curves[strategy], time_frame= 251)) pd.DataFrame([stat.values() for stat in stats],columns = stats[0].keys(),index=['Baseline','Value SMA','Technical SMA']) from mpl_toolkits.mplot3d import Axes3D values = [] for short_period in range(1,5): for long_period in range(short_period + 1, short_period + 5): values.append([short_period, long_period,sp.dmac(prices.OPEN,prices.OPEN,sp.sma,short_period, long_period, 5, True,end_value=True)]) X,Y,Z = zip(*values) print max(values,key=lambda x:x[2]) fig = plt.figure() ax = Axes3D(fig) ax.plot_trisurf(X,Y,Z) for filt,name in [(sp.ema,'EMA'),(sp.gaussian,'Gaussian'),(sp.butterworth,'Butterworth')]: values = [] for short_period in range(1,5): for long_period in range(short_period + 1, short_period + 5): values.append([short_period, long_period,sp.dmac(prices.OPEN,prices.OPEN,filt,short_period, long_period, 5, True,end_value=True)]) optimal = max(values,key=lambda x:x[2]) print "%s Value, Returns: %d, Periods: %d, %d" % (name, optimal[2], optimal[0], optimal[1]) values = [] for short_period in range(1,5): for long_period in range(short_period + 1, short_period + 5): values.append([short_period, long_period,sp.dmac(prices.OPEN,prices.OPEN,filt,short_period, long_period, 5, False,end_value=True)]) optimal = max(values,key=lambda x:x[2]) print "%s Technical, Returns: %d, Periods: %d, %d" % (name, optimal[2], optimal[0], optimal[1]) equity_curves['Value EMA'] = sp.dmac(prices.OPEN,prices.OPEN,sp.ema,1, 5, 5, True) equity_curves['Value Gaussian'] = sp.dmac(prices.OPEN,prices.OPEN,sp.gaussian,2, 3, 5, True) del equity_curves['Technical SMA'] equity_curves.plot() remove_border() stats = [] for strategy in ['Baseline','Value SMA','Value EMA','Value Gaussian']: stats.append( sp.performance(equity_curves[strategy], time_frame= 251)) pd.DataFrame([stat.values() for stat in stats],columns = stats[0].keys(),index=['Baseline','Value SMA','Value EMA','Value Gaussian']) stats = [] for strategy in ['Value SMA','Value EMA','Value Gaussian']: stats.append( sp.stats(equity_curves[strategy], equity_curves['Baseline'],time_frame= 251)) pd.DataFrame([stat.values() for stat in stats],columns = stats[0].keys(),index=['Value SMA','Value EMA','Value Gaussian']) equity_curves = pd.DataFrame(sp.rsi(prices.OPEN,50),columns =['RSI'],index=prices.index) equity_curves.plot(); remove_border() equity_curves['Upper Bound'] = [70]*len(equity_curves.RSI) equity_curves['Lower Bound'] = [30]*len(equity_curves.RSI) # normalize the original s&p equity_curves['Baseline'] = 100*prices.OPEN/prices.OPEN[-1] equity_curves.plot(); remove_border(); plt.legend(loc='upper left') values = [] for period in range(1,101): values.append([period,sp.equity_curve(sp.rsi(prices.OPEN,period), prices.OPEN, [70]*len(prices.OPEN), [30]*len(prices.OPEN), 5,end_value=True)]) print max(values,key=lambda x:x[1]) periods, returns = zip(*values) plt.plot(returns) equity_curves = pd.DataFrame(sp.equity_curve(sp.rsi(prices.OPEN,13), prices.OPEN, [70]*len(prices.OPEN), [30]*len(prices.OPEN), 5),columns =['RSI'],index=prices.index) # normalize the original s&p equity_curves['Baseline'] = 100*prices.OPEN/prices.OPEN[1] equity_curves.plot(); remove_border();# plt.legend(loc='upper left') RR = close_px/close_px.shift(1) - 1 daily_index = (1 + RR).cumprod() daily_index['MSFT'].plot() remove_border() # Let's sample and plot monthly returns... monthly_index = daily_index.asfreq('EOM', method='ffill') monthly_RR = monthly_index / monthly_index.shift(1) - 1 monthly_RR.plot() remove_border() subRR = RR.ix[:, ['AAPL', 'IBM', 'XOM', 'MSFT']] (1 + subRR).cumprod().plot() remove_border() RR[-250:].hist(bins=50) remove_border() RR.ix[-1].plot(kind='bar') remove_border() # We have the following plot for Apple: px = close_px['AAPL'] close_px['AAPL'].plot() remove_border() vol = rolling_std(px / px.shift(1) - 1, 250) * np.sqrt(250) vol.plot() remove_border() evol = ewmstd(RR, span=250) * np.sqrt(250) vol.plot() evol.plot() plt.xlim([datetime.datetime(1991, 1, 1), datetime.datetime(2012, 1, 1)]) RR.corr() plot_corr(RR.corr()) plot_corr(monthly_RR.corr()) # Note: the function leave_one_out is included in the helper file. leave_one_out(RR) DataFrame(leave_one_out(RR)) grouped = RR.groupby(lambda x: x.year) for year, group in grouped: print year print group[:5] def get_beta(RR): RR = RR.dropna() RR['intercept'] = 1. model = sm.OLS(RR['MSFT'], RR.ix[:, ['AAPL', 'intercept']]).fit() return model.params #get_beta(RR), for AAPL grouped = RR.groupby([lambda x: x.year, lambda x: x.month]) beta_by_ym = grouped.apply(get_beta) beta_by_ym.unstack(0)['AAPL'] RR = close_px / close_px.shift(1) - 1 y = RR['AAPL'] x = RR.ix[:, ['MSFT']] model = ols(y=y, x=x) # OLS model # more least squares model = ols(y=y, x=x, window=250) model.beta.info() model.beta['MSFT'].plot() remove_border() def RB(returns, benchmark, window=250): betas = {} for col, ts in returns.iteritems(): betas[col] = _r_b(ts, spx, window=250) return DataFrame(betas) def _r_b(returns, benchmark, window=250): model = ols(y=returns, x={'bmk' : benchmark}, window=window,min_periods=100) return model.beta['bmk'] def GMR(daily_returns): daily_index = T(daily_returns) monthly_index = daily_index.asfreq('EOM', method='pad') return TR(monthly_index) def T(returns): return (1 + returns).cumprod() def TR(prices): return prices / prices.shift(1) - 1 rets = GMR(TR(close_px)) spx = rets.pop('SPX') betas =RB(rets, spx, window=36) betas['MSFT'].plot() remove_border() # benchmark is the S&P 500 px = close_px.ix[datetime.datetime(1992, 1, 1):] RR = px / px.shift(1) - 1 annualizer = np.sqrt(250) bmk = RR.pop('SPX') tickers = RR.columns weights = Series(1. / len(tickers), index=tickers) weights # plot weighted portfolio versus the benchmark port_returns = (RR * weights).sum(1) plot_returns(port_returns, bmk) calc_te(weights, RR, bmk) * annualizer clean_RR = RR.fillna(0) clean_bmk = bmk.fillna(0) opt_weights = get_opt_holdings_opt(clean_RR, clean_bmk) calc_te(opt_weights, RR, bmk) * annualizer def rolling_min_te_portfolio(UNIRET, TRARET, window=250,T_intervals=100): ports = {} for i, cur_date in enumerate(UNIRET.index): if i < T_intervals: continue if i % 100 == 0: print >> sys.stdout, '*', UNI = UNIRET[max(0,i-window+1):i] TRA = TRARET[max(0,i-window+1):i] UNI = UNI.ix[:, UNI.count() > 0] UNI = UNI.fillna(0) TRA = TRA.fillna(0) ports[cur_date] = get_opt_holdings_opt(UNI,TRA) return DataFrame(ports).T ports = rolling_min_te_portfolio(RR, bmk) ports ports['JNJ'].plot() remove_border() port_return = (ports.shift(1) * RR).sum(1) plot_returns(port_return, bmk) def sharpe(RR, ann=250): return np.sqrt(ann) * RR.mean() / RR.std() sharpe(port_return) sharpe(bmk) #ours import twitter3 as tw3 import performance1 as perf1 import performance2 as perf2 import sentiment1 as sent #standard import pandas as pd import numpy as np import datetime from pattern.web import Element import requests from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import MultinomialNB from scipy import sparse from sklearn.feature_extraction.text import CountVectorizer #import ystockquote as ysq # spy = requests.get('http://en.wikipedia.org/wiki/List_of_S&P_500_companies').text # tickers = [] # dom = Element(spy) # for a in dom('tr td:first-child a'): # ticker = a.content # if len(ticker) < 5: # tickers.append(str(ticker)) # tickers.append('BRK.B') # tickers.append('CMCSA') # tickers.append('DISCA') # tickers = np.sort(tickers) #tickerlist = tickers spx = pd.read_csv('SPXSymbolsPD.csv', delimiter=',') tickerlist = list(spx['Ticker']) datelist = [datetime.date(2013,12,5),datetime.date(2013,12,6),datetime.date(2013,12,7),datetime.date(2013,12,8),datetime.date(2013,12,9),datetime.date(2013,12,10)] weekdays_datetime = [datetime.date(2013,12,5),datetime.date(2013,12,6),datetime.date(2013,12,9),datetime.date(2013,12,10)] weekdays_str = [str(day) for day in weekdays_datetime] weekend_datetime = [datetime.date(2013,12,7),datetime.date(2013,12,8)] weekend_str = [str(day) for day in weekend_datetime] #loadfull = tw3.safemultisearch(tickerlist,datelist) #loadfull.to_csv('data.csv') data = pd.read_csv('data.csv') #remove weekend days as the market is closed data = data[data['date']!=weekend_str[0]] data = data[data['date']!=weekend_str[1]] #data,performance = perf1.check_performance(data) #data = data[data['performance']>-1]#only keep if performance is 0 or 1, not NaN #data.to_csv('data_perf.csv') data = pd.read_csv('data_perf.csv') print data.irow(0) print data.irow(0)['text'] X, Y = sent.make_xy(data,vectorizer=None) X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33,random_state=22) clf = MultinomialNB() clf.fit(X_train,Y_train) clf.predict(X_test) print "accuracy on testing set",1-float(sum(abs(Y_test-clf.predict(X_test))))/float(len(Y_test)) print "accuracy on training set",1-float(sum(abs(Y_train-clf.predict(X_train))))/float(len(Y_train)) print "fraction that are positive on the day",float(len(Y_test[Y_test==1]))/float(len(Y_test)) alphas = [0, .1, 1, 5, 10, 50, 100, 150, 200] min_dfs = [1e-6, 1e-7, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] #Find the best value for alpha and min_df, and the best classifier best_alpha = None best_min_df = None max_loglike = -np.inf #for alpha in alphas: for alpha in alphas: print alpha#I print alpha as this takes a bit to run and I like to be able to see the progress for min_df in min_dfs: vectorizer = CountVectorizer(min_df = min_df) X, Y = sent.make_xy(data, vectorizer) clf = MultinomialNB(alpha=alpha)#should move outside of inner for loop loglike=sent.cv_score(clf,X,Y,sent.log_likelihood) #print loglike if loglike>max_loglike: max_loglike=loglike print "max_loglike",max_loglike,"alpha",alpha,"min_df",min_df best_alpha = alpha best_min_df = min_df print "best max_loglike",max_loglike,"best alpha",best_alpha,"best min_df",best_min_df #data = pd.read_csv('data.csv') #data = data[data['date']!=weekend_str[0]] #data = data[data['date']!=weekend_str[1]] #data,performance = perf2.check_performance_scaled(data) #data = data[data['performance']>=0]#only keep if performance is 0 to 1, not NaN ##data.to_csv('data_perf_scaled.csv') data = pd.read_csv('data_perf_scaled.csv') X, Y = sent.make_xy(data,vectorizer=None) X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33,random_state=22) clf = MultinomialNB() clf.fit(X_train,Y_train) #clf.predict(X_test) predvec = clf.predict(X_test) indexpred = np.abs(predvec-.5)>0 actual = np.divide(Y_test[indexpred]-.5,np.abs(Y_test[indexpred]-.5))*.5+.5 predict = np.divide(clf.predict(X_test)[indexpred]-.5,np.abs(clf.predict(X_test)[indexpred]-.5))*.5+.5 print "accuracy on testing set",1-float(sum(abs(actual-predict)))/float(len(actual)) print "portion of stocks with motion going up in testing set",float(len(predict[predict>0]))/float(len(predict)) predvec = clf.predict(X_train) indexpred = np.abs(predvec-.5)>0 actual = np.divide(Y_train[indexpred]-.5,np.abs(Y_train[indexpred]-.5))*.5+.5 predict = np.divide(clf.predict(X_train)[indexpred]-.5,np.abs(clf.predict(X_train)[indexpred]-.5))*.5+.5 print "accuracy on training set",1-float(sum(abs(actual-predict)))/float(len(actual)) vec = CountVectorizer(min_df = 1e-3) #vec = CountVectorizer() text = [words for i,words in data.text.iteritems()] vec.fit(text) words = np.array(vec.get_feature_names()) singles = np.eye(len(words)) X_df, Y_df = sent.make_xy(data,vectorizer=vec) X_train_df,X_test_df,Y_train_df,Y_test_df = train_test_split(X_df,Y_df,test_size=0.33,random_state=22) clf_df = MultinomialNB() clf_df.fit(X_train_df,Y_train_df) #clf_df = clf negativeindices = clf_df.predict_proba(sparse.csc_matrix(singles))[:,0].argsort() positiveindices = clf_df.predict_proba(sparse.csc_matrix(singles))[:,1].argsort() badwords=words[negativeindices] badprobs = clf_df.predict_proba(singles)[negativeindices,1] badprobs=badprobs[::-1] badwords=badwords[::-1] goodwords=words[positiveindices] goodwords=goodwords[::-1] goodprobs = clf_df.predict_proba(singles)[positiveindices,1] goodprobs = goodprobs[::-1] print "negative words",[(badwords[i],badprobs[i]) for i in range(10)] print "positive words",[(goodwords[i],goodprobs[i]) for i in range(10)]