import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline import plotly.plotly as py from plotly.graph_objs import * import urllib.request # 1. Get a stock market dataset of your choice -- pick an index # The New York Stock Exchange (NYSE) is by far the world's largest stock exchange # by market capitalization of its listed companies at US$16.6 trillion as of February 2015 # urllib.request.urlretrieve("http://www1.nyse.com/indexes/nyahist.csv", "nyahist.csv") # Read in the downloaded NYSE index history file as is df = pd.read_csv('nyahist.csv',skiprows=1,header=0,usecols=[0,1],names=['dt','price'],parse_dates=[0]) # 2. Get historical data on a stock (or two) that is a part of that index ibm = pd.read_csv('IBM_Exxon.csv',usecols=[0,1],names=['dt','price'],header=0,parse_dates=[0]) exx = pd.read_csv('IBM_Exxon.csv',usecols=[0,2],names=['dt','price'],header=0,parse_dates=[0]) # 3. get daily,monthly and yearly absolute & relative deltas def get_deltas(df,delta='dy'): """delta can be 'dy', 'mo' or 'yr'""" if delta=='mo': df = df.groupby(df.dt.dt.to_period('M'),as_index=False).nth(0).loc[:,['dt','price']] elif delta=='yr': df = df.groupby(df.dt.dt.to_period('Y'),as_index=False).nth(0).loc[:,['dt','price']] df._metadata = {'delta':delta} df.loc[df.index[0],'absolute'] = 0 df.loc[df.index[1]:,'absolute'] = df.loc[df.index[1]:,'price'].values - df.loc[:df.index[-2],'price'].values df.absolute = df.absolute.abs() df['relative'] = df.absolute / df.price return df # get standard deviations for daily, monthly and yearly changes... def get_std(df): header = ['sample size','price std','abs std','rel std'] return pd.DataFrame([[len(df),df.price.std(),df.absolute.std(),df.relative.std()]], index=[df._metadata['delta']],columns=header) # get deltas deltas=[] for delta in ('dy','mo','yr'): deltas.append(get_deltas(df,delta=delta)) # expecting higher variance as sample size gets smaller std = pd.DataFrame() for delta in deltas: std = std.append(get_std(delta)) std # 4. Are deltas power-law distributed? # 5. Compute alphas for all of them # 6. Are the alphas similar? Where are the differences? # 7. Compute C for day over day distribution on an index def get_alphaC(df,absolute=True): """header = ['sample size','α using aggregation method','Normalization constant C for α>1']""" if absolute: alpha = 1 + len(df.absolute[df.absolute>0])/sum([np.log(x) for x in df.absolute[df.absolute>0]]) C = (alpha-1)*(min(df.absolute[df.absolute>0])**(alpha-1)) else: alpha = 1 + len(df.relative[df.relative>0])/sum([np.log(x) for x in df.relative[df.relative>0]]) C = (alpha-1)*(min(df.relative[df.relative>0])**(alpha-1)) return pd.DataFrame([[alpha,C]], index=[df._metadata['delta']], columns=['alpha','C']) print('NYSE index using absolute deltas:') alphaC = pd.DataFrame() for delta in deltas: alphaC = alphaC.append(get_alphaC(delta)) alphaC ibm_d = [] for delta in ('dy','mo','yr'): ibm_d.append(get_deltas(ibm,delta=delta)) print('IBM stock absolute deltas:') alphaC = pd.DataFrame() for delta in ibm_d: alphaC = alphaC.append(get_alphaC(delta)) alphaC exx_d = [] for delta in ('dy','mo','yr'): exx_d.append(get_deltas(exx,delta=delta)) print('Exxon stock absolute deltas:') alphaC = pd.DataFrame() for delta in exx_d: alphaC = alphaC.append(get_alphaC(delta)) alphaC # 4. Using package https://pypi.python.org/pypi/powerlaw import powerlaw fit = [] for delta in deltas: fit.append(powerlaw.Fit(delta.absolute[delta.absolute>0])) pd.DataFrame([fit[0].alpha,fit[1].alpha,fit[2].alpha],index=['dy','mo','yr'],columns=['alpha by pypi/powerlaw']) # 4.1 Plot CCDF & fitted power law fig = fit[0].plot_ccdf(color='b', linewidth=3, label='Empirical Daily') fit[1].plot_ccdf(ax=fig, color='g', linewidth=3, label='Empirical Monthly') fit[2].plot_ccdf(ax=fig, color='r', linewidth=3, label='Empirical Yearly') fit[0].power_law.plot_ccdf(ax=fig, color='b', linestyle='--', label='Power law fit dy') fit[1].power_law.plot_ccdf(ax=fig, color='g', linestyle='--', label='Power law fit mo') fit[2].power_law.plot_ccdf(ax=fig, color='r', linestyle='--', label='Power law fit yr') fig.set_ylabel(u'p(X≥x)') fig.set_xlabel("NYSE daily closing price absolute change frequency") handles, labels = fig.get_legend_handles_labels() fig.legend(handles, labels, loc=3) # 8. Compute probability of a stock market crash (losing > 30% value) # prob_down = len(df.absolute[df.absolute>0]) /len(df.absolute) alphaC = pd.DataFrame() for delta in deltas: alphaC = alphaC.append(get_alphaC(delta)) alpha = alphaC.loc['dy','alpha'] C = alphaC.loc['dy','C'] prob_crash= (C/(alpha-1))*(0.3**(-(alpha-1))) print('Probability of crash (>30% value loss):',prob_crash) # 3.1 Plot daily changes y,x = np.histogram(df.absolute, bins=len(df)/50) trace1 = Scatter(x=x,y=y,name='absolute') y2,x2 = np.histogram(df.relative, bins=len(df)/50) trace2 = Scatter(x=x2,y=y2,name='relative') data = Data([trace1,trace2]) layout = Layout( title='NYSE Daily Closing Price Change', xaxis=XAxis(type='log',autorange=True,title='Daily price change'), yaxis=YAxis(type='log',autorange=True,title='Frequency')) fig = Figure(data=data, layout=layout) py.iplot(fig, filename='NYSE-daily-absolute-changes')