%%html
<link rel="stylesheet" href="static/hyrule.css" type="text/css">
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (20, 10)
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['hist', 'figure', 'show'] `%matplotlib` prevents importing * from pylab and numpy
# Pandas URL method
# target URL: http://www.quandl.com/api/v1/datasets/YAHOO/INDEX_GSPC.csv?trim_start=2007-01-01&trim_end=2015-02-02
# Generate the URL string
url = "http://www.quandl.com/api/v1/datasets/YAHOO/"
url += "INDEX_GSPC"
url += ".csv?trim_start=%s-%s-%s&trim_end=%s-%s-%s" % ('2007', '01', '01', '2015', '02', '02')
sp500 = pd.read_csv(url, parse_dates = ['Date'], dayfirst = True, index_col = 'Date')
# parse_dates: This will parse the "Date" column as a datetime object instead of an object string
# index_col: Instead of creating a default index column, Use the "Date" column as an index
# dayfirst: Use DD/MM format, Default uses the MM/DD format
sp500.head()
Open | High | Low | Close | Volume | Adjusted Close | |
---|---|---|---|---|---|---|
Date | ||||||
2015-02-02 | 1996.67 | 2021.66 | 1980.90 | 2020.85 | 4008330000 | 2020.85 |
2015-01-30 | 2019.35 | 2023.32 | 1993.38 | 1994.99 | 4538650000 | 1994.99 |
2015-01-29 | 2002.45 | 2024.64 | 1989.18 | 2021.25 | 4127140000 | 2021.25 |
2015-01-28 | 2032.34 | 2042.49 | 2001.49 | 2002.16 | 4067530000 | 2002.16 |
2015-01-27 | 2047.86 | 2047.86 | 2019.91 | 2029.55 | 3329810000 | 2029.55 |
import pandas.io.data as web
spc = web.DataReader(name = "^GSPC", data_source = 'yahoo', start = '2007-1-1', end = '2015-2-2')
#spc.to_csv("SP500.csv") # Function to save a data frame for offline analysis
spc.tail()
Open | High | Low | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
2015-01-27 | 2047.86 | 2047.86 | 2019.91 | 2029.55 | 3329810000 | 2029.55 |
2015-01-28 | 2032.34 | 2042.49 | 2001.49 | 2002.16 | 4067530000 | 2002.16 |
2015-01-29 | 2002.45 | 2024.64 | 1989.18 | 2021.25 | 4127140000 | 2021.25 |
2015-01-30 | 2019.35 | 2023.32 | 1993.38 | 1994.99 | 4538650000 | 1994.99 |
2015-02-02 | 1996.67 | 2021.66 | 1980.90 | 2020.85 | 4008330000 | 2020.85 |
N = 1000
z = np.random.randn(N)
walk = np.zeros(N)
for i in np.arange(1, N, 1):
walk[i] = walk[i - 1] + z[i]
plt.plot(walk)
[<matplotlib.lines.Line2D at 0x1038b67d0>]
# Compute rolling statistics
spc['MA42'] = pd.rolling_mean(spc['Open'], window = 42, min_periods = 30)
spc['MA252'] = pd.rolling_mean(spc['Open'], window = 252, min_periods = 30)
spc[['Open', 'MA252', 'MA42','Close']].plot()
plt.legend(["Actual", "Long term", "Short term","Close"])
plt.title('Trends in the SP500 data', fontsize = 20)
<matplotlib.text.Text at 0x1096ab050>
url = "https://www.quandl.com/api/v1/datasets/WIKI/"
url += "AAPL"
url += ".csv?trim_start=%s-%s-%s&trim_end=%s-%s-%s" % ('2007', '01', '01', '2015', '02', '02')
AAPL = pd.read_csv(url, parse_dates = ['Date'], dayfirst = True, index_col = 'Date')
AAPL.head()
Open | High | Low | Close | Volume | Ex-Dividend | Split Ratio | Adj. Open | Adj. High | Adj. Low | Adj. Close | Adj. Volume | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||
2015-02-02 | 118.03 | 119.1700 | 116.08 | 118.63 | 62347947 | 0 | 1 | 118.03 | 119.1700 | 116.08 | 118.63 | 62347947 |
2015-01-30 | 118.28 | 120.0000 | 116.85 | 117.16 | 83532038 | 0 | 1 | 118.28 | 120.0000 | 116.85 | 117.16 | 83532038 |
2015-01-29 | 116.36 | 119.1900 | 115.56 | 118.90 | 83985866 | 0 | 1 | 116.36 | 119.1900 | 115.56 | 118.90 | 83985866 |
2015-01-28 | 117.56 | 118.1200 | 115.31 | 115.31 | 146420662 | 0 | 1 | 117.56 | 118.1200 | 115.31 | 115.31 | 146420662 |
2015-01-26 | 113.74 | 114.3626 | 112.80 | 113.10 | 55457420 | 0 | 1 | 113.74 | 114.3626 | 112.80 | 113.10 | 55457420 |
url = "https://www.google.com/finance/historical?q=GOOGLEINDEX_US:RENTAL\
&output=csv&startdate=20080101&ei=_znYVJjfMZOHsgeHv4G4Cg"
r = pd.read_csv(url, parse_dates = [0], index_col = 0)
r['MA30'] = pd.rolling_mean(r['Close'], window = 30, min_periods = 30)
r[['Close', 'MA30']].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x108af4ed0>
import datetime as dt
df = web.get_data_yahoo(
['AAPL', 'GE', 'IBM', 'KO', 'MSFT', 'PEP'],
start = dt.datetime(2010, 1, 1),
end = dt.datetime(2013, 1, 1))['Adj Close']
df.head()
AAPL | GE | IBM | KO | MSFT | PEP | |
---|---|---|---|---|---|---|
Date | ||||||
2010-01-04 | 28.84 | 13.10 | 119.53 | 24.67 | 27.14 | 52.81 |
2010-01-05 | 28.89 | 13.16 | 118.09 | 24.37 | 27.14 | 53.45 |
2010-01-06 | 28.43 | 13.10 | 117.32 | 24.36 | 26.98 | 52.92 |
2010-01-07 | 28.38 | 13.77 | 116.92 | 24.30 | 26.70 | 52.58 |
2010-01-08 | 28.56 | 14.07 | 118.09 | 23.85 | 26.88 | 52.41 |
rets = df.pct_change() # Calculates Percent change over 1-day period.
corr = rets.corr() # Computes pairwise correlation of columns
plt.imshow(corr, cmap = 'Reds', interpolation = 'none') # Visualize the values in the entire data frame.
plt.colorbar()
plt.xticks(range(len(corr)), corr.columns)
plt.yticks(range(len(corr)), corr.columns);
plt.title("Correlation Matrix: Tech Stocks")
<matplotlib.text.Text at 0x10e52ac50>
from math import pi
from bokeh.plotting import *
df = pd.DataFrame(sp500)[:50]
mids = (df.Open + df.Close) / 2
spans = abs(df.Close - df.Open)
inc = df.Close > df.Open
dec = df.Open > df.Close
w = 12 * 60 * 60 * 1000 # sample 12 hours in ms
output_notebook() # Load the Bokeh Java-Script interactive browser
figure(x_axis_type="datetime", plot_width=1000, name="candlestick",
tools="pan,wheel_zoom,box_zoom,reset,previewsave")
hold()
segment(df.index, df.High, df.index, df.Low, color='black')
rect(df.index[inc], mids[inc], w, spans[inc], fill_color="#D5E1DD", line_color="black")
rect(df.index[dec], mids[dec], w, spans[dec], fill_color="#F2583E", line_color="black")
curplot().title = "SP500 Candlestick"
xaxis().major_label_orientation = pi/4
grid().grid_line_alpha = 0.3
show()
import statsmodels.api as sm
import statsmodels.graphics.tsaplots as tsa
# Calculate Autocorrelation, Partial Auto Correlation and the Correlogram
fig = plt.figure()
ax1 = fig.add_subplot(211)
fig = tsa.plot_acf(walk, lags = 42, ax = ax1)
ax2 = fig.add_subplot(212)
fig = tsa.plot_pacf(walk, lags = 42, ax = ax2)
arma_mod22 = sm.tsa.ARMA(walk, (2, 2)).fit()
print(arma_mod22.summary())
print "Durbin Watson score: ", sm.stats.durbin_watson(arma_mod22.resid)
ARMA Model Results ============================================================================== Dep. Variable: y No. Observations: 1000 Model: ARMA(2, 2) Log Likelihood -1422.275 Method: css-mle S.D. of innovations 1.001 Date: Wed, 11 Feb 2015 AIC 2856.551 Time: 20:31:19 BIC 2885.997 Sample: 0 HQIC 2867.743 ============================================================================== coef std err z P>|z| [95.0% Conf. Int.] ------------------------------------------------------------------------------ const 5.9467 4.081 1.457 0.145 -2.052 13.945 ar.L1.y 1.9103 0.016 119.067 0.000 1.879 1.942 ar.L2.y -0.9107 0.016 -57.126 0.000 -0.942 -0.879 ma.L1.y -0.9337 0.035 -26.842 0.000 -1.002 -0.866 ma.L2.y -0.0063 0.031 -0.202 0.840 -0.067 0.055 Roots ============================================================================= Real Imaginary Modulus Frequency ----------------------------------------------------------------------------- AR.1 1.0049 +0.0000j 1.0049 0.0000 AR.2 1.0927 +0.0000j 1.0927 0.0000 MA.1 1.0633 +0.0000j 1.0633 0.0000 MA.2 -149.2299 +0.0000j 149.2299 0.5000 ----------------------------------------------------------------------------- Durbin Watson score: 1.96427270972
# CPI Data set: Explore this data set in class
# Plot and Compute the various statistics
# Explain the Correlograms
macrodta = sm.datasets.macrodata.load_pandas().data
macrodta.index = pd.Index(sm.tsa.datetools.dates_from_range('1959Q1', '2009Q3'))
print macrodta.head()
cpi = macrodta["infl"]
macrodta
fig = plt.figure()
ax = fig.add_subplot(111)
ax = cpi.plot(ax = ax)
ax.legend()
year quarter realgdp realcons realinv realgovt realdpi \ 1959-03-31 1959 1 2710.349 1707.4 286.898 470.045 1886.9 1959-06-30 1959 2 2778.801 1733.7 310.859 481.301 1919.7 1959-09-30 1959 3 2775.488 1751.8 289.226 491.260 1916.4 1959-12-31 1959 4 2785.204 1753.7 299.356 484.052 1931.3 1960-03-31 1960 1 2847.699 1770.5 331.722 462.199 1955.5 cpi m1 tbilrate unemp pop infl realint 1959-03-31 28.98 139.7 2.82 5.8 177.146 0.00 0.00 1959-06-30 29.15 141.7 3.08 5.1 177.830 2.34 0.74 1959-09-30 29.35 140.5 3.82 5.3 178.657 2.74 1.09 1959-12-31 29.37 140.0 4.33 5.6 179.386 0.27 4.06 1960-03-31 29.54 139.6 3.50 5.2 180.007 2.31 1.19
<matplotlib.legend.Legend at 0x1119448d0>
# Calculate Autocorrelation, Partial Auto Correlation and the Correlogram
fig = plt.figure()
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(cpi, lags = 42, ax = ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(cpi, lags=42, ax = ax2)