from pandas import read_csv,DatetimeIndex,ols
from urllib import urlopen
def get_index(gindex, startdate=20040101):
"""
API wrapper for Google Domestic Trends data.
https://www.google.com/finance/domestic_trends
Available Indices:
'ADVERT', 'AIRTVL', 'AUTOBY', 'AUTOFI', 'AUTO', 'BIZIND', 'BNKRPT',
'COMLND', 'COMPUT', 'CONSTR', 'CRCARD', 'DURBLE', 'EDUCAT', 'INVEST',
'FINPLN', 'FURNTR', 'INSUR', 'JOBS', 'LUXURY', 'MOBILE', 'MTGE',
'RLEST', 'RENTAL', 'SHOP', 'TRAVEL', 'UNEMPL'
"""
base_url = 'http://www.google.com/finance/historical?q=GOOGLEINDEX_US:'
full_url = '%s%s&output=csv&startdate=%s' % (base_url, gindex, startdate)
dframe = read_csv(urlopen(full_url), index_col=0)
dframe.index = DatetimeIndex(dframe.index)
dframe = dframe.sort_index(0)
for col in dframe.columns:
if len(dframe[col].unique()) == 1:
dframe.pop(col)
if len(dframe.columns) == 1 and dframe.columns[0] == 'Close':
dframe.columns = [gindex]
return dframe[gindex]
autobuyers = get_index('AUTOBY') # https://www.google.com/finance?q=GOOGLEINDEX_US:AUTOBY
autofinancing = get_index('AUTOFI') # https://www.google.com/finance?q=GOOGLEINDEX_US:AUTOFI
# Run OLS testing if searches for queries related to people looking to purchase a car
# are predictive of searches for automotive financing.
model = ols(y=autofinancing, x={'Automotive Buyers': autobuyers})
print model.summary
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <Automotive Buyers> + <intercept> Number of Observations: 3351 Number of Degrees of Freedom: 2 R-squared: 0.6429 Adj R-squared: 0.6428 Rmse: 0.0648 F-stat (1, 3349): 6028.2607, p-value: 0.0000 Degrees of Freedom: model 1, resid 3349 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- Automotive Buyers 0.8114 0.0105 77.64 0.0000 0.7909 0.8319 intercept 0.2166 0.0094 23.11 0.0000 0.1982 0.2350 ---------------------------------End of Summary---------------------------------
# Plot actual Y vs. predicted Y
pred = model.predict()
pred.plot(color='b')
autofinancing.plot(color='g')
<matplotlib.axes.AxesSubplot at 0x5576f50>
# Plot the residual
err = model.resid
err.plot(color='r')
<matplotlib.axes.AxesSubplot at 0x5595990>
# Test simple model of 1-period autocorrelation
err_t1 = err.tshift(1, freq='D')
autocorr_model = ols(y=err, x={'err_t1': err_t1})
print autocorr_model.summary
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <err_t1> + <intercept> Number of Observations: 3350 Number of Degrees of Freedom: 2 R-squared: 0.9389 Adj R-squared: 0.9389 Rmse: 0.0160 F-stat (1, 3348): 51439.6180, p-value: 0.0000 Degrees of Freedom: model 1, resid 3348 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- err_t1 0.9689 0.0043 226.80 0.0000 0.9606 0.9773 intercept 0.0000 0.0003 0.06 0.9555 -0.0005 0.0006 ---------------------------------End of Summary---------------------------------
# Test model with up to 14-period autocorrelation
err_terms = {}
for lag in xrange(1, 15):
err_terms['err_%s' % lag] = err.tshift(lag, freq='D')
autocorr_model = ols(y=err, x=err_terms)
print autocorr_model.summary
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <err_1> + <err_10> + <err_11> + <err_12> + <err_13> + <err_14> + <err_2> + <err_3> + <err_4> + <err_5> + <err_6> + <err_7> + <err_8> + <err_9> + <intercept> Number of Observations: 3337 Number of Degrees of Freedom: 15 R-squared: 0.9544 Adj R-squared: 0.9542 Rmse: 0.0139 F-stat (14, 3322): 4965.7113, p-value: 0.0000 Degrees of Freedom: model 14, resid 3322 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- err_1 1.2159 0.0173 70.32 0.0000 1.1820 1.2498 err_10 -0.0440 0.0272 -1.61 0.1067 -0.0973 0.0094 err_11 0.0378 0.0272 1.39 0.1656 -0.0156 0.0912 err_12 -0.0087 0.0272 -0.32 0.7494 -0.0620 0.0447 err_13 -0.0877 0.0271 -3.24 0.0012 -0.1409 -0.0346 -------------------------------------------------------------------------------- err_14 0.0724 0.0172 4.20 0.0000 0.0386 0.1061 err_2 -0.1353 0.0272 -4.97 0.0000 -0.1887 -0.0820 err_3 -0.0437 0.0273 -1.60 0.1096 -0.0972 0.0098 err_4 -0.0123 0.0273 -0.45 0.6536 -0.0658 0.0413 err_5 -0.0436 0.0272 -1.60 0.1094 -0.0970 0.0098 -------------------------------------------------------------------------------- err_6 -0.0504 0.0272 -1.85 0.0642 -0.1038 0.0030 err_7 -0.3302 0.0262 -12.61 0.0000 -0.3815 -0.2789 err_8 0.4320 0.0262 16.49 0.0000 0.3806 0.4833 err_9 -0.0325 0.0272 -1.19 0.2329 -0.0859 0.0209 intercept 0.0000 0.0002 0.03 0.9755 -0.0005 0.0005 ---------------------------------End of Summary---------------------------------
# Find the lagged error terms that are significant at the 95% level
significantvals = []
for pval, term, paramval in zip(autocorr_model.p_value, autocorr_model.beta.index, autocorr_model.beta):
if pval < 0.05:
significantvals.append((term, paramval))
# Sort them by the size of the coefficient
from operator import itemgetter
sorted_significantvals = sorted(significantvals, key=itemgetter(1))
for sv in sorted_significantvals:
print '%s: %s' % (sv[0], sv[1])
err_7: -0.330199121438 err_2: -0.135321015861 err_13: -0.087741039288 err_14: 0.0723747974828 err_8: 0.43198713133 err_1: 1.21586269622