Notebook

In [1]:

from pandas import read_csv,DatetimeIndex,ols
from urllib import urlopen

In [2]:

def get_index(gindex, startdate=20040101):
    """
    API wrapper for Google Domestic Trends data.
        https://www.google.com/finance/domestic_trends

    Available Indices:

       'ADVERT', 'AIRTVL', 'AUTOBY', 'AUTOFI', 'AUTO', 'BIZIND', 'BNKRPT',
       'COMLND', 'COMPUT', 'CONSTR', 'CRCARD', 'DURBLE', 'EDUCAT', 'INVEST',
       'FINPLN', 'FURNTR', 'INSUR', 'JOBS', 'LUXURY', 'MOBILE', 'MTGE',
       'RLEST', 'RENTAL', 'SHOP', 'TRAVEL', 'UNEMPL'

    """
    base_url = 'http://www.google.com/finance/historical?q=GOOGLEINDEX_US:'
    full_url = '%s%s&output=csv&startdate=%s' % (base_url, gindex, startdate)
    dframe = read_csv(urlopen(full_url), index_col=0)
    dframe.index = DatetimeIndex(dframe.index)
    dframe = dframe.sort_index(0)
    for col in dframe.columns:
        if len(dframe[col].unique()) == 1:
            dframe.pop(col)
    if len(dframe.columns) == 1 and dframe.columns[0] == 'Close':
        dframe.columns = [gindex]
    return dframe[gindex]

In [3]:

autobuyers = get_index('AUTOBY') # https://www.google.com/finance?q=GOOGLEINDEX_US:AUTOBY
autofinancing = get_index('AUTOFI') # https://www.google.com/finance?q=GOOGLEINDEX_US:AUTOFI

In [4]:

# Run OLS testing if searches for queries related to people looking to purchase a car
# are predictive of searches for automotive financing.
model = ols(y=autofinancing, x={'Automotive Buyers': autobuyers})
print model.summary

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <Automotive Buyers> + <intercept>

Number of Observations:         3351
Number of Degrees of Freedom:   2

R-squared:         0.6429
Adj R-squared:     0.6428

Rmse:              0.0648

F-stat (1, 3349):  6028.2607, p-value:     0.0000

Degrees of Freedom: model 1, resid 3349

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
Automotive Buyers     0.8114     0.0105      77.64     0.0000     0.7909     0.8319
     intercept     0.2166     0.0094      23.11     0.0000     0.1982     0.2350
---------------------------------End of Summary---------------------------------

In [5]:

# Plot actual Y vs. predicted Y
pred = model.predict()
pred.plot(color='b')
autofinancing.plot(color='g')

Out[5]:

<matplotlib.axes.AxesSubplot at 0x5576f50>

In [6]:

# Plot the residual
err = model.resid
err.plot(color='r')

Out[6]:

<matplotlib.axes.AxesSubplot at 0x5595990>

In [7]:

# Test simple model of 1-period autocorrelation
err_t1 = err.tshift(1, freq='D')
autocorr_model = ols(y=err, x={'err_t1': err_t1})
print autocorr_model.summary

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <err_t1> + <intercept>

Number of Observations:         3350
Number of Degrees of Freedom:   2

R-squared:         0.9389
Adj R-squared:     0.9389

Rmse:              0.0160

F-stat (1, 3348): 51439.6180, p-value:     0.0000

Degrees of Freedom: model 1, resid 3348

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
        err_t1     0.9689     0.0043     226.80     0.0000     0.9606     0.9773
     intercept     0.0000     0.0003       0.06     0.9555    -0.0005     0.0006
---------------------------------End of Summary---------------------------------

In [8]:

# Test model with up to 14-period autocorrelation
err_terms = {}
for lag in xrange(1, 15):
    err_terms['err_%s' % lag] = err.tshift(lag, freq='D')
autocorr_model = ols(y=err, x=err_terms)
print autocorr_model.summary

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <err_1> + <err_10> + <err_11> + <err_12> + <err_13> + <err_14>
             + <err_2> + <err_3> + <err_4> + <err_5> + <err_6> + <err_7> + <err_8>
             + <err_9> + <intercept>

Number of Observations:         3337
Number of Degrees of Freedom:   15

R-squared:         0.9544
Adj R-squared:     0.9542

Rmse:              0.0139

F-stat (14, 3322):  4965.7113, p-value:     0.0000

Degrees of Freedom: model 14, resid 3322

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
         err_1     1.2159     0.0173      70.32     0.0000     1.1820     1.2498
        err_10    -0.0440     0.0272      -1.61     0.1067    -0.0973     0.0094
        err_11     0.0378     0.0272       1.39     0.1656    -0.0156     0.0912
        err_12    -0.0087     0.0272      -0.32     0.7494    -0.0620     0.0447
        err_13    -0.0877     0.0271      -3.24     0.0012    -0.1409    -0.0346
--------------------------------------------------------------------------------
        err_14     0.0724     0.0172       4.20     0.0000     0.0386     0.1061
         err_2    -0.1353     0.0272      -4.97     0.0000    -0.1887    -0.0820
         err_3    -0.0437     0.0273      -1.60     0.1096    -0.0972     0.0098
         err_4    -0.0123     0.0273      -0.45     0.6536    -0.0658     0.0413
         err_5    -0.0436     0.0272      -1.60     0.1094    -0.0970     0.0098
--------------------------------------------------------------------------------
         err_6    -0.0504     0.0272      -1.85     0.0642    -0.1038     0.0030
         err_7    -0.3302     0.0262     -12.61     0.0000    -0.3815    -0.2789
         err_8     0.4320     0.0262      16.49     0.0000     0.3806     0.4833
         err_9    -0.0325     0.0272      -1.19     0.2329    -0.0859     0.0209
     intercept     0.0000     0.0002       0.03     0.9755    -0.0005     0.0005
---------------------------------End of Summary---------------------------------

In [9]:

# Find the lagged error terms that are significant at the 95% level
significantvals = []
for pval, term, paramval in zip(autocorr_model.p_value, autocorr_model.beta.index, autocorr_model.beta):
    if pval < 0.05:
        significantvals.append((term, paramval))

# Sort them by the size of the coefficient
from operator import itemgetter
sorted_significantvals = sorted(significantvals, key=itemgetter(1))
for sv in sorted_significantvals:
    print '%s: %s' % (sv[0], sv[1])

err_7: -0.330199121438
err_2: -0.135321015861
err_13: -0.087741039288
err_14: 0.0723747974828
err_8: 0.43198713133
err_1: 1.21586269622