In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from StringIO import StringIO 
import requests

print 'Numpy:', np.__version__
print 'Pandas:', pd.__version__
import statsmodels
print 'Statsmodels:', statsmodels.__version__

np.set_printoptions(precision=2)
Numpy: 1.8.0
Pandas: 0.12.0-1047-g2d2e8b5
Statsmodels: 0.6.0.dev-Unknown
In [2]:
act = requests.get('https://docs.google.com/spreadsheet/ccc?key=0Ak_wF7ZGeMmHdFZtQjI1a1hhUWR2UExCa2E4MFhiWWc&output=csv&gid=1')
data = pd.read_csv(StringIO(act.content),index_col=0,parse_dates=['date'], thousands=',').sort_index() #converts to numbers
In [3]:
# In your example, you weren't able to predict because your dataframe was not sorted correctly;
# If you look at the model summary table, you'll see that the sample runs from
# 08-01-2013 to 05-01-2000
data['Unit Sales'].plot()
Out[3]:
<matplotlib.axes.AxesSubplot at 0x5623250>
In [4]:
# Augmented Dicky-Fuller test for unit roots
# Null Hypothesis: Unit Root
# Alternative Hypothesis: AR(p), where here p is selected as the lag <= 12 that minimizes AIC
print 'Augmented Dicky-Fuller Test (p <= 12)'
print sm.tsa.adfuller(data['Unit Sales'], 12)[1], '=> we cannot reject unit root in data'
print sm.tsa.adfuller(data['Unit Sales'].diff()[1:], 11)[1], '=> we can reject unit root in differenced data at 1%'
Augmented Dicky-Fuller Test (p <= 12)
0.453907629823 => we cannot reject unit root in data
0.00470416125858 => we can reject unit root in differenced data at 1%
In [5]:
# So it looks like an appropriate model might be ARIMA(12,1,0)
p=12
d=1
q=0
mod = sm.tsa.ARIMA(data['Unit Sales'], order=[p,d,q], freq='M') # mod is an object of class ARIMA
res = mod.fit(); # res is an object of class ARIMAResults
In [6]:
print res.summary() # sometimes the Display() command isn't as nice as print

# The roots table gives you information about the stationarity of the estimated ARIMA
# model (i.e. did you choose a high enough `d`). All the roots are greater than 1 in
# modulus, so our model is stationary. We could guess this would happen based on the
# ADF test of the differenced data, which told us that we had one unit root.
                             ARIMA Model Results                              
==============================================================================
Dep. Variable:           D.Unit Sales   No. Observations:                  160
Model:                ARIMA(12, 1, 0)   Log Likelihood               -1150.617
Method:                       css-mle   S.D. of innovations            314.595
Date:                Wed, 20 Nov 2013   AIC                           2329.233
Time:                        07:29:34   BIC                           2372.285
Sample:                    06-01-2000   HQIC                          2346.715
                         - 09-01-2013                                         
=======================================================================================
                          coef    std err          z      P>|z|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------
const                   7.6884     12.774      0.602      0.548       -17.348    32.725
ar.L1.D.Unit Sales     -0.2200      0.068     -3.228      0.002        -0.354    -0.086
ar.L2.D.Unit Sales     -0.1089      0.070     -1.550      0.123        -0.247     0.029
ar.L3.D.Unit Sales     -0.0310      0.070     -0.440      0.660        -0.169     0.107
ar.L4.D.Unit Sales     -0.1597      0.070     -2.270      0.025        -0.298    -0.022
ar.L5.D.Unit Sales     -0.1137      0.070     -1.635      0.104        -0.250     0.023
ar.L6.D.Unit Sales     -0.2565      0.066     -3.861      0.000        -0.387    -0.126
ar.L7.D.Unit Sales     -0.2117      0.068     -3.114      0.002        -0.345    -0.078
ar.L8.D.Unit Sales     -0.2040      0.069     -2.942      0.004        -0.340    -0.068
ar.L9.D.Unit Sales     -0.0047      0.070     -0.067      0.947        -0.143     0.133
ar.L10.D.Unit Sales    -0.0956      0.071     -1.355      0.177        -0.234     0.043
ar.L11.D.Unit Sales    -0.0327      0.070     -0.465      0.642        -0.170     0.105
ar.L12.D.Unit Sales     0.4781      0.068      7.082      0.000         0.346     0.610
                                    Roots                                     
==============================================================================
                  Real           Imaginary           Modulus         Frequency
------------------------------------------------------------------------------
AR.1            -1.1018           -0.0000j            1.1018           -0.5000
AR.2            -0.9003           -0.5025j            1.0311           -0.4190
AR.3            -0.9003           +0.5025j            1.0311            0.4190
AR.4            -0.5342           -0.8814j            1.0307           -0.3367
AR.5            -0.5342           +0.8814j            1.0307            0.3367
AR.6             0.0008           -1.0515j            1.0515           -0.2499
AR.7             0.0008           +1.0515j            1.0515            0.2499
AR.8             0.5202           -0.9593j            1.0913           -0.1709
AR.9             0.5202           +0.9593j            1.0913            0.1709
AR.10            0.8766           -0.5080j            1.0131           -0.0836
AR.11            0.8766           +0.5080j            1.0131            0.0836
AR.12            1.2439           -0.0000j            1.2439           -0.0000
------------------------------------------------------------------------------
In [7]:
predict = res.predict('2012-1-01','2015-01-01', dynamic=True)

fig = plt.figure(figsize=(10,3))
ax = fig.add_subplot(111)
#ax.plot(data.index, data['Unit Sales'])
ax.plot(data.index, data['Unit Sales']-data['Unit Sales'].mean())
ax.plot(predict.index, predict, 'r-');
In [7]: