import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from pandas import *
import statsmodels.api as sm
from statsmodels.tsa import *
tsa = sm.tsa
from pandas.tools.plotting import autocorrelation_plot
path = "../data/raw/appdownloads.csv"
df = pd.read_csv(path)
df
<class 'pandas.core.frame.DataFrame'> Int64Index: 120 entries, 0 to 119 Data columns: Date 120 non-null values iOS downloads 120 non-null values android downloads 120 non-null values dtypes: object(3)
df.head(30)
Date | iOS downloads | android downloads | |
---|---|---|---|
0 | 11/1/2012 | 10,763 | 6,254 |
1 | 11/2/2012 | 13,444 | 7,276 |
2 | 11/3/2012 | 12,775 | 8,109 |
3 | 11/4/2012 | 12,912 | 7,878 |
4 | 11/5/2012 | 12,168 | 6,563 |
5 | 11/6/2012 | 12,668 | 6,804 |
6 | 11/7/2012 | 10,932 | 6,134 |
7 | 11/8/2012 | 12,686 | 8,300 |
8 | 11/9/2012 | 12,533 | 8,053 |
9 | 11/10/2012 | 13,012 | 8,222 |
10 | 11/11/2012 | 13,148 | 8,200 |
11 | 11/12/2012 | 12,568 | 8,175 |
12 | 11/13/2012 | 12,538 | 8,512 |
13 | 11/14/2012 | 12,028 | 7,973 |
14 | 11/15/2012 | 11,748 | 8,490 |
15 | 11/16/2012 | 12,580 | 9,046 |
16 | 11/17/2012 | 13,312 | 8,354 |
17 | 11/18/2012 | 12,967 | 8,956 |
18 | 11/19/2012 | 12,219 | 7,302 |
19 | 11/20/2012 | 13,003 | 8,249 |
20 | 11/21/2012 | 13,030 | 8,528 |
21 | 11/22/2012 | 12,557 | 8,119 |
22 | 11/23/2012 | 12,052 | 9,330 |
23 | 11/24/2012 | 12,221 | 9,145 |
24 | 11/25/2012 | 12,769 | 8,344 |
25 | 11/26/2012 | 11,661 | 8,285 |
26 | 11/27/2012 | 12,508 | 9,196 |
27 | 11/28/2012 | 12,904 | 9,069 |
28 | 11/29/2012 | 12,357 | 9,141 |
29 | 11/30/2012 | 12,553 | 8,994 |
df = pd.read_csv(path, parse_dates={'date': ['Date']}, index_col='date')
df.head()
iOS downloads | android downloads | |
---|---|---|
date | ||
2012-11-01 | 10,763 | 6,254 |
2012-11-02 | 13,444 | 7,276 |
2012-11-03 | 12,775 | 8,109 |
2012-11-04 | 12,912 | 7,878 |
2012-11-05 | 12,168 | 6,563 |
df.ix[:,'iOS downloads']
date 2012-11-01 10,763 2012-11-02 13,444 2012-11-03 12,775 2012-11-04 12,912 2012-11-05 12,168 2012-11-06 12,668 2012-11-07 10,932 2012-11-08 12,686 2012-11-09 12,533 2012-11-10 13,012 2012-11-11 13,148 2012-11-12 12,568 2012-11-13 12,538 2012-11-14 12,028 2012-11-15 11,748 ... 2013-02-14 20,534 2013-02-15 24,296 2013-02-16 27,225 2013-02-17 27,377 2013-02-18 24,408 2013-02-19 24,783 2013-02-20 24,052 2013-02-21 24,823 2013-02-22 24,889 2013-02-23 26,345 2013-02-24 26,304 2013-02-25 23,113 2013-02-26 21,788 2013-02-27 22,733 2013-02-28 22,665 Name: iOS downloads, Length: 120
df.ix[:,'android downloads']
date 2012-11-01 6,254 2012-11-02 7,276 2012-11-03 8,109 2012-11-04 7,878 2012-11-05 6,563 2012-11-06 6,804 2012-11-07 6,134 2012-11-08 8,300 2012-11-09 8,053 2012-11-10 8,222 2012-11-11 8,200 2012-11-12 8,175 2012-11-13 8,512 2012-11-14 7,973 2012-11-15 8,490 ... 2013-02-14 12,421 2013-02-15 13,205 2013-02-16 14,787 2013-02-17 14,763 2013-02-18 13,562 2013-02-19 13,832 2013-02-20 13,823 2013-02-21 14,690 2013-02-22 15,015 2013-02-23 15,691 2013-02-24 15,845 2013-02-25 16,361 2013-02-26 14,243 2013-02-27 14,659 2013-02-28 15,080 Name: android downloads, Length: 120
df.describe()
iOS downloads | android downloads | |
---|---|---|
count | 120 | 120 |
unique | 120 | 120 |
top | 18,286 | 7,973 |
freq | 1 | 1 |
df.columns = ['ios', 'android']
df.head()
ios | android | |
---|---|---|
date | ||
2012-11-01 | 10,763 | 6,254 |
2012-11-02 | 13,444 | 7,276 |
2012-11-03 | 12,775 | 8,109 |
2012-11-04 | 12,912 | 7,878 |
2012-11-05 | 12,168 | 6,563 |
type(df.ix[5,1])
str
df = df.applymap(lambda x: int(x.replace(',','')))
type(df.ix[30,1]) # that's better
/Users/charlie/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:3576: FutureWarning: rename with inplace=True will return None from pandas 0.11 onward " from pandas 0.11 onward", FutureWarning)
numpy.int64
df['ios returns'] = df.ios.pct_change()
df['android returns'] = df.android.pct_change()
df.head()
ios | android | ios returns | android returns | |
---|---|---|---|---|
date | ||||
2012-11-01 | 10763 | 6254 | NaN | NaN |
2012-11-02 | 13444 | 7276 | 0.249094 | 0.163415 |
2012-11-03 | 12775 | 8109 | -0.049762 | 0.114486 |
2012-11-04 | 12912 | 7878 | 0.010724 | -0.028487 |
2012-11-05 | 12168 | 6563 | -0.057621 | -0.166921 |
df.ix[:,'ios':'android'].plot()
<matplotlib.axes.AxesSubplot at 0x1076054d0>
df.ix['2012-11-01':'2012-12-1','ios':'android'].plot()
<matplotlib.axes.AxesSubplot at 0x107637b90>
df.ix['2012-12-07':'2013-01-01','ios':'android'].plot()
<matplotlib.axes.AxesSubplot at 0x107a0b210>
df.ix['2012-11-01':'2013-01-01','ios returns':'android returns'].plot()
<matplotlib.axes.AxesSubplot at 0x107aebf10>
df.ix['2012-11-01':'2013-01-01','ios returns':'android returns'].plot()
<matplotlib.axes.AxesSubplot at 0x107dc9ad0>
mpl.__version__
'1.2.0'
cumdf = df.ix[:,'ios':'android'].cumsum()
'''
ios is clearly growing at a faster clip than android, although we do
note a nontrivial correction at the end of February
(and, unfortunately, the end of our data...)
'''
cumdf.plot()
<matplotlib.axes.AxesSubplot at 0x1081b3750>
del df['ios returns']
del df['android returns']
df['ios_sma-14'] = pd.rolling_mean(df.ix[:,'ios'],14)
df['android_sma-14'] = pd.rolling_mean(df.ix[:,'android'],14)
df['ios_sma-30'] = pd.rolling_mean(df.ix[:,'ios'],30)
df['android_sma-30'] = pd.rolling_mean(df.ix[:,'android'],30)
df.ix[:,'ios':'android_sma-14'].plot()
<matplotlib.axes.AxesSubplot at 0x1081a7a50>
df.plot()
<matplotlib.axes.AxesSubplot at 0x1081e70d0>
pd.rolling_std(df.ix[:,'ios':'android'], 14).plot()
'''
14-day moving standard deviation. note the wild abberation of the iOS numbers in Feb.
'''
'\n14-day moving standard deviation. note the wild abberation of the iOS numbers in Feb.\n'
pd.ewma(df.ix[:,'ios':'android'], span=14).plot()
'''
14-day exponentially weighted moving average
'''
'\n14-day exponentially weighted moving average\n'
plt.plot(pd.rolling_quantile(df.ix[:,'ios':'android'], 14, 0.5)) # rolling quantile function
[<matplotlib.lines.Line2D at 0x10891cd90>, <matplotlib.lines.Line2D at 0x10891c790>]
'''
correlation between android and iOS appears to be cyclical. what are possible explanations for this?
I'm trying to picture one crowd of people all on iOS downloading the app; then when all their iOS friends have
been exhausted, the android friends all follow suit... but this seems a bit far-fetched.
would be most instructive to compare the download figures to user data for the same period-- number of tracks posted,
number of plays, pagevisits, etc. what drives these app download numbers?
'''
pd.rolling_corr(df.ix[:,'ios'].pct_change(), df.ix[:,'android'].pct_change(), 14).plot()
<matplotlib.axes.AxesSubplot at 0x1084a4a10>
ios = df.ix[:,'ios']
android = df.ix[:,'android']
# for convenience. note that these are pandas timeseries objects:
type(ios)
pandas.core.series.TimeSeries
'''
log transform
'''
logged = np.log(df.ix[:, ['ios', 'android']])
logged.plot(subplots=True)
array([<matplotlib.axes.AxesSubplot object at 0x1089aad90>, <matplotlib.axes.AxesSubplot object at 0x108b790d0>], dtype=object)
# differencing
log_difference = logged.diff().dropna()
log_difference.plot()
<matplotlib.axes.AxesSubplot at 0x10892fd10>
'''
let's fit a vector autoregression model on these two time series just for fun.
'''
model = tsa.VAR(log_difference)
model.select_order()
VAR Order Selection ====================================================== aic bic fpe hqic ------------------------------------------------------ 0 -10.83 -10.78* 1.985e-05 -10.81 1 -10.88 -10.72 1.891e-05 -10.81 2 -10.98* -10.73 1.710e-05* -10.87* 3 -10.92 -10.57 1.807e-05 -10.78 4 -10.86 -10.41 1.916e-05 -10.68 5 -10.91 -10.36 1.821e-05 -10.69 6 -10.91 -10.26 1.827e-05 -10.65 7 -10.87 -10.12 1.908e-05 -10.57 8 -10.82 -9.963 2.016e-05 -10.47 9 -10.80 -9.843 2.060e-05 -10.41 10 -10.79 -9.734 2.084e-05 -10.36 11 -10.85 -9.694 1.968e-05 -10.38 12 -10.87 -9.614 1.937e-05 -10.36 13 -10.83 -9.468 2.035e-05 -10.28 ====================================================== * Minimum
{'aic': 2, 'bic': 0, 'fpe': 2, 'hqic': 2}
res = model.fit(maxlags=15, ic='aic')
res.summary()
Summary of Regression Results ================================== Model: VAR Method: OLS Date: Mon, 27, May, 2013 Time: 13:55:23 -------------------------------------------------------------------- No. of Equations: 2.00000 BIC: -10.6177 Nobs: 117.000 HQIC: -10.7579 Log likelihood: 312.913 FPE: 1.93340e-05 AIC: -10.8537 Det(Omega_mle): 1.77817e-05 -------------------------------------------------------------------- Results for equation ios ============================================================================= coefficient std. error t-stat prob ----------------------------------------------------------------------------- const 0.005871 0.006771 0.867 0.388 L1.ios -0.120026 0.114893 -1.045 0.298 L1.android 0.071809 0.110910 0.647 0.519 L2.ios -0.276473 0.111556 -2.478 0.015 L2.android 0.114437 0.110886 1.032 0.304 ============================================================================= Results for equation android ============================================================================= coefficient std. error t-stat prob ----------------------------------------------------------------------------- const 0.008455 0.006972 1.213 0.228 L1.ios 0.071271 0.118311 0.602 0.548 L1.android -0.319343 0.114209 -2.796 0.006 L2.ios -0.053506 0.114874 -0.466 0.642 L2.android -0.176008 0.114185 -1.541 0.126 ============================================================================= Correlation matrix of residuals ios android ios 1.000000 0.628803 android 0.628803 1.000000
res.is_stable()
True
'''
not great. let's compute some more descriptive statistics.
'''
"\nnot great. let's compute some more descriptive statistics.\n"
irf = res.irf(20)
irf.plot()
fevd = res.fevd()
fevd.plot()
res.test_whiteness()
FAIL: Some autocorrelations exceed 0.1849 bound. See plot
res.k_ar
2
'''
separates the cyclical component of the time series (here thought of as random noise) from the trend component.
the pickup in February is easier to observe at this scale. Should we call this a seasonal effect?
'''
ios_cycle, ios_trend = tsa.filters.hpfilter(ios)
android_cycle, android_trend = tsa.filters.hpfilter(android)
ios_decomp = DataFrame(ios)
ios_decomp['cycle'] = ios_cycle
ios_decomp['trend'] = ios_trend
android_decomp = DataFrame(android)
android_decomp['cycle'] = android_cycle
android_decomp['trend'] = android_trend
ios_decomp.plot()
android_decomp.plot()
<matplotlib.axes.AxesSubplot at 0x109a3d190>
'''
subtract the trend to get a stationary process so we can fit an ARMA model and predict.
'''
ios_stationary = ios - ios_trend
android_stationary = android - android_trend
ios_stationary.plot()
'''
still seeing this increasingly unstable variance for iOS over time.
'''
'\nstill seeing this increasingly unstable variance for iOS over time.\n'
ios_stationary.describe()
count 1.200000e+02 mean 1.022030e-09 std 1.315325e+03 min -2.803730e+03 25% -6.917183e+02 50% 6.702424e+01 75% 7.028854e+02 max 4.274033e+03
android_stationary.plot()
'''
android is nicer.
'''
android_stationary.describe()
count 1.200000e+02 mean 4.641834e-10 std 7.156557e+02 min -1.719127e+03 25% -5.102045e+02 50% -8.334378e+01 75% 4.496744e+02 max 1.955972e+03
'''
fit an ARMA model to ios_stationary
'''
arma_mod_ios = tsa.ARMA(ios_stationary, freq='D')
arma_res_ios = arma_mod_ios.fit(order=(2,2), trend='c', method='css-mle', disp=-1)
arma_res_ios.summary()
/Users/charlie/anaconda/lib/python2.7/site-packages/statsmodels/tsa/arima_model.py:226: FutureWarning: In the next release order will not be optional in the model constructor. "in the model constructor.", FutureWarning) /Users/charlie/anaconda/lib/python2.7/site-packages/statsmodels/tsa/arima_model.py:657: FutureWarning: The order argument to fit is deprecated. Please use the model constructor argument order. This will overwrite any order given in the model constructor. "constructor.", FutureWarning)
Dep. Variable: | ios | No. Observations: | 120 |
---|---|---|---|
Model: | ARMA(2, 2) | Log Likelihood | -1005.214 |
Method: | css-mle | S.D. of innovations | 1048.461 |
Date: | Mon, 27 May 2013 | AIC | 2022.429 |
Time: | 13:55:53 | BIC | 2039.154 |
Sample: | 11-01-2012 | HQIC | 2029.221 |
- 02-28-2013 |
coef | std err | z | P>|z| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
const | -25.9799 | 187.335 | -0.139 | 0.890 | -393.150 341.190 |
ar.L1.ios | -0.5362 | 0.206 | -2.599 | 0.011 | -0.940 -0.132 |
ar.L2.ios | 0.1470 | 0.170 | 0.867 | 0.388 | -0.185 0.479 |
ma.L1.ios | 1.2428 | 0.186 | 6.674 | 0.000 | 0.878 1.608 |
ma.L2.ios | 0.4900 | 0.121 | 4.046 | 0.000 | 0.253 0.727 |
Real | Imaginary | Modulus | Frequency | |
---|---|---|---|---|
AR.1 | -1.3588 +0.0000j 1.3588 0.5000||||
AR.2 | 5.0052 +0.0000j 5.0052 0.0000||||
MA.1 | -1.2682 -0.6576j 1.4286 -0.4239||||
MA.2 | -1.2682 +0.6576j 1.4286 0.4239
type(arma_res_ios.params)
pandas.core.series.Series
arma_mod_android = tsa.ARMA(android_stationary, freq='D')
arma_res_android = arma_mod_android.fit(order=(2,2), trend='c', method='css-mle', disp=-1)
arma_res_android.summary()
Dep. Variable: | android | No. Observations: | 120 |
---|---|---|---|
Model: | ARMA(2, 2) | Log Likelihood | -942.400 |
Method: | css-mle | S.D. of innovations | 619.688 |
Date: | Mon, 27 May 2013 | AIC | 1896.801 |
Time: | 13:55:59 | BIC | 1913.526 |
Sample: | 11-01-2012 | HQIC | 1903.593 |
- 02-28-2013 |
coef | std err | z | P>|z| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
const | -2.5502 | 65.466 | -0.039 | 0.969 | -130.861 125.760 |
ar.L1.android | 1.2026 | 0.071 | 17.007 | 0.000 | 1.064 1.341 |
ar.L2.android | -0.9194 | 0.065 | -14.157 | 0.000 | -1.047 -0.792 |
ma.L1.android | -0.9618 | 0.121 | -7.981 | 0.000 | -1.198 -0.726 |
ma.L2.android | 0.7901 | 0.086 | 9.206 | 0.000 | 0.622 0.958 |
Real | Imaginary | Modulus | Frequency | |
---|---|---|---|---|
AR.1 | 0.6540 -0.8124j 1.0429 -0.1421||||
AR.2 | 0.6540 +0.8124j 1.0429 0.1421||||
MA.1 | 0.6086 -0.9461j 1.1250 -0.1590||||
MA.2 | 0.6086 +0.9461j 1.1250 0.1590
'''
iOS download numbers grow increasingly volatile towards the end of Feb.
Looks like it's cyclical and positively correlated with time.
'''
pd.rolling_std(Series(ios_cycle, index=df.index), 7).plot() # 7-day moving standard deviation
<matplotlib.axes.AxesSubplot at 0x1081797d0>
'''
android looks a little better behaved, but these samples are really too small to say
'''
pd.rolling_std(Series(android_cycle, index=df.index), 7).plot() # 7-day moving standard deviation
<matplotlib.axes.AxesSubplot at 0x109b0aed0>
'''
hard to tell if iOS volatility is a seasonal effect because we only have 3 months of
data, and it doesn't really make sense to compare last year's figures anyway as SC is
undergoing hypergrowth, the product is totally different now, etc., etc.
here we just examine the change on the previous day.
'''
iosdiff = ios - ios.shift(1)
iosdiff.plot()
<matplotlib.axes.AxesSubplot at 0x109a1bdd0>
androiddiff = android.diff()
androiddiff.plot()
<matplotlib.axes.AxesSubplot at 0x109b47b10>
autocorrelation_plot(ios)
autocorrelation_plot(android) # autocorrelation
<matplotlib.axes.AxesSubplot at 0x109a560d0>
autocorrelation_plot(androiddiff)
<matplotlib.axes.AxesSubplot at 0x109a65bd0>
'''
try forecasting from the ARMA models
the first array is the forecasted values, the second is stderr, the third is the confidence interval.
this functionality is still under development (i.e. ugly). I should really get in there and write some stuff...
'''
arma_res_android.forecast(steps=2)
(array([-123.25006604, 224.73585785]), array([ 619.6875904 , 637.40716521]), array([[-1337.81542489, 1091.31529281], [-1024.55922944, 1474.03094515]]))
'''
forecasting more than a few steps into the future out-of-sample breaks the ARMA model.
'''
arma_res_android.forecast(steps=30)
'''
the statsmodels library is still under development, unfortunately. There seems to be quite
a few people on stackoverflow and elsewhere who have struggled with forecasting time series
lately as well. I understand that this is a solved problem in R... I should probably get on that.
'''
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-53-086b6d2ebf0d> in <module>() 3 ''' 4 ----> 5 arma_res_android.forecast(steps=30) 6 7 ''' /Users/charlie/anaconda/lib/python2.7/site-packages/statsmodels/tsa/arima_model.pyc in forecast(self, steps, exog, alpha) 1300 forecast = _arma_predict_out_of_sample(self.params, 1301 steps, self.resid, self.k_ar, self.k_ma, self.k_trend, -> 1302 self.k_exog, self.model.endog, exog, method=self.model.method) 1303 # compute the standard errors 1304 sigma2 = self.sigma2 /Users/charlie/anaconda/lib/python2.7/site-packages/statsmodels/tsa/arima_model.pyc in _arma_predict_out_of_sample(params, steps, errors, p, q, k_trend, k_exog, endog, exog, start, method) 125 for i in range(min(q,steps-1)): 126 fcast = mu[i] + np.dot(arparams,endog[i:i+p]) + \ --> 127 np.dot(maparams,resid[i:i+q]) 128 forecast[i] = fcast 129 endog[i+p] = fcast ValueError: matrices are not aligned
androidOLSresults = sm.OLS(android, range(1,121)).fit()
print androidOLSresults.summary()
OLS Regression Results ============================================================================== Dep. Variable: android R-squared: -1.747 Model: OLS Adj. R-squared: -1.747 Method: Least Squares F-statistic: -inf Date: Mon, 27 May 2013 Prob (F-statistic): nan Time: 13:56:34 Log-Likelihood: -1153.9 No. Observations: 120 AIC: 2310. Df Residuals: 119 BIC: 2313. Df Model: 0 ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ x1 146.0510 4.774 30.591 0.000 136.598 155.505 ============================================================================== Omnibus: 109.056 Durbin-Watson: 0.046 Prob(Omnibus): 0.000 Jarque-Bera (JB): 9.901 Skew: 0.178 Prob(JB): 0.00708 Kurtosis: 1.638 Cond. No. 1.00 ==============================================================================
androidOLSresults.predict(240)
# this looks about right, if a little low.
# 240 is 4 months after 3-1-2013 where the data stops, i.e. the end of June.
array([ 35052.2464936])
iosOLSresults = sm.OLS(ios, range(1,121)).fit()
print iosOLSresults.summary()
OLS Regression Results ============================================================================== Dep. Variable: ios R-squared: -0.856 Model: OLS Adj. R-squared: -0.856 Method: Least Squares F-statistic: -inf Date: Mon, 27 May 2013 Prob (F-statistic): nan Time: 13:56:38 Log-Likelihood: -1202.6 No. Observations: 120 AIC: 2407. Df Residuals: 119 BIC: 2410. Df Model: 0 ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ x1 222.0453 7.163 30.999 0.000 207.862 236.229 ============================================================================== Omnibus: 21.343 Durbin-Watson: 0.053 Prob(Omnibus): 0.000 Jarque-Bera (JB): 7.334 Skew: 0.330 Prob(JB): 0.0255 Kurtosis: 1.984 Cond. No. 1.00 ==============================================================================
iosOLSresults.predict(240) # this looks about right.
array([ 53290.88069682])