import pandas as pd import io import statsmodels.api as sm %matplotlib inline import matplotlib.pyplot as plt content2 = '''\ Units lastqu 2000-12-31 19391 NaN 2001-12-31 35068 5925 2002-12-31 39279 8063 2003-12-31 47517 9473 2004-12-31 51439 11226 2005-12-31 59674 11667 2006-12-31 58664 14016 2007-12-31 55698 13186 2008-12-31 42235 11343 2009-12-31 40478 7867 2010-12-31 38722 8114 2011-12-31 36965 8361 2012-12-31 39132 8608 2013-12-31 43160 9016 2014-12-31 NaN 9785 ''' df2 = pd.read_table(io.BytesIO(content2)) #make sure that the columns are int df2['Units']=df2['Units'][:-1].astype('int') df2['lastqu']=df2['lastqu'][1:].astype('int') df2 #note sample data is from Statewide def fit_line2(x, y): X = sm.add_constant(x, prepend=True) #Add a column of ones to allow the calculation of the intercept ols_test = sm.OLS(y, X,missing='drop').fit() """Return slope, intercept of best fit line.""" X = sm.add_constant(x) return ols_test ols_test=fit_line2(df2['lastqu'][1:-1], df2['Units'][1:-1]) #Use the lastqu to fit to Units (so can forecast future) ols_test.summary() nextq=ols_test.predict(sm.add_constant(df2['lastqu'][-1:], prepend=True)) # the prediction for 2014 nextq fig = plt.figure(figsize=(12,8)) fig=sm.graphics.plot_regress_exog(ols_test,'lastqu',fig=fig) fig.clf() #clears the fig! sm.graphics.plot_partregress_grid(ols_test, fig=fig) #fig.clf() #clears the fig! result=ols_test.model.fit() result.summary() # this uses the statsmodels formula API (same results # import formula api as alias smf import statsmodels.formula.api as smf # formula: response ~ predictors est = smf.ols(formula='Units ~ lastqu', data=df2).fit() est.summary() fig = plt.figure(figsize=(12,8)) fig=sm.graphics.plot_regress_exog(est,'lastqu',fig=fig) #Remove NaN's df2=df2[1:-1] import pymc as pm import numpy as np x=df2['lastqu'] y=df2['Units'] trace = None with pm.Model() as model: alpha = pm.Normal('alpha', mu=0, sd=20) beta = pm.Normal('beta', mu=0, sd=20) sigma = pm.Uniform('sigma', lower=0, upper=12000) y_est = alpha + beta * x likelihood = pm.Normal('y', mu=y_est, sd=sigma, observed=y) start = pm.find_MAP() step = pm.NUTS(state=start) trace = pm.sample(2000, step, start=start, progressbar=False) pm.traceplot(trace); def graph(formula, x_range, color='black', alpha=1): x = np.array(x_range) y = eval(formula) plt.plot(x, y, color=color, alpha=alpha) plt.scatter(x,y) for i in xrange(0,2000): point = trace.point(i) graph('{0} + {1}*x'.format(point['alpha'], point['beta']), range(6000,15000), color='black', alpha=.0098035) plt.show() x=df2['lastqu'] y=df2['Units'] data = dict(x=x, y=y) data with pm.Model() as model: # specify glm and pass in data. The resulting linear model, its likelihood and # and all its parameters are automatically added to our model. pm.glm.glm('y ~ x', data) step = pm.NUTS() # Instantiate MCMC sampling algorithm trace = pm.sample(2000, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling plt.figure(figsize=(7, 7)) pm.traceplot(trace) plt.tight_layout(); point['sigma'] point['beta'] point['alpha'] #!pip list