import numpy as np
f, (ax1, ax2) = subplots(ncols=2)
np.random.seed(3433)
poisData1 = np.random.poisson(lam=50, size=100)
poisData2 = np.random.poisson(lam=100, size=100)
ax1.hist(poisData1)
ax1.set_xlim([0, 150])
ax2.hist(poisData2)
ax2.set_xlim([0, 150]);
print np.mean(poisData1), np.var(poisData1)
49.77 47.2171
print np.mean(poisData2), np.var(poisData2)
100.11 93.1579
%load_ext rmagic
%%R
download.file("https://dl.dropbox.com/u/7710864/data/gaData.rda",destfile="../data/gaData.rda",method="curl")
load("../data/gaData.rda")
gaData$julian <- julian(gaData$date)
write.csv(gaData,"../data/gaData.csv",row.names=FALSE)
import pandas as pd
gaData = pd.read_csv('../data/gaData.csv')
gaData.head()
date | visits | simplystats | julian | |
---|---|---|---|---|
0 | 2011-01-01 | 0 | 0 | 14975 |
1 | 2011-01-02 | 0 | 0 | 14976 |
2 | 2011-01-03 | 0 | 0 | 14977 |
3 | 2011-01-04 | 0 | 0 | 14978 |
4 | 2011-01-05 | 0 | 0 | 14979 |
def abline(intercept, gradient, *args, **kwargs):
a = gca()
xlim = a.get_xlim()
ylim = a.get_ylim()
if args:
sty = args[0]
else:
sty = 'r'
if kwargs:
lw = kwargs['linewidth']
else:
lw = 5
a.plot(xlim, [intercept + gradient * x for x in xlim], sty, linewidth=lw)
a.set_xlim(xlim)
a.set_ylim(ylim)
from statsmodels.formula.api import ols, glm
from statsmodels.api import families as f
plot(gaData['julian'], gaData['visits'], 'o', color='darkgrey')
xlabel('Julian')
ylabel('Visits')
lm1 = ols('visits ~ julian', gaData).fit()
abline(lm1.params['Intercept'], lm1.params['julian'], linewidth=3)
glm1 = glm('visits ~ julian', gaData, family=f.Poisson()).fit()
plot(gaData['julian'], glm1.fittedvalues, linewidth=3);
plot(glm1.fittedvalues, glm1.resid_deviance, 'o', color='grey')
xlabel('Fitted')
ylabel('Residuals');
glm1.conf_int()
0 | 1 | |
---|---|---|
Intercept | -34.343231 | -31.156440 |
julian | 0.002190 | 0.002396 |
%%R
load("../data/gaData.rda")
gaData$julian <- julian(gaData$date)
glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson")
library(sandwich)
confint.agnostic <- function (object, parm, level = 0.95, ...)
{
cf <- coef(object); pnames <- names(cf)
if (missing(parm))
parm <- pnames
else if (is.numeric(parm))
parm <- pnames[parm]
a <- (1 - level)/2; a <- c(a, 1 - a)
pct <- stats:::format.perc(a, 3)
fac <- qnorm(a)
ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,pct))
ses <- sqrt(diag(sandwich::vcovHC(object)))[parm]
ci[] <- cf[parm] + ses %o% fac
ci
}
print(confint.agnostic(glm1))
2.5 % 97.5 % (Intercept) -36.362674594 -29.136997254 gaData$julian 0.002058147 0.002527955
glm2 = glm('simplystats ~ julian', gaData, offset=np.log(gaData['visits']+1), family=f.Poisson()).fit()
plot(gaData['julian'], glm2.fittedvalues, 'o')
plot(gaData['julian'], glm1.fittedvalues, 'r', linewidth=3)
xlabel('Date')
ylabel('Fitted Counts');
plot(gaData['julian'], gaData['simplystats'].astype(float)/(gaData['visits'].astype(float)+1), 'o', color='grey')
plot(gaData['julian'], glm2.fittedvalues/(gaData['visits']+1), linewidth=3)
xlabel('Date')
ylabel('Fitted Rates');