import numpy as np f, (ax1, ax2) = subplots(ncols=2) np.random.seed(3433) poisData1 = np.random.poisson(lam=50, size=100) poisData2 = np.random.poisson(lam=100, size=100) ax1.hist(poisData1) ax1.set_xlim([0, 150]) ax2.hist(poisData2) ax2.set_xlim([0, 150]); print np.mean(poisData1), np.var(poisData1) print np.mean(poisData2), np.var(poisData2) %load_ext rmagic %%R download.file("https://dl.dropbox.com/u/7710864/data/gaData.rda",destfile="../data/gaData.rda",method="curl") load("../data/gaData.rda") gaData$julian <- julian(gaData$date) write.csv(gaData,"../data/gaData.csv",row.names=FALSE) import pandas as pd gaData = pd.read_csv('../data/gaData.csv') gaData.head() def abline(intercept, gradient, *args, **kwargs): a = gca() xlim = a.get_xlim() ylim = a.get_ylim() if args: sty = args[0] else: sty = 'r' if kwargs: lw = kwargs['linewidth'] else: lw = 5 a.plot(xlim, [intercept + gradient * x for x in xlim], sty, linewidth=lw) a.set_xlim(xlim) a.set_ylim(ylim) from statsmodels.formula.api import ols, glm from statsmodels.api import families as f plot(gaData['julian'], gaData['visits'], 'o', color='darkgrey') xlabel('Julian') ylabel('Visits') lm1 = ols('visits ~ julian', gaData).fit() abline(lm1.params['Intercept'], lm1.params['julian'], linewidth=3) glm1 = glm('visits ~ julian', gaData, family=f.Poisson()).fit() plot(gaData['julian'], glm1.fittedvalues, linewidth=3); plot(glm1.fittedvalues, glm1.resid_deviance, 'o', color='grey') xlabel('Fitted') ylabel('Residuals'); glm1.conf_int() %%R load("../data/gaData.rda") gaData$julian <- julian(gaData$date) glm1 <- glm(gaData$visits ~ gaData$julian,family="poisson") library(sandwich) confint.agnostic <- function (object, parm, level = 0.95, ...) { cf <- coef(object); pnames <- names(cf) if (missing(parm)) parm <- pnames else if (is.numeric(parm)) parm <- pnames[parm] a <- (1 - level)/2; a <- c(a, 1 - a) pct <- stats:::format.perc(a, 3) fac <- qnorm(a) ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,pct)) ses <- sqrt(diag(sandwich::vcovHC(object)))[parm] ci[] <- cf[parm] + ses %o% fac ci } print(confint.agnostic(glm1)) glm2 = glm('simplystats ~ julian', gaData, offset=np.log(gaData['visits']+1), family=f.Poisson()).fit() plot(gaData['julian'], glm2.fittedvalues, 'o') plot(gaData['julian'], glm1.fittedvalues, 'r', linewidth=3) xlabel('Date') ylabel('Fitted Counts'); plot(gaData['julian'], gaData['simplystats'].astype(float)/(gaData['visits'].astype(float)+1), 'o', color='grey') plot(gaData['julian'], glm2.fittedvalues/(gaData['visits']+1), linewidth=3) xlabel('Date') ylabel('Fitted Rates');