In [4]:

import pandas as pd
import numpy as np

In [3]:

%load_ext rmagic

Robust ...¶

In [5]:

%%R -o data
set.seed(3433); par(mfrow=c(1,2)); 
data <- rnorm(100,mean=seq(0,3,length=100),sd=seq(0.1,3,length=100))

In [33]:

df = pd.DataFrame(zip(data, np.linspace(0, 3, num=100)), columns=['data', 'x'])

In [31]:

from statsmodels.formula.api import ols
from statsmodels.stats.sandwich_covariance import cov_hc3

lm1 = ols('data ~ x', df).fit()

cov_hc3(lm1)

Out[31]:

array([[ 0.05410024, -0.0476481 ],
       [-0.0476481 ,  0.05368603]])

In [32]:

lm1.normalized_cov_params

Out[32]:

	Intercept	x
Intercept	0.039406	-0.019604
x	-0.019604	0.013069

Robust linear modelling¶

In [35]:

%%R -o x,y
set.seed(343)
x <- seq(0,3,length=100); y <- rcauchy(100)

In [36]:

df = pd.DataFrame(zip(x, y), columns=['x', 'y'])

In [38]:

from statsmodels.formula.api import rlm

lm1 = ols('y ~ x', df).fit()
rlm1 = rlm('y ~ x', df).fit()

In [39]:

lm1.params

Out[39]:

Intercept    0.352298
x           -0.401058

In [40]:

rlm1.params

Out[40]:

Intercept    0.008537
x           -0.017875

In [55]:

f, (ax1, ax2) = subplots(ncols=2)

ax1.plot(df['x'], df['y'], 'o', color='grey')
ax1.plot(df['x'], lm1.fittedvalues, linewidth=3)
ax1.plot(df['x'], rlm1.fittedvalues, 'g', linewidth=3)
ax1.set_xlabel('x')
ax1.set_ylabel('y')

ax2.plot(df['x'], df['y'], 'o', color='grey')
ax2.plot(df['x'], lm1.fittedvalues, linewidth=3)
ax2.plot(df['x'], rlm1.fittedvalues, 'g', linewidth=3)
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_ylim([-5, 5])
ax2.set_title('Zoomed in')

f.set_size_inches(9, 3)
f.tight_layout();

Model selection¶

In [58]:

movies = pd.read_csv('../data/movies.txt', sep='\t')
movies.columns = ['X', 'score', 'rating', 'genre', 'box_office', 'running_time']
movies.head()

Out[58]:

	X	score	rating	genre	box_office	running_time
0	2 Fast 2 Furious	48.9	PG-13	action/adventure	127.146	107
1	28 Days Later	78.2	R	horror	45.065	113
2	A Guy Thing	39.5	PG-13	rom comedy	15.545	101
3	A Man Apart	42.9	R	action/adventure	26.248	110
4	A Mighty Wind	79.9	PG-13	comedy	17.781	91

In [74]:

# no stepwise regression
lm1 = ols('score ~ box_office + running_time', movies).fit()
lm1.summary()

Out[74]:

OLS Regression Results
Dep. Variable:	score	R-squared:	0.212
Model:	OLS	Adj. R-squared:	0.201
Method:	Least Squares	F-statistic:	18.46
Date:	Thu, 21 Feb 2013	Prob (F-statistic):	7.98e-08
Time:	23:49:35	Log-Likelihood:	-554.61
No. Observations:	140	AIC:	1115.
Df Residuals:	137	BIC:	1124.
Df Model:	2

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	37.2364	5.606	6.643	0.000	26.152 48.321
box_office	0.0824	0.018	4.495	0.000	0.046 0.119
running_time	0.1275	0.054	2.379	0.019	0.022 0.234

Omnibus:	2.420	Durbin-Watson:	2.124
Prob(Omnibus):	0.298	Jarque-Bera (JB):	2.230
Skew:	0.309	Prob(JB):	0.328
Kurtosis:	2.991	Cond. No.	677.

In [ ]:

# no regsubsets
# no bic.glm