import pandas as pd
import numpy as np
%load_ext rmagic
%%R -o data
set.seed(3433); par(mfrow=c(1,2));
data <- rnorm(100,mean=seq(0,3,length=100),sd=seq(0.1,3,length=100))
df = pd.DataFrame(zip(data, np.linspace(0, 3, num=100)), columns=['data', 'x'])
from statsmodels.formula.api import ols
from statsmodels.stats.sandwich_covariance import cov_hc3
lm1 = ols('data ~ x', df).fit()
cov_hc3(lm1)
array([[ 0.05410024, -0.0476481 ], [-0.0476481 , 0.05368603]])
lm1.normalized_cov_params
Intercept | x | |
---|---|---|
Intercept | 0.039406 | -0.019604 |
x | -0.019604 | 0.013069 |
%%R -o x,y
set.seed(343)
x <- seq(0,3,length=100); y <- rcauchy(100)
df = pd.DataFrame(zip(x, y), columns=['x', 'y'])
from statsmodels.formula.api import rlm
lm1 = ols('y ~ x', df).fit()
rlm1 = rlm('y ~ x', df).fit()
lm1.params
Intercept 0.352298 x -0.401058
rlm1.params
Intercept 0.008537 x -0.017875
f, (ax1, ax2) = subplots(ncols=2)
ax1.plot(df['x'], df['y'], 'o', color='grey')
ax1.plot(df['x'], lm1.fittedvalues, linewidth=3)
ax1.plot(df['x'], rlm1.fittedvalues, 'g', linewidth=3)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax2.plot(df['x'], df['y'], 'o', color='grey')
ax2.plot(df['x'], lm1.fittedvalues, linewidth=3)
ax2.plot(df['x'], rlm1.fittedvalues, 'g', linewidth=3)
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_ylim([-5, 5])
ax2.set_title('Zoomed in')
f.set_size_inches(9, 3)
f.tight_layout();
movies = pd.read_csv('../data/movies.txt', sep='\t')
movies.columns = ['X', 'score', 'rating', 'genre', 'box_office', 'running_time']
movies.head()
X | score | rating | genre | box_office | running_time | |
---|---|---|---|---|---|---|
0 | 2 Fast 2 Furious | 48.9 | PG-13 | action/adventure | 127.146 | 107 |
1 | 28 Days Later | 78.2 | R | horror | 45.065 | 113 |
2 | A Guy Thing | 39.5 | PG-13 | rom comedy | 15.545 | 101 |
3 | A Man Apart | 42.9 | R | action/adventure | 26.248 | 110 |
4 | A Mighty Wind | 79.9 | PG-13 | comedy | 17.781 | 91 |
# no stepwise regression
lm1 = ols('score ~ box_office + running_time', movies).fit()
lm1.summary()
Dep. Variable: | score | R-squared: | 0.212 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.201 |
Method: | Least Squares | F-statistic: | 18.46 |
Date: | Thu, 21 Feb 2013 | Prob (F-statistic): | 7.98e-08 |
Time: | 23:49:35 | Log-Likelihood: | -554.61 |
No. Observations: | 140 | AIC: | 1115. |
Df Residuals: | 137 | BIC: | 1124. |
Df Model: | 2 |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | 37.2364 | 5.606 | 6.643 | 0.000 | 26.152 48.321 |
box_office | 0.0824 | 0.018 | 4.495 | 0.000 | 0.046 0.119 |
running_time | 0.1275 | 0.054 | 2.379 | 0.019 | 0.022 0.234 |
Omnibus: | 2.420 | Durbin-Watson: | 2.124 |
---|---|---|---|
Prob(Omnibus): | 0.298 | Jarque-Bera (JB): | 2.230 |
Skew: | 0.309 | Prob(JB): | 0.328 |
Kurtosis: | 2.991 | Cond. No. | 677. |
# no regsubsets
# no bic.glm