import pandas as pd
import numpy as np
import pandas.rpy.common as com
faithful = com.load_data('faithful')
faithful.shape
(272, 2)
%load_ext rmagic
%%R -o trainSamples
set.seed(333)
trainSamples <- sample(1:272,size=(272/2),replace=F)
trainFaith = faithful.ix[trainSamples,:]
testFaith = faithful.ix[faithful.index - trainSamples,:]
trainFaith.head()
eruptions | waiting | |
---|---|---|
128 | 4.500 | 82 |
23 | 3.450 | 78 |
263 | 1.850 | 58 |
154 | 4.600 | 81 |
6 | 2.883 | 55 |
from statsmodels.formula.api import ols
lm1 = ols('eruptions ~ waiting', trainFaith).fit()
lm1.summary()
Dep. Variable: | eruptions | R-squared: | 0.814 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.812 |
Method: | Least Squares | F-statistic: | 584.8 |
Date: | Thu, 28 Feb 2013 | Prob (F-statistic): | 1.02e-50 |
Time: | 22:56:59 | Log-Likelihood: | -95.927 |
No. Observations: | 136 | AIC: | 195.9 |
Df Residuals: | 134 | BIC: | 201.7 |
Df Model: | 1 |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | -1.9249 | 0.229 | -8.397 | 0.000 | -2.378 -1.472 |
waiting | 0.0764 | 0.003 | 24.182 | 0.000 | 0.070 0.083 |
Omnibus: | 2.318 | Durbin-Watson: | 1.995 |
---|---|---|---|
Prob(Omnibus): | 0.314 | Jarque-Bera (JB): | 2.305 |
Skew: | -0.310 | Prob(JB): | 0.316 |
Kurtosis: | 2.850 | Cond. No. | 393. |
plot(trainFaith['waiting'], trainFaith['eruptions'], 'o')
plot(trainFaith['waiting'], lm1.fittedvalues, 'k', linewidth=3)
xlabel('Waiting')
ylabel('Duration');
print lm1.params['Intercept'] + lm1.params['waiting'] * 80
4.18615649492
print lm1.predict({'waiting' : [80]})
[ 4.18615649]
f, (ax1, ax2) = subplots(ncols=2)
ax1.plot(trainFaith['waiting'], trainFaith['eruptions'], 'o')
ax1.plot(trainFaith['waiting'], lm1.predict({'waiting' : trainFaith['waiting'].values}), 'k')
ax1.set_xlabel('Waiting')
ax1.set_ylabel('Duration')
ax2.plot(testFaith['waiting'], testFaith['eruptions'], 'o')
ax2.plot(testFaith['waiting'], lm1.predict({'waiting' : testFaith['waiting'].values}), 'k')
ax2.set_xlabel('Waiting')
ax2.set_ylabel('Duration')
f.set_size_inches(8,3)
f.tight_layout();
# calculate RMSE on training
print sqrt(sum((lm1.fittedvalues - trainFaith['eruptions']) ** 2))
5.71292527997
# calculate RMSE on test
print sqrt(sum((lm1.predict({'waiting' : testFaith['waiting'].values}) - testFaith['eruptions']) ** 2))
5.82715154245
# no prediction interval
ravensData = pd.read_csv('../data/ravensData.csv')
ravensData.head(6)
ravenWinNum | ravenWin | ravenScore | opponentScore | |
---|---|---|---|---|
0 | 1 | W | 24 | 9 |
1 | 1 | W | 38 | 35 |
2 | 1 | W | 28 | 13 |
3 | 1 | W | 34 | 31 |
4 | 1 | W | 44 | 13 |
5 | 0 | L | 23 | 24 |
from statsmodels.formula.api import glm
from statsmodels.api import families as f
glm1 = glm('ravenWinNum ~ ravenScore', ravensData, family=f.Binomial()).fit()
df = pd.DataFrame({'pred' : glm1.predict(), 'ravenWinNum' : ravensData['ravenWinNum'].values})
df.boxplot(by='ravenWinNum');
xx = np.linspace(0, 1, num=10)
err = [sum( (df['pred'] > x) != df['ravenWinNum'] ) for x in xx]
plot(xx, err, 'ok')
xlabel('Cutoff')
ylabel('Error');
def cost(win, pred=0):
return np.mean(np.abs(win - pred) > 0.5)
# ???? is this correct?
from sklearn.cross_validation import KFold
kf = KFold(len(ravensData), n_folds=3, indices=False)
cv1train = []
cv1test = []
cv2train = []
cv2test = []
for train, test in kf:
glm1 = glm('ravenWinNum ~ ravenScore', ravensData[train], family=f.Binomial()).fit()
glm2 = glm('ravenWinNum ~ ravenScore', ravensData[train], family=f.Gaussian()).fit()
cv1train.append( cost(ravensData[train]['ravenWinNum'], glm1.predict()) )
cv2train.append( cost(ravensData[train]['ravenWinNum'], glm2.predict()) )
cv1test.append( cost(ravensData[test]['ravenWinNum'], glm1.predict(ravensData[test])) )
cv2test.append( cost(ravensData[test]['ravenWinNum'], glm2.predict(ravensData[test])) )
zip( cv1train, cv1test )
[(0.42857142857142855, 0.16666666666666666), (0.2857142857142857, 0.16666666666666666), (0.25, 0.5)]
np.mean(cv2train), np.mean(cv2test)
(0.29365079365079361, 0.27777777777777773)