In [78]:

import pandas as pd
import numpy as np
import pandas.rpy.common as com

In [10]:

faithful = com.load_data('faithful')
faithful.shape

Out[10]:

(272, 2)

In [6]:

%load_ext rmagic

In [22]:

%%R -o trainSamples
set.seed(333)
trainSamples <- sample(1:272,size=(272/2),replace=F)

In [31]:

trainFaith = faithful.ix[trainSamples,:]
testFaith = faithful.ix[faithful.index - trainSamples,:]

In [32]:

trainFaith.head()

Out[32]:

	eruptions	waiting
128	4.500	82
23	3.450	78
263	1.850	58
154	4.600	81
6	2.883	55

In [55]:

from statsmodels.formula.api import ols

lm1 = ols('eruptions ~ waiting', trainFaith).fit()
lm1.summary()

Out[55]:

OLS Regression Results
Dep. Variable:	eruptions	R-squared:	0.814
Model:	OLS	Adj. R-squared:	0.812
Method:	Least Squares	F-statistic:	584.8
Date:	Thu, 28 Feb 2013	Prob (F-statistic):	1.02e-50
Time:	22:56:59	Log-Likelihood:	-95.927
No. Observations:	136	AIC:	195.9
Df Residuals:	134	BIC:	201.7
Df Model:	1

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	-1.9249	0.229	-8.397	0.000	-2.378 -1.472
waiting	0.0764	0.003	24.182	0.000	0.070 0.083

Omnibus:	2.318	Durbin-Watson:	1.995
Prob(Omnibus):	0.314	Jarque-Bera (JB):	2.305
Skew:	-0.310	Prob(JB):	0.316
Kurtosis:	2.850	Cond. No.	393.

In [56]:

plot(trainFaith['waiting'], trainFaith['eruptions'], 'o')
plot(trainFaith['waiting'], lm1.fittedvalues, 'k', linewidth=3)
xlabel('Waiting')
ylabel('Duration');

In [85]:

print lm1.params['Intercept'] + lm1.params['waiting'] * 80

4.18615649492

In [88]:

print lm1.predict({'waiting' : [80]})

[ 4.18615649]

In [77]:

f, (ax1, ax2) = subplots(ncols=2)

ax1.plot(trainFaith['waiting'], trainFaith['eruptions'], 'o')
ax1.plot(trainFaith['waiting'], lm1.predict({'waiting' : trainFaith['waiting'].values}), 'k')
ax1.set_xlabel('Waiting')
ax1.set_ylabel('Duration')

ax2.plot(testFaith['waiting'], testFaith['eruptions'], 'o')
ax2.plot(testFaith['waiting'], lm1.predict({'waiting' : testFaith['waiting'].values}), 'k')
ax2.set_xlabel('Waiting')
ax2.set_ylabel('Duration')

f.set_size_inches(8,3)
f.tight_layout();

In [84]:

# calculate RMSE on training
print sqrt(sum((lm1.fittedvalues - trainFaith['eruptions']) ** 2))

5.71292527997

In [83]:

# calculate RMSE on test
print sqrt(sum((lm1.predict({'waiting' : testFaith['waiting'].values}) - testFaith['eruptions']) ** 2))

5.82715154245

In [89]:

# no prediction interval

In [102]:

ravensData = pd.read_csv('../data/ravensData.csv')
ravensData.head(6)

Out[102]:

	ravenWinNum	ravenWin	ravenScore	opponentScore
0	1	W	24	9
1	1	W	38	35
2	1	W	28	13
3	1	W	34	31
4	1	W	44	13
5	0	L	23	24

In [105]:

from statsmodels.formula.api import glm
from statsmodels.api import families as f

glm1 = glm('ravenWinNum ~ ravenScore', ravensData, family=f.Binomial()).fit()

In [119]:

df = pd.DataFrame({'pred' : glm1.predict(), 'ravenWinNum' : ravensData['ravenWinNum'].values})

df.boxplot(by='ravenWinNum');

In [132]:

xx = np.linspace(0, 1, num=10)

err = [sum( (df['pred'] > x) != df['ravenWinNum'] ) for x in xx]

plot(xx, err, 'ok')
xlabel('Cutoff')
ylabel('Error');

In [138]:

def cost(win, pred=0):
    return np.mean(np.abs(win - pred) > 0.5)

In [161]:

# ???? is this correct?

from sklearn.cross_validation import KFold

kf = KFold(len(ravensData), n_folds=3, indices=False)

cv1train = []
cv1test = []
cv2train = []
cv2test = []

for train, test in kf:
    glm1 = glm('ravenWinNum ~ ravenScore', ravensData[train], family=f.Binomial()).fit()
    glm2 = glm('ravenWinNum ~ ravenScore', ravensData[train], family=f.Gaussian()).fit()
    
    cv1train.append( cost(ravensData[train]['ravenWinNum'], glm1.predict()) )
    cv2train.append( cost(ravensData[train]['ravenWinNum'], glm2.predict()) )
    
    cv1test.append( cost(ravensData[test]['ravenWinNum'], glm1.predict(ravensData[test])) )
    cv2test.append( cost(ravensData[test]['ravenWinNum'], glm2.predict(ravensData[test])) )

In [176]:

zip( cv1train, cv1test )

Out[176]:

[(0.42857142857142855, 0.16666666666666666),
 (0.2857142857142857, 0.16666666666666666),
 (0.25, 0.5)]

In [163]:

np.mean(cv2train), np.mean(cv2test)

Out[163]:

(0.29365079365079361, 0.27777777777777773)

In [ ]: