import pandas as pd import numpy as np import pandas.rpy.common as com faithful = com.load_data('faithful') faithful.shape %load_ext rmagic %%R -o trainSamples set.seed(333) trainSamples <- sample(1:272,size=(272/2),replace=F) trainFaith = faithful.ix[trainSamples,:] testFaith = faithful.ix[faithful.index - trainSamples,:] trainFaith.head() from statsmodels.formula.api import ols lm1 = ols('eruptions ~ waiting', trainFaith).fit() lm1.summary() plot(trainFaith['waiting'], trainFaith['eruptions'], 'o') plot(trainFaith['waiting'], lm1.fittedvalues, 'k', linewidth=3) xlabel('Waiting') ylabel('Duration'); print lm1.params['Intercept'] + lm1.params['waiting'] * 80 print lm1.predict({'waiting' : [80]}) f, (ax1, ax2) = subplots(ncols=2) ax1.plot(trainFaith['waiting'], trainFaith['eruptions'], 'o') ax1.plot(trainFaith['waiting'], lm1.predict({'waiting' : trainFaith['waiting'].values}), 'k') ax1.set_xlabel('Waiting') ax1.set_ylabel('Duration') ax2.plot(testFaith['waiting'], testFaith['eruptions'], 'o') ax2.plot(testFaith['waiting'], lm1.predict({'waiting' : testFaith['waiting'].values}), 'k') ax2.set_xlabel('Waiting') ax2.set_ylabel('Duration') f.set_size_inches(8,3) f.tight_layout(); # calculate RMSE on training print sqrt(sum((lm1.fittedvalues - trainFaith['eruptions']) ** 2)) # calculate RMSE on test print sqrt(sum((lm1.predict({'waiting' : testFaith['waiting'].values}) - testFaith['eruptions']) ** 2)) # no prediction interval ravensData = pd.read_csv('../data/ravensData.csv') ravensData.head(6) from statsmodels.formula.api import glm from statsmodels.api import families as f glm1 = glm('ravenWinNum ~ ravenScore', ravensData, family=f.Binomial()).fit() df = pd.DataFrame({'pred' : glm1.predict(), 'ravenWinNum' : ravensData['ravenWinNum'].values}) df.boxplot(by='ravenWinNum'); xx = np.linspace(0, 1, num=10) err = [sum( (df['pred'] > x) != df['ravenWinNum'] ) for x in xx] plot(xx, err, 'ok') xlabel('Cutoff') ylabel('Error'); def cost(win, pred=0): return np.mean(np.abs(win - pred) > 0.5) # ???? is this correct? from sklearn.cross_validation import KFold kf = KFold(len(ravensData), n_folds=3, indices=False) cv1train = [] cv1test = [] cv2train = [] cv2test = [] for train, test in kf: glm1 = glm('ravenWinNum ~ ravenScore', ravensData[train], family=f.Binomial()).fit() glm2 = glm('ravenWinNum ~ ravenScore', ravensData[train], family=f.Gaussian()).fit() cv1train.append( cost(ravensData[train]['ravenWinNum'], glm1.predict()) ) cv2train.append( cost(ravensData[train]['ravenWinNum'], glm2.predict()) ) cv1test.append( cost(ravensData[test]['ravenWinNum'], glm1.predict(ravensData[test])) ) cv2test.append( cost(ravensData[test]['ravenWinNum'], glm2.predict(ravensData[test])) ) zip( cv1train, cv1test ) np.mean(cv2train), np.mean(cv2test)