%pylab inline
Populating the interactive namespace from numpy and matplotlib
from collections import defaultdict
import cPickle as pickle
import numpy as np
import pandas as pd
import pylab
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
import seaborn
colors = seaborn.color_palette()
from matplotlib import rc
rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")
rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")
rc('font',**{'family':'serif','serif':['Palatino']})
trainDF = pd.read_csv('trainNFL.csv', index_col=0)
testDF = pd.read_csv('testNFL.csv', index_col=0)
trainDF['score_diff'] = trainDF.home_score - trainDF.away_score
testDF['score_diff'] = testDF.home_score - testDF.away_score
trainDF = trainDF.sort(column='game_id').reset_index(drop=True)
testDF = testDF.sort(column='game_id').reset_index(drop=True)
/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/core/frame.py:2542: FutureWarning: column is deprecated, use columns warnings.warn("column is deprecated, use columns", FutureWarning)
def get_correct(score_diff, predicted_score):
if ((predicted_score > 0 and score_diff > 0) or (predicted_score < 0 and score_diff < 0)):
return 1
else:
return 0
def check_accuracy(allStats):
accuracy = dict()
for year, df in allStats.iteritems():
numGames = len(df.correct_prediction)
numCorrect = sum(df.correct_prediction)
pctCorrect = numCorrect*100/float(numGames)
accuracy[year] = [numGames, numCorrect, pctCorrect]
return accuracy
def initialize(data):
X = data.ix[:, features]
y = data.ix[:, 'score_diff']
reg = LinearRegression(fit_intercept=True, normalize=True)
model = reg.fit(X, y)
return model
def get_games_till_now(game, df):
curGameId = int(game.game_id)
retdf = df[df.game_id < curGameId]
return retdf
def get_game_features(g, gamesTillNow):
hTeam = g.home_team
aTeam = g.away_team
gameYear = int(g.year)
games = []
for row, game in gamesTillNow.iterrows():
rhTeam = game.home_team
raTeam = game.away_team
if ((rhTeam == hTeam) or (rhTeam == aTeam) or (raTeam == hTeam) or (raTeam == aTeam)):
games.append(pd.DataFrame(game).T)
games = games[-15:]
numGames = len(games)
games = pd.concat(games)
average = pd.DataFrame(games[features].mean().round()).T
return (numGames, average[features])
def get_updated_data(game, df, g):
curGameId = int(game.game_id)
retdf = df[df.game_id <= curGameId]
return retdf[-g:]
def train_updated_model(data):
X = data.ix[:, features]
y = data.ix[:, 'score_diff']
reg = LinearRegression(fit_intercept=True, normalize=True)
model = reg.fit(X, y)
return model
def run_model(model, gameFeatures):
gameResult = int(model.predict(gameFeatures[features]).round())
return gameResult
features = ['temperature','humidity','wind_speed','over_or_under','visitor_point_spread','away_score','home_score','1QP_home','2QP_home','3QP_home','4QP_home','RFD_home','PFD_home','IFD_home','RY_home','RA_home','PY_home','PA_home','PC_home','SK_home','INT_home','FUM_home','PU_home','GPY_home','PR_home','PRY_home','KR_home','KRY_home','IR_home','IRY_home','PEN_home','TOP_home','TD_home','TDR_home','TDP_home','TDT_home','FGM_home','FGAT_home','FGY_home','RZA_home','RZC_home','BRY_home','BPY_home','SRP_home','S1RP_home','S2RP_home','S3RP_home','SPP_home','S1PP_home','S2PP_home','S3PP_home','LEA_home','LEY_home','LTA_home','LTY_home','LGA_home','LGY_home','MDA_home','MDY_home','RGA_home','RGY_home','RTA_home','RTY_home','REA_home','REY_home','R1A_home','R1Y_home','R2A_home','R2Y_home','R3A_home','R3Y_home','QBA_home','QBY_home','SLA_home','SLY_home','SMA_home','SMY_home','SRA_home','SRY_home','DLA_home','DLY_home','DMA_home','DMY_home','DRA_home','DRY_home','WR1A_home','WR1Y_home','WR3A_home','WR3Y_home','TEA_home','TEY_home','RBA_home','RBY_home','SGA_home','SGY_home','P1A_home','P1Y_home','P2A_home','P2Y_home','P3A_home','P3Y_home','SPC_home','MPC_home','LPC_home','Q1RA_home','Q1RY_home','Q1PA_home','Q1PY_home','LCRA_home','LCRY_home','LCPA_home','LCPY_home','RZRA_home','RZRY_home','RZPA_home','RZPY_home','SKY_home','LBS_home','DBS_home','SFPY_home','DRV_home','NPY_home','TB_home','I20_home','RTD_home','LNR_home','LNP_home','LBR_home','LBP_home','DBR_home','DBP_home','NHA_home','S3A_home','S3C_home','L3A_home','L3C_home','STF_home','DP_home','FSP_home','OHP_home','PBEP_home','DLP_home','DSP_home','DMP_home','PFN_home','1QP_away','2QP_away','3QP_away','4QP_away','RFD_away','PFD_away','IFD_away','RY_away','RA_away','PY_away','PA_away','PC_away','SK_away','INT_away','FUM_away','PU_away','GPY_away','PR_away','PRY_away','KR_away','KRY_away','IR_away','IRY_away','PEN_away','TOP_away','TD_away','TDR_away','TDP_away','TDT_away','FGM_away','FGAT_away','FGY_away','RZA_away','RZC_away','BRY_away','BPY_away','SRP_away','S1RP_away','S2RP_away','S3RP_away','SPP_away','S1PP_away','S2PP_away','S3PP_away','LEA_away','LEY_away','LTA_away','LTY_away','LGA_away','LGY_away','MDA_away','MDY_away','RGA_away','RGY_away','RTA_away','RTY_away','REA_away','REY_away','R1A_away','R1Y_away','R2A_away','R2Y_away','R3A_away','R3Y_away','QBA_away','QBY_away','SLA_away','SLY_away','SMA_away','SMY_away','SRA_away','SRY_away','DLA_away','DLY_away','DMA_away','DMY_away','DRA_away','DRY_away','WR1A_away','WR1Y_away','WR3A_away','WR3Y_away','TEA_away','TEY_away','RBA_away','RBY_away','SGA_away','SGY_away','P1A_away','P1Y_away','P2A_away','P2Y_away','P3A_away','P3Y_away','SPC_away','MPC_away','LPC_away','Q1RA_away','Q1RY_away','Q1PA_away','Q1PY_away','LCRA_away','LCRY_away','LCPA_away','LCPY_away','RZRA_away','RZRY_away','RZPA_away','RZPY_away','SKY_away','LBS_away','DBS_away','SFPY_away','DRV_away','NPY_away','TB_away','I20_away','RTD_away','LNR_away','LNP_away','LBR_away','LBP_away','DBR_away','DBP_away','NHA_away','S3A_away','S3C_away','L3A_away','L3C_away','STF_away','DP_away','FSP_away','OHP_away','PBEP_away','DLP_away','DSP_away','DMP_away','PFN_away']
clfs = [LinearRegression(fit_intercept=True),
LinearRegression(fit_intercept=True, normalize=True),
Ridge(fit_intercept=True),
Ridge(fit_intercept=True, normalize=True),
Lasso(fit_intercept=True),
Lasso(fit_intercept=True, normalize=True)]
clf_names = ['Linear Regression', 'Linear Regression Normalized',
'Ridge Regression', 'Ridge Regression Normalized',
'Lasso Regression', 'Lasso Regression Normalized']
train_results = []
models = {}
for (i, clf_) in enumerate(clfs):
data = trainDF[trainDF.year >= 2001]
X = data.ix[:, features]
y = data.ix[:, 'score_diff']
clf = clf_.fit(X, y)
models[clf_names[i]] = clf
predicted_train = clf.predict(X)
p = [int(round(val)) for val in predicted_train]
data['predicted_score_diff'] = pd.Series(p, index=data.index)
data['correct_prediction'] = data.apply(lambda x: get_correct(x['score_diff'], x['predicted_score_diff']), axis=1)
years = [int(y) for y in unique(data.year)]
ansdf = {}
for y in years:
ansdf[y] = data[data.year == y]
accuracy = check_accuracy(ansdf)
accuracy = pd.DataFrame(accuracy).T
accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
accuracy = accuracy[['pct_correct_prediction']]
accuracy.columns = [clf_names[i]]
train_results.append(accuracy.T)
train_resultdf = pd.concat(train_results).T
train_resultdf
Linear Regression | Linear Regression Normalized | Ridge Regression | Ridge Regression Normalized | Lasso Regression | Lasso Regression Normalized | |
---|---|---|---|---|---|---|
2001 | 100.000 | 100.000 | 100.000 | 94.83871 | 100.000 | 54.193548 |
2002 | 100.000 | 100.000 | 100.000 | 93.75000 | 100.000 | 59.375000 |
2003 | 100.000 | 100.000 | 100.000 | 95.00000 | 100.000 | 61.875000 |
2004 | 100.000 | 100.000 | 100.000 | 95.00000 | 100.000 | 55.000000 |
2005 | 100.000 | 100.000 | 100.000 | 93.12500 | 100.000 | 56.875000 |
2006 | 100.000 | 100.000 | 100.000 | 93.75000 | 100.000 | 53.125000 |
2007 | 100.000 | 100.000 | 100.000 | 98.12500 | 100.000 | 54.375000 |
2008 | 100.000 | 100.000 | 100.000 | 94.37500 | 100.000 | 56.250000 |
2009 | 100.000 | 100.000 | 100.000 | 93.75000 | 100.000 | 58.125000 |
2010 | 100.000 | 100.000 | 100.000 | 96.87500 | 100.000 | 57.500000 |
2011 | 100.000 | 100.000 | 100.000 | 92.50000 | 100.000 | 54.375000 |
2012 | 100.000 | 100.000 | 100.000 | 92.50000 | 100.000 | 52.500000 |
2013 | 99.375 | 99.375 | 99.375 | 91.87500 | 99.375 | 57.500000 |
13 rows × 6 columns
train_resultdf.plot(kind='bar', figsize=(17, 8))
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Years')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 100])
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True, shadow=True, ncol=3)
plt.show()
/Library/Python/2.7/site-packages/matplotlib-1.4.x-py2.7-macosx-10.9-intel.egg/matplotlib/font_manager.py:1240: UserWarning: findfont: Font family [u'serif'] not found. Falling back to Bitstream Vera Sans (prop.get_family(), self.defaultFamily[fontext]))
test_results = []
for model_name, model in models.iteritems():
data_test = testDF[testDF.year > 2000]
X_test = data_test.ix[:, features]
y_test = data_test.ix[:, 'score_diff']
predicted_test = model.predict(X_test)
pt = [int(round(val)) for val in predicted_test]
data_test['predicted_score_diff'] = pd.Series(pt, index=data_test.index)
data_test['correct_prediction'] = data_test.apply(lambda x: get_correct(x['score_diff'], x['predicted_score_diff']), axis=1)
years = [int(y) for y in unique(data_test.year)]
ansdf = {}
for y in years:
ansdf[y] = data_test[data_test.year == y]
accuracy = check_accuracy(ansdf)
accuracy = pd.DataFrame(accuracy).T
accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
accuracy = accuracy[['pct_correct_prediction']]
accuracy.columns = [model_name]
test_results.append(accuracy.T)
test_resultdf = pd.concat(test_results).T
test_resultdf
Ridge Regression Normalized | Lasso Regression Normalized | Lasso Regression | Ridge Regression | Linear Regression | Linear Regression Normalized | |
---|---|---|---|---|---|---|
2001 | 96.153846 | 57.692308 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2002 | 94.392523 | 57.943925 | 99.065421 | 99.065421 | 99.065421 | 99.065421 |
2003 | 94.392523 | 60.747664 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2004 | 93.457944 | 58.878505 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2005 | 94.392523 | 60.747664 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2006 | 97.196262 | 55.140187 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2007 | 95.327103 | 60.747664 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2008 | 92.523364 | 57.009346 | 99.065421 | 99.065421 | 99.065421 | 99.065421 |
2009 | 93.457944 | 56.074766 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2010 | 96.261682 | 52.336449 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2011 | 97.196262 | 61.682243 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
2012 | 92.523364 | 63.551402 | 99.065421 | 99.065421 | 99.065421 | 99.065421 |
2013 | 96.261682 | 62.616822 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
13 rows × 6 columns
test_resultdf.plot(kind='bar', figsize=(17, 8))
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Years')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 100])
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True, shadow=True, ncol=3)
plt.show()
def get_game_features(g, gamesTillNow, itr, alpha):
hTeam = g.home_team
aTeam = g.away_team
gameYear = int(g.year)
games = []
for row, game in gamesTillNow.iterrows():
rhTeam = game.home_team
raTeam = game.away_team
if ((rhTeam == hTeam) or (rhTeam == aTeam) or (raTeam == hTeam) or (raTeam == aTeam)):
games.append(pd.DataFrame(game).T)
games = games[-itr:]
numGames = len(games)
games = pd.concat(games)
games = games[features]
numGames = len(games.index)
weights = []
for i in xrange(1, numGames+1):
weights.append(alpha**i)
weights = np.array(weights)
average = (games.mul(weights[::-1], axis=0)/float(sum(weights)))
average = pd.DataFrame(average.sum()).T.apply(np.round)
return (numGames, average[features])
test2_results = []
for model_name, model in models.iteritems():
itr = 15
alpha = 0.9
mtemp = []
data_test = testDF[testDF.year > 2007]
for ix, game in data_test.iterrows():
gamesTillNow = get_games_till_now(game, data)
numGames, gameFeatures = get_game_features(game, gamesTillNow, itr, alpha)
gameResult = run_model(model, gameFeatures)
mtemp.append(gameResult)
data_test['predicted_score_diff'] = pd.Series(mtemp, index=data_test.index)
cols = ['year', 'week', 'home_team', 'away_team', 'home_score', 'away_score', 'winner', 'score_diff', 'predicted_score_diff']
process = data_test[cols]
process['error'] = process.score_diff - process.predicted_score_diff
process['correct_prediction'] = process.apply(lambda x: get_correct(x['score_diff'], x['predicted_score_diff']), axis=1)
years = [int(y) for y in unique(process.year)]
ansdf = {}
for y in years:
ansdf[y] = process[process.year == y]
accuracy = check_accuracy(ansdf)
accuracy = pd.DataFrame(accuracy).T
accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
accuracy = accuracy[['pct_correct_prediction']]
accuracy.columns = [model_name]
test2_results.append(accuracy.T)
test2_resultdf = pd.concat(test2_results).T
test2_resultdf
Ridge Regression Normalized | Lasso Regression Normalized | Lasso Regression | Ridge Regression | Linear Regression | Linear Regression Normalized | |
---|---|---|---|---|---|---|
2008 | 46.728972 | 57.009346 | 44.859813 | 44.859813 | 44.859813 | 44.859813 |
2009 | 46.728972 | 56.074766 | 46.728972 | 46.728972 | 46.728972 | 46.728972 |
2010 | 44.859813 | 52.336449 | 44.859813 | 44.859813 | 44.859813 | 44.859813 |
2011 | 40.186916 | 61.682243 | 42.056075 | 42.056075 | 42.056075 | 42.056075 |
2012 | 53.271028 | 63.551402 | 51.401869 | 51.401869 | 51.401869 | 51.401869 |
2013 | 50.467290 | 62.616822 | 52.336449 | 52.336449 | 52.336449 | 52.336449 |
6 rows × 6 columns
test2_resultdf.plot(kind='bar', figsize=(17, 8))
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Years')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 100])
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True, shadow=True, ncol=3)
plt.show()
def get_game_features(g, gamesTillNow, itr, alpha):
hTeam = g.home_team
aTeam = g.away_team
gameYear = int(g.year)
games = []
for row, game in gamesTillNow.iterrows():
rhTeam = game.home_team
raTeam = game.away_team
if ((rhTeam == hTeam) or (rhTeam == aTeam) or (raTeam == hTeam) or (raTeam == aTeam)):
games.append(pd.DataFrame(game).T)
games = games[-itr:]
numGames = len(games)
games = pd.concat(games)
games = games[features]
numGames = len(games.index)
weights = []
for i in xrange(1, numGames+1):
weights.append(alpha**i)
weights = np.array(weights)
average = (games.mul(weights[::-1], axis=0)/float(sum(weights)))
average = pd.DataFrame(average.sum()).T.apply(np.round)
return (numGames, average[features])
data = trainDF[trainDF.year >= 2001]
train2_results = []
for model_name, model in models.iteritems():
itr = 15
alpha = 0.9
mtemp = []
data_train = trainDF[trainDF.year > 2007]
for ix, game in data_train.iterrows():
gamesTillNow = get_games_till_now(game, data)
numGames, gameFeatures = get_game_features(game, gamesTillNow, itr, alpha)
gameResult = run_model(model, gameFeatures)
mtemp.append(gameResult)
data_train['predicted_score_diff'] = pd.Series(mtemp, index=data_train.index)
cols = ['year', 'week', 'home_team', 'away_team', 'home_score', 'away_score', 'winner', 'score_diff', 'predicted_score_diff']
process = data_train[cols]
process['error'] = process.score_diff - process.predicted_score_diff
process['correct_prediction'] = process.apply(lambda x: get_correct(x['score_diff'], x['predicted_score_diff']), axis=1)
years = [int(y) for y in unique(process.year)]
ansdf = {}
for y in years:
ansdf[y] = process[process.year == y]
accuracy = check_accuracy(ansdf)
accuracy = pd.DataFrame(accuracy).T
accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
accuracy = accuracy[['pct_correct_prediction']]
accuracy.columns = [model_name]
train2_results.append(accuracy.T)
train2_resultdf = pd.concat(train2_results).T
train2_resultdf
Ridge Regression Normalized | Lasso Regression Normalized | Lasso Regression | Ridge Regression | Linear Regression | Linear Regression Normalized | |
---|---|---|---|---|---|---|
2008 | 47.500 | 56.250 | 49.375 | 49.375 | 49.375 | 49.375 |
2009 | 48.125 | 58.125 | 50.625 | 50.625 | 50.625 | 50.625 |
2010 | 46.250 | 57.500 | 46.875 | 46.875 | 46.875 | 46.875 |
2011 | 50.625 | 54.375 | 51.250 | 51.250 | 51.250 | 51.250 |
2012 | 44.375 | 52.500 | 43.750 | 43.750 | 43.750 | 43.750 |
2013 | 43.750 | 57.500 | 46.875 | 46.875 | 46.875 | 46.875 |
6 rows × 6 columns