Importing necessary libraries.
%pylab inline
from collections import defaultdict
import cPickle as pickle
import math
import numpy as np
import pandas as pd
import pylab
import seaborn
Populating the interactive namespace from numpy and matplotlib
colors = seaborn.color_palette()
from matplotlib import rc
rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")
rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")
rc('font',**{'family':'serif','serif':['Palatino']})
def side_by_side(*objs, **kwds):
from pandas.core.common import adjoin
space = kwds.get('space', 4)
reprs = [repr(obj).split('\n') for obj in objs]
print adjoin(space, *reprs)
# Creating a dictionary containing team name keyed by its abbreviation.
team_dict = {'ARI': 'Arizona Cardinals',
'ATL': 'Atlanta Falcons',
'BAL': 'Baltimore Ravens',
'BUF': 'Buffalo Bills',
'CAR': 'Carolina Panthers',
'CHI': 'Chicago Bears',
'CIN': 'Cincinnati Bengals',
'CLE': 'Cleveland Browns',
'DAL': 'Dallas Cowboys',
'DEN': 'Denver Broncos',
'DET': 'Detroit Lions',
'GB' : 'Green Bay Packers',
'HOU': 'Houston Texans',
'IND': 'Indianapolis Colts',
'JAC': 'Jacksonville Jaguars',
'KC' : 'Kansas City Chiefs',
'MIA': 'Miami Dolphins',
'MIN': 'Minnesota Vikings',
'NE' : 'New England Patriots',
'NO' : 'New Orleans Saints',
'NYG': 'New York Giants',
'NYJ': 'New York Jets',
'OAK': 'Oakland Raiders',
'PHI': 'Philadelphia Eagles',
'PIT': 'Pittsburgh Steelers',
'SD' : 'San Diego Chargers',
'SEA': 'Seattle Seahawks',
'SF' : 'San Francisco 49ers',
'STL': 'St. Louis Rams',
'TB' : 'Tampa Bay Buccaneers',
'TEN': 'Tennessee Titans',
'WAS': 'Washington Redskins'}
def get_team_name(abbr):
return team_dict[abbr]
trainDF = pd.read_csv('trainNFL.csv', index_col=0)
testDF = pd.read_csv('testNFL.csv', index_col=0)
trainDF = trainDF[['game_id','year','week','home_team','away_team','home_score','away_score','visitor_point_spread','over_or_under','winner']]
trainDF = trainDF.sort(column='game_id').reset_index(drop=True)
testDF = testDF.sort(column='game_id').reset_index(drop=True)
/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/core/frame.py:2542: FutureWarning: column is deprecated, use columns warnings.warn("column is deprecated, use columns", FutureWarning)
trainDF.head()
game_id | year | week | home_team | away_team | home_score | away_score | visitor_point_spread | over_or_under | winner | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2000 | 1 | ATL | SF | 36 | 28 | 7.0 | 42.5 | ATL |
1 | 2 | 2000 | 1 | CLE | JAC | 7 | 27 | -10.0 | 38.0 | JAC |
2 | 3 | 2000 | 1 | DAL | PHI | 14 | 41 | 6.0 | 40.0 | PHI |
3 | 4 | 2000 | 1 | GB | NYJ | 16 | 20 | 2.5 | 36.0 | NYJ |
4 | 9 | 2000 | 1 | NO | DET | 10 | 14 | 1.0 | 39.5 | DET |
5 rows × 10 columns
trdf = trainDF
def get_historic_games(team, gameId, trdf):
df = trdf[trdf.game_id < gameId]
hdf = df[df.home_team == team]
adf = df[df.away_team == team]
newdf = pd.concat([hdf, adf])
newdf = newdf.sort_index()
try:
return pd.concat([hdf, adf])
except:
return False
def get_points(team, df):
pFor = []
pAgainst = []
for r, v in df.iterrows():
hTeam = v.home_team
aTeam = v.away_team
hScore = v.home_score
aScore = v.away_score
if (team == hTeam):
pFor.append(hScore)
pAgainst.append(aScore)
elif (team == aTeam):
pFor.append(aScore)
pAgainst.append(hScore)
return (sum(pFor)/len(df), sum(pAgainst)/len(df))
for i in xrange(1, 50):
lst = []
for r, v in trdf.iterrows():
gameId = v.game_id
home = v.home_team
away = v.away_team
homehGames = get_historic_games(home, gameId, trdf)
awayhGames = get_historic_games(away, gameId, trdf)
if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
homehGames = homehGames[-i:]
awayhGames = awayhGames[-i:]
hFor, hAgainst = get_points(home, homehGames)
aFor, aAgainst = get_points(away, awayhGames)
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
lst.append(pd.DataFrame(v).T)
trdfn = pd.concat(lst)
csv_name = "baseline2/data_iter_"+str(i)+".csv"
trdfn.to_csv(csv_name)
def get_diff(hFor, hAgainst, aFor, aAgainst):
if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
return (hFor + aAgainst)/2 - (aFor + hAgainst)/2 + 3
else:
return 3
def get_predicted_winner(diff_xy, home_team, away_team):
if diff_xy >= 0:
return home_team
else:
return away_team
def is_prediction_correct(winner, predicted_winner):
if winner == predicted_winner:
return True
else:
return False
dfs = []
for i in xrange(1, 50):
csv_name = "baseline2/data_iter_"+str(i)+".csv"
trdfn = pd.read_csv(csv_name, index_col=0)
years = set(trdfn.year)
lst = []
for year in years:
trdfny = trdfn[trdfn.year == year]
trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
ans = pd.concat(dfs)
tmp = ans.T.mean()
tmp = ans.T.mean()
pd.DataFrame(tmp, columns=[1])
1 | |
---|---|
1 | 55.911578 |
2 | 56.487615 |
3 | 57.393433 |
4 | 58.688076 |
5 | 58.873848 |
6 | 58.654954 |
7 | 58.964574 |
8 | 59.229551 |
9 | 59.362039 |
10 | 60.037442 |
11 | 59.680300 |
12 | 59.542051 |
13 | 59.949597 |
14 | 59.998560 |
15 | 61.019585 |
16 | 60.439228 |
17 | 59.805588 |
18 | 60.033122 |
19 | 59.226671 |
20 | 59.269873 |
21 | 59.314516 |
22 | 59.135945 |
23 | 58.912730 |
24 | 59.091302 |
25 | 58.912730 |
26 | 58.689516 |
27 | 58.957373 |
28 | 58.823445 |
29 | 58.689516 |
30 | 58.510945 |
31 | 58.466302 |
32 | 58.421659 |
33 | 57.841302 |
34 | 58.153802 |
35 | 58.243088 |
36 | 58.243088 |
37 | 58.019873 |
38 | 58.198445 |
39 | 57.707373 |
40 | 57.528802 |
41 | 57.618088 |
42 | 57.618088 |
43 | 57.662730 |
44 | 57.618088 |
45 | 57.618088 |
46 | 57.841302 |
47 | 58.064516 |
48 | 58.153802 |
49 | 57.930588 |
49 rows × 1 columns
tmp.plot(label="Baseline 2")
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 65])
plt.legend()
plt.show()
/Library/Python/2.7/site-packages/matplotlib-1.4.x-py2.7-macosx-10.9-intel.egg/matplotlib/font_manager.py:1240: UserWarning: findfont: Font family [u'serif'] not found. Falling back to Bitstream Vera Sans (prop.get_family(), self.defaultFamily[fontext]))
tsdf = testDF
def get_train_games(team, gameId, trdf):
df = trdf[trdf.game_id < gameId]
hdf = df[df.home_team == team]
adf = df[df.away_team == team]
newdf = pd.concat([hdf, adf])
newdf = newdf.sort_index()
try:
return pd.concat([hdf, adf])
except:
return False
def get_points(team, df):
pFor = []
pAgainst = []
for r, v in df.iterrows():
hTeam = v.home_team
aTeam = v.away_team
hScore = v.home_score
aScore = v.away_score
if (team == hTeam):
pFor.append(hScore)
pAgainst.append(aScore)
elif (team == aTeam):
pFor.append(aScore)
pAgainst.append(hScore)
return (sum(pFor)/len(df), sum(pAgainst)/len(df))
i = 15
lst = []
for r, v in tsdf.iterrows():
gameId = v.game_id
home = v.home_team
away = v.away_team
homehGames = get_train_games(home, gameId, trdf)
awayhGames = get_train_games(away, gameId, trdf)
if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
homehGames = homehGames[-i:]
awayhGames = awayhGames[-i:]
hFor, hAgainst = get_points(home, homehGames)
aFor, aAgainst = get_points(away, awayhGames)
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
lst.append(pd.DataFrame(v).T)
trdfn = pd.concat(lst)
csv_name = "baseline2/test_data_iter_"+str(i)+".csv"
trdfn.to_csv(csv_name)
dfs = []
csv_name = "baseline2/test_data_iter_"+str(i)+".csv"
tsdfn = pd.read_csv(csv_name, index_col=0)
years = set(tsdfn.year)
for year in years:
tsdfny = tsdfn[tsdfn.year == year]
tsdfny['diff_xy'] = tsdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
tsdfny['predicted_winner'] = tsdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
tsdfny['correct_prediction'] = tsdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
accuracy = float(sum(tsdfny.correct_prediction))*100/len(tsdfny.index)
dfs.append(pd.DataFrame([accuracy], index=[year], columns=["accuracy"]))
ans = pd.concat(dfs)
ans
accuracy | |
---|---|
2000 | 52.884615 |
2001 | 64.423077 |
2002 | 63.551402 |
2003 | 60.747664 |
2004 | 60.747664 |
2005 | 65.420561 |
2006 | 59.813084 |
2007 | 64.485981 |
2008 | 58.878505 |
2009 | 68.224299 |
2010 | 55.140187 |
2011 | 61.682243 |
2012 | 61.682243 |
2013 | 62.616822 |
14 rows × 1 columns
float(mean(ans))
61.44988189380713
trdf_alpha = trainDF
def get_historic_games(team, gameId, trdf):
df = trdf[trdf.game_id < gameId]
hdf = df[df.home_team == team]
adf = df[df.away_team == team]
newdf = pd.concat([hdf, adf])
newdf = newdf.sort_index()
try:
return pd.concat([hdf, adf])
except:
return False
def get_points(team, df, alpha):
pFor = []
pAgainst = []
for r, v in df.iterrows():
hTeam = v.home_team
aTeam = v.away_team
hScore = v.home_score
aScore = v.away_score
if (team == hTeam):
pFor.append(hScore)
pAgainst.append(aScore)
elif (team == aTeam):
pFor.append(aScore)
pAgainst.append(hScore)
numGames = len(pFor)
weights = []
for i in xrange(1, numGames+1):
weights.append(alpha**i)
return (int(round(np.dot(pFor, weights)/float(sum(weights)))), int(round(np.dot(pAgainst, weights)/float(sum(weights)))))
for alpha in np.arange(1, 1.01, 0.1):
for i in xrange(1, 50):
lst = []
for r, v in trdf_alpha.iterrows():
gameId = v.game_id
home = v.home_team
away = v.away_team
homehGames = get_historic_games(home, gameId, trdf_alpha)
awayhGames = get_historic_games(away, gameId, trdf_alpha)
if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
homehGames = homehGames[-i:]
awayhGames = awayhGames[-i:]
hFor, hAgainst = get_points(home, homehGames, alpha)
aFor, aAgainst = get_points(away, awayhGames, alpha)
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
lst.append(pd.DataFrame(v).T)
trdf_alpha_n = pd.concat(lst)
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
trdf_alpha_n.to_csv(csv_name)
def get_diff(hFor, hAgainst, aFor, aAgainst):
if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
return (hFor + aAgainst)/2 - (aFor + hAgainst)/2 + 3
else:
return 3
def get_predicted_winner(diff_xy, home_team, away_team):
if diff_xy >= 0:
return home_team
else:
return away_team
def is_prediction_correct(winner, predicted_winner):
if winner == predicted_winner:
return True
else:
return False
adfs = []
for alpha in np.arange(0.5, 1.01, 0.1):
dfs = []
for i in xrange(1, 50):
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
trdfn = pd.read_csv(csv_name, index_col=0)
years = set(trdfn.year)
lst = []
for year in years:
trdfny = trdfn[trdfn.year == year]
trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
tmp = pd.concat(dfs)
tmp = tmp.T.mean()
adfs.append(pd.DataFrame(tmp, columns=[alpha]).T)
ans = pd.concat(adfs).T
ans.T
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0.5 | 55.911578 | 56.445853 | 56.189516 | 58.660714 | 57.353111 | 55.387385 | 56.984447 | 57.260945 | 56.810196 | 57.896025 | 57.936348 | 55.697005 | 58.198445 | 56.419931 | 57.932028 | 57.347350 | 56.722350 | 55.432028 | 54.717742 | 56.368088 | ... |
0.6 | 55.911578 | 56.267281 | 56.275922 | 58.464862 | 57.534562 | 56.281682 | 57.926267 | 57.841302 | 57.165899 | 58.794643 | 58.610311 | 57.399194 | 58.109159 | 57.226382 | 58.424539 | 57.883065 | 56.723790 | 56.680588 | 55.119528 | 56.189516 | ... |
0.7 | 55.911578 | 56.182316 | 57.214862 | 59.095622 | 57.756336 | 57.226382 | 58.202765 | 57.930588 | 57.747696 | 59.863191 | 59.235311 | 58.028514 | 58.379896 | 58.882488 | 59.144585 | 58.286290 | 57.884505 | 57.348790 | 56.682028 | 56.546659 | ... |
0.8 | 55.911578 | 56.408410 | 56.900922 | 59.183468 | 58.472062 | 57.682892 | 58.510945 | 58.283410 | 58.063076 | 60.262097 | 59.500288 | 58.873848 | 59.408122 | 59.194988 | 59.459965 | 58.423099 | 58.555588 | 57.707373 | 57.887385 | 57.752016 | ... |
0.9 | 55.911578 | 56.358007 | 57.802419 | 59.000576 | 58.922811 | 58.034274 | 59.146025 | 58.696717 | 59.279954 | 60.437788 | 59.229551 | 59.052419 | 60.260657 | 59.955357 | 60.574597 | 59.411002 | 59.848790 | 59.180588 | 59.182028 | 58.957373 | ... |
1.0 | 55.911578 | 56.538018 | 57.173099 | 58.417339 | 58.742800 | 58.568548 | 58.738479 | 59.236751 | 59.274194 | 60.030242 | 59.369240 | 59.766705 | 60.125288 | 60.220334 | 60.708525 | 60.303859 | 59.940956 | 59.675979 | 59.583813 | 59.314516 | ... |
6 rows × 49 columns
ans.plot()
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 63])
plt.show()
ans.plot()
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([52, 62])
plt.show()
# Plotting just the max values for all alpha values
ans.max().plot()
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Alpha values')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 65])
plt.show()
tsdf_alpha = trainDF
def get_train_games(team, gameId, trdf):
df = trdf[trdf.game_id < gameId]
hdf = df[df.home_team == team]
adf = df[df.away_team == team]
newdf = pd.concat([hdf, adf])
newdf = newdf.sort_index()
try:
return pd.concat([hdf, adf])
except:
return False
def get_points(team, df, alpha):
pFor = []
pAgainst = []
for r, v in df.iterrows():
hTeam = v.home_team
aTeam = v.away_team
hScore = v.home_score
aScore = v.away_score
if (team == hTeam):
pFor.append(hScore)
pAgainst.append(aScore)
elif (team == aTeam):
pFor.append(aScore)
pAgainst.append(hScore)
numGames = len(pFor)
weights = []
for i in xrange(1, numGames+1):
weights.append(alpha**i)
return (int(round(np.dot(pFor, weights)/float(sum(weights)))), int(round(np.dot(pAgainst, weights)/float(sum(weights)))))
i = 15
alpha = 1.0
lst = []
for r, v in tsdf_alpha.iterrows():
gameId = v.game_id
home = v.home_team
away = v.away_team
homehGames = get_historic_games(home, gameId, tsdf_alpha)
awayhGames = get_historic_games(away, gameId, tsdf_alpha)
if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
homehGames = homehGames[-i:]
awayhGames = awayhGames[-i:]
hFor, hAgainst = get_points(home, homehGames, alpha)
aFor, aAgainst = get_points(away, awayhGames, alpha)
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
lst.append(pd.DataFrame(v).T)
tsdf_alpha_n = pd.concat(lst)
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
tsdf_alpha_n.to_csv(csv_name)
dfs = []
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
tsdfn = pd.read_csv(csv_name, index_col=0)
years = set(tsdfn.year)
for year in years:
tsdfny = tsdfn[tsdfn.year == year]
tsdfny['diff_xy'] = tsdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
tsdfny['predicted_winner'] = tsdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
tsdfny['correct_prediction'] = tsdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
accuracy = float(sum(tsdfny.correct_prediction))*100/len(tsdfny.index)
dfs.append(pd.DataFrame([accuracy], index=[year], columns=["accuracy"]))
ans = pd.concat(dfs)
ans
accuracy | |
---|---|
2000 | 58.709677 |
2001 | 58.709677 |
2002 | 61.875000 |
2003 | 64.375000 |
2004 | 60.000000 |
2005 | 61.875000 |
2006 | 58.125000 |
2007 | 62.500000 |
2008 | 58.125000 |
2009 | 65.000000 |
2010 | 63.750000 |
2011 | 59.375000 |
2012 | 59.375000 |
2013 | 58.125000 |
14 rows × 1 columns
float(mean(ans))
60.70852534562213