NFL Baseline 2 Model¶

Importing necessary libraries.

In [1]:

%pylab inline

from collections import defaultdict

import cPickle as pickle
import math
import numpy as np
import pandas as pd
import pylab
import seaborn

Populating the interactive namespace from numpy and matplotlib

In [2]:

colors = seaborn.color_palette()

from matplotlib import rc

rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")


rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")


rc('font',**{'family':'serif','serif':['Palatino']})

In [3]:

def side_by_side(*objs, **kwds):
    from pandas.core.common import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print adjoin(space, *reprs)

In [4]:

# Creating a dictionary containing team name keyed by its abbreviation.
team_dict = {'ARI': 'Arizona Cardinals',
             'ATL': 'Atlanta Falcons',
             'BAL': 'Baltimore Ravens',
             'BUF': 'Buffalo Bills',
             'CAR': 'Carolina Panthers',
             'CHI': 'Chicago Bears',
             'CIN': 'Cincinnati Bengals',
             'CLE': 'Cleveland Browns',
             'DAL': 'Dallas Cowboys',
             'DEN': 'Denver Broncos',
             'DET': 'Detroit Lions',
             'GB' : 'Green Bay Packers',
             'HOU': 'Houston Texans',
             'IND': 'Indianapolis Colts',
             'JAC': 'Jacksonville Jaguars',
             'KC' : 'Kansas City Chiefs',
             'MIA': 'Miami Dolphins',
             'MIN': 'Minnesota Vikings',
             'NE' : 'New England Patriots',
             'NO' : 'New Orleans Saints',
             'NYG': 'New York Giants',
             'NYJ': 'New York Jets',
             'OAK': 'Oakland Raiders',
             'PHI': 'Philadelphia Eagles',
             'PIT': 'Pittsburgh Steelers',
             'SD' : 'San Diego Chargers',
             'SEA': 'Seattle Seahawks',
             'SF' : 'San Francisco 49ers',
             'STL': 'St. Louis Rams',
             'TB' : 'Tampa Bay Buccaneers',
             'TEN': 'Tennessee Titans',
             'WAS': 'Washington Redskins'}

In [5]:

def get_team_name(abbr):
    return team_dict[abbr]

In [6]:

trainDF = pd.read_csv('trainNFL.csv', index_col=0)
testDF = pd.read_csv('testNFL.csv', index_col=0)

In [7]:

trainDF = trainDF[['game_id','year','week','home_team','away_team','home_score','away_score','visitor_point_spread','over_or_under','winner']]

In [8]:

trainDF = trainDF.sort(column='game_id').reset_index(drop=True)
testDF = testDF.sort(column='game_id').reset_index(drop=True)

/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/core/frame.py:2542: FutureWarning: column is deprecated, use columns
  warnings.warn("column is deprecated, use columns", FutureWarning)

In [9]:

trainDF.head()

Out[9]:

	game_id	year	week	home_team	away_team	home_score	away_score	visitor_point_spread	over_or_under	winner
0	1	2000	1	ATL	SF	36	28	7.0	42.5	ATL
1	2	2000	1	CLE	JAC	7	27	-10.0	38.0	JAC
2	3	2000	1	DAL	PHI	14	41	6.0	40.0	PHI
3	4	2000	1	GB	NYJ	16	20	2.5	36.0	NYJ
4	9	2000	1	NO	DET	10	14	1.0	39.5	DET

5 rows × 10 columns

Train Data Operations¶

In [15]:

trdf = trainDF

In [16]:

def get_historic_games(team, gameId, trdf):
    df = trdf[trdf.game_id < gameId]
    hdf = df[df.home_team == team]
    adf = df[df.away_team == team]
    newdf = pd.concat([hdf, adf])
    newdf = newdf.sort_index()
    try:
        return pd.concat([hdf, adf])
    except:
        return False

In [17]:

def get_points(team, df):
    pFor = []
    pAgainst = []
    for r, v in df.iterrows():
        hTeam = v.home_team
        aTeam = v.away_team
        hScore = v.home_score
        aScore = v.away_score
        if (team == hTeam):
            pFor.append(hScore)
            pAgainst.append(aScore)
        elif (team == aTeam):
            pFor.append(aScore)
            pAgainst.append(hScore)
    return (sum(pFor)/len(df), sum(pAgainst)/len(df))

In [26]:

for i in xrange(1, 50):
    lst = []
    for r, v in trdf.iterrows():
        gameId = v.game_id
        home = v.home_team
        away = v.away_team
        homehGames = get_historic_games(home, gameId, trdf)
        awayhGames = get_historic_games(away, gameId, trdf)
        if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
            if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
                homehGames = homehGames[-i:]
                awayhGames = awayhGames[-i:]
                hFor, hAgainst = get_points(home, homehGames)
                aFor, aAgainst = get_points(away, awayhGames)
            else:
                hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
        else:
            hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
        v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
        lst.append(pd.DataFrame(v).T)
    trdfn = pd.concat(lst)
    csv_name = "baseline2/data_iter_"+str(i)+".csv"
    trdfn.to_csv(csv_name)

In [29]:

def get_diff(hFor, hAgainst, aFor, aAgainst):
    if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
        return (hFor + aAgainst)/2  - (aFor + hAgainst)/2 + 3
    else:
        return 3

In [30]:

def get_predicted_winner(diff_xy, home_team, away_team):
    if diff_xy >= 0:
        return home_team
    else:
        return away_team

In [31]:

def is_prediction_correct(winner, predicted_winner):
    if winner == predicted_winner:
        return True
    else:
        return False

In [27]:

dfs = []
for i in xrange(1, 50):
    csv_name = "baseline2/data_iter_"+str(i)+".csv"
    trdfn = pd.read_csv(csv_name, index_col=0)
    years = set(trdfn.year)
    lst = []
    for year in years:
        trdfny = trdfn[trdfn.year == year]
        trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
        trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
        trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
        lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
    dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
ans = pd.concat(dfs)    

In [28]:

tmp = ans.T.mean()

In [32]:

tmp = ans.T.mean()
pd.DataFrame(tmp, columns=[1])

Out[32]:

	1
1	55.911578
2	56.487615
3	57.393433
4	58.688076
5	58.873848
6	58.654954
7	58.964574
8	59.229551
9	59.362039
10	60.037442
11	59.680300
12	59.542051
13	59.949597
14	59.998560
15	61.019585
16	60.439228
17	59.805588
18	60.033122
19	59.226671
20	59.269873
21	59.314516
22	59.135945
23	58.912730
24	59.091302
25	58.912730
26	58.689516
27	58.957373
28	58.823445
29	58.689516
30	58.510945
31	58.466302
32	58.421659
33	57.841302
34	58.153802
35	58.243088
36	58.243088
37	58.019873
38	58.198445
39	57.707373
40	57.528802
41	57.618088
42	57.618088
43	57.662730
44	57.618088
45	57.618088
46	57.841302
47	58.064516
48	58.153802
49	57.930588

49 rows × 1 columns

In [32]:

tmp.plot(label="Baseline 2")
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 65])
plt.legend()
plt.show()

/Library/Python/2.7/site-packages/matplotlib-1.4.x-py2.7-macosx-10.9-intel.egg/matplotlib/font_manager.py:1240: UserWarning: findfont: Font family [u'serif'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

Test Data Operations¶

In [37]:

tsdf = testDF

In [21]:

def get_train_games(team, gameId, trdf):
    df = trdf[trdf.game_id < gameId]
    hdf = df[df.home_team == team]
    adf = df[df.away_team == team]
    newdf = pd.concat([hdf, adf])
    newdf = newdf.sort_index()
    try:
        return pd.concat([hdf, adf])
    except:
        return False

In [22]:

def get_points(team, df):
    pFor = []
    pAgainst = []
    for r, v in df.iterrows():
        hTeam = v.home_team
        aTeam = v.away_team
        hScore = v.home_score
        aScore = v.away_score
        if (team == hTeam):
            pFor.append(hScore)
            pAgainst.append(aScore)
        elif (team == aTeam):
            pFor.append(aScore)
            pAgainst.append(hScore)
    return (sum(pFor)/len(df), sum(pAgainst)/len(df))

In [43]:

i = 15

lst = []
for r, v in tsdf.iterrows():
    gameId = v.game_id
    home = v.home_team
    away = v.away_team
    homehGames = get_train_games(home, gameId, trdf)
    awayhGames = get_train_games(away, gameId, trdf)
    if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
        if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
            homehGames = homehGames[-i:]
            awayhGames = awayhGames[-i:]
            hFor, hAgainst = get_points(home, homehGames)
            aFor, aAgainst = get_points(away, awayhGames)
        else:
            hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
    else:
        hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
    v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
    lst.append(pd.DataFrame(v).T)
trdfn = pd.concat(lst)
csv_name = "baseline2/test_data_iter_"+str(i)+".csv"
trdfn.to_csv(csv_name)

In [63]:

dfs = []

csv_name = "baseline2/test_data_iter_"+str(i)+".csv"
tsdfn = pd.read_csv(csv_name, index_col=0)

years = set(tsdfn.year)
for year in years:
    tsdfny = tsdfn[tsdfn.year == year]
    tsdfny['diff_xy'] = tsdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
    tsdfny['predicted_winner'] = tsdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
    tsdfny['correct_prediction'] = tsdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
    accuracy = float(sum(tsdfny.correct_prediction))*100/len(tsdfny.index)
    dfs.append(pd.DataFrame([accuracy], index=[year], columns=["accuracy"]))
ans = pd.concat(dfs)

In [66]:

ans

Out[66]:

	accuracy
2000	52.884615
2001	64.423077
2002	63.551402
2003	60.747664
2004	60.747664
2005	65.420561
2006	59.813084
2007	64.485981
2008	58.878505
2009	68.224299
2010	55.140187
2011	61.682243
2012	61.682243
2013	62.616822

14 rows × 1 columns

In [69]:

float(mean(ans))

Out[69]:

61.44988189380713

Train Data Operations¶

In [45]:

trdf_alpha = trainDF

In [25]:

def get_historic_games(team, gameId, trdf):
    df = trdf[trdf.game_id < gameId]
    hdf = df[df.home_team == team]
    adf = df[df.away_team == team]
    newdf = pd.concat([hdf, adf])
    newdf = newdf.sort_index()
    try:
        return pd.concat([hdf, adf])
    except:
        return False

In [26]:

def get_points(team, df, alpha):
    pFor = []
    pAgainst = []
    for r, v in df.iterrows():
        hTeam = v.home_team
        aTeam = v.away_team
        hScore = v.home_score
        aScore = v.away_score
        if (team == hTeam):
            pFor.append(hScore)
            pAgainst.append(aScore)
        elif (team == aTeam):
            pFor.append(aScore)
            pAgainst.append(hScore)
    numGames = len(pFor)
    weights = []
    for i in xrange(1, numGames+1):
        weights.append(alpha**i)
    return (int(round(np.dot(pFor, weights)/float(sum(weights)))), int(round(np.dot(pAgainst, weights)/float(sum(weights)))))

In [55]:

for alpha in np.arange(1, 1.01, 0.1):
    for i in xrange(1, 50):
        lst = []
        for r, v in trdf_alpha.iterrows():
            gameId = v.game_id
            home = v.home_team
            away = v.away_team
            homehGames = get_historic_games(home, gameId, trdf_alpha)
            awayhGames = get_historic_games(away, gameId, trdf_alpha)
            if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
                if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
                    homehGames = homehGames[-i:]
                    awayhGames = awayhGames[-i:]
                    hFor, hAgainst = get_points(home, homehGames, alpha)
                    aFor, aAgainst = get_points(away, awayhGames, alpha)
                else:
                    hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
            else:
                hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
            v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
            lst.append(pd.DataFrame(v).T)
        trdf_alpha_n = pd.concat(lst)
        csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
        trdf_alpha_n.to_csv(csv_name)

In [56]:

def get_diff(hFor, hAgainst, aFor, aAgainst):
    if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
        return (hFor + aAgainst)/2  - (aFor + hAgainst)/2 + 3
    else:
        return 3

In [57]:

def get_predicted_winner(diff_xy, home_team, away_team):
    if diff_xy >= 0:
        return home_team
    else:
        return away_team

In [58]:

def is_prediction_correct(winner, predicted_winner):
    if winner == predicted_winner:
        return True
    else:
        return False

In [83]:

adfs = []
for alpha in np.arange(0.5, 1.01, 0.1):
    dfs = []
    for i in xrange(1, 50):
        csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
        trdfn = pd.read_csv(csv_name, index_col=0)
        years = set(trdfn.year)
        lst = []
        for year in years:
            trdfny = trdfn[trdfn.year == year]
            trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
            trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
            trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
            lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
        dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
    tmp = pd.concat(dfs)
    tmp = tmp.T.mean()
    adfs.append(pd.DataFrame(tmp, columns=[alpha]).T)
ans = pd.concat(adfs).T

In [84]:

ans.T

Out[84]:

	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20
0.5	55.911578	56.445853	56.189516	58.660714	57.353111	55.387385	56.984447	57.260945	56.810196	57.896025	57.936348	55.697005	58.198445	56.419931	57.932028	57.347350	56.722350	55.432028	54.717742	56.368088	...
0.6	55.911578	56.267281	56.275922	58.464862	57.534562	56.281682	57.926267	57.841302	57.165899	58.794643	58.610311	57.399194	58.109159	57.226382	58.424539	57.883065	56.723790	56.680588	55.119528	56.189516	...
0.7	55.911578	56.182316	57.214862	59.095622	57.756336	57.226382	58.202765	57.930588	57.747696	59.863191	59.235311	58.028514	58.379896	58.882488	59.144585	58.286290	57.884505	57.348790	56.682028	56.546659	...
0.8	55.911578	56.408410	56.900922	59.183468	58.472062	57.682892	58.510945	58.283410	58.063076	60.262097	59.500288	58.873848	59.408122	59.194988	59.459965	58.423099	58.555588	57.707373	57.887385	57.752016	...
0.9	55.911578	56.358007	57.802419	59.000576	58.922811	58.034274	59.146025	58.696717	59.279954	60.437788	59.229551	59.052419	60.260657	59.955357	60.574597	59.411002	59.848790	59.180588	59.182028	58.957373	...
1.0	55.911578	56.538018	57.173099	58.417339	58.742800	58.568548	58.738479	59.236751	59.274194	60.030242	59.369240	59.766705	60.125288	60.220334	60.708525	60.303859	59.940956	59.675979	59.583813	59.314516	...

6 rows × 49 columns

In [85]:

ans.plot()
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 63])
plt.show()

In [89]:

ans.plot()
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([52, 62])
plt.show()

In [90]:

# Plotting just the max values for all alpha values
ans.max().plot()
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Alpha values')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 65])
plt.show()

Test Data Operations¶

In [14]:

tsdf_alpha = trainDF

In [15]:

def get_train_games(team, gameId, trdf):
    df = trdf[trdf.game_id < gameId]
    hdf = df[df.home_team == team]
    adf = df[df.away_team == team]
    newdf = pd.concat([hdf, adf])
    newdf = newdf.sort_index()
    try:
        return pd.concat([hdf, adf])
    except:
        return False

In [16]:

def get_points(team, df, alpha):
    pFor = []
    pAgainst = []
    for r, v in df.iterrows():
        hTeam = v.home_team
        aTeam = v.away_team
        hScore = v.home_score
        aScore = v.away_score
        if (team == hTeam):
            pFor.append(hScore)
            pAgainst.append(aScore)
        elif (team == aTeam):
            pFor.append(aScore)
            pAgainst.append(hScore)
    numGames = len(pFor)
    weights = []
    for i in xrange(1, numGames+1):
        weights.append(alpha**i)
    return (int(round(np.dot(pFor, weights)/float(sum(weights)))), int(round(np.dot(pAgainst, weights)/float(sum(weights)))))

In [27]:

i = 15
alpha = 1.0

lst = []
for r, v in tsdf_alpha.iterrows():
    gameId = v.game_id
    home = v.home_team
    away = v.away_team
    homehGames = get_historic_games(home, gameId, tsdf_alpha)
    awayhGames = get_historic_games(away, gameId, tsdf_alpha)
    if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
        if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
            homehGames = homehGames[-i:]
            awayhGames = awayhGames[-i:]
            hFor, hAgainst = get_points(home, homehGames, alpha)
            aFor, aAgainst = get_points(away, awayhGames, alpha)
        else:
            hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
    else:
        hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
    v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
    lst.append(pd.DataFrame(v).T)
tsdf_alpha_n = pd.concat(lst)
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
tsdf_alpha_n.to_csv(csv_name)

In [32]:

dfs = []

csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
tsdfn = pd.read_csv(csv_name, index_col=0)

years = set(tsdfn.year)
for year in years:
    tsdfny = tsdfn[tsdfn.year == year]
    tsdfny['diff_xy'] = tsdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
    tsdfny['predicted_winner'] = tsdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
    tsdfny['correct_prediction'] = tsdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
    accuracy = float(sum(tsdfny.correct_prediction))*100/len(tsdfny.index)
    dfs.append(pd.DataFrame([accuracy], index=[year], columns=["accuracy"]))
ans = pd.concat(dfs)

In [33]:

ans

Out[33]:

	accuracy
2000	58.709677
2001	58.709677
2002	61.875000
2003	64.375000
2004	60.000000
2005	61.875000
2006	58.125000
2007	62.500000
2008	58.125000
2009	65.000000
2010	63.750000
2011	59.375000
2012	59.375000
2013	58.125000

14 rows × 1 columns

In [34]:

float(mean(ans))

Out[34]:

60.70852534562213