NFL PageRank Model¶

Importing necessary libraries.

In [1]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [2]:

from collections import defaultdict

import cPickle as pickle
import math
import networkx as nx
import numpy as np
import operator
import pandas as pd
import pylab
import seaborn

In [3]:

colors = seaborn.color_palette()

from matplotlib import rc

rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")


rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")


rc('font',**{'family':'serif','serif':['Palatino']})

Reading games data bought from ArmchairAnalysis into a pandas dataframe named gamedDF

In [4]:

trainDF = pd.read_csv('trainNFL.csv', index_col=0)
testDF = pd.read_csv('testNFL.csv', index_col=0)

Selecting useful from the columns from the gamesDF and renaming them using a descriptive name.

In [5]:

trainDF['score_diff'] = trainDF.home_score - trainDF.away_score
testDF['score_diff'] = testDF.home_score - testDF.away_score

trainDF = trainDF.sort(column='game_id').reset_index(drop=True)
testDF = testDF.sort(column='game_id').reset_index(drop=True)

/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/core/frame.py:2542: FutureWarning: column is deprecated, use columns
  warnings.warn("column is deprecated, use columns", FutureWarning)

In [6]:

cols = ['game_id','year','week','day','away_team','home_team','away_score','home_score','winner']
gamesDF = trainDF[cols]

Creating a dictionary and a function to easily access any teams's name with its abbreviation.

In [7]:

# Creating a dictionary containing team name keyed by its abbreviation.
team_dict = {'ARI': 'Arizona Cardinals',
             'ATL': 'Atlanta Falcons',
             'BAL': 'Baltimore Ravens',
             'BUF': 'Buffalo Bills',
             'CAR': 'Carolina Panthers',
             'CHI': 'Chicago Bears',
             'CIN': 'Cincinnati Bengals',
             'CLE': 'Cleveland Browns',
             'DAL': 'Dallas Cowboys',
             'DEN': 'Denver Broncos',
             'DET': 'Detroit Lions',
             'GB' : 'Green Bay Packers',
             'HOU': 'Houston Texans',
             'IND': 'Indianapolis Colts',
             'JAC': 'Jacksonville Jaguars',
             'KC' : 'Kansas City Chiefs',
             'MIA': 'Miami Dolphins',
             'MIN': 'Minnesota Vikings',
             'NE' : 'New England Patriots',
             'NO' : 'New Orleans Saints',
             'NYG': 'New York Giants',
             'NYJ': 'New York Jets',
             'OAK': 'Oakland Raiders',
             'PHI': 'Philadelphia Eagles',
             'PIT': 'Pittsburgh Steelers',
             'SD' : 'San Diego Chargers',
             'SEA': 'Seattle Seahawks',
             'SF' : 'San Francisco 49ers',
             'STL': 'St. Louis Rams',
             'TB' : 'Tampa Bay Buccaneers',
             'TEN': 'Tennessee Titans',
             'WAS': 'Washington Redskins'}

In [8]:

def get_team_name(abbr):
    """Returns team name given its abbreviation.
    
    Uses mapping in the dictionary team_dict to return the team name.
  
    Args:
        abbr: A string which is a team abbreviation.
        
    Returns:
        A string which is a mapping of team abbreviation to team name.
    """
    return team_dict[abbr]

In [9]:

gamesDF.head()

Out[9]:

	game_id	year	week	day	away_team	home_team	away_score	home_score	winner
0	1	2000	1	SUN	SF	ATL	28	36	ATL
1	2	2000	1	SUN	JAC	CLE	27	7	JAC
2	3	2000	1	SUN	PHI	DAL	41	14	PHI
3	4	2000	1	SUN	NYJ	GB	20	16	NYJ
4	9	2000	1	SUN	DET	NO	14	10	DET

5 rows × 9 columns

In [10]:

def create_team_graph(teams):
    graph = nx.DiGraph()
    for i in teams.index:
        team = teams.ix[i]        
        if team['away_score'] < team['home_score']:
            winner = team['home_team']
            loser = team['away_team']
            point_differential = float(abs(team['away_score']- team['home_score'])) /(team['away_score']+ team['home_score'])
        else:
            winner = team['away_team']
            loser = team['home_team']
            point_differential = float(abs(team['away_score']- team['home_score'])) /(team['away_score']+ team['home_score'])        
        update_graph(winner, loser, point_differential, graph)
    return graph

In [11]:

def update_graph(winner, loser, point_differential, graph):    
    graph.add_node(winner)
    graph.add_node(loser)
    
    if graph.has_edge(loser, winner):
        updated_weight = graph[loser][winner]['weight'] + point_differential
    else:
        updated_weight = point_differential
        
    graph.add_edge(loser, winner, weight = updated_weight)
    return graph

In [12]:

# Get winner for all the years
def get_winners(data):
    years = list(unique(data.year))
    result = dict()
    for year in years:
        games = data[data.year == year]
        team_network = create_team_graph(games)
        pagerank_teams = nx.pagerank_numpy(team_network)
        pagerank_teams = pd.DataFrame(pagerank_teams.items(), columns=['team', 'page_rank'])
        pagerank_teams = pd.DataFrame(pagerank_teams.ix[:, 'page_rank']).set_index(pagerank_teams.team)
        pagerank_teams = pagerank_teams.sort(columns='page_rank', ascending=False)
        result[year] = pagerank_teams
    return result

In [13]:

winners = get_winners(gamesDF)

In [14]:

gamesDF[gamesDF.year == 2011].tail()

Out[14]:

	game_id	year	week	day	away_team	home_team	away_score	home_score	winner
1905	3183	2011	19	SUN	DEN	NE	10	45	NE
1906	3184	2011	19	SUN	HOU	BAL	13	20	BAL
1907	3185	2011	19	SUN	NYG	GB	37	20	NYG
1908	3186	2011	20	SUN	BAL	NE	20	23	NE
1909	3187	2011	20	SUN	NYG	SF	20	17	NYG

5 rows × 9 columns

In [15]:

plt.rc('figure', figsize=(18,9))
winners[2013].plot(kind='bar')
plt.show()

/Library/Python/2.7/site-packages/matplotlib-1.4.x-py2.7-macosx-10.9-intel.egg/matplotlib/font_manager.py:1240: UserWarning: findfont: Font family [u'serif'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [16]:

plt.rc('figure', figsize=(18,9))
winners[2012].plot(kind='bar')
plt.show()

In [17]:

plt.rc('figure', figsize=(18,9))
winners[2011].plot(kind='bar')
plt.show()

In [18]:

plt.rc('figure', figsize=(18,9))
winners[2010].plot(kind='bar')
plt.show()

In [19]:

plt.rc('figure', figsize=(18,9))
winners[2010].plot(kind='bar')
plt.show()

In [20]:

plt.rc('figure', figsize=(18,9))
winners[2009].plot(kind='bar')
plt.show()

Game Simulation¶

The following modules simulate games in order and use rankings from Baseline model to predict winner. It also calculates predicted score difference which can later be compared with actual score diffrence to measure error.

In [21]:

def get_schedule(gamesDF, allGames=False):
    """Gets playoff game ordering.

    Args:
        gamesDF: A pandas dataframe containing games data with following columns.
            'game_id',
            'year',
            'week',
            'away_team',
            'home_team',
            'away_score',
            'home_score',
            'winner'

    Returns:
        A dictionary containing year as key and pandas dataframe containing playoff game
        ordering and scores as value.
    """
    years = set(gamesDF.year)
    gameSchedule = dict()
    for year in years:
        if allGames:
            yearlyPlayoff = gamesDF[gamesDF.year == year]
        else:
            yearlyPlayoff = gamesDF[gamesDF.year == year].tail(11)
        yearlyPlayoff['game_order'] = yearlyPlayoff.game_id.rank(ascending=True).order()
        cols = ['game_order', 'home_team', 'away_team', 'home_score', 'away_score', 'winner']
        yearlyPlayoff = yearlyPlayoff[cols]
        gameSchedule[year] = yearlyPlayoff
    return gameSchedule

In [22]:

def yearly_game_simulator(gameSchedule, teamRankings):
    """Simulates games played on a particular season and returns a dataframe which contains
    information about our accuracy.

    Args:
        gameSchedule: A pandas dataframe containing NFL playoff gameplay ordering.
        teamRankings: A pandas dataframe containing team rankings from Baseline model.

    Returns:
        A pandas dataframe which contains model prediction and its correctness when compared
        to actual data.
    """
    stats = gameSchedule.copy(deep=True)
    for i, game in gameSchedule.iterrows():
        game_order = game[0]
        home_team = game[1]
        away_team = game[2]
        home_score = game[3]
        away_score = game[4]
        winner = game[5]
        if home_team == winner:
            winner_score = home_score
            loser_score = away_score
        else:
            loser_score = home_score
            winner_score = away_score
        try:
            home_team_rank = int(teamRankings.ix[home_team]['page_rank'])
            away_team_rank = int(teamRankings.ix[away_team]['page_rank'])
            if home_team_rank >= away_team_rank:
                predicted_winner = home_team
                predicted_score_diff = home_team_rank - away_team_rank
            else:
                predicted_winner = away_team
                # home_team_rank - away_team_rankn reverse
                predicted_score_diff = home_team_rank - away_team_rank
        except:
            home_team_rank = np.NAN
            away_team_rank = np.NAN
            predicted_winner = home_team
            predicted_score_diff = np.NAN
        if predicted_winner == winner:
            correct_prediction = True
        else:
            correct_prediction = False
        score_diff = winner_score - loser_score
        stats.ix[i,'predicted_winner'] = predicted_winner
        stats.ix[i,'correct_prediction'] = correct_prediction
        stats.ix[i,'home_team_rank'] = home_team_rank
        stats.ix[i,'away_team_rank'] = away_team_rank
        stats.ix[i,'predicted_score_diff'] = predicted_score_diff
        stats.ix[i,'score_diff'] = score_diff
        stats.ix[i,'error'] = predicted_score_diff - score_diff
    return stats

In [23]:

def game_simulator(gameSchedule, teamRankings):
    """Simulates games played on all seasons and returns a dataframe which contains
    information about our accuracy.

    Args:
        gameSchedule: A dictionary containing year as key and a pandas dataframe
            containing NFL playoff gameplay ordering as value.
        teamRankings: A dictionary containing year as key and a pandas dataframe
            containing team rankings from Baseline model as value.

    Returns:
        A dictionary containing year as key and a pandas dataframe which contains
        model prediction and its correctness when compared to actual data as value.
    """
    allStats = dict()
    for year, schedule in gameSchedule.iteritems():
        rankings = teamRankings[year]
        yearlyStats = yearly_game_simulator(schedule, rankings)
        allStats[year] = yearlyStats
    return allStats

In [24]:

def check_accuracy(allStats):
    accuracy = dict()
    for year, df in allStats.iteritems():
        numGames = len(df.correct_prediction)
        numCorrect = sum(df.correct_prediction)
        pctCorrect = numCorrect*100/float(numGames)
        accuracy[year] = [numGames, numCorrect, pctCorrect]
    return accuracy

In [25]:

gameSchedule = get_schedule(testDF, allGames=True)
teamRankings = get_winners(testDF)
allStats = game_simulator(gameSchedule, teamRankings)

In [27]:

accuracy = check_accuracy(allStats)
accuracy = pd.DataFrame(accuracy).T
accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
accuracy = accuracy[-5:]
accuracy

Out[27]:

	num_games	num_correct_prediction	pct_correct_prediction
2009	107	60	56.074766
2010	107	56	52.336449
2011	107	66	61.682243
2012	107	68	63.551402
2013	107	67	62.616822

5 rows × 3 columns

In [33]:

plt.figure(figsize=(10, 6))
accuracy = accuracy[['pct_correct_prediction']]
accuracy.plot(kind='bar', figsize=(10, 6))
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.yticks(np.arange(0, 71, 10))
plt.xlabel('Years')
plt.ylabel('Percentage Correct')
plt.title('Predict playoffs using PageRank model')
plt.show()

<matplotlib.figure.Figure at 0x1150e8350>

2014 Winner Prediction¶

In [37]:

gamesDF = pd.read_csv('nfl_2014.csv')

In [38]:

cols = ['year', 'week', 'home_team', 'away_team', 'home_score', 'away_score', 'winner']
gamesDF = gamesDF[cols]

In [39]:

gamesDF.tail(10)

Out[39]:

	year	week	home_team	away_team	home_score	away_score	winner
246	2014	17	WAS	DAL	17	44	DAL
247	2014	17	BAL	CLE	20	10	BAL
248	2014	17	MIA	NYJ	24	37	NYJ
249	2014	17	DEN	OAK	47	14	DEN
250	2014	17	SF	ARI	20	17	SF
251	2014	17	NYG	PHI	26	34	PHI
252	2014	17	HOU	JAC	23	17	HOU
253	2014	17	MIN	CHI	13	9	MIN
254	2014	17	TB	NO	20	23	NO
255	2014	17	NE	BUF	9	17	BUF

10 rows × 7 columns

In [40]:

teamRankings = get_winners(gamesDF)

In [41]:

teamRankings[2014].plot(kind='bar')

Out[41]:

<matplotlib.axes._subplots.AxesSubplot at 0x114cbf7d0>

In [42]:

temp = teamRankings[2014]
names = []
for r, v in temp.iterrows():
    names.append(team_dict[r])
temp['name'] = pd.Series(names, index=temp.index)
temp[['name', 'page_rank']]#.to_csv('page_rank_res.csv')

Out[42]:

	name	page_rank
team
SEA	Seattle Seahawks	0.067890
KC	Kansas City Chiefs	0.064077
NE	New England Patriots	0.059916
DAL	Dallas Cowboys	0.052141
DEN	Denver Broncos	0.050597
MIA	Miami Dolphins	0.042805
SD	San Diego Chargers	0.042738
PHI	Philadelphia Eagles	0.040979
BUF	Buffalo Bills	0.040300
PIT	Pittsburgh Steelers	0.039831
BAL	Baltimore Ravens	0.038372
ARI	Arizona Cardinals	0.037293
GB	Green Bay Packers	0.037044
CIN	Cincinnati Bengals	0.036969
IND	Indianapolis Colts	0.036103
STL	St. Louis Rams	0.034990
SF	San Francisco 49ers	0.028202
CLE	Cleveland Browns	0.026800
HOU	Houston Texans	0.026737
DET	Detroit Lions	0.025509
TEN	Tennessee Titans	0.021281
CAR	Carolina Panthers	0.020868
NO	New Orleans Saints	0.018921
NYG	New York Giants	0.016243
MIN	Minnesota Vikings	0.015758
NYJ	New York Jets	0.014228
ATL	Atlanta Falcons	0.013226
OAK	Oakland Raiders	0.011857
WAS	Washington Redskins	0.011103
JAC	Jacksonville Jaguars	0.010020
CHI	Chicago Bears	0.009846
TB	Tampa Bay Buccaneers	0.007356

32 rows × 2 columns

Team standings DataFrame based on the conference and division.

In [43]:

sdf = pd.read_csv('nfl_sim_data_actual.csv')

In [44]:

sdf

Out[44]:

	conf	standing	team	prob
0	NFC	1	SEA	0.067890
1	NFC	2	GB	0.037044
2	NFC	3	DAL	0.052141
3	NFC	4	CAR	0.020868
4	NFC	5	ARI	0.037293
5	NFC	6	DET	0.025509
6	AFC	1	NE	0.059916
7	AFC	2	DEN	0.050597
8	AFC	3	PIT	0.039831
9	AFC	4	IND	0.036103
10	AFC	5	CIN	0.036969
11	AFC	6	BAL	0.038372

12 rows × 4 columns

In [45]:

nfc_data = sdf[sdf.conf == 'NFC']
afc_data = sdf[sdf.conf == 'AFC']

Monte Carlo Simulator¶

In [46]:

def mc_simulator():
    num_sim = 50
    # NFC Wildcard Round - 1
    teams = [nfc_data[nfc_data.standing == 4].team.get_values()[0], nfc_data[nfc_data.standing == 5].team.get_values()[0]]
    s1 = round(float(nfc_data[nfc_data.standing == 4].prob), 4)
    s2 = round(float(nfc_data[nfc_data.standing == 5].prob), 4)
    nwcr1 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
    # NFC Wildcard Round - 2
    teams = [nfc_data[nfc_data.standing == 3].team.get_values()[0], nfc_data[nfc_data.standing == 6].team.get_values()[0]]
    s1 = round(float(nfc_data[nfc_data.standing == 3].prob), 4)
    s2 = round(float(nfc_data[nfc_data.standing == 6].prob), 4)
    nwcr2 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
    # NFC Division Round - 1
    ndr1 = []
    for i in xrange(len(nwcr1)):
        teams = [nwcr1[i], nfc_data[nfc_data.standing == 1].team.get_values()[0]]
        s1 = round(temp[temp.index == nwcr1[i]].page_rank.get_values()[0], 4)
        s2 = round(float(nfc_data[nfc_data.standing == 1].prob), 4)
        ndr1.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
    # NFC Division Round - 2
    ndr2 = []
    for i in xrange(len(nwcr2)):
        teams = [nwcr2[i], nfc_data[nfc_data.standing == 2].team.get_values()[0]]
        s1 = round(temp[temp.index == nwcr2[i]].page_rank.get_values()[0], 4)
        s2 = round(float(nfc_data[nfc_data.standing == 2].prob), 4)
        ndr2.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
    # NFC Conference Champion
    ncc = []
    for i in xrange(len(ndr1)):
        teams = [ndr1[i], ndr2[i]]
        s1 = round(temp[temp.index == ndr1[i]].page_rank.get_values()[0], 4)
        s2 = round(temp[temp.index == ndr2[i]].page_rank.get_values()[0], 4)
        ncc.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))

    # AFC Wildcard Round - 1
    teams = [afc_data[afc_data.standing == 4].team.get_values()[0], afc_data[afc_data.standing == 5].team.get_values()[0]]
    s1 = round(float(afc_data[afc_data.standing == 4].prob), 4)
    s2 = round(float(afc_data[afc_data.standing == 5].prob), 4)
    awcr1 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
    # AFC Wildcard Round - 2
    teams = [afc_data[afc_data.standing == 3].team.get_values()[0], afc_data[afc_data.standing == 6].team.get_values()[0]]
    s1 = round(float(afc_data[afc_data.standing == 3].prob), 4)
    s2 = round(float(afc_data[afc_data.standing == 6].prob), 4)
    awcr2 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
    # AFC Division Round - 1
    adr1 = []
    for i in xrange(len(awcr1)):
        teams = [awcr1[i], afc_data[afc_data.standing == 1].team.get_values()[0]]
        s1 = round(temp[temp.index == awcr1[i]].page_rank.get_values()[0], 4)
        s2 = round(float(afc_data[afc_data.standing == 1].prob), 4)
        adr1.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
    # AFC Division Round - 1
    adr2 = []
    for i in xrange(len(awcr2)):
        teams = [awcr2[i], afc_data[afc_data.standing == 2].team.get_values()[0]]
        s1 = round(temp[temp.index == awcr2[i]].page_rank.get_values()[0], 4)
        s2 = round(float(afc_data[afc_data.standing == 2].prob), 4)
        adr2.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
    # AFC Conference Champion
    acc = []
    for i in xrange(len(adr1)):
        teams = [adr1[i], adr2[i]]
        s1 = round(temp[temp.index == adr1[i]].page_rank.get_values()[0], 4)
        s2 = round(temp[temp.index == adr2[i]].page_rank.get_values()[0], 4)
        acc.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
    # Superbowl Champion
    num_sim = 2
    winners = []
    for i in xrange(len(ncc)):
        teams = [ncc[i], acc[i]]
        s1 = round(temp[temp.index == ncc[i]].page_rank.get_values()[0], 4)
        s2 = round(temp[temp.index == acc[i]].page_rank.get_values()[0], 4)
        winners.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
    return winners

In [47]:

final_result = mc_simulator()

In [48]:

from collections import defaultdict
freq = defaultdict(int)
for winner in final_result:
    freq[winner] += 1

In [49]:

wdf = pd.DataFrame(freq.items())
wdf.index = wdf[0]
wdf.columns = ['team', 'wins']
wdf['win_prob'] = wdf.wins/sum(wdf.wins)
wdf = wdf.sort(columns='win_prob', ascending=False)
wdf = wdf[['win_prob']]
wdf

Out[49]:

	win_prob
0
SEA	0.259444
NE	0.192552
DEN	0.139564
DAL	0.101136
GB	0.086048
PIT	0.044592
ARI	0.040720
BAL	0.040340
CIN	0.036212
IND	0.036144
DET	0.013648
CAR	0.009600

12 rows × 1 columns