Importing necessary libraries.
%pylab inline
Populating the interactive namespace from numpy and matplotlib
from collections import defaultdict
import cPickle as pickle
import math
import networkx as nx
import numpy as np
import operator
import pandas as pd
import pylab
import seaborn
colors = seaborn.color_palette()
from matplotlib import rc
rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")
rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")
rc('font',**{'family':'serif','serif':['Palatino']})
Reading games data bought from ArmchairAnalysis into a pandas dataframe named gamedDF
trainDF = pd.read_csv('trainNFL.csv', index_col=0)
testDF = pd.read_csv('testNFL.csv', index_col=0)
Selecting useful from the columns from the gamesDF and renaming them using a descriptive name.
trainDF['score_diff'] = trainDF.home_score - trainDF.away_score
testDF['score_diff'] = testDF.home_score - testDF.away_score
trainDF = trainDF.sort(column='game_id').reset_index(drop=True)
testDF = testDF.sort(column='game_id').reset_index(drop=True)
/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/core/frame.py:2542: FutureWarning: column is deprecated, use columns warnings.warn("column is deprecated, use columns", FutureWarning)
cols = ['game_id','year','week','day','away_team','home_team','away_score','home_score','winner']
gamesDF = trainDF[cols]
Creating a dictionary and a function to easily access any teams's name with its abbreviation.
# Creating a dictionary containing team name keyed by its abbreviation.
team_dict = {'ARI': 'Arizona Cardinals',
'ATL': 'Atlanta Falcons',
'BAL': 'Baltimore Ravens',
'BUF': 'Buffalo Bills',
'CAR': 'Carolina Panthers',
'CHI': 'Chicago Bears',
'CIN': 'Cincinnati Bengals',
'CLE': 'Cleveland Browns',
'DAL': 'Dallas Cowboys',
'DEN': 'Denver Broncos',
'DET': 'Detroit Lions',
'GB' : 'Green Bay Packers',
'HOU': 'Houston Texans',
'IND': 'Indianapolis Colts',
'JAC': 'Jacksonville Jaguars',
'KC' : 'Kansas City Chiefs',
'MIA': 'Miami Dolphins',
'MIN': 'Minnesota Vikings',
'NE' : 'New England Patriots',
'NO' : 'New Orleans Saints',
'NYG': 'New York Giants',
'NYJ': 'New York Jets',
'OAK': 'Oakland Raiders',
'PHI': 'Philadelphia Eagles',
'PIT': 'Pittsburgh Steelers',
'SD' : 'San Diego Chargers',
'SEA': 'Seattle Seahawks',
'SF' : 'San Francisco 49ers',
'STL': 'St. Louis Rams',
'TB' : 'Tampa Bay Buccaneers',
'TEN': 'Tennessee Titans',
'WAS': 'Washington Redskins'}
def get_team_name(abbr):
"""Returns team name given its abbreviation.
Uses mapping in the dictionary team_dict to return the team name.
Args:
abbr: A string which is a team abbreviation.
Returns:
A string which is a mapping of team abbreviation to team name.
"""
return team_dict[abbr]
gamesDF.head()
game_id | year | week | day | away_team | home_team | away_score | home_score | winner | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2000 | 1 | SUN | SF | ATL | 28 | 36 | ATL |
1 | 2 | 2000 | 1 | SUN | JAC | CLE | 27 | 7 | JAC |
2 | 3 | 2000 | 1 | SUN | PHI | DAL | 41 | 14 | PHI |
3 | 4 | 2000 | 1 | SUN | NYJ | GB | 20 | 16 | NYJ |
4 | 9 | 2000 | 1 | SUN | DET | NO | 14 | 10 | DET |
5 rows × 9 columns
def create_team_graph(teams):
graph = nx.DiGraph()
for i in teams.index:
team = teams.ix[i]
if team['away_score'] < team['home_score']:
winner = team['home_team']
loser = team['away_team']
point_differential = float(abs(team['away_score']- team['home_score'])) /(team['away_score']+ team['home_score'])
else:
winner = team['away_team']
loser = team['home_team']
point_differential = float(abs(team['away_score']- team['home_score'])) /(team['away_score']+ team['home_score'])
update_graph(winner, loser, point_differential, graph)
return graph
def update_graph(winner, loser, point_differential, graph):
graph.add_node(winner)
graph.add_node(loser)
if graph.has_edge(loser, winner):
updated_weight = graph[loser][winner]['weight'] + point_differential
else:
updated_weight = point_differential
graph.add_edge(loser, winner, weight = updated_weight)
return graph
# Get winner for all the years
def get_winners(data):
years = list(unique(data.year))
result = dict()
for year in years:
games = data[data.year == year]
team_network = create_team_graph(games)
pagerank_teams = nx.pagerank_numpy(team_network)
pagerank_teams = pd.DataFrame(pagerank_teams.items(), columns=['team', 'page_rank'])
pagerank_teams = pd.DataFrame(pagerank_teams.ix[:, 'page_rank']).set_index(pagerank_teams.team)
pagerank_teams = pagerank_teams.sort(columns='page_rank', ascending=False)
result[year] = pagerank_teams
return result
winners = get_winners(gamesDF)
gamesDF[gamesDF.year == 2011].tail()
game_id | year | week | day | away_team | home_team | away_score | home_score | winner | |
---|---|---|---|---|---|---|---|---|---|
1905 | 3183 | 2011 | 19 | SUN | DEN | NE | 10 | 45 | NE |
1906 | 3184 | 2011 | 19 | SUN | HOU | BAL | 13 | 20 | BAL |
1907 | 3185 | 2011 | 19 | SUN | NYG | GB | 37 | 20 | NYG |
1908 | 3186 | 2011 | 20 | SUN | BAL | NE | 20 | 23 | NE |
1909 | 3187 | 2011 | 20 | SUN | NYG | SF | 20 | 17 | NYG |
5 rows × 9 columns
plt.rc('figure', figsize=(18,9))
winners[2013].plot(kind='bar')
plt.show()
/Library/Python/2.7/site-packages/matplotlib-1.4.x-py2.7-macosx-10.9-intel.egg/matplotlib/font_manager.py:1240: UserWarning: findfont: Font family [u'serif'] not found. Falling back to Bitstream Vera Sans (prop.get_family(), self.defaultFamily[fontext]))
plt.rc('figure', figsize=(18,9))
winners[2012].plot(kind='bar')
plt.show()
plt.rc('figure', figsize=(18,9))
winners[2011].plot(kind='bar')
plt.show()
plt.rc('figure', figsize=(18,9))
winners[2010].plot(kind='bar')
plt.show()
plt.rc('figure', figsize=(18,9))
winners[2010].plot(kind='bar')
plt.show()
plt.rc('figure', figsize=(18,9))
winners[2009].plot(kind='bar')
plt.show()
The following modules simulate games in order and use rankings from Baseline model to predict winner. It also calculates predicted score difference which can later be compared with actual score diffrence to measure error.
def get_schedule(gamesDF, allGames=False):
"""Gets playoff game ordering.
Args:
gamesDF: A pandas dataframe containing games data with following columns.
'game_id',
'year',
'week',
'away_team',
'home_team',
'away_score',
'home_score',
'winner'
Returns:
A dictionary containing year as key and pandas dataframe containing playoff game
ordering and scores as value.
"""
years = set(gamesDF.year)
gameSchedule = dict()
for year in years:
if allGames:
yearlyPlayoff = gamesDF[gamesDF.year == year]
else:
yearlyPlayoff = gamesDF[gamesDF.year == year].tail(11)
yearlyPlayoff['game_order'] = yearlyPlayoff.game_id.rank(ascending=True).order()
cols = ['game_order', 'home_team', 'away_team', 'home_score', 'away_score', 'winner']
yearlyPlayoff = yearlyPlayoff[cols]
gameSchedule[year] = yearlyPlayoff
return gameSchedule
def yearly_game_simulator(gameSchedule, teamRankings):
"""Simulates games played on a particular season and returns a dataframe which contains
information about our accuracy.
Args:
gameSchedule: A pandas dataframe containing NFL playoff gameplay ordering.
teamRankings: A pandas dataframe containing team rankings from Baseline model.
Returns:
A pandas dataframe which contains model prediction and its correctness when compared
to actual data.
"""
stats = gameSchedule.copy(deep=True)
for i, game in gameSchedule.iterrows():
game_order = game[0]
home_team = game[1]
away_team = game[2]
home_score = game[3]
away_score = game[4]
winner = game[5]
if home_team == winner:
winner_score = home_score
loser_score = away_score
else:
loser_score = home_score
winner_score = away_score
try:
home_team_rank = int(teamRankings.ix[home_team]['page_rank'])
away_team_rank = int(teamRankings.ix[away_team]['page_rank'])
if home_team_rank >= away_team_rank:
predicted_winner = home_team
predicted_score_diff = home_team_rank - away_team_rank
else:
predicted_winner = away_team
# home_team_rank - away_team_rankn reverse
predicted_score_diff = home_team_rank - away_team_rank
except:
home_team_rank = np.NAN
away_team_rank = np.NAN
predicted_winner = home_team
predicted_score_diff = np.NAN
if predicted_winner == winner:
correct_prediction = True
else:
correct_prediction = False
score_diff = winner_score - loser_score
stats.ix[i,'predicted_winner'] = predicted_winner
stats.ix[i,'correct_prediction'] = correct_prediction
stats.ix[i,'home_team_rank'] = home_team_rank
stats.ix[i,'away_team_rank'] = away_team_rank
stats.ix[i,'predicted_score_diff'] = predicted_score_diff
stats.ix[i,'score_diff'] = score_diff
stats.ix[i,'error'] = predicted_score_diff - score_diff
return stats
def game_simulator(gameSchedule, teamRankings):
"""Simulates games played on all seasons and returns a dataframe which contains
information about our accuracy.
Args:
gameSchedule: A dictionary containing year as key and a pandas dataframe
containing NFL playoff gameplay ordering as value.
teamRankings: A dictionary containing year as key and a pandas dataframe
containing team rankings from Baseline model as value.
Returns:
A dictionary containing year as key and a pandas dataframe which contains
model prediction and its correctness when compared to actual data as value.
"""
allStats = dict()
for year, schedule in gameSchedule.iteritems():
rankings = teamRankings[year]
yearlyStats = yearly_game_simulator(schedule, rankings)
allStats[year] = yearlyStats
return allStats
def check_accuracy(allStats):
accuracy = dict()
for year, df in allStats.iteritems():
numGames = len(df.correct_prediction)
numCorrect = sum(df.correct_prediction)
pctCorrect = numCorrect*100/float(numGames)
accuracy[year] = [numGames, numCorrect, pctCorrect]
return accuracy
gameSchedule = get_schedule(testDF, allGames=True)
teamRankings = get_winners(testDF)
allStats = game_simulator(gameSchedule, teamRankings)
accuracy = check_accuracy(allStats)
accuracy = pd.DataFrame(accuracy).T
accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
accuracy = accuracy[-5:]
accuracy
num_games | num_correct_prediction | pct_correct_prediction | |
---|---|---|---|
2009 | 107 | 60 | 56.074766 |
2010 | 107 | 56 | 52.336449 |
2011 | 107 | 66 | 61.682243 |
2012 | 107 | 68 | 63.551402 |
2013 | 107 | 67 | 62.616822 |
5 rows × 3 columns
plt.figure(figsize=(10, 6))
accuracy = accuracy[['pct_correct_prediction']]
accuracy.plot(kind='bar', figsize=(10, 6))
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.yticks(np.arange(0, 71, 10))
plt.xlabel('Years')
plt.ylabel('Percentage Correct')
plt.title('Predict playoffs using PageRank model')
plt.show()
<matplotlib.figure.Figure at 0x1150e8350>
gamesDF = pd.read_csv('nfl_2014.csv')
cols = ['year', 'week', 'home_team', 'away_team', 'home_score', 'away_score', 'winner']
gamesDF = gamesDF[cols]
gamesDF.tail(10)
year | week | home_team | away_team | home_score | away_score | winner | |
---|---|---|---|---|---|---|---|
246 | 2014 | 17 | WAS | DAL | 17 | 44 | DAL |
247 | 2014 | 17 | BAL | CLE | 20 | 10 | BAL |
248 | 2014 | 17 | MIA | NYJ | 24 | 37 | NYJ |
249 | 2014 | 17 | DEN | OAK | 47 | 14 | DEN |
250 | 2014 | 17 | SF | ARI | 20 | 17 | SF |
251 | 2014 | 17 | NYG | PHI | 26 | 34 | PHI |
252 | 2014 | 17 | HOU | JAC | 23 | 17 | HOU |
253 | 2014 | 17 | MIN | CHI | 13 | 9 | MIN |
254 | 2014 | 17 | TB | NO | 20 | 23 | NO |
255 | 2014 | 17 | NE | BUF | 9 | 17 | BUF |
10 rows × 7 columns
teamRankings = get_winners(gamesDF)
teamRankings[2014].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x114cbf7d0>
temp = teamRankings[2014]
names = []
for r, v in temp.iterrows():
names.append(team_dict[r])
temp['name'] = pd.Series(names, index=temp.index)
temp[['name', 'page_rank']]#.to_csv('page_rank_res.csv')
name | page_rank | |
---|---|---|
team | ||
SEA | Seattle Seahawks | 0.067890 |
KC | Kansas City Chiefs | 0.064077 |
NE | New England Patriots | 0.059916 |
DAL | Dallas Cowboys | 0.052141 |
DEN | Denver Broncos | 0.050597 |
MIA | Miami Dolphins | 0.042805 |
SD | San Diego Chargers | 0.042738 |
PHI | Philadelphia Eagles | 0.040979 |
BUF | Buffalo Bills | 0.040300 |
PIT | Pittsburgh Steelers | 0.039831 |
BAL | Baltimore Ravens | 0.038372 |
ARI | Arizona Cardinals | 0.037293 |
GB | Green Bay Packers | 0.037044 |
CIN | Cincinnati Bengals | 0.036969 |
IND | Indianapolis Colts | 0.036103 |
STL | St. Louis Rams | 0.034990 |
SF | San Francisco 49ers | 0.028202 |
CLE | Cleveland Browns | 0.026800 |
HOU | Houston Texans | 0.026737 |
DET | Detroit Lions | 0.025509 |
TEN | Tennessee Titans | 0.021281 |
CAR | Carolina Panthers | 0.020868 |
NO | New Orleans Saints | 0.018921 |
NYG | New York Giants | 0.016243 |
MIN | Minnesota Vikings | 0.015758 |
NYJ | New York Jets | 0.014228 |
ATL | Atlanta Falcons | 0.013226 |
OAK | Oakland Raiders | 0.011857 |
WAS | Washington Redskins | 0.011103 |
JAC | Jacksonville Jaguars | 0.010020 |
CHI | Chicago Bears | 0.009846 |
TB | Tampa Bay Buccaneers | 0.007356 |
32 rows × 2 columns
Team standings DataFrame based on the conference and division.
sdf = pd.read_csv('nfl_sim_data_actual.csv')
sdf
conf | standing | team | prob | |
---|---|---|---|---|
0 | NFC | 1 | SEA | 0.067890 |
1 | NFC | 2 | GB | 0.037044 |
2 | NFC | 3 | DAL | 0.052141 |
3 | NFC | 4 | CAR | 0.020868 |
4 | NFC | 5 | ARI | 0.037293 |
5 | NFC | 6 | DET | 0.025509 |
6 | AFC | 1 | NE | 0.059916 |
7 | AFC | 2 | DEN | 0.050597 |
8 | AFC | 3 | PIT | 0.039831 |
9 | AFC | 4 | IND | 0.036103 |
10 | AFC | 5 | CIN | 0.036969 |
11 | AFC | 6 | BAL | 0.038372 |
12 rows × 4 columns
nfc_data = sdf[sdf.conf == 'NFC']
afc_data = sdf[sdf.conf == 'AFC']
def mc_simulator():
num_sim = 50
# NFC Wildcard Round - 1
teams = [nfc_data[nfc_data.standing == 4].team.get_values()[0], nfc_data[nfc_data.standing == 5].team.get_values()[0]]
s1 = round(float(nfc_data[nfc_data.standing == 4].prob), 4)
s2 = round(float(nfc_data[nfc_data.standing == 5].prob), 4)
nwcr1 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
# NFC Wildcard Round - 2
teams = [nfc_data[nfc_data.standing == 3].team.get_values()[0], nfc_data[nfc_data.standing == 6].team.get_values()[0]]
s1 = round(float(nfc_data[nfc_data.standing == 3].prob), 4)
s2 = round(float(nfc_data[nfc_data.standing == 6].prob), 4)
nwcr2 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
# NFC Division Round - 1
ndr1 = []
for i in xrange(len(nwcr1)):
teams = [nwcr1[i], nfc_data[nfc_data.standing == 1].team.get_values()[0]]
s1 = round(temp[temp.index == nwcr1[i]].page_rank.get_values()[0], 4)
s2 = round(float(nfc_data[nfc_data.standing == 1].prob), 4)
ndr1.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
# NFC Division Round - 2
ndr2 = []
for i in xrange(len(nwcr2)):
teams = [nwcr2[i], nfc_data[nfc_data.standing == 2].team.get_values()[0]]
s1 = round(temp[temp.index == nwcr2[i]].page_rank.get_values()[0], 4)
s2 = round(float(nfc_data[nfc_data.standing == 2].prob), 4)
ndr2.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
# NFC Conference Champion
ncc = []
for i in xrange(len(ndr1)):
teams = [ndr1[i], ndr2[i]]
s1 = round(temp[temp.index == ndr1[i]].page_rank.get_values()[0], 4)
s2 = round(temp[temp.index == ndr2[i]].page_rank.get_values()[0], 4)
ncc.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
# AFC Wildcard Round - 1
teams = [afc_data[afc_data.standing == 4].team.get_values()[0], afc_data[afc_data.standing == 5].team.get_values()[0]]
s1 = round(float(afc_data[afc_data.standing == 4].prob), 4)
s2 = round(float(afc_data[afc_data.standing == 5].prob), 4)
awcr1 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
# AFC Wildcard Round - 2
teams = [afc_data[afc_data.standing == 3].team.get_values()[0], afc_data[afc_data.standing == 6].team.get_values()[0]]
s1 = round(float(afc_data[afc_data.standing == 3].prob), 4)
s2 = round(float(afc_data[afc_data.standing == 6].prob), 4)
awcr2 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
# AFC Division Round - 1
adr1 = []
for i in xrange(len(awcr1)):
teams = [awcr1[i], afc_data[afc_data.standing == 1].team.get_values()[0]]
s1 = round(temp[temp.index == awcr1[i]].page_rank.get_values()[0], 4)
s2 = round(float(afc_data[afc_data.standing == 1].prob), 4)
adr1.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
# AFC Division Round - 1
adr2 = []
for i in xrange(len(awcr2)):
teams = [awcr2[i], afc_data[afc_data.standing == 2].team.get_values()[0]]
s1 = round(temp[temp.index == awcr2[i]].page_rank.get_values()[0], 4)
s2 = round(float(afc_data[afc_data.standing == 2].prob), 4)
adr2.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
# AFC Conference Champion
acc = []
for i in xrange(len(adr1)):
teams = [adr1[i], adr2[i]]
s1 = round(temp[temp.index == adr1[i]].page_rank.get_values()[0], 4)
s2 = round(temp[temp.index == adr2[i]].page_rank.get_values()[0], 4)
acc.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
# Superbowl Champion
num_sim = 2
winners = []
for i in xrange(len(ncc)):
teams = [ncc[i], acc[i]]
s1 = round(temp[temp.index == ncc[i]].page_rank.get_values()[0], 4)
s2 = round(temp[temp.index == acc[i]].page_rank.get_values()[0], 4)
winners.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
return winners
final_result = mc_simulator()
from collections import defaultdict
freq = defaultdict(int)
for winner in final_result:
freq[winner] += 1
wdf = pd.DataFrame(freq.items())
wdf.index = wdf[0]
wdf.columns = ['team', 'wins']
wdf['win_prob'] = wdf.wins/sum(wdf.wins)
wdf = wdf.sort(columns='win_prob', ascending=False)
wdf = wdf[['win_prob']]
wdf
win_prob | |
---|---|
0 | |
SEA | 0.259444 |
NE | 0.192552 |
DEN | 0.139564 |
DAL | 0.101136 |
GB | 0.086048 |
PIT | 0.044592 |
ARI | 0.040720 |
BAL | 0.040340 |
CIN | 0.036212 |
IND | 0.036144 |
DET | 0.013648 |
CAR | 0.009600 |
12 rows × 1 columns