Importing necessary libraries.
%pylab inline
from collections import defaultdict
from IPython.display import Image
import glob
import cPickle as pickle
import math
import networkx as nx
import numpy as np
import operator
import pandas as pd
import pylab
import seaborn
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['rc', 'f', 'pylab'] `%matplotlib` prevents importing * from pylab and numpy
from matplotlib import rc
rc("figure", facecolor="white")
rc("axes", facecolor="white")
# rc("axes", edgecolor="grey")
rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
# rc("grid", linestyle=":")
rc('font',**{'family':'serif','serif':['Palatino']})
def side_by_side(*objs, **kwds):
from pandas.core.common import adjoin
space = kwds.get('space', 4)
reprs = [repr(obj).split('\n') for obj in objs]
print adjoin(space, *reprs)
Reading games data bought from ArmchairAnalysis into a pandas dataframe named gamedDF
path = r'cfblines'
allfiles = glob.glob(path + "/*.csv")
alldata = pd.DataFrame()
fileList = []
for f in allfiles:
data1 = pd.read_csv(f)
fileList.append(data1)
merged = pd.concat(fileList, ignore_index=True)
# Splitting date to get the year value
def get_year(col):
return col.split('/')[-1]
merged['year'] = merged['Date'].apply(get_year)
merged.index += 1
merged['game_id'] = merged.index
merged.head()
Date | Visitor | Visitor Score | Home Team | Home Score | Line | year | game_id | |
---|---|---|---|---|---|---|---|---|
1 | 08/26/2000 | Iowa | 7 | Kansas State | 27 | 29.0 | 2000 | 1 |
2 | 08/26/2000 | Brigham Young | 3 | Florida State | 29 | 25.0 | 2000 | 2 |
3 | 08/26/2000 | Miss Valley State | 10 | Louisiana Tech | 63 | 2000 | 3 | |
4 | 08/26/2000 | New Mexico | 3 | Texas Tech | 24 | 15.5 | 2000 | 4 |
5 | 08/27/2000 | Southern Cal | 29 | Penn State | 5 | 3.5 | 2000 | 5 |
5 rows × 8 columns
naming = {'year':'year',
'Visitor':'away_team',
'Home Team':'home_team',
'Visitor Score':'away_score',
'Home Score':'home_score'}
merged.rename(columns=naming, inplace=True)
gamesDF = merged
Selecting useful from the columns from the gamesDF and renaming them using a descriptive name.
def get_winner(home_score, away_score, home_team, away_team):
"""Finds out the winner of a game.
Based of the score it decides who the winner of a game is.
Args:
home_score: An interger value which represents home team score.
away_score: An interger value which represents home team score.
home_team: A string representing the home team.
away_team: A string representing the away team.
Returns:
A string which represents the winner of a given game.
"""
if home_score > away_score:
return home_team
else:
return away_team
gamesDF['winner'] = gamesDF.apply(lambda x: get_winner(x['home_score'], x['away_score'], x['home_team'], x['away_team']), axis=1)
gamesDF.tail()
Date | away_team | away_score | home_team | home_score | Line | year | game_id | winner | |
---|---|---|---|---|---|---|---|---|---|
10458 | 12/07/2013 | Duke | 7 | Florida State | 45 | 29.5 | 2013 | 10458 | Florida State |
10459 | 12/07/2013 | Stanford | 38 | Arizona State | 14 | 3.5 | 2013 | 10459 | Stanford |
10460 | 12/07/2013 | Ohio State | 24 | Michigan State | 34 | -5.5 | 2013 | 10460 | Michigan State |
10461 | 12/07/2013 | Utah State | 17 | Fresno State | 24 | 3.0 | 2013 | 10461 | Fresno State |
10462 | 12/14/2013 | Army | 7 | Navy | 34 | 12.0 | 2013 | 10462 | Navy |
5 rows × 9 columns
Adding a synthetic column called winner which represents winner of a given game (i.e., row of data)
cols = ['game_id', 'year', 'home_team', 'away_team', 'home_score', 'away_score', 'winner']
gamesDF = gamesDF[cols]
len(gamesDF.index)
10462
ints = ['year','home_score', 'away_score']
for c in list(gamesDF[ints].columns):
gamesDF[c] = gamesDF[c].astype(int)
# gamesDF.to_csv('processed.csv')
def create_team_graph(teams):
graph = nx.DiGraph()
for i in teams.index:
team = teams.ix[i]
if (team.away_score <= team.home_score):
winner = team['home_team']
loser = team['away_team']
point_differential = float(abs(team['away_score']- team['home_score'])) /(team['away_score']+ team['home_score'])
else:
winner = team['away_team']
loser = team['home_team']
try:
point_differential = float(abs(team['away_score']- team['home_score'])) /(team['away_score']+ team['home_score'])
except:
point_differential = 0.001
update_graph(winner, loser, point_differential, graph)
return graph
def update_graph(winner, loser, point_differential, graph):
graph.add_node(winner)
graph.add_node(loser)
if graph.has_edge(loser, winner):
updated_weight = graph[loser][winner]['weight'] + point_differential
else:
updated_weight = point_differential
graph.add_edge(loser, winner, weight = updated_weight)
return graph
# Get winner for all the years
def get_winners(data):
years = list(unique(gamesDF.year))
result = dict()
for year in years:
games = data[data.year == year]
team_network = create_team_graph(games)
pagerank_teams = nx.pagerank_numpy(team_network)
pagerank_teams = pd.DataFrame(pagerank_teams.items(), columns=['team', 'page_rank'])
pagerank_teams = pd.DataFrame(pagerank_teams.ix[:, 'page_rank']).set_index(pagerank_teams.team)
pagerank_teams = pagerank_teams.sort(columns='page_rank', ascending=False)
result[year] = pagerank_teams
return result
winners = get_winners(gamesDF)
gamesDF[gamesDF.year == 2011].tail()
game_id | year | home_team | away_team | home_score | away_score | winner | |
---|---|---|---|---|---|---|---|
8840 | 8840 | 2011 | Houston | Southern Miss | 28 | 49 | Southern Miss |
8841 | 8841 | 2011 | Louisiana State | Georgia | 42 | 10 | Louisiana State |
8842 | 8842 | 2011 | Clemson | Virginia Tech | 38 | 10 | Clemson |
8843 | 8843 | 2011 | Wisconsin | Michigan State | 42 | 39 | Wisconsin |
8844 | 8844 | 2011 | Navy | Army | 27 | 21 | Navy |
5 rows × 7 columns
plt.rc('figure', figsize=(18,9))
winners[2013][:25].plot(kind='bar')
plt.show()
plt.rc('figure', figsize=(18,9))
winners[2012][:25].plot(kind='bar')
plt.show()
plt.rc('figure', figsize=(18,9))
winners[2011][:25].plot(kind='bar')
plt.show()
The following modules simulate games in order and use rankings from Baseline model to predict winner. It also calculates predicted score difference which can later be compared with actual score diffrence to measure error.
def get_schedule(gamesDF, allGames=False):
"""Gets playoff game ordering.
Args:
gamesDF: A pandas dataframe containing games data with following columns.
'game_id',
'year',
'week',
'away_team',
'home_team',
'away_score',
'home_score',
'winner'
Returns:
A dictionary containing year as key and pandas dataframe containing playoff game
ordering and scores as value.
"""
years = set(gamesDF.year)
gameSchedule = dict()
for year in years:
if allGames:
yearlyPlayoff = gamesDF[gamesDF.year == year]
else:
yearlyPlayoff = gamesDF[gamesDF.year == year].tail(11)
yearlyPlayoff['game_order'] = yearlyPlayoff.game_id.rank(ascending=True).order()
cols = ['game_order', 'home_team', 'away_team', 'home_score', 'away_score', 'winner']
yearlyPlayoff = yearlyPlayoff[cols]
gameSchedule[year] = yearlyPlayoff
return gameSchedule
def yearly_game_simulator(gameSchedule, teamRankings):
"""Simulates games played on a particular season and returns a dataframe which contains
information about our accuracy.
Args:
gameSchedule: A pandas dataframe containing NFL playoff gameplay ordering.
teamRankings: A pandas dataframe containing team rankings from Baseline model.
Returns:
A pandas dataframe which contains model prediction and its correctness when compared
to actual data.
"""
stats = gameSchedule.copy(deep=True)
for i, game in gameSchedule.iterrows():
game_order = game[0]
home_team = game[1]
away_team = game[2]
home_score = game[3]
away_score = game[4]
winner = game[5]
if home_team == winner:
winner_score = home_score
loser_score = away_score
else:
loser_score = home_score
winner_score = away_score
try:
home_team_rank = int(teamRankings.ix[home_team]['page_rank'])
away_team_rank = int(teamRankings.ix[away_team]['page_rank'])
if home_team_rank >= away_team_rank:
predicted_winner = home_team
predicted_score_diff = home_team_rank - away_team_rank
else:
predicted_winner = away_team
# home_team_rank - away_team_rankn reverse
predicted_score_diff = home_team_rank - away_team_rank
except:
home_team_rank = np.NAN
away_team_rank = np.NAN
predicted_winner = home_team
predicted_score_diff = np.NAN
if predicted_winner == winner:
correct_prediction = True
else:
correct_prediction = False
score_diff = winner_score - loser_score
stats.ix[i,'predicted_winner'] = predicted_winner
stats.ix[i,'correct_prediction'] = correct_prediction
stats.ix[i,'home_team_rank'] = home_team_rank
stats.ix[i,'away_team_rank'] = away_team_rank
stats.ix[i,'predicted_score_diff'] = predicted_score_diff
stats.ix[i,'score_diff'] = score_diff
stats.ix[i,'error'] = predicted_score_diff - score_diff
return stats
def game_simulator(gameSchedule, teamRankings):
"""Simulates games played on all seasons and returns a dataframe which contains
information about our accuracy.
Args:
gameSchedule: A dictionary containing year as key and a pandas dataframe
containing NFL playoff gameplay ordering as value.
teamRankings: A dictionary containing year as key and a pandas dataframe
containing team rankings from Baseline model as value.
Returns:
A dictionary containing year as key and a pandas dataframe which contains
model prediction and its correctness when compared to actual data as value.
"""
allStats = dict()
for year, schedule in gameSchedule.iteritems():
rankings = teamRankings[year]
yearlyStats = yearly_game_simulator(schedule, rankings)
allStats[year] = yearlyStats
return allStats
def check_accuracy(allStats):
accuracy = dict()
for year, df in allStats.iteritems():
numGames = len(df.correct_prediction)
numCorrect = sum(df.correct_prediction)
pctCorrect = numCorrect*100/float(numGames)
accuracy[year] = [numGames, numCorrect, pctCorrect]
return accuracy
gameSchedule = get_schedule(gamesDF, allGames=True)
teamRankings = get_winners(gamesDF)
allStats = game_simulator(gameSchedule, teamRankings)
accuracy = check_accuracy(allStats)
accuracy = pd.DataFrame(accuracy).T
accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
accuracy
num_games | num_correct_prediction | pct_correct_prediction | |
---|---|---|---|
2000 | 671 | 416 | 61.997019 |
2001 | 684 | 419 | 61.257310 |
2002 | 744 | 476 | 63.978495 |
2003 | 743 | 470 | 63.257066 |
2004 | 691 | 451 | 65.267728 |
2005 | 690 | 416 | 60.289855 |
2006 | 760 | 481 | 63.289474 |
2007 | 767 | 479 | 62.451108 |
2008 | 770 | 487 | 63.246753 |
2009 | 774 | 495 | 63.953488 |
2010 | 773 | 475 | 61.448900 |
2011 | 777 | 499 | 64.221364 |
2012 | 805 | 491 | 60.993789 |
2013 | 813 | 505 | 62.115621 |
14 rows × 3 columns
plt.figure(figsize=(10, 6))
plt.bar(accuracy.index, accuracy['pct_correct_prediction'])
plt.xticks(list(accuracy.index), rotation=60)
plt.yticks(np.arange(0, 101, 10))
plt.xlabel('Years')
plt.ylabel('Percentage Correct')
plt.title('Predict all games using regular season games of current year')
plt.show()
path = r'cfblines/2014'
allfiles = glob.glob(path + "/wagstats2014*.csv")
alldata = pd.DataFrame()
fileList = []
for f in allfiles:
data1 = pd.read_csv(f)
fileList.append(data1)
merged = pd.concat(fileList, ignore_index=True)
# Splitting date to get the year value
def get_year(col):
return col.split('/')[-1]
merged['year'] = merged['Date'].apply(get_year)
merged.index += 1
merged['game_id'] = merged.index
merged.head()
Date | Vis Team | Rushing Yards | Rushing Attempts | Passing Yards | Passing Attempts | Passing Completions | Penalties | Penalty Yards | Fumbles Lost | Interceptions Thrown | 1st Downs | 3rd Down Attempts | 3rd Down Conversions | 4th Down Attempts | 4th Down conversions | Time of Possession | Score | Home Team | Rushing Yards.1 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 08/27/2014 | Abilene Christian | 95 | 36 | 403 | 40 | 30 | 9 | 86 | 0 | 1 | 26 | 14 | 7 | 0 | 0 | 2049 | 37 | Georgia State | 153 | ... |
2 | 08/28/2014 | Texas A&M | 169 | 39 | 511 | 60 | 44 | 8 | 95 | 0 | 0 | 39 | 17 | 12 | 2 | 2 | 2258 | 52 | South Carolina | 67 | ... |
3 | 08/28/2014 | Wake Forest | -3 | 27 | 97 | 22 | 12 | 8 | 51 | 0 | 1 | 5 | 14 | 3 | 0 | 0 | 1477 | 10 | Louisiana-Monroe | 163 | ... |
4 | 08/28/2014 | Eastern Illinois | 99 | 39 | 310 | 57 | 32 | 6 | 28 | 0 | 1 | 27 | 21 | 8 | 5 | 4 | 1791 | 20 | Minnesota | 182 | ... |
5 | 08/28/2014 | Howard | 148 | 51 | 68 | 26 | 12 | 5 | 48 | 0 | 0 | 15 | 18 | 3 | 3 | 1 | 2005 | 0 | Akron | 113 | ... |
5 rows × 37 columns
cols = ['game_id', 'Home Team', ' Score', ' Vis Team', ' Score.1', 'year']
gamesDF = merged[cols]
naming = {'year':'year',
' Vis Team':'away_team',
'Home Team':'home_team',
' Score':'away_score',
' Score.1':'home_score'}
gamesDF.rename(columns=naming, inplace=True)
/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/core/frame.py:2184: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame **kwargs)
def get_winner(home_score, away_score, home_team, away_team):
"""Finds out the winner of a game.
Based of the score it decides who the winner of a game is.
Args:
home_score: An interger value which represents home team score.
away_score: An interger value which represents home team score.
home_team: A string representing the home team.
away_team: A string representing the away team.
Returns:
A string which represents the winner of a given game.
"""
if home_score > away_score:
return home_team
else:
return away_team
gamesDF['winner'] = gamesDF.apply(lambda x: get_winner(x['home_score'], x['away_score'], x['home_team'], x['away_team']), axis=1)
ngamesDF = gamesDF.dropna(how='any')
winners = get_winners(gamesDF)
winners['2014']
page_rank | |
---|---|
team | |
Mississippi | 0.047657 |
Arkansas | 0.040814 |
Oregon | 0.040382 |
Arizona | 0.040329 |
Georgia | 0.034180 |
Alabama | 0.030070 |
UCLA | 0.026884 |
Georgia Tech | 0.023484 |
Baylor | 0.023422 |
Auburn | 0.022786 |
West Virginia | 0.022126 |
Florida | 0.021623 |
Missouri | 0.021409 |
Stanford | 0.020294 |
Miami (Florida) | 0.018711 |
Virginia Tech | 0.016808 |
Mississippi State | 0.015904 |
Florida State | 0.015670 |
Duke | 0.015266 |
North Carolina | 0.014291 |
Ohio State | 0.013245 |
Clemson | 0.012491 |
TCU | 0.011713 |
Arizona State | 0.011601 |
LSU | 0.010716 |
Texas | 0.010165 |
Louisville | 0.009677 |
Oklahoma | 0.009523 |
Kansas State | 0.008777 |
Boise State | 0.008503 |
USC | 0.008330 |
Wisconsin | 0.008222 |
Texas A&M | 0.008205 |
Louisiana Tech | 0.007691 |
Virginia | 0.007658 |
Western Kentucky | 0.007081 |
Pittsburgh | 0.006798 |
South Carolina | 0.006763 |
North Carolina State | 0.006387 |
Michigan State | 0.006086 |
Utah State | 0.006036 |
Air Force | 0.005989 |
Nebraska | 0.005694 |
Utah | 0.005411 |
Marshall | 0.005371 |
Tennessee | 0.005215 |
Minnesota | 0.005118 |
East Carolina | 0.005097 |
Wake Forest | 0.004979 |
Boston College | 0.004839 |
Notre Dame | 0.004822 |
BYU | 0.004780 |
Oregon State | 0.004766 |
Temple | 0.004435 |
San Diego State | 0.004381 |
Memphis | 0.004277 |
UCF | 0.004133 |
Iowa | 0.004109 |
Akron | 0.003991 |
Washington | 0.003984 |
... |
209 rows × 1 columns
plt.rc('figure', figsize=(18,9))
winners['2014'][:25].plot(kind='bar')
plt.show()
teams = winners['2014']
a = []
a.append(teams[teams['team'] == ' Alabama'])
a.append(teams[teams['team'] == ' Oregon'])
a.append(teams[teams['team'] == ' Florida State'])
a.append(teams[teams['team'] == ' Ohio State'])
teams = pd.concat(a)
teams
data = teams[['team', 'page_rank']]
data = data.sort(columns=['page_rank'], ascending=False)
data.index = [1, 2, 3, 4]
data
team | page_rank | |
---|---|---|
1 | Oregon | 0.040382 |
2 | Alabama | 0.030070 |
3 | Florida State | 0.015670 |
4 | Ohio State | 0.013245 |
4 rows × 2 columns
def mc_simulator():
num_sim = 10000
# NFC Wildcard Round - 1
teams = [data[data.index == 1].team.get_values()[0], data[data.index == 4].team.get_values()[0]]
s1 = round(data[data.index == 1].page_rank.get_values()[0], 4)
s2 = round(data[data.index == 4].page_rank.get_values()[0], 4)
cf1 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
# NFC Wildcard Round - 2
teams = [data[data.index == 2].team.get_values()[0], data[data.index == 3].team.get_values()[0]]
s1 = round(data[data.index == 2].page_rank.get_values()[0], 4)
s2 = round(data[data.index == 3].page_rank.get_values()[0], 4)
cf2 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
# Superbowl Champion
num_sim = 10000
winners = []
for i in xrange(len(cf1)):
teams = [cf1[i], cf2[i]]
s1 = round(data[data.team == cf1[i]].page_rank.get_values()[0], 4)
s2 = round(data[data.team == cf2[i]].page_rank.get_values()[0], 4)
winners.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
return winners
final_winners = mc_simulator()
from collections import defaultdict
freq = defaultdict(int)
for winner in final_winners:
freq[winner] += 1
wdf = pd.DataFrame(freq.items())
wdf.index = wdf[0]
wdf.columns = ['team', 'wins']
wdf['win_prob'] = wdf.wins/sum(wdf.wins)
wdf = wdf.sort(columns='win_prob', ascending=False)
wdf = wdf[['win_prob']]
wdf
win_prob | |
---|---|
0 | |
Oregon | 0.468566 |
Alabama | 0.324067 |
Florida State | 0.118623 |
Ohio State | 0.088744 |
4 rows × 1 columns