College Football PageRank Model¶

Importing necessary libraries.

In [27]:

%pylab inline

from collections import defaultdict
from IPython.display import Image

import glob
import cPickle as pickle
import math
import networkx as nx
import numpy as np
import operator
import pandas as pd
import pylab
import seaborn

Populating the interactive namespace from numpy and matplotlib

WARNING: pylab import has clobbered these variables: ['rc', 'f', 'pylab']
`%matplotlib` prevents importing * from pylab and numpy

In [28]:

from matplotlib import rc
rc("figure", facecolor="white")
rc("axes", facecolor="white")
# rc("axes", edgecolor="grey")


rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
# rc("grid", linestyle=":")


rc('font',**{'family':'serif','serif':['Palatino']})

In [29]:

def side_by_side(*objs, **kwds):
    from pandas.core.common import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print adjoin(space, *reprs)

Reading games data bought from ArmchairAnalysis into a pandas dataframe named gamedDF

In [30]:

path = r'cfblines'
allfiles = glob.glob(path + "/*.csv")
alldata = pd.DataFrame()
fileList = []
for f in allfiles:
    data1 = pd.read_csv(f)
    fileList.append(data1)
merged = pd.concat(fileList, ignore_index=True)

In [31]:

# Splitting date to get the year value
def get_year(col):
    return col.split('/')[-1]
merged['year'] = merged['Date'].apply(get_year)
merged.index += 1 
merged['game_id'] = merged.index
merged.head()

Out[31]:

	Date	Visitor	Visitor Score	Home Team	Home Score	Line	year	game_id
1	08/26/2000	Iowa	7	Kansas State	27	29.0	2000	1
2	08/26/2000	Brigham Young	3	Florida State	29	25.0	2000	2
3	08/26/2000	Miss Valley State	10	Louisiana Tech	63		2000	3
4	08/26/2000	New Mexico	3	Texas Tech	24	15.5	2000	4
5	08/27/2000	Southern Cal	29	Penn State	5	3.5	2000	5

5 rows × 8 columns

In [32]:

naming = {'year':'year',
          'Visitor':'away_team',
          'Home Team':'home_team',
          'Visitor Score':'away_score',
          'Home Score':'home_score'}
merged.rename(columns=naming, inplace=True)

In [33]:

gamesDF = merged

Selecting useful from the columns from the gamesDF and renaming them using a descriptive name.

In [34]:

def get_winner(home_score, away_score, home_team, away_team):
    """Finds out the winner of a game.
    
    Based of the score it decides who the winner of a game is.
  
    Args:
        home_score: An interger value which represents home team score.
        away_score: An interger value which represents home team score.
        home_team: A string representing the home team.
        away_team: A string representing the away team.
        
    Returns:
        A string which represents the winner of a given game.
    """
    if home_score > away_score:
        return home_team
    else:
        return away_team

In [35]:

gamesDF['winner'] = gamesDF.apply(lambda x: get_winner(x['home_score'], x['away_score'], x['home_team'], x['away_team']), axis=1)

In [36]:

gamesDF.tail()

Out[36]:

	Date	away_team	away_score	home_team	home_score	Line	year	game_id	winner
10458	12/07/2013	Duke	7	Florida State	45	29.5	2013	10458	Florida State
10459	12/07/2013	Stanford	38	Arizona State	14	3.5	2013	10459	Stanford
10460	12/07/2013	Ohio State	24	Michigan State	34	-5.5	2013	10460	Michigan State
10461	12/07/2013	Utah State	17	Fresno State	24	3.0	2013	10461	Fresno State
10462	12/14/2013	Army	7	Navy	34	12.0	2013	10462	Navy

5 rows × 9 columns

Adding a synthetic column called winner which represents winner of a given game (i.e., row of data)

In [37]:

cols = ['game_id', 'year', 'home_team', 'away_team', 'home_score', 'away_score', 'winner']
gamesDF = gamesDF[cols]
len(gamesDF.index)

Out[37]:

In [38]:

ints = ['year','home_score', 'away_score']
for c in list(gamesDF[ints].columns):
    gamesDF[c] = gamesDF[c].astype(int)

In [39]:

# gamesDF.to_csv('processed.csv')

In [40]:

def create_team_graph(teams):
    graph = nx.DiGraph()
    for i in teams.index:
        team = teams.ix[i]
        if (team.away_score <= team.home_score):
            winner = team['home_team']
            loser = team['away_team']
            point_differential = float(abs(team['away_score']- team['home_score'])) /(team['away_score']+ team['home_score'])
        else:
            winner = team['away_team']
            loser = team['home_team']
            try:
                point_differential = float(abs(team['away_score']- team['home_score'])) /(team['away_score']+ team['home_score'])
            except:
                point_differential = 0.001
        update_graph(winner, loser, point_differential, graph)
    return graph

In [41]:

def update_graph(winner, loser, point_differential, graph):    
    graph.add_node(winner)
    graph.add_node(loser)
    
    if graph.has_edge(loser, winner):
        updated_weight = graph[loser][winner]['weight'] + point_differential
    else:
        updated_weight = point_differential
        
    graph.add_edge(loser, winner, weight = updated_weight)    
    return graph

In [42]:

# Get winner for all the years
def get_winners(data):
    years = list(unique(gamesDF.year))
    result = dict()
    for year in years:
        games = data[data.year == year]
        team_network = create_team_graph(games)
        pagerank_teams = nx.pagerank_numpy(team_network)
        pagerank_teams = pd.DataFrame(pagerank_teams.items(), columns=['team', 'page_rank'])
        pagerank_teams = pd.DataFrame(pagerank_teams.ix[:, 'page_rank']).set_index(pagerank_teams.team)
        pagerank_teams = pagerank_teams.sort(columns='page_rank', ascending=False)
        result[year] = pagerank_teams
    return result

In [43]:

winners = get_winners(gamesDF)

In [44]:

gamesDF[gamesDF.year == 2011].tail()

Out[44]:

	game_id	year	home_team	away_team	home_score	away_score	winner
8840	8840	2011	Houston	Southern Miss	28	49	Southern Miss
8841	8841	2011	Louisiana State	Georgia	42	10	Louisiana State
8842	8842	2011	Clemson	Virginia Tech	38	10	Clemson
8843	8843	2011	Wisconsin	Michigan State	42	39	Wisconsin
8844	8844	2011	Navy	Army	27	21	Navy

5 rows × 7 columns

In [45]:

plt.rc('figure', figsize=(18,9))
winners[2013][:25].plot(kind='bar')
plt.show()

In [46]:

plt.rc('figure', figsize=(18,9))
winners[2012][:25].plot(kind='bar')
plt.show()

In [47]:

plt.rc('figure', figsize=(18,9))
winners[2011][:25].plot(kind='bar')
plt.show()

Game Simulation¶

The following modules simulate games in order and use rankings from Baseline model to predict winner. It also calculates predicted score difference which can later be compared with actual score diffrence to measure error.

In [48]:

def get_schedule(gamesDF, allGames=False):
    """Gets playoff game ordering.

    Args:
        gamesDF: A pandas dataframe containing games data with following columns.
            'game_id',
            'year',
            'week',
            'away_team',
            'home_team',
            'away_score',
            'home_score',
            'winner'

    Returns:
        A dictionary containing year as key and pandas dataframe containing playoff game
        ordering and scores as value.
    """
    years = set(gamesDF.year)
    gameSchedule = dict()
    for year in years:
        if allGames:
            yearlyPlayoff = gamesDF[gamesDF.year == year]
        else:
            yearlyPlayoff = gamesDF[gamesDF.year == year].tail(11)
        yearlyPlayoff['game_order'] = yearlyPlayoff.game_id.rank(ascending=True).order()
        cols = ['game_order', 'home_team', 'away_team', 'home_score', 'away_score', 'winner']
        yearlyPlayoff = yearlyPlayoff[cols]
        gameSchedule[year] = yearlyPlayoff
    return gameSchedule

In [49]:

def yearly_game_simulator(gameSchedule, teamRankings):
    """Simulates games played on a particular season and returns a dataframe which contains
    information about our accuracy.

    Args:
        gameSchedule: A pandas dataframe containing NFL playoff gameplay ordering.
        teamRankings: A pandas dataframe containing team rankings from Baseline model.

    Returns:
        A pandas dataframe which contains model prediction and its correctness when compared
        to actual data.
    """
    stats = gameSchedule.copy(deep=True)
    for i, game in gameSchedule.iterrows():
        game_order = game[0]
        home_team = game[1]
        away_team = game[2]
        home_score = game[3]
        away_score = game[4]
        winner = game[5]
        if home_team == winner:
            winner_score = home_score
            loser_score = away_score
        else:
            loser_score = home_score
            winner_score = away_score
        try:
            home_team_rank = int(teamRankings.ix[home_team]['page_rank'])
            away_team_rank = int(teamRankings.ix[away_team]['page_rank'])
            if home_team_rank >= away_team_rank:
                predicted_winner = home_team
                predicted_score_diff = home_team_rank - away_team_rank
            else:
                predicted_winner = away_team
                # home_team_rank - away_team_rankn reverse
                predicted_score_diff = home_team_rank - away_team_rank
        except:
            home_team_rank = np.NAN
            away_team_rank = np.NAN
            predicted_winner = home_team
            predicted_score_diff = np.NAN
        if predicted_winner == winner:
            correct_prediction = True
        else:
            correct_prediction = False
        score_diff = winner_score - loser_score
        stats.ix[i,'predicted_winner'] = predicted_winner
        stats.ix[i,'correct_prediction'] = correct_prediction
        stats.ix[i,'home_team_rank'] = home_team_rank
        stats.ix[i,'away_team_rank'] = away_team_rank
        stats.ix[i,'predicted_score_diff'] = predicted_score_diff
        stats.ix[i,'score_diff'] = score_diff
        stats.ix[i,'error'] = predicted_score_diff - score_diff
    return stats

In [50]:

def game_simulator(gameSchedule, teamRankings):
    """Simulates games played on all seasons and returns a dataframe which contains
    information about our accuracy.

    Args:
        gameSchedule: A dictionary containing year as key and a pandas dataframe
            containing NFL playoff gameplay ordering as value.
        teamRankings: A dictionary containing year as key and a pandas dataframe
            containing team rankings from Baseline model as value.

    Returns:
        A dictionary containing year as key and a pandas dataframe which contains
        model prediction and its correctness when compared to actual data as value.
    """
    allStats = dict()
    for year, schedule in gameSchedule.iteritems():
        rankings = teamRankings[year]
        yearlyStats = yearly_game_simulator(schedule, rankings)
        allStats[year] = yearlyStats
    return allStats

In [51]:

def check_accuracy(allStats):
    accuracy = dict()
    for year, df in allStats.iteritems():
        numGames = len(df.correct_prediction)
        numCorrect = sum(df.correct_prediction)
        pctCorrect = numCorrect*100/float(numGames)
        accuracy[year] = [numGames, numCorrect, pctCorrect]
    return accuracy

In [55]:

gameSchedule = get_schedule(gamesDF, allGames=True)
teamRankings = get_winners(gamesDF)

allStats = game_simulator(gameSchedule, teamRankings)

In [56]:

accuracy = check_accuracy(allStats)
accuracy = pd.DataFrame(accuracy).T
accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
accuracy

Out[56]:

	num_games	num_correct_prediction	pct_correct_prediction
2000	671	416	61.997019
2001	684	419	61.257310
2002	744	476	63.978495
2003	743	470	63.257066
2004	691	451	65.267728
2005	690	416	60.289855
2006	760	481	63.289474
2007	767	479	62.451108
2008	770	487	63.246753
2009	774	495	63.953488
2010	773	475	61.448900
2011	777	499	64.221364
2012	805	491	60.993789
2013	813	505	62.115621

14 rows × 3 columns

In [57]:

plt.figure(figsize=(10, 6))
plt.bar(accuracy.index, accuracy['pct_correct_prediction'])
plt.xticks(list(accuracy.index), rotation=60)
plt.yticks(np.arange(0, 101, 10))
plt.xlabel('Years')
plt.ylabel('Percentage Correct')
plt.title('Predict all games using regular season games of current year')
plt.show()

2014 Winner Prediction¶

In [67]:

path = r'cfblines/2014'
allfiles = glob.glob(path + "/wagstats2014*.csv")
alldata = pd.DataFrame()
fileList = []
for f in allfiles:
    data1 = pd.read_csv(f)
    fileList.append(data1)
merged = pd.concat(fileList, ignore_index=True)

In [68]:

# Splitting date to get the year value
def get_year(col):
    return col.split('/')[-1]
merged['year'] = merged['Date'].apply(get_year)
merged.index += 1 
merged['game_id'] = merged.index
merged.head()

Out[68]:

	Date	Vis Team	Rushing Yards	Rushing Attempts	Passing Yards	Passing Attempts	Passing Completions	Penalties	Penalty Yards	Interceptions Thrown	1st Downs	3rd Down Attempts	3rd Down Conversions	4th Down Attempts	4th Down conversions	Time of Possession	Score	Home Team	Rushing Yards.1
1	08/27/2014	Abilene Christian	95	36	403	40	30	9	86	1	26	14	7	0	0	2049	37	Georgia State	153	...
2	08/28/2014	Texas A&M	169	39	511	60	44	8	95	0	39	17	12	2	2	2258	52	South Carolina	67	...
3	08/28/2014	Wake Forest	-3	27	97	22	12	8	51	1	5	14	3	0	0	1477	10	Louisiana-Monroe	163	...
4	08/28/2014	Eastern Illinois	99	39	310	57	32	6	28	1	27	21	8	5	4	1791	20	Minnesota	182	...
5	08/28/2014	Howard	148	51	68	26	12	5	48	0	15	18	3	3	1	2005	0	Akron	113	...

5 rows × 37 columns

In [69]:

cols = ['game_id', 'Home Team', ' Score', ' Vis Team', ' Score.1', 'year']
gamesDF = merged[cols]

In [70]:

naming = {'year':'year',
          ' Vis Team':'away_team',
          'Home Team':'home_team',
          ' Score':'away_score',
          ' Score.1':'home_score'}
gamesDF.rename(columns=naming, inplace=True)

/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/core/frame.py:2184: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame
  **kwargs)

In [71]:

def get_winner(home_score, away_score, home_team, away_team):
    """Finds out the winner of a game.
    
    Based of the score it decides who the winner of a game is.
  
    Args:
        home_score: An interger value which represents home team score.
        away_score: An interger value which represents home team score.
        home_team: A string representing the home team.
        away_team: A string representing the away team.
        
    Returns:
        A string which represents the winner of a given game.
    """
    if home_score > away_score:
        return home_team
    else:
        return away_team

gamesDF['winner'] = gamesDF.apply(lambda x: get_winner(x['home_score'], x['away_score'], x['home_team'], x['away_team']), axis=1)

In [72]:

ngamesDF = gamesDF.dropna(how='any')

In [73]:

winners = get_winners(gamesDF)
winners['2014']

Out[73]:

	page_rank
team
Mississippi	0.047657
Arkansas	0.040814
Oregon	0.040382
Arizona	0.040329
Georgia	0.034180
Alabama	0.030070
UCLA	0.026884
Georgia Tech	0.023484
Baylor	0.023422
Auburn	0.022786
West Virginia	0.022126
Florida	0.021623
Missouri	0.021409
Stanford	0.020294
Miami (Florida)	0.018711
Virginia Tech	0.016808
Mississippi State	0.015904
Florida State	0.015670
Duke	0.015266
North Carolina	0.014291
Ohio State	0.013245
Clemson	0.012491
TCU	0.011713
Arizona State	0.011601
LSU	0.010716
Texas	0.010165
Louisville	0.009677
Oklahoma	0.009523
Kansas State	0.008777
Boise State	0.008503
USC	0.008330
Wisconsin	0.008222
Texas A&M	0.008205
Louisiana Tech	0.007691
Virginia	0.007658
Western Kentucky	0.007081
Pittsburgh	0.006798
South Carolina	0.006763
North Carolina State	0.006387
Michigan State	0.006086
Utah State	0.006036
Air Force	0.005989
Nebraska	0.005694
Utah	0.005411
Marshall	0.005371
Tennessee	0.005215
Minnesota	0.005118
East Carolina	0.005097
Wake Forest	0.004979
Boston College	0.004839
Notre Dame	0.004822
BYU	0.004780
Oregon State	0.004766
Temple	0.004435
San Diego State	0.004381
Memphis	0.004277
UCF	0.004133
Iowa	0.004109
Akron	0.003991
Washington	0.003984
	...

209 rows × 1 columns

In [74]:

plt.rc('figure', figsize=(18,9))
winners['2014'][:25].plot(kind='bar')
plt.show()

In [96]:

teams = winners['2014']
a = []
a.append(teams[teams['team'] == ' Alabama'])
a.append(teams[teams['team'] == ' Oregon'])
a.append(teams[teams['team'] == ' Florida State'])
a.append(teams[teams['team'] == ' Ohio State'])
teams = pd.concat(a)
teams
data = teams[['team', 'page_rank']]
data = data.sort(columns=['page_rank'], ascending=False)
data.index = [1, 2, 3, 4]
data

Out[96]:

	team	page_rank
1	Oregon	0.040382
2	Alabama	0.030070
3	Florida State	0.015670
4	Ohio State	0.013245

4 rows × 2 columns

In [97]:

def mc_simulator():
    num_sim = 10000
    # NFC Wildcard Round - 1
    teams = [data[data.index == 1].team.get_values()[0], data[data.index == 4].team.get_values()[0]]
    s1 = round(data[data.index == 1].page_rank.get_values()[0], 4)
    s2 = round(data[data.index == 4].page_rank.get_values()[0], 4)
    cf1 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
    
    # NFC Wildcard Round - 2
    teams = [data[data.index == 2].team.get_values()[0], data[data.index == 3].team.get_values()[0]]
    s1 = round(data[data.index == 2].page_rank.get_values()[0], 4)
    s2 = round(data[data.index == 3].page_rank.get_values()[0], 4)
    cf2 = np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])])
    
    # Superbowl Champion
    num_sim = 10000
    winners = []
    for i in xrange(len(cf1)):
        teams = [cf1[i], cf2[i]]
        s1 = round(data[data.team == cf1[i]].page_rank.get_values()[0], 4)
        s2 = round(data[data.team == cf2[i]].page_rank.get_values()[0], 4)
        winners.extend(np.random.choice(teams, num_sim, p=[s1/sum([s1, s2]), s2/sum([s1, s2])]))
    return winners

In [98]:

final_winners = mc_simulator()

In [99]:

from collections import defaultdict
freq = defaultdict(int)
for winner in final_winners:
    freq[winner] += 1

In [100]:

wdf = pd.DataFrame(freq.items())
wdf.index = wdf[0]
wdf.columns = ['team', 'wins']
wdf['win_prob'] = wdf.wins/sum(wdf.wins)
wdf = wdf.sort(columns='win_prob', ascending=False)
wdf = wdf[['win_prob']]
wdf

Out[100]:

	win_prob
0
Oregon	0.468566
Alabama	0.324067
Florida State	0.118623
Ohio State	0.088744

4 rows × 1 columns