import os
import dateutil
import json
import networkx as nx # read it in with NX
import igraph as ig # process it in Igraph
import numpy as np
import pandas as pd
import statsmodels.api as sm
import scipy.stats as sps
from sklearn import linear_model
from patsy import dmatrices, dmatrix
We're going to download our NFL data from Armchair Analysis.
!mkdir -p data
if not os.path.exists('data/NFL_Data_Historical.zip'):
!curl -o "data/#1" http://armchairanalysis.com/{NFL_Data_Historical.zip}
if not os.path.exists('data/GAMES.csv'):
!unzip -d data data/NFL_Data_Historical.zip
armchair_games = pd.read_csv('data/GAMES.csv', index_col=0)
The 2012 games are not included, so we'll need to append them. This is kind of a mess because the Pro Football Reference data are weird.
games2012 = pd.read_csv('data/nfl2012.csv')
teams = sorted(armchair_games['H'].unique())
# These just happen to line up except for Seattle/SF
team_names = {name: team for name, team in zip(sorted(games2012['Loser'].unique()), teams)}
team_names['San Francisco 49ers'] = 'SF'
team_names['Seattle Seahawks'] = 'SEA'
games2012['H'] = games2012.apply(lambda x: x['Loser'] if x['Unnamed: 5'] == '@' else x['Winner'] , axis=1)
games2012['V'] = games2012.apply(lambda x: x['Winner'] if x['Unnamed: 5'] == '@' else x['Loser'] , axis=1)
games2012['H'] = games2012['H'].apply(team_names.get)
games2012['V'] = games2012['V'].apply(team_names.get)
games2012['PTSH'] = games2012.apply(lambda x: x['PtsL'] if x['Unnamed: 5'] == '@' else x['PtsW'] , axis=1)
games2012['PTSV'] = games2012.apply(lambda x: x['PtsW'] if x['Unnamed: 5'] == '@' else x['PtsL'] , axis=1)
games2012['SEAS'] = 2012
max_gid = int(armchair_games.index.astype(int).max())
games2012.index = range(max_gid+1, max_gid+1+games2012.shape[0])
games2012.index.name = 'GID'
games2012 = games2012.rename(columns={'Week': 'WEEK'})[['SEAS', 'WEEK', 'H', 'V', 'PTSH', 'PTSV']]
games = pd.concat([armchair_games, games2012])
games.to_csv('web/all.games.csv')
These utility functions willl transform a Pandas dataframe into either a graph, a matrix, or a set of model matrices for using a Statsmodels/Sklearn supervised model.
teams = sorted(games['H'].unique())
team_ids = {team: i for i, team in enumerate(teams)}
def make_graph(df):
"""Turn a set of games into a graph representation."""
# convert teams to ints for igraph
edges = []
for gid, row in df.iterrows():
if row['PTSH'] > row['PTSV']:
winner = row['H']
loser = row['V']
else:
winner = row['V']
loser = row['H']
edges.append((team_ids[loser], team_ids[winner]))
return ig.Graph(edges=edges, directed=True, vertex_attrs={'name': teams})
def make_matrix(df, elements=lambda x: (1, 0) if x['PTSH'] > x['PTSV'] else (0, 1)):
"""Turn a set of games into a matrix representation."""
k = len(teams)
mat = np.zeros((k, k))
for ix, row in df.iterrows():
a, b = elements(row)
mat[team_ids[row['H']], team_ids[row['V']]] = a
mat[team_ids[row['V']], team_ids[row['H']]] = b
return mat
def make_model_matrix(df):
dummies = {team: (df['H'] == team).astype(np.int) - (df['V'] == team).astype(np.int)
for team in teams}
df2 = pd.DataFrame(dummies)
df2['win'] = (df['PTSH'] > df['PTSV']).astype(np.int)
y, X = dmatrices('win ~ 0 + %s' % ' + '.join(teams), df2)
return y, X
import requests
from BeautifulSoup import BeautifulSoup as Soup
def fetch_power_rankings(year, week):
"""Get the rankings from ESPN for a given year/week."""
url = 'http://espn.go.com/nfl/powerrankings/_/year/%i/week/%i' % (year, week)
page = requests.get(url)
soup = Soup(page.content)
raw = [(int(td.text), td.parent.findAll('a')[0].get('href')) for td in soup.findAll(attrs={'class': 'pr-rank'})]
ranks = {}
for rank, url in raw:
ranks[url.split('/')[-2].upper()] = rank
ranks['WAS'] = ranks.pop('WSH')
return ranks
if not os.path.exists('data/espn.rankings.json'):
espn_rankings = {}
for year in range(2002, 2013):
espn_rankings[year] = {}
for week in (9, 18):
espn_rankings[year][week] = fetch_power_rankings(year, week)
with open('data/espn.rankings.json', 'w') as f:
espn_rankings = json.dump(espn_rankings, f)
else:
with open('data/espn.rankings.json') as f:
espn_rankings = json.load(f)
def espn_rank(df):
week = df['WEEK'].max()
season = df['SEAS'].max()
return espn_rankings[str(season)][str(week+1)]
We'll make a dataframe for the 2012 season to use as an example.
g12 = games[(games['SEAS'] == 2012)*(games['WEEK'] < 18)]
def rank_to_vector(rank):
return np.array([rank[team] for team in teams])
def zero_one_loss(df, rank):
homebetter = df.H.map(rank.get) < df.V.map(rank.get)
homewin = df.PTSH > df.PTSV
return (homewin * homebetter + (1-homewin)*(1-homebetter)).sum() / float(df.shape[0])
def misrank_loss(df, rank):
winner_rank = games.Winner.map(rank.get)
loser_rank = games.Loser.map(rank.get)
return ((winner_rank > loser_rank)*(winner_rank - loser_rank)).sum()
def eval_espn_similarity(rank_func, year, week):
week8 = games[(games['SEAS'] == year)*(games['WEEK'] <= week)]
ours = rank_to_vector(rank_func(week8))
theirs = rank_to_vector(espn_rankings[str(year)][str(week+1)])
return sps.kendalltau(ours, theirs)[0]
def eval_description(rank_func, year):
season = games[(games['SEAS'] == year)*(games['WEEK'] < 18)]
rank = rank_func(season)
return zero_one_loss(season, rank)
def eval_in_season_prediction(rank_func, year):
first_half = games[(games['SEAS'] == year)*(games['WEEK'] < 9)]
second_half = games[(games['SEAS'] == year)*(games['WEEK'] < 18)*(games['WEEK'] >= 9)]
rank = rank_func(first_half)
return zero_one_loss(second_half, rank)
def eval_postseason_prediction(rank_func, year):
in_season = games[(games['SEAS'] == year)*(games['WEEK'] < 18)]
post_season = games[(games['SEAS'] == year)*(games['WEEK'] >= 18)]
rank = rank_func(in_season)
return zero_one_loss(post_season, rank)
def wins(df):
winner = df.apply(lambda x: x['H'] if x['PTSH'] > x['PTSV'] else x['V'], axis=1)
w = {team: 31 for team in df['H'].unique()}
for rank, (team, ws) in enumerate(w.iteritems()):
w[team] = rank
return w
pythagorean wins = $\frac{p^{\beta}}{p^{\beta} + q^{\beta}}$
def pythagorean_wins(df, beta=2.37):
home = df.groupby('H')[['PTSH', 'PTSV']].sum().rename(columns={'PTSH': 'PTSFOR', 'PTSV': 'PTSAG'}, index={'H': 'TEAM'})
away = df.groupby('V')[['PTSH', 'PTSV']].sum().rename(columns={'PTSV': 'PTSFOR', 'PTSH': 'PTSAG'}, index={'V': 'TEAM'})
newdf = pd.concat([home, away]).groupby(level=0).sum()
winrate = newdf['PTSFOR']**beta / (newdf['PTSFOR']**beta + newdf['PTSAG']**beta)
return dict((winrate*-1).rank().astype(int).iteritems())
Power iteration code comes from The Glowing Python's blogpost.
PageRank with no damping (damping = 1) should be equivalent to taking the largest eigvenvector.
def pagerank(g, damping=1):
values = g.pagerank(damping=damping)
g_pr = {vert['name']: val for vert, val in zip(g.vs, values)}
return g_pr #{key: i for i, (key, v) in enumerate(sorted(g_pr.iteritems(), key=lambda x: x[1]))}
def power_iteration(m, iters=10000, x0=None):
if x0 == None:
x0 = np.ones(m.shape[0])
x0 /= np.linalg.norm(x0, 1)
for i in range(iters):
x0 = np.dot(m,x0)
x0 /= np.linalg.norm(x0,1)
scores = {team: x0[i] for team, i in team_ids.iteritems()}
return {team: rank for rank, (team, score) in enumerate(sorted(scores.iteritems(), key=lambda x: x[1], reverse=True))}
def eigen_rank(df):
m = make_matrix(df)
return power_iteration(m)
def eigen_rank_scores(df):
"""Uses points scored instead of dummy variables for wins"""
m = make_matrix(df, elements=lambda x: (x['PTSH'], x['PTSV']))
return power_iteration(m)
def score_transform(s1, s2):
x = (s1 + 1.0) / (s1 + s2 + 2.0)
return 0.5 + 0.5 * np.sign(x - 0.5) * np.sqrt(np.abs(2*x - 1))
def eigen_rank_scores_nonlinear(df):
"""Uses a transformed version of score with gives less credit for blowouts
See Keener (1993).
"""
m = make_matrix(df, elements=lambda x: (score_transform(x['PTSH'], x['PTSV']), score_transform(x['PTSV'], x['PTSH'])))
return power_iteration(m)
def logit_rank(df, c=1.0):
y, X = make_model_matrix(df)
clf = linear_model.LogisticRegression(C=c, penalty='l2', tol=1e-6, fit_intercept=False)
clf.fit(X, y)
return {team: rank for rank, (team, score) in enumerate(sorted(zip(teams, clf.coef_[0]), key=lambda x: x[1], reverse=True))}
def optimal_rank(df):
g = make_graph(df)
feedback_arcs = g.feedback_arc_set(method='exact')
g.delete_edges(feedback_arcs)
g_sort = g.topological_sorting()
return {g.vs[ix]['name']: i for i, ix in enumerate(reversed(g_sort))}
rankers = {
'wins': wins,
'pythagorean wins': pythagorean_wins,
'eigenvector': eigen_rank,
'eigenvector2': eigen_rank_scores_nonlinear,
'btl': logit_rank,
'optimal': optimal_rank,
'espn': espn_rank,
}
evaluations = {
'descriptive': eval_description,
'predict_in_season': eval_in_season_prediction,
'predict_postseason': eval_postseason_prediction,
'espn_similarity8': lambda rank, year: eval_espn_similarity(rank, year, 8),
'espn_similarity17': lambda rank, year: eval_espn_similarity(rank, year, 17)
}
results = []
for name, ranker in rankers.iteritems():
for eval_name, evaluation in evaluations.iteritems():
for year in range(2002, 2013):
metric = evaluation(ranker, year)
results.append({
'rank': name,
'eval': eval_name,
'year': year,
'value': metric,
})
resdf = pd.DataFrame(results)
resdf.set_index(['eval', 'year', 'rank'], inplace=True)
results2 = []
for name, ranker in rankers.iteritems():
for year in range(2002, 2013):
df = games[(games.SEAS == year)*(games.WEEK < 18)]
rank = ranker(df)
for rank, (team, value) in enumerate(sorted(rank.iteritems(), key=lambda x: x[1])):
results2.append({
'method': name,
'year': year,
'rank': rank+1
})
all_rankings = pd.DataFrame(results2)
all_rankings.to_csv('web/all.rankings.csv')
all_rankings.method.unique()
figsize(12, 8)
fig = resdf.ix['descriptive'].unstack()['value'].plot()
savefig('web/slides/img/desc.perf.png')
fig = resdf.ix['espn_similarity8'].unstack()['value'].plot()
savefig('web/slides/img/espn8.perf.png')
fig = resdf.ix['espn_similarity17'].unstack()['value'].plot()
savefig('web/slides/img/espn17.perf.png')
fig = resdf.ix['predict_in_season'].unstack()['value'].plot()
savefig('web/slides/img/season.perf.png')
fig = resdf.ix['predict_postseason'].unstack()['value'].plot()
savefig('web/slides/img/postseason.perf.png')
def emit_table(rank, fn):
ranked = [t for t, r in sorted(rank.iteritems(), key=lambda x: x[1])]
with open(fn, 'w') as f:
f.write('<table class="ranking-table" style="margin-left: auto; margin-right: auto;">')
for i in range(8):
f.write('<tr>')
for j in range(4):
ix = i + j*8
f.write('<td>%i</td><td><img src="../logos/%s.png" width=70 height=35 style="border:none;"/></td>' % (ix+1, ranked[ix]))
f.write('</tr>')
f.write('</table>')
emit_table(pythagorean_wins(g12), 'web/slides/pythagorean.html')
emit_table(eigen_rank_scores_nonlinear(g12), 'web/slides/eigenvector.html')
emit_table(logit_rank(g12), 'web/slides/btl.html')
emit_table(optimal_rank(g12), 'web/slides/optimal.html')
n = A.shape[1]
w,v = np.linalg.eig(A)
abs(np.real(v[:n,0]) / np.linalg.norm(v[:n,0],1))
A = np.matrix("[0 1 2 0; 1 0 1 1; 0 1 0 0; 2 1 2 0]")
mat = A.copy()
for i in range(100):
mat = mat.dot(A)
print mat.dot(np.ones(4))