%pylab inline
Populating the interactive namespace from numpy and matplotlib
import pandas as pd
import numpy as np
import statsmodels.api as sm
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)
idx = pd.IndexSlice
data_file = "../data/input-cleaned/combined.tsv"
df = pd.read_csv(data_file, sep="\t")
df['game_id'] = range(1, len(df) + 1)
for col in ['home_scorers', 'away_scorers']:
df[col] = df[col].fillna('NULL')
print df.shape
print df.iloc[0]
(4940, 10) season 2001-2002 date 2002-06-11 home_team Blackburn Rovers home_score 3 home_scorers Andy Cole,53,g;Damien Duff,66,g;Andy Cole,81,g away_team Fulham away_score 0 away_scorers NULL venue Ewood Park game_id 1 Name: 0, dtype: object
# add column denoting who won
def get_result(row):
h, a = row.home_score, row.away_score
return 'draw' if h == a else 'home' if h > a else 'away'
df['result'] = df.apply(get_result, axis=1)
# result counts
results = pd.DataFrame()
results['games'] = df.groupby('result').size().order(ascending=False)
results['freq'] = (results.games / sum(results.games)).astype(float)
print results
games freq result home 2286 0.462753 away 1374 0.278138 draw 1280 0.259109
# transpose df so that there is only 1 team column per row
dfs = {}
for side in ['home', 'away']:
df_side = df.copy()
new_names = {}
for col in ['team', 'score', 'scorers', 'g1']:
new_names['%s_%s' % (side, col)] = col
df_side.rename(columns=new_names, inplace=True)
df_side['side'] = side
df_side['result'] = df_side['result'].apply(lambda x:
'draw' if x == 'draw' else 'win' if x == side else 'loss')
cols_to_keep = ['game_id', 'date', 'season', 'team', 'score', 'scorers', 'side', 'result']
df_side = df_side[cols_to_keep]
dfs[side] = df_side
dg = dfs['home'].append(dfs['away'])
# check that it worked
print len(dg)
print len(dg.game_id.unique())
dg[dg.game_id == 1]
9880 4940
game_id | date | season | team | score | scorers | side | result | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 2002-06-11 | 2001-2002 | Blackburn Rovers | 3 | Andy Cole,53,g;Damien Duff,66,g;Andy Cole,81,g | home | win |
0 | 1 | 2002-06-11 | 2001-2002 | Fulham | 0 | NULL | away | loss |
# goals scored in a game by team
print dg.groupby(['team', 'score']).size().unstack().fillna(0).head(10)
score 0 1 2 3 4 5 6 7 8 9 team Arsenal 63 150 128 86 43 13 7 4 0 0 Aston Villa 151 175 105 40 16 5 2 0 0 0 Birmingham 60 49 30 10 2 1 0 0 0 0 Birmingham City 32 52 23 5 2 0 0 0 0 0 Blackburn Rovers 126 143 94 39 13 2 0 1 0 0 Blackpool 8 11 15 2 2 0 0 0 0 0 Bolton Wanderers 132 147 92 27 17 3 0 0 0 0 Burnley 13 15 5 3 2 0 0 0 0 0 Cardiff City 19 10 5 4 0 0 0 0 0 0 Charlton Athletic 77 81 47 16 7 0 0 0 0 0
# new df around indiv goals
data = []
for i, r in dg.iterrows():
if r.scorers != 'NULL':
goals = r.scorers.split(';')
for goal in goals:
goal_info = goal.split(',')
other_info = [r.game_id, r.season, r.team, r.side, r.result, r.score]
combined = other_info + goal_info
data.append(combined)
cols = ['game_id', 'season', 'team', 'side', 'result', 'tot_goals', 'player', 'time', 'type']
goals = pd.DataFrame(data, columns=cols)
goals['time'] = goals['time'].astype(int)
print len(goals)
print goals.head()
13001 game_id season team side result tot_goals player time type 0 1 2001-2002 Blackburn Rovers home win 3 Andy Cole 53 g 1 1 2001-2002 Blackburn Rovers home win 3 Damien Duff 66 g 2 1 2001-2002 Blackburn Rovers home win 3 Andy Cole 81 g 3 2 2001-2002 Leicester City home win 2 Paul Dickov 60 g 4 2 2001-2002 Leicester City home win 2 Matthew Piper 71 g
goals.time.describe()
count 13001.000000 mean 50.486501 std 26.406922 min 0.000000 25% 29.000000 50% 51.000000 75% 74.000000 max 95.000000 Name: time, dtype: float64
# goals scored by 15 min time frames
# 30-45 + 75-90 includes stoppage time before half and end of game
def time_norm(t, bin=5):
t_start = max(min(t, 90) - 1, 0) / bin * bin
return '%d-%d' % (t_start, t_start + bin)
goals['time_norm'] = goals['time'].apply(lambda x: time_norm(x, 15))
goals.groupby('time_norm').size().plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x10fdc53d0>
# transpose goals df so that index is game and time
goals = goals.sort(['game_id', 'time'], ascending=[1, 1])
# 1/0 for home/away goals at each score times
score_by_time = goals.groupby(['game_id', 'time', 'side']).size().unstack().fillna(0)
# cumulative goals for home/away by game
score_by_time = score_by_time.groupby(level=[0]).cumsum()
print score_by_time.head(10)
side away home game_id time 1 53 0 1 66 0 2 81 0 3 2 54 1 0 60 1 1 71 1 2 3 13 0 1 35 0 2 46 0 3 57 0 4
# check a specific game to make sure things look good
print score_by_time.loc[idx[[10],:], :]
side away home game_id time 10 4 0 1 20 1 1 31 2 1 33 2 2 72 2 3 83 2 4 89 3 4
# densify so that all times are in index for each game
home = score_by_time.unstack()['home'].fillna(method='ffill', axis=1).fillna(0).stack()
away = score_by_time.unstack()['away'].fillna(method='ffill', axis=1).fillna(0).stack()
score_by_time_dense = pd.DataFrame()
score_by_time_dense['home'] = home
score_by_time_dense['away'] = away
# add goal diff column
score_by_time_dense['diff'] = score_by_time_dense['home'] - score_by_time_dense['away']
# check that it worked
print score_by_time_dense.loc[idx[[10], :], :].tail(10)
home away diff game_id time 10 85 4 2 2 86 4 2 2 87 4 2 2 88 4 2 2 89 4 3 1 90 4 3 1 91 4 3 1 93 4 3 1 94 4 3 1 95 4 3 1
# add final result to this df
score_by_time_dense2 = pd.merge(score_by_time_dense.reset_index(), df[['game_id', 'result']], on='game_id')
# tally results by time and goal diff
outcomes = score_by_time_dense2.groupby(['time', 'diff', 'result']).size()
outcomes = outcomes.unstack().unstack().fillna(0).stack().loc[idx[:, -3:3], :]
# turn counts in percentages
outcomes["tot_games"] = outcomes.sum(axis=1)
outcomes.loc[:, "away":"home"] = outcomes.loc[:, "away":"home"].div(outcomes["tot_games"], axis=0)
print outcomes.head(15)
result away draw home tot_games time diff 0 -3 NaN NaN NaN 0 -2 NaN NaN NaN 0 -1 0.000000 0.000000 1.000000 2 0 0.303125 0.192188 0.504687 4480 1 1.000000 0.000000 0.000000 1 2 NaN NaN NaN 0 3 NaN NaN NaN 0 1 -3 NaN NaN NaN 0 -2 NaN NaN NaN 0 -1 0.500000 0.250000 0.250000 28 0 0.303030 0.191542 0.505427 4422 1 0.151515 0.212121 0.636364 33 2 NaN NaN NaN 0 3 NaN NaN NaN 0 2 -3 NaN NaN NaN 0
# pivot data so index is only time
outcomes2 = outcomes.stack().unstack(level=1).unstack(level=1)
outcomes2.head()
diff | -3 | -2 | -1 | 0 | 1 | 2 | 3 | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
result | away | draw | home | tot_games | away | draw | home | tot_games | away | draw | home | tot_games | away | draw | home | tot_games | away | draw | home | tot_games | away | draw | home | tot_games | away | draw | home | tot_games |
time | ||||||||||||||||||||||||||||
0 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 0 | 0.000000 | 0.000000 | 1.000000 | 2 | 0.303125 | 0.192188 | 0.504687 | 4480 | 1.000000 | 0.000000 | 0.000000 | 1 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 0 |
1 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 0 | 0.500000 | 0.250000 | 0.250000 | 28 | 0.303030 | 0.191542 | 0.505427 | 4422 | 0.151515 | 0.212121 | 0.636364 | 33 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 0 |
2 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 0 | 0.538462 | 0.200000 | 0.261538 | 65 | 0.303023 | 0.192707 | 0.504270 | 4333 | 0.129412 | 0.152941 | 0.717647 | 85 | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 0 |
3 | NaN | NaN | NaN | 0 | 0.5 | 0.5 | 0 | 2 | 0.486957 | 0.165217 | 0.347826 | 115 | 0.304595 | 0.193273 | 0.502132 | 4222 | 0.111888 | 0.174825 | 0.713287 | 143 | 0 | 0.000000 | 1.000000 | 1 | NaN | NaN | NaN | 0 |
4 | NaN | NaN | NaN | 0 | 0.5 | 0.5 | 0 | 2 | 0.469880 | 0.180723 | 0.349398 | 166 | 0.305151 | 0.194121 | 0.500729 | 4116 | 0.124352 | 0.150259 | 0.725389 | 193 | 0 | 0.333333 | 0.666667 | 6 | NaN | NaN | NaN | 0 |
outcomes2[2].ix[:, 'away':'home'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1111c8490>
# add smoothed series for each outcome (using local regression)
outcomes3 = outcomes2.copy()
for col in outcomes3.columns:
if col[1] in ['away', 'draw', 'home']:
s = outcomes3[col]
x, y = s.index.values, s.values
y_smooth = sm.nonparametric.lowess(y, x, frac=0.2, return_sorted=False)
outcomes3[col[0], '%s_smooth' % col[1]] = y_smooth
outcomes3 = outcomes3.sort_index(axis=1)
print outcomes3[0].head()
result away away_smooth draw draw_smooth home home_smooth tot_games time 0 0.303125 0.303486 0.192188 0.192624 0.504687 0.503849 4480 1 0.303030 0.303999 0.191542 0.192699 0.505427 0.503266 4422 2 0.303023 0.304492 0.192707 0.192767 0.504270 0.502711 4333 3 0.304595 0.304962 0.193273 0.192828 0.502132 0.502185 4222 4 0.305151 0.305411 0.194121 0.192886 0.500729 0.501684 4116
# check that things seem reasonable
outcomes3[0].ix[:, 'away':'home_smooth'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x117a95ed0>
# output final dataset
outcomes3.to_csv('../ipy-data/outcome-probs.csv', index=True, tupleize_cols=True)