#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('pylab', 'inline') # In[2]: import pandas as pd import numpy as np import statsmodels.api as sm pd.set_option('display.width', 500) pd.set_option('display.max_columns', 30) idx = pd.IndexSlice # In[4]: data_file = "../data/input-cleaned/combined.tsv" df = pd.read_csv(data_file, sep="\t") df['game_id'] = range(1, len(df) + 1) for col in ['home_scorers', 'away_scorers']: df[col] = df[col].fillna('NULL') print df.shape print df.iloc[0] # In[5]: # add column denoting who won def get_result(row): h, a = row.home_score, row.away_score return 'draw' if h == a else 'home' if h > a else 'away' df['result'] = df.apply(get_result, axis=1) # result counts results = pd.DataFrame() results['games'] = df.groupby('result').size().order(ascending=False) results['freq'] = (results.games / sum(results.games)).astype(float) print results # In[7]: # transpose df so that there is only 1 team column per row dfs = {} for side in ['home', 'away']: df_side = df.copy() new_names = {} for col in ['team', 'score', 'scorers', 'g1']: new_names['%s_%s' % (side, col)] = col df_side.rename(columns=new_names, inplace=True) df_side['side'] = side df_side['result'] = df_side['result'].apply(lambda x: 'draw' if x == 'draw' else 'win' if x == side else 'loss') cols_to_keep = ['game_id', 'date', 'season', 'team', 'score', 'scorers', 'side', 'result'] df_side = df_side[cols_to_keep] dfs[side] = df_side dg = dfs['home'].append(dfs['away']) # check that it worked print len(dg) print len(dg.game_id.unique()) dg[dg.game_id == 1] # In[8]: # goals scored in a game by team print dg.groupby(['team', 'score']).size().unstack().fillna(0).head(10) # In[9]: # new df around indiv goals data = [] for i, r in dg.iterrows(): if r.scorers != 'NULL': goals = r.scorers.split(';') for goal in goals: goal_info = goal.split(',') other_info = [r.game_id, r.season, r.team, r.side, r.result, r.score] combined = other_info + goal_info data.append(combined) cols = ['game_id', 'season', 'team', 'side', 'result', 'tot_goals', 'player', 'time', 'type'] goals = pd.DataFrame(data, columns=cols) goals['time'] = goals['time'].astype(int) print len(goals) print goals.head() # In[10]: goals.time.describe() # In[11]: # goals scored by 15 min time frames # 30-45 + 75-90 includes stoppage time before half and end of game def time_norm(t, bin=5): t_start = max(min(t, 90) - 1, 0) / bin * bin return '%d-%d' % (t_start, t_start + bin) goals['time_norm'] = goals['time'].apply(lambda x: time_norm(x, 15)) goals.groupby('time_norm').size().plot(kind='barh') # In[29]: # transpose goals df so that index is game and time goals = goals.sort(['game_id', 'time'], ascending=[1, 1]) # 1/0 for home/away goals at each score times score_by_time = goals.groupby(['game_id', 'time', 'side']).size().unstack().fillna(0) # cumulative goals for home/away by game score_by_time = score_by_time.groupby(level=[0]).cumsum() print score_by_time.head(10) # In[24]: # check a specific game to make sure things look good print score_by_time.loc[idx[[10],:], :] # In[14]: # densify so that all times are in index for each game home = score_by_time.unstack()['home'].fillna(method='ffill', axis=1).fillna(0).stack() away = score_by_time.unstack()['away'].fillna(method='ffill', axis=1).fillna(0).stack() score_by_time_dense = pd.DataFrame() score_by_time_dense['home'] = home score_by_time_dense['away'] = away # add goal diff column score_by_time_dense['diff'] = score_by_time_dense['home'] - score_by_time_dense['away'] # check that it worked print score_by_time_dense.loc[idx[[10], :], :].tail(10) # In[15]: # add final result to this df score_by_time_dense2 = pd.merge(score_by_time_dense.reset_index(), df[['game_id', 'result']], on='game_id') # tally results by time and goal diff outcomes = score_by_time_dense2.groupby(['time', 'diff', 'result']).size() outcomes = outcomes.unstack().unstack().fillna(0).stack().loc[idx[:, -3:3], :] # turn counts in percentages outcomes["tot_games"] = outcomes.sum(axis=1) outcomes.loc[:, "away":"home"] = outcomes.loc[:, "away":"home"].div(outcomes["tot_games"], axis=0) print outcomes.head(15) # In[16]: # pivot data so index is only time outcomes2 = outcomes.stack().unstack(level=1).unstack(level=1) outcomes2.head() # In[17]: outcomes2[2].ix[:, 'away':'home'].plot() # In[18]: # add smoothed series for each outcome (using local regression) outcomes3 = outcomes2.copy() for col in outcomes3.columns: if col[1] in ['away', 'draw', 'home']: s = outcomes3[col] x, y = s.index.values, s.values y_smooth = sm.nonparametric.lowess(y, x, frac=0.2, return_sorted=False) outcomes3[col[0], '%s_smooth' % col[1]] = y_smooth outcomes3 = outcomes3.sort_index(axis=1) print outcomes3[0].head() # In[23]: # check that things seem reasonable outcomes3[0].ix[:, 'away':'home_smooth'].plot() # In[33]: # output final dataset outcomes3.to_csv('../ipy-data/outcome-probs.csv', index=True, tupleize_cols=True)