%matplotlib inline %config InlineBackend.figure_format = 'retina' import os import math import warnings warnings.filterwarnings('ignore') from IPython.display import Image, HTML import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import pymc # I know folks are switching to "as pm" but I'm just not there yet DATA_DIR = os.path.join(os.getcwd(), 'data/') data_file = DATA_DIR + 'results_2014.csv' df = pd.read_csv(data_file, sep=',') df.head() teams = df.home_team.unique() teams = pd.DataFrame(teams, columns=['team']) teams['i'] = teams.index teams.head() teams df = pd.merge(df, teams, left_on='home_team', right_on='team', how='left') df = df.rename(columns = {'i': 'i_home'}).drop('team', 1) df = pd.merge(df, teams, left_on='away_team', right_on='team', how='left') df = df.rename(columns = {'i': 'i_away'}).drop('team', 1) df.head() observed_home_goals = df.home_score.values observed_away_goals = df.away_score.values home_team = df.i_home.values away_team = df.i_away.values num_teams = len(df.i_home.drop_duplicates()) num_games = len(home_team) num_teams df g = df.groupby('i_away') att_starting_points = np.log(g.away_score.mean()) g = df.groupby('i_home') def_starting_points = -np.log(g.away_score.mean()) #hyperpriors home = pymc.Normal('home', 0, .0001, value=0) tau_att = pymc.Gamma('tau_att', .1, .1, value=10) tau_def = pymc.Gamma('tau_def', .1, .1, value=10) intercept = pymc.Normal('intercept', 0, .0001, value=0) #team-specific parameters atts_star = pymc.Normal("atts_star", mu=0, tau=tau_att, size=num_teams, value=att_starting_points.values) defs_star = pymc.Normal("defs_star", mu=0, tau=tau_def, size=num_teams, value=def_starting_points.values) # trick to code the sum to zero contraint @pymc.deterministic def atts(atts_star=atts_star): atts = atts_star.copy() atts = atts - np.mean(atts_star) return atts @pymc.deterministic def defs(defs_star=defs_star): defs = defs_star.copy() defs = defs - np.mean(defs_star) return defs @pymc.deterministic def home_theta(home_team=home_team, away_team=away_team, home=home, atts=atts, defs=defs, intercept=intercept): return np.exp(intercept + home + atts[home_team] + defs[away_team]) @pymc.deterministic def away_theta(home_team=home_team, away_team=away_team, home=home, atts=atts, defs=defs, intercept=intercept): return np.exp(intercept + atts[away_team] + defs[home_team]) home_points = pymc.Poisson('home_points', mu=home_theta, value=observed_home_goals, observed=True) away_points = pymc.Poisson('away_points', mu=away_theta, value=observed_away_goals, observed=True) mcmc = pymc.MCMC([home, intercept, tau_att, tau_def, home_theta, away_theta, atts_star, defs_star, atts, defs, home_points, away_points]) map_ = pymc.MAP( mcmc ) map_.fit() mcmc.sample(200000, 40000, 20) pymc.Matplot.plot(home) pymc.Matplot.plot(intercept) pymc.Matplot.plot(tau_att) pymc.Matplot.plot(tau_def) observed_season = DATA_DIR + 'table_2014.csv' df_observed = pd.read_csv(observed_season) df_observed.loc[df_observed.QR.isnull(), 'QR'] = '' def simulate_season(): """ Simulate a season once, using one random draw from the mcmc chain. """ num_samples = atts.trace().shape[0] draw = np.random.randint(0, num_samples) atts_draw = pd.DataFrame({'att': atts.trace()[draw, :],}) defs_draw = pd.DataFrame({'def': defs.trace()[draw, :],}) home_draw = home.trace()[draw] intercept_draw = intercept.trace()[draw] season = df.copy() season = pd.merge(season, atts_draw, left_on='i_home', right_index=True) season = pd.merge(season, defs_draw, left_on='i_home', right_index=True) season = season.rename(columns = {'att': 'att_home', 'def': 'def_home'}) season = pd.merge(season, atts_draw, left_on='i_away', right_index=True) season = pd.merge(season, defs_draw, left_on='i_away', right_index=True) season = season.rename(columns = {'att': 'att_away', 'def': 'def_away'}) season['home'] = home_draw season['intercept'] = intercept_draw season['home_theta'] = season.apply(lambda x: math.exp(x['intercept'] + x['home'] + x['att_home'] + x['def_away']), axis=1) season['away_theta'] = season.apply(lambda x: math.exp(x['intercept'] + x['att_away'] + x['def_home']), axis=1) season['home_goals'] = season.apply(lambda x: np.random.poisson(x['home_theta']), axis=1) season['away_goals'] = season.apply(lambda x: np.random.poisson(x['away_theta']), axis=1) season['home_outcome'] = season.apply(lambda x: 'win' if x['home_goals'] > x['away_goals'] else 'loss' if x['home_goals'] < x['away_goals'] else 'draw', axis=1) season['away_outcome'] = season.apply(lambda x: 'win' if x['home_goals'] < x['away_goals'] else 'loss' if x['home_goals'] > x['away_goals'] else 'draw', axis=1) season = season.join(pd.get_dummies(season.home_outcome, prefix='home')) season = season.join(pd.get_dummies(season.away_outcome, prefix='away')) return season def create_season_table(season): """ Using a season dataframe output by simulate_season(), create a summary dataframe with wins, losses, goals for, etc. """ g = season.groupby('i_home') home = pd.DataFrame({'home_goals': g.home_goals.sum(), 'home_goals_against': g.away_goals.sum(), 'home_wins': g.home_win.sum(), 'home_losses': g.home_loss.sum() }) g = season.groupby('i_away') away = pd.DataFrame({'away_goals': g.away_goals.sum(), 'away_goals_against': g.home_goals.sum(), 'away_wins': g.away_win.sum(), 'away_losses': g.away_loss.sum() }) df = home.join(away) df['wins'] = df.home_wins + df.away_wins df['losses'] = df.home_losses + df.away_losses df['points'] = df.wins * 2 df['gf'] = df.home_goals + df.away_goals df['ga'] = df.home_goals_against + df.away_goals_against df['gd'] = df.gf - df.ga df = pd.merge(teams, df, left_on='i', right_index=True) df = df.sort_index(by='points', ascending=False) df = df.reset_index() df['position'] = df.index + 1 df['champion'] = (df.position == 1).astype(int) df['relegated'] = (df.position > 5).astype(int) return df def simulate_seasons(n=100): dfs = [] for i in range(n): s = simulate_season() t = create_season_table(s) t['iteration'] = i dfs.append(t) return pd.concat(dfs, ignore_index=True) simuls = simulate_seasons(10000) ax = simuls.points[simuls.team == 'Ireland'].hist(figsize=(7,5)) median = simuls.points[simuls.team == 'Ireland'].median() ax.set_title('Ireland: 2013-14 points, 1000 simulations') ax.plot([median, median], ax.get_ylim()) plt.annotate('Median: %s' % median, xy=(median + 1, ax.get_ylim()[1]-10)) ax = simuls.gf[simuls.team == 'Ireland'].hist(figsize=(7,5)) median = simuls.gf[simuls.team == 'Ireland'].median() ax.set_title('Ireland: 2013-14 points for, 1000 simulations') ax.plot([median, median], ax.get_ylim()) plt.annotate('Median: %s' % median, xy=(median + 1, ax.get_ylim()[1]-10)) g = simuls.groupby('team') df_champs = pd.DataFrame({'percent_champs': g.champion.mean()}) df_champs = df_champs.sort_index(by='percent_champs') df_champs = df_champs[df_champs.percent_champs > .05] df_champs = df_champs.reset_index() fig, ax = plt.subplots(figsize=(8,6)) ax.barh(df_champs.index.values, df_champs.percent_champs.values) for i, row in df_champs.iterrows(): label = "{0:.1f}%".format(100 * row['percent_champs']) ax.annotate(label, xy=(row['percent_champs'], i), xytext = (3, 10), textcoords = 'offset points') ax.set_ylabel('Team') ax.set_title('% of Simulated Seasons In Which Team Finished Winners') _= ax.set_yticks(df_champs.index + .5) _= ax.set_yticklabels(df_champs['team'].values)