#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')


# In[2]:


import pandas as pd
import numpy as np
import statsmodels.api as sm

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)

idx = pd.IndexSlice


# In[4]:


data_file = "../data/input-cleaned/combined.tsv"
df = pd.read_csv(data_file, sep="\t")

df['game_id'] = range(1, len(df) + 1)

for col in ['home_scorers', 'away_scorers']:
    df[col] = df[col].fillna('NULL')

print df.shape
print df.iloc[0]


# In[5]:


# add column denoting who won
def get_result(row):
    h, a = row.home_score, row.away_score
    return 'draw' if h == a else 'home' if h > a else 'away'

df['result'] = df.apply(get_result, axis=1)

# result counts
results = pd.DataFrame()
results['games'] = df.groupby('result').size().order(ascending=False)
results['freq'] = (results.games / sum(results.games)).astype(float)

print results


# In[7]:


# transpose df so that there is only 1 team column per row
dfs = {}
for side in ['home', 'away']:    
    df_side = df.copy()
    
    new_names = {}
    for col in ['team', 'score', 'scorers', 'g1']:
        new_names['%s_%s' % (side, col)] = col
    df_side.rename(columns=new_names, inplace=True)

    df_side['side'] = side
    df_side['result'] = df_side['result'].apply(lambda x: 
        'draw' if x == 'draw' else 'win' if x == side else 'loss')

    cols_to_keep = ['game_id', 'date', 'season', 'team', 'score', 'scorers', 'side', 'result']
    df_side = df_side[cols_to_keep]
    
    dfs[side] = df_side

dg = dfs['home'].append(dfs['away'])

# check that it worked
print len(dg)
print len(dg.game_id.unique())

dg[dg.game_id == 1]


# In[8]:


# goals scored in a game by team
print dg.groupby(['team', 'score']).size().unstack().fillna(0).head(10)


# In[9]:


# new df around indiv goals
data = []
for i, r in dg.iterrows():
    if r.scorers != 'NULL':
        goals = r.scorers.split(';')
        for goal in goals:
            goal_info = goal.split(',')
            other_info = [r.game_id, r.season, r.team, r.side, r.result, r.score]
            combined = other_info + goal_info
            data.append(combined)

cols = ['game_id', 'season', 'team', 'side', 'result', 'tot_goals', 'player', 'time', 'type']
goals = pd.DataFrame(data, columns=cols)

goals['time'] = goals['time'].astype(int)

print len(goals)
print goals.head()


# In[10]:


goals.time.describe()


# In[11]:


# goals scored by 15 min time frames
# 30-45 + 75-90 includes stoppage time before half and end of game
def time_norm(t, bin=5):
    t_start = max(min(t, 90) - 1, 0) / bin * bin
    return '%d-%d' % (t_start, t_start + bin)

goals['time_norm'] = goals['time'].apply(lambda x: time_norm(x, 15))

goals.groupby('time_norm').size().plot(kind='barh')


# In[29]:


# transpose goals df so that index is game and time
goals = goals.sort(['game_id', 'time'], ascending=[1, 1])

# 1/0 for home/away goals at each score times
score_by_time = goals.groupby(['game_id', 'time', 'side']).size().unstack().fillna(0)

# cumulative goals for home/away by game
score_by_time = score_by_time.groupby(level=[0]).cumsum()

print score_by_time.head(10)


# In[24]:


# check a specific game to make sure things look good
print score_by_time.loc[idx[[10],:], :]


# In[14]:


# densify so that all times are in index for each game
home = score_by_time.unstack()['home'].fillna(method='ffill', axis=1).fillna(0).stack()
away = score_by_time.unstack()['away'].fillna(method='ffill', axis=1).fillna(0).stack()

score_by_time_dense = pd.DataFrame()

score_by_time_dense['home'] = home
score_by_time_dense['away'] = away

# add goal diff column
score_by_time_dense['diff'] = score_by_time_dense['home'] - score_by_time_dense['away']

# check that it worked
print score_by_time_dense.loc[idx[[10], :], :].tail(10)


# In[15]:


# add final result to this df
score_by_time_dense2 = pd.merge(score_by_time_dense.reset_index(), df[['game_id', 'result']], on='game_id')

# tally results by time and goal diff
outcomes = score_by_time_dense2.groupby(['time', 'diff', 'result']).size()
outcomes = outcomes.unstack().unstack().fillna(0).stack().loc[idx[:, -3:3], :]

# turn counts in percentages
outcomes["tot_games"] = outcomes.sum(axis=1)
outcomes.loc[:, "away":"home"] = outcomes.loc[:, "away":"home"].div(outcomes["tot_games"], axis=0)

print outcomes.head(15)


# In[16]:


# pivot data so index is only time
outcomes2 = outcomes.stack().unstack(level=1).unstack(level=1)

outcomes2.head()


# In[17]:


outcomes2[2].ix[:, 'away':'home'].plot()


# In[18]:


# add smoothed series for each outcome (using local regression) 
outcomes3 = outcomes2.copy()

for col in outcomes3.columns:
    if col[1] in ['away', 'draw', 'home']:    
        s = outcomes3[col]
        x, y = s.index.values, s.values

        y_smooth = sm.nonparametric.lowess(y, x, frac=0.2, return_sorted=False)
        outcomes3[col[0], '%s_smooth' % col[1]] = y_smooth

outcomes3 = outcomes3.sort_index(axis=1)

print outcomes3[0].head()


# In[23]:


# check that things seem reasonable
outcomes3[0].ix[:, 'away':'home_smooth'].plot()


# In[33]:


# output final dataset
outcomes3.to_csv('../ipy-data/outcome-probs.csv', index=True, tupleize_cols=True)