import pandas as pd
import numpy as np
import glob
import re

%matplotlib inline
import matplotlib.pyplot as plt

# read PSVs into DataFrame
games = []
files = glob.glob('cbb-play-data/*.psv')
for f in files:
    df = pd.read_csv(f, sep='|')
    df['game_id'] = f.replace('.psv', '')
    games.append(df)

print 'Read {0} games'.format(len(games))

games_df = pd.concat(games)

# add event_id to maintain event order
# we can use the index since pandas defaults to the Nth row of the file
games_df['event_id'] = games_df.index

# melt data into one column for home/away and another for event
# maintain play order by sorting on event_id
melted = pd.melt(games_df, id_vars=['event_id', 'game_id', 'time', 'score'],
                     var_name='team', value_name='event')
melted.sort_index(by=['game_id', 'event_id'], inplace=True)

# drop rows with NaN events - an event only belongs to one team
melted = melted[melted.event.notnull()]
print melted[10:15]

# label whether three pointers were made or missed
get_shot_result = lambda x: re.findall('(made|missed)', x)[0]
shot3 = melted.event.str.contains('Three Point')
melted['shot_result'] = melted[shot3].event.apply(get_shot_result)

def criteria(df):
    """Labels if the three pointer was preceded by an offensive rebound."""
    df['after_oreb'] = ((df.event.str.contains('Three Point')) & \
                        df.event.shift(1).str.contains('Offensive Rebound'))
    df.after_oreb.fillna(False, inplace=True)
    return df

melted = melted.groupby('game_id').apply(criteria)
melted[melted.shot_result.notnull()].head(3)

threes = melted[melted.shot_result.notnull()]
attempts = threes.groupby(['shot_result', 'after_oreb']).size().unstack(0)
attempts['perc'] = attempts.made.astype(float) / (attempts.made + attempts.missed)
print attempts

attempts.index = ['No', 'Yes']
plt.figure(figsize=[8, 6])
attempts.perc.plot(kind='bar')
plt.ylabel('3P%')
plt.xlabel('After Offensive Rebound?')
plt.grid(False);

melted['minutes'] = melted.time.apply(lambda x: int(x.split(':')[0]))
melted['seconds'] = melted.time.apply(lambda x: int(x.split(':')[1]))

duped_cols = ['game_id', 'event_id', 'time', 'event']
melted[melted.duplicated(cols=duped_cols)][:3]

melted.drop_duplicates(cols=['game_id', 'event_id', 'event'], inplace=True)

melted['period_end'] = melted.event.apply(lambda x: x.startswith('End of'))
melted[melted.period_end].head(3)

calculate_period = lambda x: x.shift(1).cumsum().fillna(0) + 1
melted['period'] = melted.groupby('game_id').period_end.apply(calculate_period)
melted[melted.period_end].head(3)

calculate_period = lambda x: x.shift(1).cumsum().fillna(0) + 1
melted['period'] = melted.groupby('game_id').period_end.apply(calculate_period)

melted.set_index('game_id', inplace=True)

# 40min regulation game + (# periods - 2 halves) * 5min OTs
gametime = lambda x: 40 + (x - 2) * 5
melted['gametime'] = melted.groupby(level=0).period.max().apply(gametime)
melted.reset_index('game_id', inplace=True)
melted.groupby('gametime').game_id.nunique()

melted.loc[melted.gametime == 35, 'gametime'] = 40

def clock_to_secs_left(df):
    """Calculates the total seconds left in the game."""
    df['secs_left'] = np.nan
    df.loc[df.period == 1, 'secs_left'] = (df.minutes * 60) + 1200 + df.seconds
    df.loc[df.period > 1, 'secs_left'] = (df.minutes * 60) + df.seconds
    return df

clock_to_secs_left(melted)
print melted[['game_id', 'time', 'event', 'period', 'secs_left']][:5]

melted['secs_elapsed'] = melted.secs_left.shift(1) - melted.secs_left

mask = (melted.secs_elapsed >= 0) & (melted.secs_elapsed <= 7)
threes_after_orebs = melted[melted.after_oreb & mask]

grouped = threes_after_orebs.groupby(['shot_result', 'secs_elapsed']).size()
grouped = grouped.unstack(0).fillna(0)

grouped['attempts'] = grouped.made + grouped.missed
grouped['percentage'] = grouped.made / grouped.attempts.astype(float)

t = threes.shot_result.value_counts()
t = float(t['made']) / (t['made'] + t['missed'])

plt.figure(figsize=(12.5, 7))

plt.plot(grouped.percentage, label='O-Reb 3P%', color='#377EB8')
plt.hlines(t, 0, 7, label='"Normal" 3P%', linestyles='--')
plt.xlabel('Seconds Since Offensive Rebound')
plt.xticks(np.arange(8))
plt.ylabel('3-Point Percentage', labelpad=15)
plt.grid(False)
plt.legend(loc='lower right');

convert = lambda x: True if x == 'made' else False

normal_criteria = (melted.after_oreb == False) & melted.shot_result.notnull()
normal = melted[normal_criteria].shot_result.apply(convert)

after_criteria = (melted.after_oreb) & melted.shot_result.notnull() & \
                    (melted.secs_elapsed <= 7)
after = melted[after_criteria].shot_result.apply(convert)

print "After O-Reb 3P%:", after.mean()
print "Sample Size:", len(after)
print "\n"
print "All other 3P%:", normal.mean()
print "Sample Size:", len(normal)
print "\n"
print "Absolute difference: %.4f" % (after.mean() - normal.mean())

m = normal - normal.mean()
m2 = m**2
print (m2.sum() / len(m2)), normal.std()

import pymc as pm

with pm.Model() as model:
    # no chance 3P% is out of this range
    p_normal = pm.Uniform("p_normal", lower=0.3, upper=0.4)
    p_after = pm.Uniform("p_after", lower=0.3, upper=0.4)
    model.deterministics.append(pm.Deterministic("p_delta",p_after-p_normal))
    
    # scraped observations
    obs_normal = pm.Bernoulli("obs_normal", p_normal, observed=normal.astype(int))
    obs_after = pm.Bernoulli("obs_after", p_after, observed=after.astype(int))  
    
    m = pm.HamiltonianMC()
    print "running..."
    trace = pm.psample(1000, m)

atrace = trace.combined()
p_normal_samples = atrace['p_normal'][:]
p_after_samples=  atrace['p_after'][:]
delta_samples = atrace['p_after']-atrace['p_normal']

plt.figure(figsize=(12.5, 10))

ax = plt.subplot(311)
plt.xlim(0.3, 0.4)
plt.xticks(np.arange(0.3, 0.401, 0.01))
plt.ylim(0, 300)
plt.hist(p_normal_samples, histtype='stepfilled', bins=50, normed=True, 
         color='#E41A1C', label='3P% "Normal"')
plt.vlines(normal.mean(), 0, 300, linestyles='--', label='True "Normal" 3P%')
plt.legend()
plt.grid(False)

ax = plt.subplot(312)
plt.xlim(0.3, 0.4)
plt.xticks(np.arange(0.3, 0.401, 0.01))
plt.ylim(0, 300)
plt.hist(p_after_samples, histtype='stepfilled', bins=50, normed=True,
         color='#4DAF4A', label='3P% After Off. Reb.')
plt.vlines(after.mean(), 0, 300, linestyles='--',
           label='True 3P% After Off. Reb.')
plt.legend()
plt.grid(False)

ax = plt.subplot(313)
plt.xlim(-0.05, 0.05)
plt.xticks(np.arange(-0.05, 0.051, 0.01))
plt.ylim(0, 300)
plt.hist(delta_samples, histtype='stepfilled', bins=50, normed=True,
         color='#377EB8', label='Delta')
plt.vlines(0, 0, 300, linestyles='--', label='$H_0$ (No difference)')
plt.legend()
plt.grid(False);

print normal.std(), after.std(), p_normal_samples.std(), p_after_samples.std()

print ("3P% after offensive rebounds was more successful "
       "in {0:.1f}% of simulations").format((delta_trace > 0).mean() * 100)