import pandas as pd
import numpy as np
import glob
import re
%matplotlib inline
import matplotlib.pyplot as plt
# read PSVs into DataFrame
games = []
files = glob.glob('cbb-play-data/*.psv')
for f in files:
df = pd.read_csv(f, sep='|')
df['game_id'] = f.replace('.psv', '')
games.append(df)
print 'Read {0} games'.format(len(games))
Read 2530 games
games_df = pd.concat(games)
# add event_id to maintain event order
# we can use the index since pandas defaults to the Nth row of the file
games_df['event_id'] = games_df.index
# melt data into one column for home/away and another for event
# maintain play order by sorting on event_id
melted = pd.melt(games_df, id_vars=['event_id', 'game_id', 'time', 'score'],
var_name='team', value_name='event')
melted.sort_index(by=['game_id', 'event_id'], inplace=True)
# drop rows with NaN events - an event only belongs to one team
melted = melted[melted.event.notnull()]
print melted[10:15]
event_id game_id time score team \ 10 10 cbb-play-data/323140038 17:56 3-4 away 11 11 cbb-play-data/323140038 17:40 3-4 away 801719 12 cbb-play-data/323140038 17:40 3-4 home 801720 13 cbb-play-data/323140038 17:35 3-4 home 14 14 cbb-play-data/323140038 17:35 3-4 away event 10 Karl Cochran Defensive Rebound. 11 Karl Cochran missed Three Point Jumper. 801719 Askia Booker Defensive Rebound. 801720 Askia Booker missed Jumper. 14 Spencer Collins Defensive Rebound.
# label whether three pointers were made or missed
get_shot_result = lambda x: re.findall('(made|missed)', x)[0]
shot3 = melted.event.str.contains('Three Point')
melted['shot_result'] = melted[shot3].event.apply(get_shot_result)
def criteria(df):
"""Labels if the three pointer was preceded by an offensive rebound."""
df['after_oreb'] = ((df.event.str.contains('Three Point')) & \
df.event.shift(1).str.contains('Offensive Rebound'))
df.after_oreb.fillna(False, inplace=True)
return df
melted = melted.groupby('game_id').apply(criteria)
melted[melted.shot_result.notnull()].head(3)
event_id | game_id | time | score | team | event | shot_result | after_oreb | |
---|---|---|---|---|---|---|---|---|
801707 | 0 | cbb-play-data/323140038 | 19:28 | 0-0 | home | Askia Booker missed Three Point Jumper. | missed | False |
2 | 2 | cbb-play-data/323140038 | 19:12 | 0-0 | away | Karl Cochran missed Three Point Jumper. | missed | False |
5 | 5 | cbb-play-data/323140038 | 18:43 | 3-2 | away | Spencer Collins made Three Point Jumper. Assi... | made | False |
threes = melted[melted.shot_result.notnull()]
attempts = threes.groupby(['shot_result', 'after_oreb']).size().unstack(0)
attempts['perc'] = attempts.made.astype(float) / (attempts.made + attempts.missed)
print attempts
shot_result made missed perc after_oreb False 28688 55956 0.338925 True 2505 4692 0.348062
attempts.index = ['No', 'Yes']
plt.figure(figsize=[8, 6])
attempts.perc.plot(kind='bar')
plt.ylabel('3P%')
plt.xlabel('After Offensive Rebound?')
plt.grid(False);
melted['minutes'] = melted.time.apply(lambda x: int(x.split(':')[0]))
melted['seconds'] = melted.time.apply(lambda x: int(x.split(':')[1]))
duped_cols = ['game_id', 'event_id', 'time', 'event']
melted[melted.duplicated(cols=duped_cols)][:3]
event_id | game_id | time | score | team | event | shot_result | after_oreb | minutes | seconds | |
---|---|---|---|---|---|---|---|---|---|---|
801733 | 26 | cbb-play-data/323140038 | 15:48 | Colorado Full Timeout. | home | Colorado Full Timeout. | NaN | False | 15 | 48 |
801758 | 51 | cbb-play-data/323140038 | 11:50 | Wofford Full Timeout. | home | Wofford Full Timeout. | NaN | False | 11 | 50 |
801764 | 57 | cbb-play-data/323140038 | 11:45 | Colorado Full Timeout. | home | Colorado Full Timeout. | NaN | False | 11 | 45 |
melted.drop_duplicates(cols=['game_id', 'event_id', 'event'], inplace=True)
melted['period_end'] = melted.event.apply(lambda x: x.startswith('End of'))
melted[melted.period_end].head(3)
event_id | game_id | time | score | team | event | shot_result | after_oreb | minutes | seconds | period_end | |
---|---|---|---|---|---|---|---|---|---|---|---|
133 | 133 | cbb-play-data/323140038 | 0:00 | End of the 1st Half. | away | End of the 1st Half. | NaN | False | 0 | 0 | True |
305 | 305 | cbb-play-data/323140038 | 0:00 | End of the 2nd Half. | away | End of the 2nd Half. | NaN | False | 0 | 0 | True |
462 | 155 | cbb-play-data/323140041 | 0:00 | End of the 1st Half. | away | End of the 1st Half. | NaN | False | 0 | 0 | True |
calculate_period = lambda x: x.shift(1).cumsum().fillna(0) + 1
melted['period'] = melted.groupby('game_id').period_end.apply(calculate_period)
melted[melted.period_end].head(3)
event_id | game_id | time | score | team | event | shot_result | after_oreb | minutes | seconds | period_end | period | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
133 | 133 | cbb-play-data/323140038 | 0:00 | End of the 1st Half. | away | End of the 1st Half. | NaN | False | 0 | 0 | True | 1 |
305 | 305 | cbb-play-data/323140038 | 0:00 | End of the 2nd Half. | away | End of the 2nd Half. | NaN | False | 0 | 0 | True | 2 |
462 | 155 | cbb-play-data/323140041 | 0:00 | End of the 1st Half. | away | End of the 1st Half. | NaN | False | 0 | 0 | True | 1 |
calculate_period = lambda x: x.shift(1).cumsum().fillna(0) + 1
melted['period'] = melted.groupby('game_id').period_end.apply(calculate_period)
melted.set_index('game_id', inplace=True)
# 40min regulation game + (# periods - 2 halves) * 5min OTs
gametime = lambda x: 40 + (x - 2) * 5
melted['gametime'] = melted.groupby(level=0).period.max().apply(gametime)
melted.reset_index('game_id', inplace=True)
melted.groupby('gametime').game_id.nunique()
gametime 35 44 40 2062 45 373 50 41 55 8 60 2 dtype: int64
melted.loc[melted.gametime == 35, 'gametime'] = 40
def clock_to_secs_left(df):
"""Calculates the total seconds left in the game."""
df['secs_left'] = np.nan
df.loc[df.period == 1, 'secs_left'] = (df.minutes * 60) + 1200 + df.seconds
df.loc[df.period > 1, 'secs_left'] = (df.minutes * 60) + df.seconds
return df
clock_to_secs_left(melted)
print melted[['game_id', 'time', 'event', 'period', 'secs_left']][:5]
game_id time \ 0 cbb-play-data/323140038 19:28 1 cbb-play-data/323140038 19:28 2 cbb-play-data/323140038 19:12 3 cbb-play-data/323140038 19:12 4 cbb-play-data/323140038 19:05 event period secs_left 0 Askia Booker missed Three Point Jumper. 1 2368 1 Wofford Defensive Rebound. 1 2368 2 Karl Cochran missed Three Point Jumper. 1 2352 3 Spencer Dinwiddie Defensive Rebound. 1 2352 4 Askia Booker made Layup. Assisted by Spencer ... 1 2345
melted['secs_elapsed'] = melted.secs_left.shift(1) - melted.secs_left
mask = (melted.secs_elapsed >= 0) & (melted.secs_elapsed <= 7)
threes_after_orebs = melted[melted.after_oreb & mask]
grouped = threes_after_orebs.groupby(['shot_result', 'secs_elapsed']).size()
grouped = grouped.unstack(0).fillna(0)
grouped['attempts'] = grouped.made + grouped.missed
grouped['percentage'] = grouped.made / grouped.attempts.astype(float)
t = threes.shot_result.value_counts()
t = float(t['made']) / (t['made'] + t['missed'])
plt.figure(figsize=(12.5, 7))
plt.plot(grouped.percentage, label='O-Reb 3P%', color='#377EB8')
plt.hlines(t, 0, 7, label='"Normal" 3P%', linestyles='--')
plt.xlabel('Seconds Since Offensive Rebound')
plt.xticks(np.arange(8))
plt.ylabel('3-Point Percentage', labelpad=15)
plt.grid(False)
plt.legend(loc='lower right');
convert = lambda x: True if x == 'made' else False
normal_criteria = (melted.after_oreb == False) & melted.shot_result.notnull()
normal = melted[normal_criteria].shot_result.apply(convert)
after_criteria = (melted.after_oreb) & melted.shot_result.notnull() & \
(melted.secs_elapsed <= 7)
after = melted[after_criteria].shot_result.apply(convert)
print "After O-Reb 3P%:", after.mean()
print "Sample Size:", len(after)
print "\n"
print "All other 3P%:", normal.mean()
print "Sample Size:", len(normal)
print "\n"
print "Absolute difference: %.4f" % (after.mean() - normal.mean())
After O-Reb 3P%: 0.349338178043 Sample Size: 3853 All other 3P%: 0.338926769318 Sample Size: 84623 Absolute difference: 0.0104
m = normal - normal.mean()
m2 = m**2
print (m2.sum() / len(m2)), normal.std()
0.224055414358 0.473347717939
import pymc as pm
with pm.Model() as model:
# no chance 3P% is out of this range
p_normal = pm.Uniform("p_normal", lower=0.3, upper=0.4)
p_after = pm.Uniform("p_after", lower=0.3, upper=0.4)
model.deterministics.append(pm.Deterministic("p_delta",p_after-p_normal))
# scraped observations
obs_normal = pm.Bernoulli("obs_normal", p_normal, observed=normal.astype(int))
obs_after = pm.Bernoulli("obs_after", p_after, observed=after.astype(int))
m = pm.HamiltonianMC()
print "running..."
trace = pm.psample(1000, m)
running...
atrace = trace.combined()
p_normal_samples = atrace['p_normal'][:]
p_after_samples= atrace['p_after'][:]
delta_samples = atrace['p_after']-atrace['p_normal']
plt.figure(figsize=(12.5, 10))
ax = plt.subplot(311)
plt.xlim(0.3, 0.4)
plt.xticks(np.arange(0.3, 0.401, 0.01))
plt.ylim(0, 300)
plt.hist(p_normal_samples, histtype='stepfilled', bins=50, normed=True,
color='#E41A1C', label='3P% "Normal"')
plt.vlines(normal.mean(), 0, 300, linestyles='--', label='True "Normal" 3P%')
plt.legend()
plt.grid(False)
ax = plt.subplot(312)
plt.xlim(0.3, 0.4)
plt.xticks(np.arange(0.3, 0.401, 0.01))
plt.ylim(0, 300)
plt.hist(p_after_samples, histtype='stepfilled', bins=50, normed=True,
color='#4DAF4A', label='3P% After Off. Reb.')
plt.vlines(after.mean(), 0, 300, linestyles='--',
label='True 3P% After Off. Reb.')
plt.legend()
plt.grid(False)
ax = plt.subplot(313)
plt.xlim(-0.05, 0.05)
plt.xticks(np.arange(-0.05, 0.051, 0.01))
plt.ylim(0, 300)
plt.hist(delta_samples, histtype='stepfilled', bins=50, normed=True,
color='#377EB8', label='Delta')
plt.vlines(0, 0, 300, linestyles='--', label='$H_0$ (No difference)')
plt.legend()
plt.grid(False);
print normal.std(), after.std(), p_normal_samples.std(), p_after_samples.std()
0.473347717939 0.476822843387 0.0016462591967 0.00762804620874
print ("3P% after offensive rebounds was more successful "
"in {0:.1f}% of simulations").format((delta_trace > 0).mean() * 100)
3P% after offensive rebounds was more successful in 91.1% of simulations