import pandas as pd import numpy as np import glob import re %matplotlib inline import matplotlib.pyplot as plt # read PSVs into DataFrame games = [] files = glob.glob('cbb-play-data/*.psv') for f in files: df = pd.read_csv(f, sep='|') df['game_id'] = f.replace('.psv', '') games.append(df) print 'Read {0} games'.format(len(games)) games_df = pd.concat(games) # add event_id to maintain event order # we can use the index since pandas defaults to the Nth row of the file games_df['event_id'] = games_df.index # melt data into one column for home/away and another for event # maintain play order by sorting on event_id melted = pd.melt(games_df, id_vars=['event_id', 'game_id', 'time', 'score'], var_name='team', value_name='event') melted.sort_index(by=['game_id', 'event_id'], inplace=True) # drop rows with NaN events - an event only belongs to one team melted = melted[melted.event.notnull()] print melted[10:15] # label whether three pointers were made or missed get_shot_result = lambda x: re.findall('(made|missed)', x)[0] shot3 = melted.event.str.contains('Three Point') melted['shot_result'] = melted[shot3].event.apply(get_shot_result) def criteria(df): """Labels if the three pointer was preceded by an offensive rebound.""" df['after_oreb'] = ((df.event.str.contains('Three Point')) & \ df.event.shift(1).str.contains('Offensive Rebound')) df.after_oreb.fillna(False, inplace=True) return df melted = melted.groupby('game_id').apply(criteria) melted[melted.shot_result.notnull()].head(3) threes = melted[melted.shot_result.notnull()] attempts = threes.groupby(['shot_result', 'after_oreb']).size().unstack(0) attempts['perc'] = attempts.made.astype(float) / (attempts.made + attempts.missed) print attempts attempts.index = ['No', 'Yes'] plt.figure(figsize=[8, 6]) attempts.perc.plot(kind='bar') plt.ylabel('3P%') plt.xlabel('After Offensive Rebound?') plt.grid(False); melted['minutes'] = melted.time.apply(lambda x: int(x.split(':')[0])) melted['seconds'] = melted.time.apply(lambda x: int(x.split(':')[1])) duped_cols = ['game_id', 'event_id', 'time', 'event'] melted[melted.duplicated(cols=duped_cols)][:3] melted.drop_duplicates(cols=['game_id', 'event_id', 'event'], inplace=True) melted['period_end'] = melted.event.apply(lambda x: x.startswith('End of')) melted[melted.period_end].head(3) calculate_period = lambda x: x.shift(1).cumsum().fillna(0) + 1 melted['period'] = melted.groupby('game_id').period_end.apply(calculate_period) melted[melted.period_end].head(3) calculate_period = lambda x: x.shift(1).cumsum().fillna(0) + 1 melted['period'] = melted.groupby('game_id').period_end.apply(calculate_period) melted.set_index('game_id', inplace=True) # 40min regulation game + (# periods - 2 halves) * 5min OTs gametime = lambda x: 40 + (x - 2) * 5 melted['gametime'] = melted.groupby(level=0).period.max().apply(gametime) melted.reset_index('game_id', inplace=True) melted.groupby('gametime').game_id.nunique() melted.loc[melted.gametime == 35, 'gametime'] = 40 def clock_to_secs_left(df): """Calculates the total seconds left in the game.""" df['secs_left'] = np.nan df.loc[df.period == 1, 'secs_left'] = (df.minutes * 60) + 1200 + df.seconds df.loc[df.period > 1, 'secs_left'] = (df.minutes * 60) + df.seconds return df clock_to_secs_left(melted) print melted[['game_id', 'time', 'event', 'period', 'secs_left']][:5] melted['secs_elapsed'] = melted.secs_left.shift(1) - melted.secs_left mask = (melted.secs_elapsed >= 0) & (melted.secs_elapsed <= 7) threes_after_orebs = melted[melted.after_oreb & mask] grouped = threes_after_orebs.groupby(['shot_result', 'secs_elapsed']).size() grouped = grouped.unstack(0).fillna(0) grouped['attempts'] = grouped.made + grouped.missed grouped['percentage'] = grouped.made / grouped.attempts.astype(float) t = threes.shot_result.value_counts() t = float(t['made']) / (t['made'] + t['missed']) plt.figure(figsize=(12.5, 7)) plt.plot(grouped.percentage, label='O-Reb 3P%', color='#377EB8') plt.hlines(t, 0, 7, label='"Normal" 3P%', linestyles='--') plt.xlabel('Seconds Since Offensive Rebound') plt.xticks(np.arange(8)) plt.ylabel('3-Point Percentage', labelpad=15) plt.grid(False) plt.legend(loc='lower right'); convert = lambda x: True if x == 'made' else False normal_criteria = (melted.after_oreb == False) & melted.shot_result.notnull() normal = melted[normal_criteria].shot_result.apply(convert) after_criteria = (melted.after_oreb) & melted.shot_result.notnull() & \ (melted.secs_elapsed <= 7) after = melted[after_criteria].shot_result.apply(convert) print "After O-Reb 3P%:", after.mean() print "Sample Size:", len(after) print "\n" print "All other 3P%:", normal.mean() print "Sample Size:", len(normal) print "\n" print "Absolute difference: %.4f" % (after.mean() - normal.mean()) m = normal - normal.mean() m2 = m**2 print (m2.sum() / len(m2)), normal.std() import pymc as pm with pm.Model() as model: # no chance 3P% is out of this range p_normal = pm.Uniform("p_normal", lower=0.3, upper=0.4) p_after = pm.Uniform("p_after", lower=0.3, upper=0.4) model.deterministics.append(pm.Deterministic("p_delta",p_after-p_normal)) # scraped observations obs_normal = pm.Bernoulli("obs_normal", p_normal, observed=normal.astype(int)) obs_after = pm.Bernoulli("obs_after", p_after, observed=after.astype(int)) m = pm.HamiltonianMC() print "running..." trace = pm.psample(1000, m) atrace = trace.combined() p_normal_samples = atrace['p_normal'][:] p_after_samples= atrace['p_after'][:] delta_samples = atrace['p_after']-atrace['p_normal'] plt.figure(figsize=(12.5, 10)) ax = plt.subplot(311) plt.xlim(0.3, 0.4) plt.xticks(np.arange(0.3, 0.401, 0.01)) plt.ylim(0, 300) plt.hist(p_normal_samples, histtype='stepfilled', bins=50, normed=True, color='#E41A1C', label='3P% "Normal"') plt.vlines(normal.mean(), 0, 300, linestyles='--', label='True "Normal" 3P%') plt.legend() plt.grid(False) ax = plt.subplot(312) plt.xlim(0.3, 0.4) plt.xticks(np.arange(0.3, 0.401, 0.01)) plt.ylim(0, 300) plt.hist(p_after_samples, histtype='stepfilled', bins=50, normed=True, color='#4DAF4A', label='3P% After Off. Reb.') plt.vlines(after.mean(), 0, 300, linestyles='--', label='True 3P% After Off. Reb.') plt.legend() plt.grid(False) ax = plt.subplot(313) plt.xlim(-0.05, 0.05) plt.xticks(np.arange(-0.05, 0.051, 0.01)) plt.ylim(0, 300) plt.hist(delta_samples, histtype='stepfilled', bins=50, normed=True, color='#377EB8', label='Delta') plt.vlines(0, 0, 300, linestyles='--', label='$H_0$ (No difference)') plt.legend() plt.grid(False); print normal.std(), after.std(), p_normal_samples.std(), p_after_samples.std() print ("3P% after offensive rebounds was more successful " "in {0:.1f}% of simulations").format((delta_trace > 0).mean() * 100)