import numpy as np import pandas as pd %matplotlib inline from ggplot import * # Inspect data, the data is pitches tracked over a 2 month stretch in the 2013 # MLB season. baseball = pd.read_csv('./data/baseball-pitches-clean.csv') print baseball.shape[0], " pitches were tracked." baseball.head() baseball.columns # How many pitches types are there? baseball.pitch_type.unique() baseball.pitch_name.unique() # How many pitchers are in the dataset? len(baseball.pitcher_name.unique()) baseball.describe()[['start_speed', 'end_speed']] slowest_pitch = baseball[baseball['start_speed'] == baseball['start_speed'].min(0)] slowest_pitch.pitcher_name zach_wheeler = baseball[baseball['pitcher_name'] == 'Zack Wheeler'] less_than_70 = zach_wheeler[zach_wheeler['start_speed'] < 70] print 'Number of pitches under 70 mph =', len(less_than_70) print 'Mean of Zach Wheeler\'s pitch speeds', round(zach_wheeler['start_speed'].mean(),2), 'MPH.' print len(baseball[baseball['start_speed'] < 60]), 'pitches are under 60 mph' # R.A. Dickey is a knuckleballer, one of only ones in the entire league dickey = baseball[baseball['pitcher_name'] == 'R.A. Dickey'] print 'R. A. Dickey has ', len(dickey[dickey['start_speed'] < 60]), 'under 60 mph' over_60 = baseball['start_speed'] >= 60 baseball = baseball[over_60] baseball = baseball[['pitch_time', 'inning', 'pitcher_name', 'hitter_name', 'pitch_type', 'px', 'pz', 'pitch_name', 'start_speed', 'end_speed', 'type_confidence']] baseball.head() p = ggplot(aes(x='px', y='pz', color='pitch_name'), data=baseball) + geom_jitter() p p = ggplot(aes(x='px', y='pz'), data=baseball) + geom_point(color='blue') + facet_wrap('pitch_name') p baseball['pitch_name'].value_counts() # Show in percentages baseball['pitch_name'].value_counts() / len(baseball) * 100 from IPython.display import YouTubeVideo YouTubeVideo('uW0V6OsxDBo', 600, 338) p = ggplot(aes(x='start_speed'), data=baseball) + geom_histogram() + facet_wrap('pitch_name') p # Let's see how many of these Dickey throws knuckles = baseball[baseball['pitch_name'] == 'Knuckleball'] dickey = knuckles[knuckles['pitcher_name'] == 'R.A. Dickey'] print 'Percentage of Knuckleballs belonging to Dickey', (len(dickey) / len(knuckles) * 100) # Let's get darvish data darvish = baseball[baseball['pitcher_name'] == 'Yu Darvish'] darvish['pitch_name'].value_counts() / len(darvish) * 100 p = ggplot(aes(x='px', y='pz', color='pitch_name'), data=darvish) + geom_jitter(alpha=0.3) p = p + ggtitle('Darvish Pitch Spread') + stat_smooth(method='lm') p p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=darvish) p = p + stat_smooth(method='lm', size=5) p p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=darvish) p = p + geom_jitter(alpha=0.3) p baseball['pitcher_name'].value_counts() verlander = baseball[baseball['pitcher_name'] == 'Justin Verlander'] verlander.head() verlander['pitch_name'].value_counts() / len(verlander) * 100 p = ggplot(aes(x='px', y='pz', color='pitch_name'), data=verlander) + geom_jitter(alpha=0.3) p = p + ggtitle('Verlander Pitch Spread') + stat_smooth(method='lm') p p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=verlander) p = p + stat_smooth(method='lm', size=5) p p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=verlander) p = p + geom_jitter(alpha=0.3) p p = ggplot(aes(x='inning', y='start_speed', color='pitch_name'), data=baseball) p = p + stat_smooth(method='lm', size=5) + ggtitle('Pitch Speed vs Innings') p p = ggplot(aes(x='inning', y='start_speed'), data=baseball) p = p + stat_smooth(method='lm', size=5) + ggtitle('Pitch speed vs Innings') p baseball['date'] = baseball['pitch_time'].str.slice(0,10) baseball['pitch_count'] = 1 baseball['pitch_count'] = baseball.groupby(['pitcher_name', 'date'])['pitch_count'].cumsum() p = ggplot(aes(x='pitch_count', y='start_speed', color='pitch_name'), data=baseball) p = p + stat_smooth(method='lm', size=5) + ggtitle('Pitch Speed vs Pitch Count') p p = ggplot(aes(x='pitch_count', y='start_speed'), data=baseball) p = p + stat_smooth(method='lm', size=5) + ggtitle('Pitch Speed vs Pitch Count') p darvish = baseball[baseball['pitcher_name'] == 'Yu Darvish'] p = ggplot(aes(x='pitch_count', y='start_speed', color='pitch_name'), data=darvish) p = p + stat_smooth(se=False, size=5) + geom_jitter(alpha=0.3) p = p + ggtitle('Darvish: Pitch Speed vs Pitch Count') p verlander = baseball[baseball['pitcher_name'] == 'Justin Verlander'] p = ggplot(aes(x='pitch_count', y='start_speed', color='pitch_name'), data=verlander) p = p + stat_smooth(se=False, size=5) + geom_jitter(alpha=0.3) p = p + ggtitle('Verlander: Pitch Speed vs Pitch Count') p # Let's see if anyone else's fastball speed increases over time! fastballs = baseball[baseball['pitch_name'] == 'Fastball'] top10 = set(fastballs.pitcher_name.value_counts().index[:10]) pitchers = set(fastballs.pitcher_name) for name in pitchers: if name not in top10: drop_rows = fastballs.index[fastballs.pitcher_name == name] fastballs = fastballs.drop(drop_rows, axis=0) set(fastballs.pitcher_name) p = ggplot(aes(x='pitch_count',y='start_speed'), data=fastballs) + stat_smooth(se=False, size=3) p = p + facet_wrap('pitcher_name') p colon = baseball[baseball.pitcher_name == 'Bartolo Colon'] colon.pitch_name.value_counts() / len(colon) * 100