from ggplot import *
import pandas as pd
import numpy as np
%matplotlib inline
df = pd.read_csv("./baseball-pitches-clean.csv")
df = df[['pitch_time', 'inning', 'pitcher_name', 'hitter_name', 'pitch_type',
'sz_top', 'sz_bottom',
'px', 'pz', 'pitch_name', 'start_speed', 'end_speed', 'type_confidence']]
df.head()
pitch_time | inning | pitcher_name | hitter_name | pitch_type | sz_top | sz_bottom | px | pz | pitch_name | start_speed | end_speed | type_confidence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2013-10-01 20:07:43 -0400 | 1 | Francisco Liriano | Shin-Soo Choo | B | 3.10 | 1.53 | 0.628 | 1.547 | Fastball | 93.2 | 85.3 | 0.894 |
1 | 2013-10-01 20:07:57 -0400 | 1 | Francisco Liriano | Shin-Soo Choo | S | 3.06 | 1.56 | 0.545 | 3.069 | Fastball | 93.4 | 85.6 | 0.895 |
2 | 2013-10-01 20:08:12 -0400 | 1 | Francisco Liriano | Shin-Soo Choo | S | 3.25 | 1.53 | 0.120 | 1.826 | Slider | 89.1 | 82.8 | 0.931 |
3 | 2013-10-01 20:08:31 -0400 | 1 | Francisco Liriano | Shin-Soo Choo | S | 3.25 | 1.53 | -0.229 | 1.667 | Slider | 90.0 | 83.3 | 0.926 |
4 | 2013-10-01 20:09:09 -0400 | 1 | Francisco Liriano | Ryan Ludwick | B | 3.62 | 1.78 | -1.917 | 0.438 | Slider | 87.7 | 81.6 | 0.915 |
5 rows × 13 columns
ggplot(aes(x='px', y='pz', color='pitch_type'), data=df) +\
geom_jitter()
<ggplot: (277212125)>
ggplot(aes(x='px', y='pz', color='pitch_type', shape='pitch_name'), data=df) +\
geom_jitter()
<ggplot: (294730357)>
df.hitter_name.value_counts().head(10)
Mike Trout 610 Christian Yelich 587 Carlos Santana 581 Matt Carpenter 574 Joey Votto 572 Kyle Seager 560 Shin-Soo Choo 554 Brian Dozier 552 Jayson Werth 550 Evan Longoria 550 dtype: int64
hitter = df[df.hitter_name=='Mike Trout']
hitter.head()
pitch_time | inning | pitcher_name | hitter_name | pitch_type | sz_top | sz_bottom | px | pz | pitch_name | start_speed | end_speed | type_confidence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3415 | 2013-09-29 15:11:23 -0400 | 1 | Yu Darvish | Mike Trout | B | 3.48 | 1.57 | 1.120 | 1.995 | Fastball | 92.1 | 85.9 | 0.911 |
3416 | 2013-09-29 15:11:39 -0400 | 1 | Yu Darvish | Mike Trout | X | 3.48 | 1.59 | 0.156 | 2.002 | Fastball | 92.3 | 87.4 | 0.908 |
3487 | 2013-09-29 15:54:36 -0400 | 4 | Yu Darvish | Mike Trout | S | 3.57 | 1.61 | 0.204 | 1.687 | Fastball | 91.4 | 85.1 | 0.909 |
3488 | 2013-09-29 15:54:56 -0400 | 4 | Yu Darvish | Mike Trout | B | 3.57 | 1.74 | -2.115 | 2.987 | Fastball | 89.7 | 84.4 | 0.872 |
3489 | 2013-09-29 15:55:21 -0400 | 4 | Yu Darvish | Mike Trout | S | 3.64 | 1.81 | 0.033 | 2.472 | Fastball (sinker|split-fingered) | 84.5 | 80.6 | 0.936 |
5 rows × 13 columns
ggplot(aes(x='px', y='pz', color='pitch_type'), data=hitter) +\
geom_point()
<ggplot: (291762161)>
ggplot(aes(x='px', y='pz', color='pitch_type'), data=hitter) +\
geom_point() +\
scale_color_brewer()
<ggplot: (291746041)>
ggplot(aes(x='px', y='pz', color='pitch_type'), data=hitter) +\
geom_point() +\
scale_color_brewer(type='qual', palette=4)
<ggplot: (290664317)>
ggplot(aes(x='px', y='pz', color='pitch_type'), data=hitter) +\
geom_point() +\
geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red')
<ggplot: (281108505)>
ggplot(aes(x='px', y='pz', color='pitch_type'), data=hitter) +\
geom_point() +\
geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
geom_vline(xintercept=[-1, 1], color='red')
<ggplot: (284822549)>
ggplot(aes(x='px', y='pz', color='pitch_type'), data=hitter) +\
geom_point() +\
geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
geom_vline(xintercept=[-1, 1], color='red') +\
coord_equal()
<ggplot: (283370949)>
fastballs = df[df.pitch_name=='Fastball']
fastballs.head()
pitch_time | inning | pitcher_name | hitter_name | pitch_type | sz_top | sz_bottom | px | pz | pitch_name | start_speed | end_speed | type_confidence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2013-10-01 20:07:43 -0400 | 1 | Francisco Liriano | Shin-Soo Choo | B | 3.10 | 1.53 | 0.628 | 1.547 | Fastball | 93.2 | 85.3 | 0.894 |
1 | 2013-10-01 20:07:57 -0400 | 1 | Francisco Liriano | Shin-Soo Choo | S | 3.06 | 1.56 | 0.545 | 3.069 | Fastball | 93.4 | 85.6 | 0.895 |
8 | 2013-10-01 20:10:57 -0400 | 1 | Francisco Liriano | Joey Votto | X | 3.58 | 1.67 | 0.614 | 3.155 | Fastball | 95.1 | 87.3 | 0.920 |
10 | 2013-10-01 20:14:30 -0400 | 1 | Johnny Cueto | Starling Marte | S | 3.51 | 1.53 | 0.366 | 3.226 | Fastball | 93.0 | 85.5 | 0.948 |
11 | 2013-10-01 20:14:45 -0400 | 1 | Johnny Cueto | Starling Marte | X | 3.51 | 1.53 | 0.371 | 3.595 | Fastball | 93.7 | 86.2 | 0.947 |
5 rows × 13 columns
ggplot(aes(x='start_speed', y='end_speed', color='start_speed - end_speed'), data=fastballs.head(10000)) +\
geom_point() +\
scale_color_gradient()
<ggplot: (282992077)>
ggplot(aes(x='start_speed', y='end_speed', color='start_speed - end_speed'), data=fastballs.head(10000)) +\
geom_point() +\
scale_color_gradient(low="blue", high="red") +\
geom_abline(intercept=10) +\
coord_equal()
<ggplot: (291747837)>
How does C.J. Wilson use his pitches?
pitcher = df[df.pitcher_name=="C.J. Wilson"]
ggplot(aes(x='px', y='pz', color='start_speed'), data=pitcher) +\
geom_point() +\
scale_color_gradient(low='yellow', high='red') +\
coord_equal() + \
xlab("Horizontal Position at Home Plate") +\
ylab("Vertical Position at Home Plate") +\
ggtitle("C.J. Wilson Pitch Map")
<ggplot: (292500581)>
ggplot(aes(x='px', y='pz', color='start_speed', shape='pitch_name'), data=pitcher) +\
geom_point() +\
scale_color_gradient(low='white', high='black') +\
coord_equal()
<ggplot: (287898201)>
def normalize_pitch(speed):
return (speed - speed.min()) / (speed.max() - speed.min())
pitcher['pitch_speed_norm'] = pitcher.groupby(["pitcher_name", "pitch_name"]).start_speed.apply(normalize_pitch)
-c:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_index,col_indexer] = value instead
ggplot(aes(x='px', y='pz', color='pitch_speed_norm'), data=pitcher) +\
geom_point() +\
geom_hline(yintercept=[pitcher.sz_bottom.mean(), pitcher.sz_top.mean()], color='blue') +\
geom_vline(xintercept=[-1, 1], color='blue') +\
scale_color_gradient(low='yellow', high='red') +\
facet_wrap("pitch_name", scales="fixed") +\
ggtitle("C.J. Wilson Normalized Pitch Speeds")
<ggplot: (289434297)>
ggplot(aes(x='px', y='pz', color='pitch_speed_norm'), data=pitcher[pitcher.pitch_name=="Slider"]) +\
geom_point() +\
geom_hline(yintercept=[pitcher.sz_bottom.mean(), pitcher.sz_top.mean()], color='blue') +\
geom_vline(xintercept=[-1, 1], color='blue') +\
scale_color_gradient(low='yellow', high='red') +\
facet_wrap("pitch_name", scales="fixed")
<ggplot: (292455361)>
ggplot
ships with a few themes that are easy to customize. My favorite is seaborn.
ggplot(aes(x='px', y='pz', color='pitch_speed_norm'), data=pitcher[pitcher.pitch_name=="Slider"]) +\
geom_point() +\
geom_hline(yintercept=[pitcher.sz_bottom.mean(), pitcher.sz_top.mean()], color='blue') +\
geom_vline(xintercept=[-1, 1], color='blue') +\
scale_color_gradient(low='yellow', high='red') +\
facet_wrap("pitch_name", scales="fixed") +\
theme_seaborn()
<ggplot: (292445261)>
ggplot(aes(x='px', y='pz', color='pitch_speed_norm'), data=pitcher[pitcher.pitch_name=="Slider"]) +\
geom_point() +\
geom_hline(yintercept=[pitcher.sz_bottom.mean(), pitcher.sz_top.mean()], color='blue') +\
geom_vline(xintercept=[-1, 1], color='blue') +\
scale_color_gradient(low='yellow', high='red') +\
facet_wrap("pitch_name", scales="fixed") +\
theme_xkcd()
<ggplot: (292401777)>
ggplot(aes(x='inning', y='start_speed'), data=df) + stat_smooth(color='steelblue') + theme_xkcd()
<ggplot: (294730413)>