Download and save the data

library(baseballr)
playerid_lookup("Sale")
data <- scrape_statcast_savant(start_date = "2017-04-01", end_date = "2017-11-01", playerid  = 519242, player_type='pitcher')

str(data)
head(data)

# Save it to disk with saveRDS()
write.csv(data, "C:/Users/sclee1/OneDrive/Documents/R/sportsData/data/data.csv")

Import data and select two variables

# Read it back in with readRDS()
data <- read.csv("C:/Users/sclee1/OneDrive/Documents/R/sportsData/data/data.csv")
str(data)
## 'data.frame':    3428 obs. of  89 variables:
##  $ X                              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ pitch_type                     : Factor w/ 4 levels "CH","FF","FT",..: 3 1 2 1 1 3 3 4 3 4 ...
##  $ game_date                      : Factor w/ 32 levels "2017-04-05","2017-04-10",..: 32 32 32 32 32 32 32 32 32 32 ...
##  $ release_speed                  : num  94.4 91 97.1 87 87.7 97.4 98.1 80.4 92.4 82.5 ...
##  $ release_pos_x                  : num  3.08 3.07 3.05 3.17 3.07 ...
##  $ release_pos_z                  : num  5.03 4.97 5.26 4.97 5.01 ...
##  $ player_name                    : Factor w/ 1 level "Chris Sale": 1 1 1 1 1 1 1 1 1 1 ...
##  $ batter                         : int  431145 431145 431145 431145 431145 607680 607680 434778 434778 430832 ...
##  $ pitcher                        : int  519242 519242 519242 519242 519242 519242 519242 519242 519242 519242 ...
##  $ events                         : Factor w/ 16 levels "caught_stealing_2b",..: 5 NA NA NA NA 13 NA 10 NA 2 ...
##  $ description                    : Factor w/ 13 levels "ball","blocked_ball",..: 8 1 1 12 1 9 4 10 1 9 ...
##  $ spin_dir                       : logi  NA NA NA NA NA NA ...
##  $ spin_rate_deprecated           : logi  NA NA NA NA NA NA ...
##  $ break_angle_deprecated         : logi  NA NA NA NA NA NA ...
##  $ break_length_deprecated        : logi  NA NA NA NA NA NA ...
##  $ zone                           : int  5 14 1 6 8 6 12 3 12 6 ...
##  $ des                            : Factor w/ 713 levels "Aaron Altherr flies out to center fielder Jackie Bradley.  ",..: 588 NA NA NA NA 420 NA 392 NA 345 ...
##  $ game_type                      : Factor w/ 1 level "R": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stand                          : Factor w/ 2 levels "L","R": 2 2 2 2 2 2 2 2 2 2 ...
##  $ p_throws                       : Factor w/ 1 level "L": 1 1 1 1 1 1 1 1 1 1 ...
##  $ home_team                      : Factor w/ 14 levels "BAL","BOS","CLE",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ away_team                      : Factor w/ 10 levels "BAL","BOS","CLE",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ type                           : Factor w/ 3 levels "B","S","X": 3 1 1 2 1 3 2 3 1 3 ...
##  $ hit_location                   : int  5 NA NA NA NA 1 NA NA NA NA ...
##  $ bb_type                        : Factor w/ 4 levels "fly_ball","ground_ball",..: 3 NA NA NA NA 2 NA 3 NA 3 ...
##  $ balls                          : int  3 2 1 1 0 0 0 1 0 1 ...
##  $ strikes                        : int  1 1 1 0 0 1 0 0 0 1 ...
##  $ game_year                      : int  2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
##  $ pfx_x                          : num  1.56 1.7 1.34 1.56 1.71 ...
##  $ pfx_z                          : num  0.5869 0.0515 1.0221 0.4351 0.4717 ...
##  $ plate_x                        : num  0.0603 1.0183 -0.8058 0.5792 -0.103 ...
##  $ plate_z                        : num  2.51 2.01 3.11 2.38 1.79 ...
##  $ on_3b                          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ on_2b                          : int  NA NA NA NA NA NA NA 430832 430832 NA ...
##  $ on_1b                          : int  607680 607680 607680 607680 607680 NA NA NA NA NA ...
##  $ outs_when_up                   : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ inning                         : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ inning_topbot                  : Factor w/ 2 levels "Bot","Top": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hc_x                           : num  92.8 NA NA NA NA ...
##  $ hc_y                           : num  161 NA NA NA NA ...
##  $ tfs_deprecated                 : logi  NA NA NA NA NA NA ...
##  $ tfs_zulu_deprecated            : logi  NA NA NA NA NA NA ...
##  $ pos2_person_id                 : int  506702 506702 506702 506702 506702 506702 506702 506702 506702 506702 ...
##  $ umpire                         : logi  NA NA NA NA NA NA ...
##  $ sv_id                          : Factor w/ 3414 levels "_","170405_231036",..: 3414 3413 3412 3411 3410 3409 3408 3407 3406 3405 ...
##  $ vx0                            : num  -11.2 -8.7 -13.2 -9.27 -10.99 ...
##  $ vy0                            : num  -137 -132 -140 -126 -127 ...
##  $ vz0                            : num  -2.125 -1.607 -2.472 -0.927 -2.576 ...
##  $ ax                             : num  21.9 21.6 20.5 18.3 20.5 ...
##  $ ay                             : num  28.6 28.2 29.8 23.2 24.6 ...
##  $ az                             : num  -24.6 -31.5 -18.4 -27.6 -26.8 ...
##  $ sz_top                         : num  3.2 3.36 3.39 3.2 3.36 ...
##  $ sz_bot                         : num  1.27 1.48 1.54 1.27 1.6 ...
##  $ hit_distance_sc                : int  121 NA NA NA NA 10 45 356 NA 243 ...
##  $ launch_speed                   : num  101 NA NA NA NA ...
##  $ launch_angle                   : num  6.39 NA NA NA NA ...
##  $ effective_speed                : num  93.9 90.5 96.4 86.6 86.9 ...
##  $ release_spin_rate              : int  2239 2178 2478 2045 2250 2449 2364 2439 2150 2513 ...
##  $ release_extension              : num  6.09 6.16 6.07 6.01 5.9 ...
##  $ game_pk                        : int  492457 492457 492457 492457 492457 492457 492457 492457 492457 492457 ...
##  $ pos1_person_id                 : int  519242 519242 519242 519242 519242 519242 519242 519242 519242 519242 ...
##  $ pos2_person_id.1               : int  506702 506702 506702 506702 506702 506702 506702 506702 506702 506702 ...
##  $ pos3_person_id                 : int  607752 607752 607752 607752 607752 607752 607752 607752 607752 607752 ...
##  $ pos4_person_id                 : int  571918 571918 571918 571918 571918 571918 571918 571918 571918 571918 ...
##  $ pos5_person_id                 : int  646240 646240 646240 646240 646240 646240 646240 646240 646240 646240 ...
##  $ pos6_person_id                 : int  593428 593428 593428 593428 593428 593428 593428 593428 593428 593428 ...
##  $ pos7_person_id                 : int  643217 643217 643217 643217 643217 643217 643217 643217 643217 643217 ...
##  $ pos8_person_id                 : int  598265 598265 598265 598265 598265 598265 598265 598265 598265 598265 ...
##  $ pos9_person_id                 : int  455759 455759 455759 455759 455759 455759 455759 455759 455759 455759 ...
##  $ release_pos_y                  : num  54.4 54.3 54.4 54.5 54.6 ...
##  $ estimated_ba_using_speedangle  : num  0.65 NA NA NA NA 0.256 NA 0.143 NA 0.922 ...
##  $ estimated_woba_using_speedangle: num  0.651 NA NA NA NA 0.211 NA 0.216 NA 0.897 ...
##  $ woba_value                     : num  0 NA NA NA NA 0.9 NA 2 NA 1.25 ...
##  $ woba_denom                     : int  1 NA NA NA NA 1 NA 1 NA 1 ...
##  $ babip_value                    : int  0 NA NA NA NA 1 NA 0 NA 1 ...
##  $ iso_value                      : int  0 NA NA NA NA 0 NA 3 NA 1 ...
##  $ launch_speed_angle             : int  4 NA NA NA NA 2 NA 3 NA 4 ...
##  $ at_bat_number                  : int  39 39 39 39 39 38 38 37 37 36 ...
##  $ pitch_number                   : int  5 4 3 2 1 2 1 2 1 3 ...
##  $ pitch_name                     : Factor w/ 5 levels "","2-Seam Fastball",..: 2 4 3 4 4 2 2 5 2 5 ...
##  $ home_score                     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ away_score                     : int  5 5 5 5 5 5 5 3 3 3 ...
##  $ bat_score                      : int  5 5 5 5 5 5 5 3 3 3 ...
##  $ fld_score                      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ post_away_score                : int  5 5 5 5 5 5 5 3 3 3 ...
##  $ post_home_score                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ post_bat_score                 : int  5 5 5 5 5 5 5 3 3 3 ...
##  $ post_fld_score                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ barrel                         : int  0 NA NA NA NA 0 0 0 NA 0 ...

# Select variables that are necessary for our analysis
library(dplyr)
data <-
  data %>%
  select(pitch_type, description)
head(data)
##   pitch_type          description
## 1         FT        hit_into_play
## 2         CH                 ball
## 3         FF                 ball
## 4         CH      swinging_strike
## 5         CH                 ball
## 6         FT hit_into_play_no_out

Q1 How many pitches did he throw during the season?

Q2 What’s his primary pitch that he threw most often?

Q3 How many pitches did he throw that induced swing?

Q4 How many foreseam fastballs did he throw?

Q5 How many foreseam fastballs did he throw that missed the bat?

Q6 What’s his best strike out pitch?

# Count the number pitch by pitch_type
data %>%
  count(pitch_type)
## # A tibble: 5 x 2
##   pitch_type     n
##   <fctr>     <int>
## 1 CH           644
## 2 FF          1262
## 3 FT           390
## 4 SL          1127
## 5 <NA>           5

library(ggplot2)
data %>%
  filter(!description %in% c("ball", "blocked_ball", "called_strike", "hit_by_pitch"),
         !is.na(pitch_type)) %>% #exclude no swings
  mutate(swingMiss = description %in% c("swinging_strike", "missed_bunt", "swinging_strike_blocked")) %>%
  group_by(pitch_type) %>%
  summarize(missPerType = sum(swingMiss),
            N = n(),
            noContactRate = missPerType / N * 100) %>%
  ungroup() %>%
  ggplot(aes(reorder(x = pitch_type, noContactRate), y = noContactRate, fill = pitch_type)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Swing and Miss Pitch",
       x = NULL,
       y = "Swing and Miss Rate")


# Show step-by-step
data %>%
  filter(!description %in% c("ball", "blocked_ball", "called_strike", "hit_by_pitch"),
         !is.na(pitch_type)) %>% #exclude no swings
  mutate(swingMiss = description %in% c("swinging_strike", "missed_bunt", "swinging_strike_blocked")) %>%
  group_by(pitch_type) %>%
  summarize(missPerType = sum(swingMiss),
            N = n(),
            noContactRate = missPerType / N * 100)
## # A tibble: 4 x 4
##   pitch_type missPerType     N noContactRate
##   <fctr>           <int> <int>         <dbl>
## 1 CH                 117   332          35.2
## 2 FF                 179   721          24.8
## 3 FT                  32   186          17.2
## 4 SL                 183   476          38.4