Chapter 1: Exploring Pitch

Import Greinke dataset

greinke <- read.csv("greinke.csv")

1.1 Clean the Data

# Print the first 6 rows of the data
head(greinke)
##         p_name pitcher_id batter_stand pitch_type    pitch_result
## 1 Zack Greinke     425844            R         FF            Ball
## 2 Zack Greinke     425844            R         FF Swinging Strike
## 3 Zack Greinke     425844            R         FF   Called Strike
## 4 Zack Greinke     425844            R         SL Swinging Strike
## 5 Zack Greinke     425844            R         FF Swinging Strike
## 6 Zack Greinke     425844            R         SL Swinging Strike
##   atbat_result start_speed    z0     x0  pfx_x  pfx_z    px    pz
## 1         Walk        94.2 5.997 -0.675 -4.457  9.760 1.714 1.925
## 2       Single        92.4 6.281 -0.760 -1.590 11.400 0.589 3.271
## 3     Home Run        92.7 6.168 -0.958 -1.884  9.245 0.399 2.918
## 4    Strikeout        86.9 6.077 -0.939  3.594  0.762 0.764 1.306
## 5    Strikeout        92.8 6.107 -0.524 -0.558 11.134 1.517 2.193
## 6    Strikeout        87.8 6.321 -0.948  4.313  0.132 0.695 3.431
##   break_angle break_length spin_rate spin_dir balls strikes outs game_date
## 1        24.8          3.5  2188.802  204.457     2       2    2 10/3/2015
## 2        10.1          2.7  2312.202  187.913     1       1    0 10/3/2015
## 3         9.2          3.5  1889.841  191.468     0       0    1 10/3/2015
## 4       -11.4          8.0   693.649  102.648     1       2    0 10/3/2015
## 5        -0.4          2.8  2242.916  182.859     1       2    0 10/3/2015
## 6       -13.6          7.8   828.693   92.330     2       2    1 10/3/2015
##   inning inning_topbot batted_ball_type batted_ball_velocity   hc_x  hc_y
## 1      4           top                                    NA   0.00  0.00
## 2      3           top                                   104 123.56 97.26
## 3      5           top                                   103  50.88 31.17
## 4      6           top                                    NA   0.00  0.00
## 5      8           top                                    NA   0.00  0.00
## 6      1           top                                    NA   0.00  0.00
##   pitch_id distance_feet
## 1      160            NA
## 2       95             0
## 3      218           425
## 4      265            NA
## 5      374            NA
## 6       14            NA
# Print the number of rows in the data frame
nrow(greinke)
## [1] 3239
# Summarize the start_speed variable
summary(greinke$start_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   52.20   87.30   89.80   88.44   91.80   95.40       3
# Get rid of data without start_speed
greinke <- subset(greinke, !is.na(start_speed))

# Print the number of complete entries
nrow(greinke)
## [1] 3236
# Print the structure of greinke
str(greinke)
## 'data.frame':    3236 obs. of  29 variables:
##  $ p_name              : Factor w/ 1 level "Zack Greinke": 1 1 1 1 1 1 1 1 1 1 ...
##  $ pitcher_id          : int  425844 425844 425844 425844 425844 425844 425844 425844 425844 425844 ...
##  $ batter_stand        : Factor w/ 2 levels "L","R": 2 2 2 2 2 2 2 2 2 2 ...
##  $ pitch_type          : Factor w/ 8 levels "","CH","CU","EP",..: 5 5 5 8 5 8 2 5 8 8 ...
##  $ pitch_result        : Factor w/ 15 levels "Ball","Ball In Dirt",..: 1 14 3 14 14 14 15 3 4 14 ...
##  $ atbat_result        : Factor w/ 24 levels "Bunt Groundout",..: 24 20 12 21 21 21 21 21 10 24 ...
##  $ start_speed         : num  94.2 92.4 92.7 86.9 92.8 87.8 90.3 92.7 85.5 87.3 ...
##  $ z0                  : num  6 6.28 6.17 6.08 6.11 ...
##  $ x0                  : num  -0.675 -0.76 -0.958 -0.939 -0.524 ...
##  $ pfx_x               : num  -4.457 -1.59 -1.884 3.594 -0.558 ...
##  $ pfx_z               : num  9.76 11.4 9.245 0.762 11.134 ...
##  $ px                  : num  1.714 0.589 0.399 0.764 1.517 ...
##  $ pz                  : num  1.93 3.27 2.92 1.31 2.19 ...
##  $ break_angle         : num  24.8 10.1 9.2 -11.4 -0.4 -13.6 22.5 25.1 -8.4 -11.3 ...
##  $ break_length        : num  3.5 2.7 3.5 8 2.8 7.8 7.4 3.8 7.5 7.4 ...
##  $ spin_rate           : num  2189 2312 1890 694 2243 ...
##  $ spin_dir            : num  204 188 191 103 183 ...
##  $ balls               : int  2 1 0 1 1 2 1 0 0 0 ...
##  $ strikes             : int  2 1 0 2 2 2 2 2 0 1 ...
##  $ outs                : int  2 0 1 0 0 1 1 2 2 2 ...
##  $ game_date           : Factor w/ 32 levels "10/3/2015","4/12/2015",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ inning              : int  4 3 5 6 8 1 6 5 8 4 ...
##  $ inning_topbot       : Factor w/ 2 levels "bot","top": 2 2 2 2 2 2 2 2 2 2 ...
##  $ batted_ball_type    : Factor w/ 5 levels "","FB","GB","LD",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ batted_ball_velocity: int  NA 104 103 NA NA NA NA NA NA NA ...
##  $ hc_x                : num  0 123.6 50.9 0 0 ...
##  $ hc_y                : num  0 97.3 31.2 0 0 ...
##  $ pitch_id            : int  160 95 218 265 374 14 279 231 386 156 ...
##  $ distance_feet       : int  NA 0 425 NA NA NA NA NA NA NA ...

1.2 Check Dates

# Check if dates are formatted as dates
class(greinke$game_date)
## [1] "factor"
# Change them to dates
greinke$game_date <- as.Date(greinke$game_date, format = "%m/%d/%Y")

# Check that the variable is now formatted as a date
class(greinke$game_date)
## [1] "Date"

X.1.3 Delimit Dates

library(tidyr)
# Separate game_date into "year", "month", and "day"
greinke <- separate(data = greinke, col = game_date,
                    into = c("year", "month", "day"),
                    sep = "-", remove = FALSE)

# Convert month to numeric
greinke$month <- as.numeric(greinke$month)

# Create the july variable
greinke$july <- ifelse(greinke$month == 7, "july", "other")

# View the head() of greinke
head(greinke)
##         p_name pitcher_id batter_stand pitch_type    pitch_result
## 1 Zack Greinke     425844            R         FF            Ball
## 2 Zack Greinke     425844            R         FF Swinging Strike
## 3 Zack Greinke     425844            R         FF   Called Strike
## 4 Zack Greinke     425844            R         SL Swinging Strike
## 5 Zack Greinke     425844            R         FF Swinging Strike
## 6 Zack Greinke     425844            R         SL Swinging Strike
##   atbat_result start_speed    z0     x0  pfx_x  pfx_z    px    pz
## 1         Walk        94.2 5.997 -0.675 -4.457  9.760 1.714 1.925
## 2       Single        92.4 6.281 -0.760 -1.590 11.400 0.589 3.271
## 3     Home Run        92.7 6.168 -0.958 -1.884  9.245 0.399 2.918
## 4    Strikeout        86.9 6.077 -0.939  3.594  0.762 0.764 1.306
## 5    Strikeout        92.8 6.107 -0.524 -0.558 11.134 1.517 2.193
## 6    Strikeout        87.8 6.321 -0.948  4.313  0.132 0.695 3.431
##   break_angle break_length spin_rate spin_dir balls strikes outs
## 1        24.8          3.5  2188.802  204.457     2       2    2
## 2        10.1          2.7  2312.202  187.913     1       1    0
## 3         9.2          3.5  1889.841  191.468     0       0    1
## 4       -11.4          8.0   693.649  102.648     1       2    0
## 5        -0.4          2.8  2242.916  182.859     1       2    0
## 6       -13.6          7.8   828.693   92.330     2       2    1
##    game_date year month day inning inning_topbot batted_ball_type
## 1 2015-10-03 2015    10  03      4           top                 
## 2 2015-10-03 2015    10  03      3           top                 
## 3 2015-10-03 2015    10  03      5           top                 
## 4 2015-10-03 2015    10  03      6           top                 
## 5 2015-10-03 2015    10  03      8           top                 
## 6 2015-10-03 2015    10  03      1           top                 
##   batted_ball_velocity   hc_x  hc_y pitch_id distance_feet  july
## 1                   NA   0.00  0.00      160            NA other
## 2                  104 123.56 97.26       95             0 other
## 3                  103  50.88 31.17      218           425 other
## 4                   NA   0.00  0.00      265            NA other
## 5                   NA   0.00  0.00      374            NA other
## 6                   NA   0.00  0.00       14            NA other
# Print a summary of the july variable
summary(factor(greinke$july))
##  july other 
##   524  2712

1.4 Velocity Distribution

# Make a histogram of Greinke's start speed
hist(greinke$start_speed)

# Create greinke_july
greinke_july <- subset(greinke, july == "july")

# Create greinke_other
greinke_other <- subset(greinke, july == "other")

# Use par to format your plot layout
par(mfrow = c(1,2))

# Plot start_speed histogram from july
hist(greinke_july$start_speed)

# Plot start_speed histogram for other months
hist(greinke_other$start_speed)

1.5 Fastball Velocity Distribution

# Create july_ff
july_ff <- subset(greinke_july, pitch_type == "FF")

# Create other_ff
other_ff <- subset(greinke_other, pitch_type == "FF")

# Formatting code, don't change this
par(mfrow = c(1, 2))

# Plot histogram of July fastball speeds
hist(july_ff$start_speed)

# Plot histogram of other month fastball speeds
hist(other_ff$start_speed)

1.6 Distribution Comparisons w/ Color

# Make a fastball speed histogram for other months
hist(other_ff$start_speed,
     col = "#00009950", freq = FALSE,
     ylim = c(0, .35), xlab = "Velocity (mph)",
     main = "Greinke 4-Seam Fastball Velocity")

# Add a histogram for July
hist(july_ff$start_speed, add = TRUE,
     col = "#99000050", freq = FALSE)

# Draw vertical line at the mean of other_ff
abline(v = mean(other_ff$start_speed),
       col = "#00009950", lwd = 2)

# Draw vertical line at the mean of july_ff
abline(v = mean(july_ff$start_speed),
       col = "#99000050", lwd = 2)

1.7 Tapply Velocity Changes

# Summarize velocity in July and other months
tapply(greinke$start_speed, greinke$july, mean)
##     july    other 
## 88.86489 88.35601
# Create greinke_ff
greinke_ff <- subset(greinke, pitch_type == "FF")

# Calculate mean fastball velocities: ff_velo_month
ff_velo_month <- tapply(greinke_ff$start_speed, greinke_ff$july, mean)

# Print ff_velo_month
ff_velo_month
##     july    other 
## 92.42077 91.66474

X.1.8 Game by game Changes

# Create ff_dt
ff_dt <- data.frame(tapply(greinke_ff$start_speed, greinke_ff$game_date, mean))

# Print the first 6 rows of ff_dt
head(ff_dt)
##            tapply.greinke_ff.start_speed..greinke_ff.game_date..mean.
## 2015-04-07                                                   90.82632
## 2015-04-12                                                   90.51622
## 2015-04-18                                                   90.28654
## 2015-04-24                                                   90.51277
## 2015-04-29                                                   90.40732
## 2015-05-05                                                   90.33043

1.9 Tidying the data frame

# Create game_date in ff_dt
ff_dt$game_date <- as.Date(row.names(ff_dt), "%Y-%m-%d")

# Rename the first column
colnames(ff_dt)[1] <- "start_speed"

# Remove row names
row.names(ff_dt) <- NULL

# View head of ff_dt
head(ff_dt)
##   start_speed  game_date
## 1    90.82632 2015-04-07
## 2    90.51622 2015-04-12
## 3    90.28654 2015-04-18
## 4    90.51277 2015-04-24
## 5    90.40732 2015-04-29
## 6    90.33043 2015-05-05

1.10 Game by Game

# Plot game-by-game 4-seam fastballs
plot(ff_dt$start_speed ~ ff_dt$game_date,
     lwd = 4, type = "l", ylim = c(88, 95),
     main = "Greinke 4-Seam Fastball Velocity", xlab = "Date", ylab = "Velocity (mph)")

1.11 Jittered Points

# Code from previous exercise, don't change this
plot(ff_dt$start_speed ~ ff_dt$game_date,
     lwd = 4, type = "l", ylim = c(88, 95),
     main = "Greinke 4-Seam Fastball Velocity",
     xlab = "Date", ylab = "Velocity (mph)")

# Add jittered points to the plot
points(greinke_ff$start_speed ~ jitter(as.numeric(greinke_ff$game_date)),
       pch = 16, col = "#99004450")

Chapter 2: Exploring Pitch Types

2.1 Pitch Mix Tables

# Subset the data to remove pitch types "IN" and "EP"
greinke <- subset(greinke, pitch_type != "IN" & pitch_type != "EP")

# Drop the levels from pitch_type
greinke$pitch_type <- droplevels(greinke$pitch_type)

# Create type_tab
type_tab <- table(greinke$pitch_type, greinke$july)

# Print type_tab
type_tab
##     
##      july other
##   CH  112   487
##   CU   51   242
##   FF  207  1191
##   FT   66   255
##   SL   86   535

2.2 Pitch Mix Table

# Create type_prop table
type_prop <- round(prop.table(type_tab, margin = 2), 3)

# Print type_prop
type_prop
##     
##       july other
##   CH 0.215 0.180
##   CU 0.098 0.089
##   FF 0.397 0.439
##   FT 0.126 0.094
##   SL 0.165 0.197

2.3 PMT July

# Create ff_prop
ff_prop <- type_prop[3, ] 

# Print ff_prop
ff_prop
##  july other 
## 0.397 0.439
# Print ff_velo_month
ff_velo_month
##     july    other 
## 92.42077 91.66474

2.4 Describe Fastball Usage

type_prop <- as.data.frame.matrix(type_prop)
Pitch <- rownames(type_prop)
type_prop <- cbind(Pitch, type_prop)
colnames(type_prop) <- paste(c("Pitch", "July", "Other"))

2.5 PMT Changes

# Create the Difference column
type_prop$Difference <- (type_prop$July - type_prop$Other) / type_prop$Other

# Print the type_prop
type_prop
##    Pitch  July Other  Difference
## CH    CH 0.215 0.180  0.19444444
## CU    CU 0.098 0.089  0.10112360
## FF    FF 0.397 0.439 -0.09567198
## FT    FT 0.126 0.094  0.34042553
## SL    SL 0.165 0.197 -0.16243655
# Plot a barplot
barplot(type_prop$Difference, names.arg = type_prop$Pitch, 
        main = "Pitch Usage in July vs. Other Months", 
        ylab = "Percentage Change in July", 
        ylim = c(-0.3, 0.3))