Overview

For this project, I am given a dataset of batted ball data recorded on two different systems: System A and System B. Each batted ball also contains a pitcher ID, batter ID, and a hit type (e.g. ground_ball, line_drive) that was manually recorded by an observer. Some batted balls only contain data for System A and vice versa, some contain data for both, and some no data at all.

My goal is to first handle the missing data and then to project the average speed-off-bat for each batter in the next season, using the updated dataset.

Exploratory Data Analysis

First, I would like to perform an exploratory data analysis (EDA) to compare the two systems. I’ll first take a look at how many missing (NA) values the dataset contains.

df <- read.csv("C:/Users/phill/OneDrive/Desktop/Job_Search/battedBallData.csv")

# The number of missing values from system A
sum(is.na(df$speed_A))
## [1] 7572
sum(is.na(df$vangle_A))
## [1] 7572
# The number of missing values from system B
sum(is.na(df$speed_B))
## [1] 1402
sum(is.na(df$vangle_B))
## [1] 1402
df_na <- df[rowSums(is.na(df)) == 4, ]
nrow(df_na)
## [1] 542

There are a total of 7572 missing values from system A and 1402 missing values from system B. Among these, there are 542 batted balls that contain no data. We will omit these NA values from the data in our comparisons below.

Comparing System A and System B

df_clean <- na.omit(df)
hist(df_clean$speed_A, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram comparison of System A and B')
hist(df_clean$speed_B, col = rgb(1,0,0,0.2), add = TRUE)
legend('topright', c('System A', 'System B'),
       fill=c(rgb(0,0,1,0.2), rgb(1,0,0,0.2)))

summary(df_clean$speed_A)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.46   80.47   90.70   88.51   98.36  121.85
summary(df_clean$speed_B)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.967  69.092  83.709  79.860  93.100 114.403
sd(df_clean$speed_A)
## [1] 13.13249
library(Metrics)
rmse(df_clean$speed_A, df_clean$speed_B)
## [1] 13.10099
cor(df_clean[,4:7])
##             speed_A  vangle_A   speed_B   vangle_B
## speed_A  1.00000000 0.1147474 0.8167788 0.04563272
## vangle_A 0.11474739 1.0000000 0.4893087 0.96660210
## speed_B  0.81677884 0.4893087 1.0000000 0.39179743
## vangle_B 0.04563272 0.9666021 0.3917974 1.00000000

From the histogram and summary statistics we can see the data for both is right skewed, but the mean and quartiles of the data from system A are larger than system B. System B captures many values less than 25 mph, but the smallest system A captured was 26 mph. We’ll take a deeper dive into each batted ball type to try and decipher this.

Ground Balls

gb <- df[df$hittype == 'ground_ball',]
gb <- na.omit(gb)


hist(gb$speed_A, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram comparison of System A and B (Ground Balls)')
hist(gb$speed_B, col = rgb(1,0,0,0.2), add = TRUE)
legend('topright', c('System A', 'System B'),
       fill=c(rgb(0,0,1,0.2), rgb(1,0,0,0.2)))

plot(gb$speed_A, gb$speed_B)
abline(0,1, col = 'red')

summary(gb$speed_A)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.46   76.75   88.83   86.29   97.51  121.85
summary(gb$speed_B)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.967  57.399  70.846  71.389  88.367 114.403
sd(gb$speed_A)
## [1] 14.49373
library(Metrics)
rmse(gb$speed_A, gb$speed_B)
## [1] 19.31986
cor(gb[,4:7])
##            speed_A  vangle_A   speed_B  vangle_B
## speed_A  1.0000000 0.4207398 0.7730186 0.2751386
## vangle_A 0.4207398 1.0000000 0.7751005 0.7790003
## speed_B  0.7730186 0.7751005 1.0000000 0.5986332
## vangle_B 0.2751386 0.7790003 0.5986332 1.0000000

Fly Balls

fb <- df[df$hittype == 'fly_ball',]
fb <- na.omit(fb)


hist(fb$speed_A, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram comparison of System A and B (Fly Balls)')
hist(fb$speed_B, col = rgb(1,0,0,0.2), add = TRUE)
legend('topright', c('System A', 'System B'),
       fill=c(rgb(0,0,1,0.2), rgb(1,0,0,0.2)))

plot(fb$speed_A, fb$speed_B)
abline(0,1, col = 'red')

summary(fb$speed_A)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.79   84.02   90.85   90.07   96.87  115.16
summary(fb$speed_B)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   49.48   80.45   87.12   86.31   92.93  111.41
sd(fb$speed_A)
## [1] 9.322711
library(Metrics)
rmse(fb$speed_A, fb$speed_B)
## [1] 3.919709
cor(fb[,4:7])
##             speed_A   vangle_A    speed_B   vangle_B
## speed_A   1.0000000 -0.2698598  0.9927676 -0.2460814
## vangle_A -0.2698598  1.0000000 -0.2384965  0.9954537
## speed_B   0.9927676 -0.2384965  1.0000000 -0.2146139
## vangle_B -0.2460814  0.9954537 -0.2146139  1.0000000

Line Drives

ld <- df[df$hittype == 'line_drive',]
ld <- na.omit(ld)

hist(ld$speed_A, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram comparison of System A and B (Line Drives)')
hist(ld$speed_B, col = rgb(1,0,0,0.2), add = TRUE)
legend('topright', c('System A', 'System B'),
       fill=c(rgb(0,0,1,0.2), rgb(1,0,0,0.2)))

plot(ld$speed_A, ld$speed_B)
abline(0,1, col = 'red')

summary(ld$speed_A)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35.44   85.83   95.24   92.85  101.49  118.81
summary(ld$speed_B)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.96   82.27   91.04   88.79   96.99  114.40
sd(ld$speed_A)
## [1] 11.64369
library(Metrics)
rmse(ld$speed_A, ld$speed_B)
## [1] 4.258328
cor(ld[,4:7])
##             speed_A   vangle_A    speed_B   vangle_B
## speed_A   1.0000000 -0.3389399  0.9953076 -0.2959997
## vangle_A -0.3389399  1.0000000 -0.3379732  0.9925967
## speed_B   0.9953076 -0.3379732  1.0000000 -0.2912326
## vangle_B -0.2959997  0.9925967 -0.2912326  1.0000000

Popups

pu <- df[df$hittype == 'popup',]
pu <- na.omit(pu)

hist(pu$speed_A, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram comparison of System A and B (Popups)')
hist(pu$speed_B, col = rgb(1,0,0,0.2), add = TRUE)
legend('topright', c('System A', 'System B'),
       fill=c(rgb(0,0,1,0.2), rgb(1,0,0,0.2)))

plot(pu$speed_A, pu$speed_B)
abline(0,1, col = 'red')

summary(pu$speed_A)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35.06   65.87   74.24   73.70   82.19  105.91
summary(pu$speed_B)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.47   62.65   71.48   70.91   79.60  105.00
sd(pu$speed_A)
## [1] 11.29649
library(Metrics)
rmse(pu$speed_A, pu$speed_B)
## [1] 3.110551
cor(pu[,4:7])
##            speed_A  vangle_A   speed_B  vangle_B
## speed_A  1.0000000 0.5486644 0.9927913 0.5684015
## vangle_A 0.5486644 1.0000000 0.5532597 0.9954998
## speed_B  0.9927913 0.5532597 1.0000000 0.5719040
## vangle_B 0.5684015 0.9954998 0.5719040 1.0000000

We can see from the histograms that the data for fly balls and popups appear to be normally distributed with system A and system B having similar variances but system B’s mean is slightly lower than system A. For line drives and ground balls, the data is more right-skewed. From the system A vs system B scatterplot, the data for each appear to have a linear relationship but all the datapoints for B are slightly smaller than A. System B doesn’t appear to record batted ball data well for ground balls, as can be seen in the ground ball scatterplot. Using these inferences, I can impute missing data based on different circumstances.

Managing Imperfect Data Using Imputation

The data will be split into the following scenarios:

  1. Full data for system A and B
  2. Data for A, no data for B
  3. Data for B, no data for A (Excluding ground balls)
  4. Data for B, no data for A (Ground balls)
  5. No data for A or B

It is given in the prompt that system A is believed to be much more accurate than system B. For this reason, in scenarios 1 and 2 we will use system A data. If there is only data available from system B, for all hit types except ground balls we will create a regression model using system B features as predictors for our target variable in system A, and perform regression imputation (scenario 3). Because the data for ground balls from system B is less than reliable, in scenario 4 we will perform mean imputation by calculating the mean for each hit type for each batter from system A, and filling the NA values with this mean. For scenario 5 because we have no data to work off of, we will perform the same imputation as scenario 4.

library(dplyr)
library(tidyverse)

df2 <- df[is.na(df$speed_A) == FALSE,]
# For scenarios 1 and 2
df2['best_speed'] = df2['speed_A']
df2['best_angle'] = df2['vangle_A']

# For scenario 3
df_no_gb <- df_clean[df_clean$hittype != 'ground_ball',]
model_speed <- lm(speed_A ~ speed_B + vangle_B, data = df_no_gb)
model_angle <- lm(vangle_A ~ speed_B + vangle_B, data = df_no_gb)
summary(model_speed)
## 
## Call:
## lm(formula = speed_A ~ speed_B + vangle_B, data = df_no_gb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.883  -0.503   0.025   0.555  26.610 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.6172461  0.0552562   29.27   <2e-16 ***
## speed_B      1.0314941  0.0005756 1792.19   <2e-16 ***
## vangle_B    -0.0184915  0.0004232  -43.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.151 on 36678 degrees of freedom
## Multiple R-squared:  0.9903, Adjusted R-squared:  0.9903 
## F-statistic: 1.875e+06 on 2 and 36678 DF,  p-value: < 2.2e-16
summary(model_angle)
## 
## Call:
## lm(formula = vangle_A ~ speed_B + vangle_B, data = df_no_gb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -77.855  -0.217  -0.014   0.211  15.411 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.7426075  0.0390470   95.85   <2e-16 ***
## speed_B     -0.0310514  0.0004067  -76.35   <2e-16 ***
## vangle_B     0.9633392  0.0002991 3220.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8133 on 36678 degrees of freedom
## Multiple R-squared:  0.997,  Adjusted R-squared:  0.997 
## F-statistic: 6.055e+06 on 2 and 36678 DF,  p-value: < 2.2e-16
df3 <- df %>%
  filter(is.na(speed_A) == TRUE, is.na(speed_B) == FALSE, hittype != 'ground_ball')

df3 <- df3 %>% 
  mutate(best_speed = predict(model_speed, newdata = df3)) %>%
  mutate(best_angle = predict(model_angle, newdata = df3))

# For scenario 4
df4 <- df[df$hittype == 'ground_ball', ]

df4 <- df4 %>%
  group_by(batter) %>%
  mutate(best_speed = mean(speed_A, na.rm = TRUE)) %>%
  mutate(best_angle = mean(vangle_A, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(is.na(speed_A) == TRUE, is.na(speed_B) == FALSE)
  
# For scenario 5

df5 <- df %>%
  group_by(batter, hittype) %>%
  mutate(best_speed = mean(speed_A, na.rm = TRUE)) %>%
  mutate(best_angle = mean(vangle_A, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(is.na(speed_A) == TRUE, is.na(speed_B) == TRUE)

df_best <- rbind(df2, df3, df4, df5)
summary(df_best$best_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   26.46   80.32   89.35   87.86   97.53  121.85      35
summary(df_best$best_angle)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -91.90   -7.56   10.12   11.58   28.33   89.14      35

I elected to use both speed_B and vangle_B as predictors for the regression model for speed and vangle because the RSE values were slighty better than only using the respective metric to the target from system B (1.15 < 1.18 for speed and 0.8133 < 0.8756 for angle). The summary statistics reveal there are still missing values for 35 batted balls. Because these are likely batted balls that were for players with only system B data or no data on a certain hit type, there’s no way to formulate a sound estimate for that batter, and so here I elect to remove these rows from data_best.

df_best <- df_best[is.na(df_best$best_speed) == FALSE,]

hist(df_best$best_speed, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram of Best Speed', breaks = 50)

summary(df_best$best_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.46   80.32   89.35   87.86   97.53  121.85
sd(df_best$best_speed)
## [1] 12.94348

The following histograms visualize how the data is now visualized by hit type after data imputation

Ground Balls

gb_best <- df_best[df_best$hittype == 'ground_ball',]
hist(gb_best$best_speed, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram of Best Speed (Ground Balls)', breaks = 50)

summary(gb_best$best_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.46   78.52   87.62   86.17   96.21  121.85
sd(gb_best$best_speed)
## [1] 13.62289

Fly Balls

fb_best <- df_best[df_best$hittype == 'fly_ball',]
hist(fb_best$best_speed, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram of Best Speed (Fly Balls)', breaks = 50)

summary(fb_best$best_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.79   84.01   90.83   90.06   96.86  115.16
sd(fb_best$best_speed)
## [1] 9.316297

Line Drives

ld_best <- df_best[df_best$hittype == 'line_drive',]
hist(ld_best$best_speed, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram of Best Speed (Line Drives)', breaks = 50)

summary(ld_best$best_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.08   85.70   95.22   92.80  101.50  118.81
sd(ld_best$best_speed)
## [1] 11.73245

Popups

pu_best <- df_best[df_best$hittype == 'popup',]
hist(pu_best$best_speed, col = rgb(0,0,1,0.2), xlab = 'Speed', 
     main = 'Histogram of Best Speed (Popups)', breaks = 50)

summary(pu_best$best_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.59   67.20   75.44   74.52   82.49  105.91
sd(pu_best$best_speed)
## [1] 10.88436

Projecting Speed-Off-Bat for Next Season

Now that we have dealt with missing data, we can project next season’s speed-off-bat averages for each batter. I considered several different ways of doing this, including Hierarchical (Mixed) Modeling, Bayesian inference, or just simply using the average of each batter’s exit speed as the projection. My first approach was to use Bayesian Inference, but I changed my mind a decided to take a look at mixed modeling. Mixed modeling involves the combination of fixed effects and random effects to see how predictors influence the target within groups and between groups. In the code below, I have created a scatter plot (had to hide for pdf rendering purposes) with every batted ball colored by batter, as well as linear regression models for each batter. It’s a little busy, but the key aspect to look at is the thick blue line on the plot, which represents the average regression line for all batters. You can see there is a slightly positive relationship between exit speed and launch angle over all batters. Knowing this, I can consider using launch angle as a fixed effect for my mixed model.

library(ggplot2)

# Create a speed vs angle plot and plot regression lines for each batter,
# as well as an average regression line for all batters
spd_v_ang <- ggplot(df_best, aes(x = best_angle, y = best_speed, color = batter)) + 
  #geom_point(size = 1.2, alpha = 0.8) + # Add points
  scale_colour_gradientn(colours = rainbow(801)) +
  geom_smooth(method = lm, aes(group = batter), se=FALSE, size = 0.5, alpha = 0.8) +
  geom_smooth(method = lm, se = FALSE, size = 1, alpha = 0.8) + # Avg line
  theme_minimal() +
  labs(x="Launch Angle", y="Exit Speed", color="Batter")

print(spd_v_ang)

Now we can construct the mixed model. I will first attempt to use launch angle as only a fixed effect, with the random effect coming from the grouping of each batter.

# Load the lme4 package
library(lme4)
library(lmerTest)

model <- lmer(best_speed ~ best_angle + (1 | batter), data = df_best)

summary(model)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: best_speed ~ best_angle + (1 | batter)
##    Data: df_best
## 
## REML criterion at convergence: 581288.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.7294 -0.5841  0.1092  0.7583  2.5669 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  batter   (Intercept)   9.455   3.075  
##  Residual             159.620  12.634  
## Number of obs: 73340, groups:  batter, 801
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 8.691e+01  1.396e-01 5.736e+02 622.431   <2e-16 ***
## best_angle  1.659e-02  1.840e-03 7.329e+04   9.013   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr)
## best_angle -0.142

The p-value for best_angle in the fixed effects is very small, indicating it is significant. I can try to add best_angle to the random slopes to see if there’s any significance.

library(lme4)
library(lmerTest)

model2 <- lmer(best_speed ~ best_angle + (best_angle | batter), data = df_best)

summary(model2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: best_speed ~ best_angle + (best_angle | batter)
##    Data: df_best
## 
## REML criterion at convergence: 580441.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.0085 -0.5791  0.0928  0.7577  2.6185 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  batter   (Intercept) 9.950e+00  3.1543       
##           best_angle  6.146e-03  0.0784  -0.23
##  Residual             1.562e+02 12.4983       
## Number of obs: 73340, groups:  batter, 801
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)  86.96981    0.14310 543.47176 607.737  < 2e-16 ***
## best_angle    0.01977    0.00396 476.09374   4.993 8.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr)
## best_angle -0.255
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.111865 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
ranova(model2)

From the summary statistics, the fixed regression slopes are all still significant. There are no significance tests for the random effects, so I used the ranova function in the lmerTest package to check whether dropping the random effect would make the model worse. When doing so I got a p-value of < 2.2e-16, indicating it is significant and should not be dropped from the model. We can now calculate average exit speeds for each batter using our model.

# Find the random effects for each batter
batter_effects <- ranef(model2)$batter

# Now, find the fixed effects
fixed_effect <- fixef(model2)

# The predicted average exit speed for each batter can be calculated by adding the fixed effect to their random effect.
average_launch_angle <- mean(df_best$best_angle)
predicted_exit_speeds <- fixed_effect[1] + average_launch_angle * fixed_effect[2] + 
  batter_effects[,1] + batter_effects[,2]
head(predicted_exit_speeds)
## [1] 89.46576 83.42032 89.56762 88.47500 89.07564 84.03128
df_batters <- df_best %>%
  group_by(batter) %>%
  summarise(avg_speed = mean(best_speed), sd_speed = sd(best_speed), bip = n())

df_batters$predicted_speed = predicted_exit_speeds

hist(df_batters$predicted_speed, col = rgb(0,0,1,0.2), xlab = 'Average Speed', 
     main = 'Histogram of Average Speed (All Batters)', breaks = 50)

summary(df_batters$predicted_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   76.61   85.69   87.29   87.20   88.65   94.28

The dataframe df_batters contains a row for each of the 801 batters in the dataset. Each row has their average speed and standard deviation of the speed for their measured batted balls, as well as their predicted speed for next season using Hierarchical modeling. It also includes number of batted balls recorded. The model gives more bias towards the mean of the batter’s average exit speed the larger the sample size gets, which makes sense. The following code block limits the table to only batter and predicted speed-off-bat, and stores it in the dataframe “avg_speed_off_bat”.

avg_speed_off_bat <- cbind(batter = df_batters$batter, 
                           avg_speed = df_batters$predicted_speed)

head(avg_speed_off_bat)
##      batter avg_speed
## [1,]      1  89.46576
## [2,]      2  83.42032
## [3,]      3  89.56762
## [4,]      4  88.47500
## [5,]      5  89.07564
## [6,]      6  84.03128
#write.csv(avg_speed_off_bat, "C:/Users/phill/OneDrive/Desktop/Job_Search/avg_speed_off_bat.csv")