# Load needed libraries
library(tidyverse)
library(readr)
library(knitr)
library(sqldf)

Do other factors affect Home Run Distance besides exit velocity?

In my last discussion I am created a simple model to show if there is a direct correlation with the exit velocity of a ball hit out of the park and the distance that it travels. My conclusion was that there is. In this discussion I will add the other measures to determine if those also affect the distance of a homerun. I will limit the data to home runs hit out of Coors Field in Denver and Yankee Stadium for better clarity.
filename <- tempfile()
download.file("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA605%20Fall%202022/Week%2011/HR%20Tracker.csv",filename)
hr_stad_CFYS_2017 <- read.csv.sql(filename, "select TRUE_DISTANCE, EXIT_VELOCITY, ELEVATION_ANGLE, HORIZONTAL_ANGLE, APEX, BALLPARK  from file where GAME_DATE > '2017-04-01' and EXIT_VELOCITY > 70 and BALLPARK IN ('Coors Field','Yankee Stadium')", sep=",")

The stadium will be my dichotimus variable.

coors_bp <- ifelse(hr_stad_CFYS_2017$BALLPARK == 'Coors Field', 1, 0)

hr_stad_CFYS_2017_out <- data.frame(TRUE_DISTANCE = hr_stad_CFYS_2017$TRUE_DISTANCE, EXIT_VELOCITY = hr_stad_CFYS_2017$EXIT_VELOCITY, ELEVATION_ANGLE = hr_stad_CFYS_2017$ELEVATION_ANGLE , HORIZONTAL_ANGLE = hr_stad_CFYS_2017$HORIZONTAL_ANGLE, APEX = hr_stad_CFYS_2017$APEX)
head(hr_stad_CFYS_2017_out,10)
##    TRUE_DISTANCE EXIT_VELOCITY ELEVATION_ANGLE HORIZONTAL_ANGLE APEX
## 1            387         106.6            25.9            121.8   69
## 2            397          98.8            29.5             78.4   91
## 3            420         106.1            27.5             71.3   86
## 4            487         118.9            25.0            103.2  104
## 5            408         104.6            23.8             80.0   69
## 6            424         102.3            33.2             94.9  124
## 7            428         104.1            30.5             78.2  107
## 8            429         108.7            32.5             61.4  112
## 9            362          97.2            38.6             62.6  121
## 10           365          96.5            24.7             80.0   66
pairs(hr_stad_CFYS_2017_out, gap=0.5, panel = panel.smooth, main = "Home Run data", col = 3 + (coors_bp))

From the pairs view we can see that the elevation angle of the ball’s trajectory has a negative impact on distance. I’ll now create a multiple regression model to determine how the othr factors come into play.

(hr_stad_CFYS_2017_lm <- lm(TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + HORIZONTAL_ANGLE + APEX , data = hr_stad_CFYS_2017_out))
## 
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + 
##     HORIZONTAL_ANGLE + APEX, data = hr_stad_CFYS_2017_out)
## 
## Coefficients:
##      (Intercept)     EXIT_VELOCITY   ELEVATION_ANGLE  HORIZONTAL_ANGLE  
##        237.87211           2.07560          -6.48694           0.05274  
##             APEX  
##          1.43415
summary(hr_stad_CFYS_2017_lm)
## 
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + 
##     HORIZONTAL_ANGLE + APEX, data = hr_stad_CFYS_2017_out)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -78.049  -9.454   1.385  12.643  37.616 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      237.87211   34.88470   6.819  3.0e-11 ***
## EXIT_VELOCITY      2.07560    0.27259   7.614  1.6e-13 ***
## ELEVATION_ANGLE   -6.48694    0.55444 -11.700  < 2e-16 ***
## HORIZONTAL_ANGLE   0.05274    0.04449   1.185    0.236    
## APEX               1.43415    0.09581  14.969  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.05 on 446 degrees of freedom
## Multiple R-squared:  0.6752, Adjusted R-squared:  0.6722 
## F-statistic: 231.7 on 4 and 446 DF,  p-value: < 2.2e-16

Since horizontal angle p-value > .05, let’s remove that factor and recalculate.

(hr_stad_CFYS_2017_lm <- lm(TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + APEX , data = hr_stad_CFYS_2017_out))
## 
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + 
##     APEX, data = hr_stad_CFYS_2017_out)
## 
## Coefficients:
##     (Intercept)    EXIT_VELOCITY  ELEVATION_ANGLE             APEX  
##         234.365            2.146           -6.460            1.436
summary(hr_stad_CFYS_2017_lm)
## 
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + 
##     APEX, data = hr_stad_CFYS_2017_out)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -78.669  -9.361   1.133  12.644  37.291 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     234.36522   34.77479   6.740 4.92e-11 ***
## EXIT_VELOCITY     2.14553    0.26625   8.058 7.12e-15 ***
## ELEVATION_ANGLE  -6.46031    0.55423 -11.656  < 2e-16 ***
## APEX              1.43574    0.09584  14.980  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.06 on 447 degrees of freedom
## Multiple R-squared:  0.6741, Adjusted R-squared:  0.6719 
## F-statistic: 308.2 on 3 and 447 DF,  p-value: < 2.2e-16

Now that all p-values are below .05, I am still not confortable to call this a good model but let’s analyze the residuals.

residuals <- resid(hr_stad_CFYS_2017_lm)
plot(residuals)

hist(residuals)

qqnorm(resid(hr_stad_CFYS_2017_lm))
qqline(resid(hr_stad_CFYS_2017_lm))

After removing the horizontal angle factor and plotting the residuals, it appears to have a normal distribution but the histogram shows a skew to the right. I will select the elevation angle as my quadratic variable because of it’s negative affect on the distribution and recalculate.

elevation_sq <- hr_stad_CFYS_2017_out$ELEVATION_ANGLE ^ 2
(hr_stad_CFYS_2017_lm_2 <- lm(TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + APEX + elevation_sq, data = hr_stad_CFYS_2017_out))
## 
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + 
##     APEX + elevation_sq, data = hr_stad_CFYS_2017_out)
## 
## Coefficients:
##     (Intercept)    EXIT_VELOCITY  ELEVATION_ANGLE             APEX  
##       -153.6721           3.1230          13.7079           1.1787  
##    elevation_sq  
##         -0.3172
summary(hr_stad_CFYS_2017_lm_2)
## 
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + 
##     APEX + elevation_sq, data = hr_stad_CFYS_2017_out)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.175  -9.616  -0.269   9.544  68.166 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -153.67212   42.77902  -3.592 0.000364 ***
## EXIT_VELOCITY      3.12304    0.24127  12.944  < 2e-16 ***
## ELEVATION_ANGLE   13.70788    1.66196   8.248 1.82e-15 ***
## APEX               1.17875    0.08475  13.908  < 2e-16 ***
## elevation_sq      -0.31720    0.02504 -12.665  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.64 on 446 degrees of freedom
## Multiple R-squared:  0.7603, Adjusted R-squared:  0.7582 
## F-statistic: 353.7 on 4 and 446 DF,  p-value: < 2.2e-16

After doing so, you’ll notivce the median is now very close to 0 and the min/max range seems to have improved. Plotting the residuals does seem to show more of a normal distribution in all plots below.

residuals <- resid(hr_stad_CFYS_2017_lm_2)
plot(residuals)

hist(residuals)

plot(fitted(hr_stad_CFYS_2017_lm),resid(hr_stad_CFYS_2017_lm))

par(mfrow=c(2,2))
plot(hr_stad_CFYS_2017_lm)