# Load needed libraries
library(tidyverse)
library(readr)
library(knitr)
library(sqldf)
filename <- tempfile()
download.file("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA605%20Fall%202022/Week%2011/HR%20Tracker.csv",filename)
hr_stad_CFYS_2017 <- read.csv.sql(filename, "select TRUE_DISTANCE, EXIT_VELOCITY, ELEVATION_ANGLE, HORIZONTAL_ANGLE, APEX, BALLPARK from file where GAME_DATE > '2017-04-01' and EXIT_VELOCITY > 70 and BALLPARK IN ('Coors Field','Yankee Stadium')", sep=",")
The stadium will be my dichotimus variable.
coors_bp <- ifelse(hr_stad_CFYS_2017$BALLPARK == 'Coors Field', 1, 0)
hr_stad_CFYS_2017_out <- data.frame(TRUE_DISTANCE = hr_stad_CFYS_2017$TRUE_DISTANCE, EXIT_VELOCITY = hr_stad_CFYS_2017$EXIT_VELOCITY, ELEVATION_ANGLE = hr_stad_CFYS_2017$ELEVATION_ANGLE , HORIZONTAL_ANGLE = hr_stad_CFYS_2017$HORIZONTAL_ANGLE, APEX = hr_stad_CFYS_2017$APEX)
head(hr_stad_CFYS_2017_out,10)
## TRUE_DISTANCE EXIT_VELOCITY ELEVATION_ANGLE HORIZONTAL_ANGLE APEX
## 1 387 106.6 25.9 121.8 69
## 2 397 98.8 29.5 78.4 91
## 3 420 106.1 27.5 71.3 86
## 4 487 118.9 25.0 103.2 104
## 5 408 104.6 23.8 80.0 69
## 6 424 102.3 33.2 94.9 124
## 7 428 104.1 30.5 78.2 107
## 8 429 108.7 32.5 61.4 112
## 9 362 97.2 38.6 62.6 121
## 10 365 96.5 24.7 80.0 66
pairs(hr_stad_CFYS_2017_out, gap=0.5, panel = panel.smooth, main = "Home Run data", col = 3 + (coors_bp))
From the pairs view we can see that the elevation angle of the ball’s trajectory has a negative impact on distance. I’ll now create a multiple regression model to determine how the othr factors come into play.
(hr_stad_CFYS_2017_lm <- lm(TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + HORIZONTAL_ANGLE + APEX , data = hr_stad_CFYS_2017_out))
##
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE +
## HORIZONTAL_ANGLE + APEX, data = hr_stad_CFYS_2017_out)
##
## Coefficients:
## (Intercept) EXIT_VELOCITY ELEVATION_ANGLE HORIZONTAL_ANGLE
## 237.87211 2.07560 -6.48694 0.05274
## APEX
## 1.43415
summary(hr_stad_CFYS_2017_lm)
##
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE +
## HORIZONTAL_ANGLE + APEX, data = hr_stad_CFYS_2017_out)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.049 -9.454 1.385 12.643 37.616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 237.87211 34.88470 6.819 3.0e-11 ***
## EXIT_VELOCITY 2.07560 0.27259 7.614 1.6e-13 ***
## ELEVATION_ANGLE -6.48694 0.55444 -11.700 < 2e-16 ***
## HORIZONTAL_ANGLE 0.05274 0.04449 1.185 0.236
## APEX 1.43415 0.09581 14.969 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.05 on 446 degrees of freedom
## Multiple R-squared: 0.6752, Adjusted R-squared: 0.6722
## F-statistic: 231.7 on 4 and 446 DF, p-value: < 2.2e-16
Since horizontal angle p-value > .05, let’s remove that factor and recalculate.
(hr_stad_CFYS_2017_lm <- lm(TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + APEX , data = hr_stad_CFYS_2017_out))
##
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE +
## APEX, data = hr_stad_CFYS_2017_out)
##
## Coefficients:
## (Intercept) EXIT_VELOCITY ELEVATION_ANGLE APEX
## 234.365 2.146 -6.460 1.436
summary(hr_stad_CFYS_2017_lm)
##
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE +
## APEX, data = hr_stad_CFYS_2017_out)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.669 -9.361 1.133 12.644 37.291
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 234.36522 34.77479 6.740 4.92e-11 ***
## EXIT_VELOCITY 2.14553 0.26625 8.058 7.12e-15 ***
## ELEVATION_ANGLE -6.46031 0.55423 -11.656 < 2e-16 ***
## APEX 1.43574 0.09584 14.980 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.06 on 447 degrees of freedom
## Multiple R-squared: 0.6741, Adjusted R-squared: 0.6719
## F-statistic: 308.2 on 3 and 447 DF, p-value: < 2.2e-16
Now that all p-values are below .05, I am still not confortable to call this a good model but let’s analyze the residuals.
residuals <- resid(hr_stad_CFYS_2017_lm)
plot(residuals)
hist(residuals)
qqnorm(resid(hr_stad_CFYS_2017_lm))
qqline(resid(hr_stad_CFYS_2017_lm))
After removing the horizontal angle factor and plotting the residuals, it appears to have a normal distribution but the histogram shows a skew to the right. I will select the elevation angle as my quadratic variable because of it’s negative affect on the distribution and recalculate.
elevation_sq <- hr_stad_CFYS_2017_out$ELEVATION_ANGLE ^ 2
(hr_stad_CFYS_2017_lm_2 <- lm(TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE + APEX + elevation_sq, data = hr_stad_CFYS_2017_out))
##
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE +
## APEX + elevation_sq, data = hr_stad_CFYS_2017_out)
##
## Coefficients:
## (Intercept) EXIT_VELOCITY ELEVATION_ANGLE APEX
## -153.6721 3.1230 13.7079 1.1787
## elevation_sq
## -0.3172
summary(hr_stad_CFYS_2017_lm_2)
##
## Call:
## lm(formula = TRUE_DISTANCE ~ EXIT_VELOCITY + ELEVATION_ANGLE +
## APEX + elevation_sq, data = hr_stad_CFYS_2017_out)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.175 -9.616 -0.269 9.544 68.166
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -153.67212 42.77902 -3.592 0.000364 ***
## EXIT_VELOCITY 3.12304 0.24127 12.944 < 2e-16 ***
## ELEVATION_ANGLE 13.70788 1.66196 8.248 1.82e-15 ***
## APEX 1.17875 0.08475 13.908 < 2e-16 ***
## elevation_sq -0.31720 0.02504 -12.665 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.64 on 446 degrees of freedom
## Multiple R-squared: 0.7603, Adjusted R-squared: 0.7582
## F-statistic: 353.7 on 4 and 446 DF, p-value: < 2.2e-16
After doing so, you’ll notivce the median is now very close to 0 and the min/max range seems to have improved. Plotting the residuals does seem to show more of a normal distribution in all plots below.
residuals <- resid(hr_stad_CFYS_2017_lm_2)
plot(residuals)
hist(residuals)
plot(fitted(hr_stad_CFYS_2017_lm),resid(hr_stad_CFYS_2017_lm))
par(mfrow=c(2,2))
plot(hr_stad_CFYS_2017_lm)