library(RODBC)
## Warning: package 'RODBC' was built under R version 4.2.3
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/modules/R_X11.so
## Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-17.0.1+12/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## tcltk DLL is linked to '/opt/X11/lib/libX11.6.dylib'
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
library(ggplot2)
library(patchwork)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(glue)
library(caret)
## Loading required package: lattice
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-40. For overview type 'help("mgcv-package")'.
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(lme4)
##
## Attaching package: 'lme4'
## The following object is masked from 'package:nlme':
##
## lmList
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ nlme::collapse() masks dplyr::collapse()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ purrr::lift() masks caret::lift()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(glue)
library(caTools) #this is for test/train split the dataset
library(CatEncoders) #this is for label encoding
##
## Attaching package: 'CatEncoders'
##
## The following object is masked from 'package:base':
##
## transform
library(randomForest) #random forest package
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(xgboost) #xgboost library
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(ggThemeAssist)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
library(ggpubr)
library(png)
library(patchwork)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
library(Matrix)
library(readr)
library(rsample)
## Warning: package 'rsample' was built under R version 4.2.3
pitch_data <- read.csv("~/Desktop/pitch_data.csv")
set.seed(1693)
pitch_data$is_called_strike <- as.factor(pitch_data$is_called_strike)
pitch_data$is_swing <- as.factor(pitch_data$is_swing)
split_data <- initial_split(pitch_data, prop = 0.8) # 80% training, 20% testing
# Create training and testing sets
train_data <- training(split_data)
test_data <- testing(split_data)
# Verify the split
dim(train_data) # Check the dimensions of the training set
## [1] 80000 23
dim(test_data)
## [1] 20000 23
names(train_data)
## [1] "pitch_id" "batter_side" "pitcher_side"
## [4] "pitch_type" "balls" "strikes"
## [7] "umpire_id" "pitcher_id" "catcher_id"
## [10] "plate_horz_location" "plate_vert_location" "release_speed"
## [13] "horz_break" "vert_break_induced" "spin_rate"
## [16] "release_height" "release_side" "extension"
## [19] "launch_angle" "exit_velocity" "is_swing"
## [22] "is_in_play" "is_called_strike"
logreg1 <- glmer(
is_called_strike ~ plate_horz_location + plate_vert_location + release_speed + spin_rate +
pitch_type + horz_break + vert_break_induced + release_height + release_side +
extension + is_swing +
(1 | pitcher_id) + (1 | umpire_id) + (1 | catcher_id), # Random intercepts for pitchers and umpires
data = train_data,
family = binomial(link = "logit"), control = glmerControl(optimizer = "bobyqa")
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## unable to evaluate scaled gradient
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge: degenerate Hessian with 1 negative eigenvalues
summary(logreg1)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: is_called_strike ~ plate_horz_location + plate_vert_location +
## release_speed + spin_rate + pitch_type + horz_break + vert_break_induced +
## release_height + release_side + extension + is_swing + (1 |
## pitcher_id) + (1 | umpire_id) + (1 | catcher_id)
## Data: train_data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 50670.4 50874.7 -25313.2 50626.4 79848
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.33169 -0.61255 -0.00004 -0.00003 2.87278
##
## Random effects:
## Groups Name Variance Std.Dev.
## pitcher_id (Intercept) 3.408e-03 0.05838
## umpire_id (Intercept) 4.001e-08 0.00020
## catcher_id (Intercept) 3.257e-03 0.05707
## Number of obs: 79870, groups: pitcher_id, 657; umpire_id, 88; catcher_id, 77
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.976e+00 3.609e-01 5.476 4.36e-08 ***
## plate_horz_location 3.011e-02 1.112e-02 2.708 0.00677 **
## plate_vert_location 1.074e-01 1.068e-02 10.059 < 2e-16 ***
## release_speed -3.181e-02 3.629e-03 -8.765 < 2e-16 ***
## spin_rate 1.100e-04 5.121e-05 2.147 0.03179 *
## pitch_type4FB -4.295e-01 3.994e-02 -10.755 < 2e-16 ***
## pitch_typeCB -8.397e-01 9.170e-02 -9.157 < 2e-16 ***
## pitch_typeCH -1.314e+00 5.723e-02 -22.951 < 2e-16 ***
## pitch_typeCT -5.777e-01 5.135e-02 -11.250 < 2e-16 ***
## pitch_typeKN -2.019e+00 6.315e-01 -3.196 0.00139 **
## pitch_typeSF -1.260e+00 9.098e-02 -13.849 < 2e-16 ***
## pitch_typeSL -7.673e-01 5.557e-02 -13.809 < 2e-16 ***
## pitch_typeSW -8.421e-01 7.131e-02 -11.810 < 2e-16 ***
## horz_break -4.372e-04 1.198e-03 -0.365 0.71520
## vert_break_induced 2.993e-03 2.794e-03 1.071 0.28406
## release_height -2.913e-02 2.312e-02 -1.260 0.20771
## release_side 3.539e-03 6.596e-03 0.537 0.59156
## extension 4.660e-02 2.608e-02 1.787 0.07400 .
## is_swing1 -1.973e+01 1.444e+02 -0.137 0.89132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 19 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
## optimizer (bobyqa) convergence code: 0 (OK)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate Hessian with 1 negative eigenvalues
logreg_preds1 <- predict(logreg1, test_data, type = "response", re.form = NA) #CalledStrikePerc
logreg_preds0 <- 1 - logreg_preds1 #NotCalledStrikePerc
logreg_df <- data.frame(NotCalledStrike = logreg_preds0,
CalledStrikePerc = logreg_preds1,
is_called_strike = as.character(test_data$is_called_strike),
x = test_data$plate_horz_location,
y = test_data$plate_vert_location)
logreg_df$is_called_strike <- as.numeric(logreg_df$is_called_strike)
#Model performance
print(paste("Max Strike%:", max(logreg_df$CalledStrikePerc)))
## [1] "Max Strike%: NA"
print(paste("Mean Strike%:", mean(logreg_df$CalledStrikePerc, na.rm = TRUE)))
## [1] "Mean Strike%: 0.163328613182822"
print(paste("Sum of Strike%:", sum(logreg_df$CalledStrikePerc)))
## [1] "Sum of Strike%: NA"
print(paste("Sum of IsStrike:", sum(logreg_df$is_called_strike)))
## [1] "Sum of IsStrike: 3237"
print(paste("Difference:", abs(sum(logreg_df$CalledStrikePerc)
- sum(logreg_df$is_called_strike))))
## [1] "Difference: NA"
#Accuracy Test
Strike_Perc_Change <- ifelse(logreg_df$CalledStrikePerc > .3, 1, 0) #.3 is an arbitary cutoff, about twice the mean to try and improve accuracy
cm <- confusionMatrix(factor(logreg_df$is_called_strike), factor(Strike_Perc_Change)); cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 12764 3970
## 1 831 2397
##
## Accuracy : 0.7595
## 95% CI : (0.7535, 0.7654)
## No Information Rate : 0.681
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3629
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9389
## Specificity : 0.3765
## Pos Pred Value : 0.7628
## Neg Pred Value : 0.7426
## Prevalence : 0.6810
## Detection Rate : 0.6394
## Detection Prevalence : 0.8383
## Balanced Accuracy : 0.6577
##
## 'Positive' Class : 0
##
accuracy <- cm$overall["Accuracy"]
print(accuracy)
## Accuracy
## 0.759493
logreg_df$bucket <- cut(logreg_df$CalledStrikePerc, breaks = seq(0, .6, by = .05), include.lowest = TRUE, labels = FALSE)
t = ggplot(logreg_df, aes(x = x, y = y, color = as.factor(bucket))) +
geom_point() + xlim(-2,2) + ylim(0,5) +
scale_color_brewer(palette="Spectral", name = "StrikePerc Bucket",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
labels = c("0-0.05", "0.05-0.1", "0.1-0.15", "0.15-0.2", "0.2-0.25",
"0.25-0.3", "0.3-0.35", "0.35-0.4", "0.4-0.45", "0.5-0.55", "0.55-.6")) +
theme_minimal() + ggtitle('Model') + # lines after this create the plot
geom_rect(xmin = -.833, xmax = .833, ymax = 3.47,
ymin = 1.6, alpha = 0, color = "black", size = 1) +
geom_rect(xmin = -1.1083, xmax = 1.1083,
ymin = 1.28, ymax = 3.78, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -.5583, xmax = .5583,
ymin = 1.9, ymax = 3.1, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -1.667, xmax = 1.667,
ymin = .642, ymax = 4.334, alpha = 0, color = "black", linetype = 3, linewidth = .5) +
coord_fixed() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(colour = "black")
) +
theme(axis.line = element_line(linetype = "solid")) + theme(
legend.text = element_text(size = 6.5),
legend.title = element_text(size = 8)
) + theme(plot.title = element_text(size = 11)) +
theme(panel.background = element_rect(fill = "gray"),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()) + ylab(" ") + xlab(" ") +
theme(panel.background = element_rect(fill = "white")) ; print(t)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 651 rows containing missing values (`geom_point()`).
Overall, this is a decent model, but in future work with a wider sample I think it can get much better
This second model I took out some of the low p-value predictors
logreg2 <- glmer(
is_called_strike ~ plate_horz_location + plate_vert_location + release_speed + spin_rate +
pitch_type +
(1 | pitcher_id) + (1 | umpire_id) + (1 | catcher_id), # Random intercepts for pitchers and umpires
data = train_data,
family = binomial(link = "logit"), control = glmerControl(optimizer = "bobyqa")
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 5.91641 (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
summary(logreg2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: is_called_strike ~ plate_horz_location + plate_vert_location +
## release_speed + spin_rate + pitch_type + (1 | pitcher_id) +
## (1 | umpire_id) + (1 | catcher_id)
## Data: train_data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 69959.0 70107.6 -34963.5 69927.0 79862
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.0032 -0.4653 -0.4287 -0.3223 4.0663
##
## Random effects:
## Groups Name Variance Std.Dev.
## pitcher_id (Intercept) 3.982e-03 0.0631026
## umpire_id (Intercept) 4.020e-08 0.0002005
## catcher_id (Intercept) 2.623e-03 0.0512160
## Number of obs: 79878, groups: pitcher_id, 657; umpire_id, 88; catcher_id, 77
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.792e+00 2.802e-01 6.396 1.60e-10 ***
## plate_horz_location 2.203e-02 1.189e-02 1.852 0.06396 .
## plate_vert_location 7.009e-02 1.140e-02 6.148 7.83e-10 ***
## release_speed -3.609e-02 3.639e-03 -9.919 < 2e-16 ***
## spin_rate 7.408e-05 8.621e-05 0.859 0.39013
## pitch_type4FB -3.560e-01 3.052e-02 -11.662 < 2e-16 ***
## pitch_typeCB -7.688e-01 8.650e-02 -8.887 < 2e-16 ***
## pitch_typeCH -1.277e+00 5.276e-02 -24.211 < 2e-16 ***
## pitch_typeCT -5.626e-01 5.010e-02 -11.228 < 2e-16 ***
## pitch_typeKN -1.924e+00 6.102e-01 -3.154 0.00161 **
## pitch_typeSF -1.310e+00 9.363e-02 -13.990 < 2e-16 ***
## pitch_typeSL -7.913e-01 5.786e-02 -13.676 < 2e-16 ***
## pitch_typeSW -7.895e-01 8.121e-02 -9.722 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 13 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
## optimizer (bobyqa) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 5.91641 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
logreg_preds1 <- predict(logreg2, test_data, type = "response", re.form = NA) #CalledStrikePerc
logreg_preds0 <- 1 - logreg_preds1 #NotCalledStrikePerc
logreg_df <- data.frame(NotCalledStrike = logreg_preds0,
CalledStrikePerc = logreg_preds1,
is_called_strike = as.character(test_data$is_called_strike),
x = test_data$plate_horz_location,
y = test_data$plate_vert_location)
logreg_df$is_called_strike <- as.numeric(logreg_df$is_called_strike)
#Model performance
print(paste("Max Strike%:", max(logreg_df$CalledStrikePerc)))
## [1] "Max Strike%: NA"
print(paste("Mean Strike%:", mean(logreg_df$CalledStrikePerc, na.rm = TRUE)))
## [1] "Mean Strike%: 0.162732652211725"
print(paste("Sum of Strike%:", sum(logreg_df$CalledStrikePerc)))
## [1] "Sum of Strike%: NA"
print(paste("Sum of IsStrike:", sum(logreg_df$is_called_strike)))
## [1] "Sum of IsStrike: 3237"
print(paste("Difference:", abs(sum(logreg_df$CalledStrikePerc)
- sum(logreg_df$is_called_strike))))
## [1] "Difference: NA"
#Accuracy Test
Strike_Perc_Change <- ifelse(logreg_df$CalledStrikePerc > .3, 1, 0) #.3 is an arbitary cutoff, about twice the mean to try and improve accuracy
cm <- confusionMatrix(factor(logreg_df$is_called_strike), factor(Strike_Perc_Change)); cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16699 36
## 1 3225 4
##
## Accuracy : 0.8367
## 95% CI : (0.8315, 0.8418)
## No Information Rate : 0.998
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0015
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.838135
## Specificity : 0.100000
## Pos Pred Value : 0.997849
## Neg Pred Value : 0.001239
## Prevalence : 0.997996
## Detection Rate : 0.836456
## Detection Prevalence : 0.838259
## Balanced Accuracy : 0.469067
##
## 'Positive' Class : 0
##
accuracy <- cm$overall["Accuracy"]
print(accuracy)
## Accuracy
## 0.836656
logreg_df$bucket <- cut(logreg_df$CalledStrikePerc, breaks = seq(0, .6, by = .05), include.lowest = TRUE, labels = FALSE)
t = ggplot(logreg_df, aes(x = x, y = y, color = as.factor(bucket))) +
geom_point() + xlim(-2,2) + ylim(0,5) +
scale_color_brewer(palette="Spectral", name = "StrikePerc Bucket",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
labels = c("0-0.05", "0.05-0.1", "0.1-0.15", "0.15-0.2", "0.2-0.25",
"0.25-0.3", "0.3-0.35", "0.35-0.4", "0.4-0.45", "0.5-0.55", "0.55-.6")) +
theme_minimal() + ggtitle('Model') + # lines after this create the plot
geom_rect(xmin = -.833, xmax = .833, ymax = 3.47,
ymin = 1.6, alpha = 0, color = "black", size = 1) +
geom_rect(xmin = -1.1083, xmax = 1.1083,
ymin = 1.28, ymax = 3.78, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -.5583, xmax = .5583,
ymin = 1.9, ymax = 3.1, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -1.667, xmax = 1.667,
ymin = .642, ymax = 4.334, alpha = 0, color = "black", linetype = 3, linewidth = .5) +
coord_fixed() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(colour = "black")
) +
theme(axis.line = element_line(linetype = "solid")) + theme(
legend.text = element_text(size = 6.5),
legend.title = element_text(size = 8)
) + theme(plot.title = element_text(size = 11)) +
theme(panel.background = element_rect(fill = "gray"),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()) + ylab(" ") + xlab(" ") +
theme(panel.background = element_rect(fill = "white")) ; print(t)
## Warning: Removed 648 rows containing missing values (`geom_point()`).
#This is just a thing I wanted to try on all the pitches with no swings, wanted to see if I could make it better
filter_pitch_data <- pitch_data %>%
filter(is_swing == 0)
filter_split_data <- initial_split(filter_pitch_data, prop = 0.8) # 80% training, 20% testing
# Create training and testing sets
train_data <- training(filter_split_data)
test_data <- testing(filter_split_data)
# Verify the split
dim(train_data) # Check the dimensions of the training set
## [1] 41715 23
dim(test_data)
## [1] 10429 23
logreg3 <- glmer(
is_called_strike ~ plate_horz_location + plate_vert_location + release_speed + spin_rate +
pitch_type + horz_break + vert_break_induced + release_height + release_side +
extension +
(1 | pitcher_id) + (1 | umpire_id) + (1 | catcher_id), # Random intercepts for pitchers and umpires
data = train_data,
family = binomial(link = "logit"), control = glmerControl(optimizer = "bobyqa")
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 3.8837 (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
summary(logreg3)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: is_called_strike ~ plate_horz_location + plate_vert_location +
## release_speed + spin_rate + pitch_type + horz_break + vert_break_induced +
## release_height + release_side + extension + (1 | pitcher_id) +
## (1 | umpire_id) + (1 | catcher_id)
## Data: train_data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 50675.7 50857.0 -25316.8 50633.7 41616
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.2995 -0.7102 -0.6022 1.2755 2.4327
##
## Random effects:
## Groups Name Variance Std.Dev.
## pitcher_id (Intercept) 6.787e-03 0.0823840
## umpire_id (Intercept) 4.815e-08 0.0002194
## catcher_id (Intercept) 1.810e-03 0.0425462
## Number of obs: 41637, groups: pitcher_id, 654; umpire_id, 88; catcher_id, 77
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.766e+00 3.685e-01 4.793 1.64e-06 ***
## plate_horz_location 2.408e-02 1.112e-02 2.165 0.0304 *
## plate_vert_location 1.105e-01 1.071e-02 10.320 < 2e-16 ***
## release_speed -2.777e-02 3.788e-03 -7.332 2.26e-13 ***
## spin_rate 8.397e-05 5.669e-05 1.481 0.1386
## pitch_type4FB -4.154e-01 4.025e-02 -10.320 < 2e-16 ***
## pitch_typeCB -7.675e-01 9.391e-02 -8.173 3.01e-16 ***
## pitch_typeCH -1.283e+00 5.833e-02 -22.002 < 2e-16 ***
## pitch_typeCT -5.551e-01 5.220e-02 -10.633 < 2e-16 ***
## pitch_typeKN -1.341e+00 5.799e-01 -2.313 0.0207 *
## pitch_typeSF -1.236e+00 9.188e-02 -13.450 < 2e-16 ***
## pitch_typeSL -7.280e-01 5.713e-02 -12.742 < 2e-16 ***
## pitch_typeSW -7.751e-01 7.416e-02 -10.452 < 2e-16 ***
## horz_break 4.436e-04 1.204e-03 0.368 0.7126
## vert_break_induced 2.777e-03 2.800e-03 0.992 0.3213
## release_height -2.714e-02 2.358e-02 -1.151 0.2497
## release_side 6.392e-03 6.791e-03 0.941 0.3466
## extension 2.447e-02 2.659e-02 0.920 0.3573
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 18 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
## optimizer (bobyqa) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 3.8837 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
logreg4 <- glmer(
is_called_strike ~ plate_horz_location + plate_vert_location + release_speed + pitch_type +
(1 | pitcher_id) + (1 | umpire_id) + (1 | catcher_id), # Random intercepts for pitchers and umpires
data = train_data,
family = binomial(link = "logit"), control = glmerControl(optimizer = "bobyqa")
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0935542 (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
summary(logreg4)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: is_called_strike ~ plate_horz_location + plate_vert_location +
## release_speed + pitch_type + (1 | pitcher_id) + (1 | umpire_id) +
## (1 | catcher_id)
## Data: train_data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 50778.8 50908.4 -25374.4 50748.8 41700
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.2903 -0.7110 -0.6034 1.2718 2.4192
##
## Random effects:
## Groups Name Variance Std.Dev.
## pitcher_id (Intercept) 7.017e-03 0.0837673
## umpire_id (Intercept) 3.165e-08 0.0001779
## catcher_id (Intercept) 2.106e-03 0.0458886
## Number of obs: 41715, groups: pitcher_id, 654; umpire_id, 88; catcher_id, 77
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.724061 0.321272 5.366 8.03e-08 ***
## plate_horz_location 0.023256 0.010951 2.124 0.03370 *
## plate_vert_location 0.108106 0.010556 10.241 < 2e-16 ***
## release_speed -0.024980 0.003399 -7.350 1.99e-13 ***
## pitch_type4FB -0.384135 0.032436 -11.843 < 2e-16 ***
## pitch_typeCB -0.760673 0.066736 -11.398 < 2e-16 ***
## pitch_typeCH -1.301028 0.056201 -23.149 < 2e-16 ***
## pitch_typeCT -0.532083 0.048288 -11.019 < 2e-16 ***
## pitch_typeKN -1.487646 0.573402 -2.594 0.00948 **
## pitch_typeSF -1.295435 0.082485 -15.705 < 2e-16 ***
## pitch_typeSL -0.710568 0.047822 -14.859 < 2e-16 ***
## pitch_typeSW -0.728409 0.060392 -12.061 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) plt_h_ plt_v_ rls_sp pt_4FB ptc_CB ptc_CH ptc_CT ptc_KN
## plt_hrz_lct -0.009
## plt_vrt_lct -0.134 -0.173
## release_spd -0.994 0.022 0.058
## ptch_typ4FB 0.013 0.017 -0.191 -0.062
## pitch_typCB -0.776 0.010 0.111 0.746 0.246
## pitch_typCH -0.537 -0.012 0.132 0.497 0.313 0.553
## pitch_typCT -0.325 0.052 -0.011 0.288 0.399 0.413 0.376
## pitch_typKN -0.105 -0.002 0.007 0.102 0.028 0.092 0.070 0.051
## pitch_typSF -0.349 -0.031 0.099 0.320 0.212 0.363 0.306 0.250 0.047
## pitch_typSL -0.613 0.054 0.131 0.567 0.373 0.634 0.536 0.438 0.081
## pitch_typSW -0.698 0.069 0.091 0.665 0.279 0.659 0.526 0.412 0.087
## ptc_SF ptc_SL
## plt_hrz_lct
## plt_vrt_lct
## release_spd
## ptch_typ4FB
## pitch_typCB
## pitch_typCH
## pitch_typCT
## pitch_typKN
## pitch_typSF
## pitch_typSL 0.354
## pitch_typSW 0.345 0.607
## optimizer (bobyqa) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.0935542 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
logreg_preds1 <- predict(logreg3, test_data, type = "response", re.form = NA) #CalledStrikePerc
logreg_preds0 <- 1 - logreg_preds1 #NotCalledStrikePerc
logreg_df <- data.frame(NotCalledStrike = logreg_preds0,
CalledStrikePerc = logreg_preds1,
is_called_strike = as.character(test_data$is_called_strike),
x = test_data$plate_horz_location,
y = test_data$plate_vert_location)
logreg_df$is_called_strike <- as.numeric(logreg_df$is_called_strike)
#Model performance
print(paste("Max Strike%:", max(logreg_df$CalledStrikePerc)))
## [1] "Max Strike%: NA"
print(paste("Mean Strike%:", mean(logreg_df$CalledStrikePerc, na.rm = TRUE)))
## [1] "Mean Strike%: 0.311786704792465"
print(paste("Sum of Strike%:", sum(logreg_df$CalledStrikePerc)))
## [1] "Sum of Strike%: NA"
print(paste("Sum of IsStrike:", sum(logreg_df$is_called_strike)))
## [1] "Sum of IsStrike: 3251"
print(paste("Difference:", abs(sum(logreg_df$CalledStrikePerc)
- sum(logreg_df$is_called_strike))))
## [1] "Difference: NA"
#Accuracy Test
Strike_Perc_Change <- ifelse(logreg_df$CalledStrikePerc > .4, 1, 0) #.4 is an arbitary
cm <- confusionMatrix(factor(logreg_df$is_called_strike), factor(Strike_Perc_Change)); cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6527 642
## 1 2739 505
##
## Accuracy : 0.6753
## 95% CI : (0.6662, 0.6843)
## No Information Rate : 0.8898
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0803
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7044
## Specificity : 0.4403
## Pos Pred Value : 0.9104
## Neg Pred Value : 0.1557
## Prevalence : 0.8898
## Detection Rate : 0.6268
## Detection Prevalence : 0.6885
## Balanced Accuracy : 0.5723
##
## 'Positive' Class : 0
##
accuracy <- cm$overall["Accuracy"]
print(accuracy)
## Accuracy
## 0.6753097
logreg_df$bucket <- cut(logreg_df$CalledStrikePerc, breaks = seq(0, .6, by = .05), include.lowest = TRUE, labels = FALSE)
t = ggplot(logreg_df, aes(x = x, y = y, color = as.factor(bucket))) +
geom_point() + xlim(-2,2) + ylim(0,5) +
scale_color_brewer(palette="Spectral", name = "StrikePerc Bucket",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
labels = c("0-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5",
"0.5-0.6", "0.6-0.7", "0.7-0.8", "0.8-0.9", "0.9-1.0")) +
theme_minimal() + ggtitle('Model') + # lines after this create the plot
geom_rect(xmin = -.833, xmax = .833, ymax = 3.47,
ymin = 1.6, alpha = 0, color = "black", size = 1) +
geom_rect(xmin = -1.1083, xmax = 1.1083,
ymin = 1.28, ymax = 3.78, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -.5583, xmax = .5583,
ymin = 1.9, ymax = 3.1, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -1.667, xmax = 1.667,
ymin = .642, ymax = 4.334, alpha = 0, color = "black", linetype = 3, linewidth = .5) +
coord_fixed() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(colour = "black")
) +
theme(axis.line = element_line(linetype = "solid")) + theme(
legend.text = element_text(size = 6.5),
legend.title = element_text(size = 8)
) + theme(plot.title = element_text(size = 11)) +
theme(panel.background = element_rect(fill = "gray"),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()) + ylab(" ") + xlab(" ") +
theme(panel.background = element_rect(fill = "white")) ; print(t)
## Warning: Removed 594 rows containing missing values (`geom_point()`).