library(RODBC)
## Warning: package 'RODBC' was built under R version 4.2.3
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/modules/R_X11.so
##   Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-17.0.1+12/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## tcltk DLL is linked to '/opt/X11/lib/libX11.6.dylib'
## Could not load tcltk.  Will use slower R code instead.
## Loading required package: RSQLite
library(ggplot2)
library(patchwork)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(glue)
library(caret)
## Loading required package: lattice
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-40. For overview type 'help("mgcv-package")'.
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(lme4)
## 
## Attaching package: 'lme4'
## The following object is masked from 'package:nlme':
## 
##     lmList
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ nlme::collapse()      masks dplyr::collapse()
## ✖ tidyr::expand()       masks Matrix::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ purrr::lift()         masks caret::lift()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ tidyr::pack()         masks Matrix::pack()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ tidyr::unpack()       masks Matrix::unpack()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(glue)
library(caTools) #this is for test/train split the dataset
library(CatEncoders) #this is for label encoding
## 
## Attaching package: 'CatEncoders'
## 
## The following object is masked from 'package:base':
## 
##     transform
library(randomForest) #random forest package
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(xgboost) #xgboost library
## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice
library(ggThemeAssist)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
library(ggpubr)
library(png)
library(patchwork)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
library(Matrix)
library(readr)
library(rsample)
## Warning: package 'rsample' was built under R version 4.2.3
pitch_data <- read.csv("~/Desktop/pitch_data.csv")
set.seed(1693)
pitch_data$is_called_strike <- as.factor(pitch_data$is_called_strike)
pitch_data$is_swing <- as.factor(pitch_data$is_swing)

split_data <- initial_split(pitch_data, prop = 0.8)  # 80% training, 20% testing

# Create training and testing sets
train_data <- training(split_data)
test_data  <- testing(split_data)

# Verify the split
dim(train_data)  # Check the dimensions of the training set
## [1] 80000    23
dim(test_data) 
## [1] 20000    23
names(train_data)
##  [1] "pitch_id"            "batter_side"         "pitcher_side"       
##  [4] "pitch_type"          "balls"               "strikes"            
##  [7] "umpire_id"           "pitcher_id"          "catcher_id"         
## [10] "plate_horz_location" "plate_vert_location" "release_speed"      
## [13] "horz_break"          "vert_break_induced"  "spin_rate"          
## [16] "release_height"      "release_side"        "extension"          
## [19] "launch_angle"        "exit_velocity"       "is_swing"           
## [22] "is_in_play"          "is_called_strike"
logreg1 <- glmer(
  is_called_strike ~ plate_horz_location + plate_vert_location + release_speed + spin_rate + 
    pitch_type + horz_break + vert_break_induced + release_height + release_side +
    extension  + is_swing +
    (1 | pitcher_id) + (1 | umpire_id) + (1 | catcher_id),  # Random intercepts for pitchers and umpires
  data = train_data,
  family = binomial(link = "logit"), control = glmerControl(optimizer = "bobyqa")
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## unable to evaluate scaled gradient
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge: degenerate Hessian with 1 negative eigenvalues
summary(logreg1)
## Warning in vcov.merMod(object, use.hessian = use.hessian): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Warning in vcov.merMod(object, correlation = correlation, sigm = sig): variance-covariance matrix computed from finite-difference Hessian is
## not positive definite or contains NA values: falling back to var-cov estimated from RX
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: is_called_strike ~ plate_horz_location + plate_vert_location +  
##     release_speed + spin_rate + pitch_type + horz_break + vert_break_induced +  
##     release_height + release_side + extension + is_swing + (1 |  
##     pitcher_id) + (1 | umpire_id) + (1 | catcher_id)
##    Data: train_data
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  50670.4  50874.7 -25313.2  50626.4    79848 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.33169 -0.61255 -0.00004 -0.00003  2.87278 
## 
## Random effects:
##  Groups     Name        Variance  Std.Dev.
##  pitcher_id (Intercept) 3.408e-03 0.05838 
##  umpire_id  (Intercept) 4.001e-08 0.00020 
##  catcher_id (Intercept) 3.257e-03 0.05707 
## Number of obs: 79870, groups:  pitcher_id, 657; umpire_id, 88; catcher_id, 77
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.976e+00  3.609e-01   5.476 4.36e-08 ***
## plate_horz_location  3.011e-02  1.112e-02   2.708  0.00677 ** 
## plate_vert_location  1.074e-01  1.068e-02  10.059  < 2e-16 ***
## release_speed       -3.181e-02  3.629e-03  -8.765  < 2e-16 ***
## spin_rate            1.100e-04  5.121e-05   2.147  0.03179 *  
## pitch_type4FB       -4.295e-01  3.994e-02 -10.755  < 2e-16 ***
## pitch_typeCB        -8.397e-01  9.170e-02  -9.157  < 2e-16 ***
## pitch_typeCH        -1.314e+00  5.723e-02 -22.951  < 2e-16 ***
## pitch_typeCT        -5.777e-01  5.135e-02 -11.250  < 2e-16 ***
## pitch_typeKN        -2.019e+00  6.315e-01  -3.196  0.00139 ** 
## pitch_typeSF        -1.260e+00  9.098e-02 -13.849  < 2e-16 ***
## pitch_typeSL        -7.673e-01  5.557e-02 -13.809  < 2e-16 ***
## pitch_typeSW        -8.421e-01  7.131e-02 -11.810  < 2e-16 ***
## horz_break          -4.372e-04  1.198e-03  -0.365  0.71520    
## vert_break_induced   2.993e-03  2.794e-03   1.071  0.28406    
## release_height      -2.913e-02  2.312e-02  -1.260  0.20771    
## release_side         3.539e-03  6.596e-03   0.537  0.59156    
## extension            4.660e-02  2.608e-02   1.787  0.07400 .  
## is_swing1           -1.973e+01  1.444e+02  -0.137  0.89132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 19 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
## optimizer (bobyqa) convergence code: 0 (OK)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 1 negative eigenvalues
logreg_preds1 <- predict(logreg1, test_data, type = "response", re.form = NA) #CalledStrikePerc
logreg_preds0 <- 1 - logreg_preds1 #NotCalledStrikePerc
logreg_df <- data.frame(NotCalledStrike = logreg_preds0,
                        CalledStrikePerc = logreg_preds1,
                        is_called_strike = as.character(test_data$is_called_strike),
                        x = test_data$plate_horz_location,
                        y = test_data$plate_vert_location)

logreg_df$is_called_strike <- as.numeric(logreg_df$is_called_strike)

#Model performance 
print(paste("Max Strike%:", max(logreg_df$CalledStrikePerc)))
## [1] "Max Strike%: NA"
print(paste("Mean Strike%:", mean(logreg_df$CalledStrikePerc, na.rm = TRUE)))
## [1] "Mean Strike%: 0.163328613182822"
print(paste("Sum of Strike%:", sum(logreg_df$CalledStrikePerc)))
## [1] "Sum of Strike%: NA"
print(paste("Sum of IsStrike:", sum(logreg_df$is_called_strike)))
## [1] "Sum of IsStrike: 3237"
print(paste("Difference:", abs(sum(logreg_df$CalledStrikePerc)
                    - sum(logreg_df$is_called_strike))))
## [1] "Difference: NA"
#Accuracy Test
Strike_Perc_Change <- ifelse(logreg_df$CalledStrikePerc > .3, 1, 0) #.3 is an arbitary cutoff, about twice the mean to try and improve accuracy
cm <- confusionMatrix(factor(logreg_df$is_called_strike), factor(Strike_Perc_Change)); cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 12764  3970
##          1   831  2397
##                                           
##                Accuracy : 0.7595          
##                  95% CI : (0.7535, 0.7654)
##     No Information Rate : 0.681           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3629          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9389          
##             Specificity : 0.3765          
##          Pos Pred Value : 0.7628          
##          Neg Pred Value : 0.7426          
##              Prevalence : 0.6810          
##          Detection Rate : 0.6394          
##    Detection Prevalence : 0.8383          
##       Balanced Accuracy : 0.6577          
##                                           
##        'Positive' Class : 0               
## 
accuracy <- cm$overall["Accuracy"]
print(accuracy)
## Accuracy 
## 0.759493
logreg_df$bucket <- cut(logreg_df$CalledStrikePerc, breaks = seq(0, .6, by = .05), include.lowest = TRUE, labels = FALSE)
t = ggplot(logreg_df, aes(x = x, y = y, color = as.factor(bucket))) +
geom_point() + xlim(-2,2) + ylim(0,5) +
scale_color_brewer(palette="Spectral", name = "StrikePerc Bucket",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
labels = c("0-0.05", "0.05-0.1", "0.1-0.15", "0.15-0.2", "0.2-0.25", 
"0.25-0.3", "0.3-0.35", "0.35-0.4", "0.4-0.45", "0.5-0.55", "0.55-.6")) + 
theme_minimal()  + ggtitle('Model') + # lines after this create the plot
geom_rect(xmin = -.833, xmax = .833, ymax = 3.47, 
ymin = 1.6, alpha = 0, color = "black", size = 1) +
geom_rect(xmin = -1.1083, xmax = 1.1083, 
ymin = 1.28, ymax = 3.78, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -.5583, xmax = .5583, 
ymin = 1.9, ymax = 3.1, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -1.667, xmax = 1.667, 
ymin = .642, ymax = 4.334, alpha = 0, color = "black", linetype = 3, linewidth = .5) +
coord_fixed() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(colour = "black")
) +
theme(axis.line = element_line(linetype = "solid")) + theme(
legend.text = element_text(size = 6.5),
legend.title = element_text(size = 8)
) + theme(plot.title = element_text(size = 11)) +
theme(panel.background = element_rect(fill = "gray"),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank()) + ylab(" ") + xlab(" ") +
theme(panel.background = element_rect(fill = "white")) ; print(t)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 651 rows containing missing values (`geom_point()`).

Overall, this is a decent model, but in future work with a wider sample I think it can get much better

This second model I took out some of the low p-value predictors

logreg2 <- glmer(
  is_called_strike ~ plate_horz_location + plate_vert_location + release_speed + spin_rate + 
    pitch_type + 
    (1 | pitcher_id) + (1 | umpire_id) + (1 | catcher_id),  # Random intercepts for pitchers and umpires
  data = train_data,
  family = binomial(link = "logit"), control = glmerControl(optimizer = "bobyqa")
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 5.91641 (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
summary(logreg2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: is_called_strike ~ plate_horz_location + plate_vert_location +  
##     release_speed + spin_rate + pitch_type + (1 | pitcher_id) +  
##     (1 | umpire_id) + (1 | catcher_id)
##    Data: train_data
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  69959.0  70107.6 -34963.5  69927.0    79862 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.0032 -0.4653 -0.4287 -0.3223  4.0663 
## 
## Random effects:
##  Groups     Name        Variance  Std.Dev. 
##  pitcher_id (Intercept) 3.982e-03 0.0631026
##  umpire_id  (Intercept) 4.020e-08 0.0002005
##  catcher_id (Intercept) 2.623e-03 0.0512160
## Number of obs: 79878, groups:  pitcher_id, 657; umpire_id, 88; catcher_id, 77
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.792e+00  2.802e-01   6.396 1.60e-10 ***
## plate_horz_location  2.203e-02  1.189e-02   1.852  0.06396 .  
## plate_vert_location  7.009e-02  1.140e-02   6.148 7.83e-10 ***
## release_speed       -3.609e-02  3.639e-03  -9.919  < 2e-16 ***
## spin_rate            7.408e-05  8.621e-05   0.859  0.39013    
## pitch_type4FB       -3.560e-01  3.052e-02 -11.662  < 2e-16 ***
## pitch_typeCB        -7.688e-01  8.650e-02  -8.887  < 2e-16 ***
## pitch_typeCH        -1.277e+00  5.276e-02 -24.211  < 2e-16 ***
## pitch_typeCT        -5.626e-01  5.010e-02 -11.228  < 2e-16 ***
## pitch_typeKN        -1.924e+00  6.102e-01  -3.154  0.00161 ** 
## pitch_typeSF        -1.310e+00  9.363e-02 -13.990  < 2e-16 ***
## pitch_typeSL        -7.913e-01  5.786e-02 -13.676  < 2e-16 ***
## pitch_typeSW        -7.895e-01  8.121e-02  -9.722  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 13 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
## optimizer (bobyqa) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 5.91641 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
logreg_preds1 <- predict(logreg2, test_data, type = "response", re.form = NA) #CalledStrikePerc
logreg_preds0 <- 1 - logreg_preds1 #NotCalledStrikePerc
logreg_df <- data.frame(NotCalledStrike = logreg_preds0,
                        CalledStrikePerc = logreg_preds1,
                        is_called_strike = as.character(test_data$is_called_strike),
                        x = test_data$plate_horz_location,
                        y = test_data$plate_vert_location)

logreg_df$is_called_strike <- as.numeric(logreg_df$is_called_strike)

#Model performance 
print(paste("Max Strike%:", max(logreg_df$CalledStrikePerc)))
## [1] "Max Strike%: NA"
print(paste("Mean Strike%:", mean(logreg_df$CalledStrikePerc, na.rm = TRUE)))
## [1] "Mean Strike%: 0.162732652211725"
print(paste("Sum of Strike%:", sum(logreg_df$CalledStrikePerc)))
## [1] "Sum of Strike%: NA"
print(paste("Sum of IsStrike:", sum(logreg_df$is_called_strike)))
## [1] "Sum of IsStrike: 3237"
print(paste("Difference:", abs(sum(logreg_df$CalledStrikePerc)
                    - sum(logreg_df$is_called_strike))))
## [1] "Difference: NA"
#Accuracy Test
Strike_Perc_Change <- ifelse(logreg_df$CalledStrikePerc > .3, 1, 0) #.3 is an arbitary cutoff, about twice the mean to try and improve accuracy
cm <- confusionMatrix(factor(logreg_df$is_called_strike), factor(Strike_Perc_Change)); cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16699    36
##          1  3225     4
##                                           
##                Accuracy : 0.8367          
##                  95% CI : (0.8315, 0.8418)
##     No Information Rate : 0.998           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0015         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.838135        
##             Specificity : 0.100000        
##          Pos Pred Value : 0.997849        
##          Neg Pred Value : 0.001239        
##              Prevalence : 0.997996        
##          Detection Rate : 0.836456        
##    Detection Prevalence : 0.838259        
##       Balanced Accuracy : 0.469067        
##                                           
##        'Positive' Class : 0               
## 
accuracy <- cm$overall["Accuracy"]
print(accuracy)
## Accuracy 
## 0.836656
logreg_df$bucket <- cut(logreg_df$CalledStrikePerc, breaks = seq(0, .6, by = .05), include.lowest = TRUE, labels = FALSE)
t = ggplot(logreg_df, aes(x = x, y = y, color = as.factor(bucket))) +
geom_point() + xlim(-2,2) + ylim(0,5) +
scale_color_brewer(palette="Spectral", name = "StrikePerc Bucket",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
labels = c("0-0.05", "0.05-0.1", "0.1-0.15", "0.15-0.2", "0.2-0.25", 
"0.25-0.3", "0.3-0.35", "0.35-0.4", "0.4-0.45", "0.5-0.55", "0.55-.6")) + 
theme_minimal()  + ggtitle('Model') + # lines after this create the plot
geom_rect(xmin = -.833, xmax = .833, ymax = 3.47, 
ymin = 1.6, alpha = 0, color = "black", size = 1) +
geom_rect(xmin = -1.1083, xmax = 1.1083, 
ymin = 1.28, ymax = 3.78, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -.5583, xmax = .5583, 
ymin = 1.9, ymax = 3.1, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -1.667, xmax = 1.667, 
ymin = .642, ymax = 4.334, alpha = 0, color = "black", linetype = 3, linewidth = .5) +
coord_fixed() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(colour = "black")
) +
theme(axis.line = element_line(linetype = "solid")) + theme(
legend.text = element_text(size = 6.5),
legend.title = element_text(size = 8)
) + theme(plot.title = element_text(size = 11)) +
theme(panel.background = element_rect(fill = "gray"),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank()) + ylab(" ") + xlab(" ") +
theme(panel.background = element_rect(fill = "white")) ; print(t)
## Warning: Removed 648 rows containing missing values (`geom_point()`).

#This is just a thing I wanted to try on all the pitches with no swings, wanted to see if I could make it better

filter_pitch_data <- pitch_data %>%
  filter(is_swing == 0)

filter_split_data <- initial_split(filter_pitch_data, prop = 0.8)  # 80% training, 20% testing

# Create training and testing sets
train_data <- training(filter_split_data)
test_data  <- testing(filter_split_data)

# Verify the split
dim(train_data)  # Check the dimensions of the training set
## [1] 41715    23
dim(test_data) 
## [1] 10429    23
logreg3 <- glmer(
  is_called_strike ~ plate_horz_location + plate_vert_location + release_speed + spin_rate + 
    pitch_type + horz_break + vert_break_induced + release_height + release_side +
    extension   +
    (1 | pitcher_id) + (1 | umpire_id) + (1 | catcher_id),  # Random intercepts for pitchers and umpires
  data = train_data,
  family = binomial(link = "logit"), control = glmerControl(optimizer = "bobyqa")
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 3.8837 (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
summary(logreg3)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: is_called_strike ~ plate_horz_location + plate_vert_location +  
##     release_speed + spin_rate + pitch_type + horz_break + vert_break_induced +  
##     release_height + release_side + extension + (1 | pitcher_id) +  
##     (1 | umpire_id) + (1 | catcher_id)
##    Data: train_data
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  50675.7  50857.0 -25316.8  50633.7    41616 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2995 -0.7102 -0.6022  1.2755  2.4327 
## 
## Random effects:
##  Groups     Name        Variance  Std.Dev. 
##  pitcher_id (Intercept) 6.787e-03 0.0823840
##  umpire_id  (Intercept) 4.815e-08 0.0002194
##  catcher_id (Intercept) 1.810e-03 0.0425462
## Number of obs: 41637, groups:  pitcher_id, 654; umpire_id, 88; catcher_id, 77
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.766e+00  3.685e-01   4.793 1.64e-06 ***
## plate_horz_location  2.408e-02  1.112e-02   2.165   0.0304 *  
## plate_vert_location  1.105e-01  1.071e-02  10.320  < 2e-16 ***
## release_speed       -2.777e-02  3.788e-03  -7.332 2.26e-13 ***
## spin_rate            8.397e-05  5.669e-05   1.481   0.1386    
## pitch_type4FB       -4.154e-01  4.025e-02 -10.320  < 2e-16 ***
## pitch_typeCB        -7.675e-01  9.391e-02  -8.173 3.01e-16 ***
## pitch_typeCH        -1.283e+00  5.833e-02 -22.002  < 2e-16 ***
## pitch_typeCT        -5.551e-01  5.220e-02 -10.633  < 2e-16 ***
## pitch_typeKN        -1.341e+00  5.799e-01  -2.313   0.0207 *  
## pitch_typeSF        -1.236e+00  9.188e-02 -13.450  < 2e-16 ***
## pitch_typeSL        -7.280e-01  5.713e-02 -12.742  < 2e-16 ***
## pitch_typeSW        -7.751e-01  7.416e-02 -10.452  < 2e-16 ***
## horz_break           4.436e-04  1.204e-03   0.368   0.7126    
## vert_break_induced   2.777e-03  2.800e-03   0.992   0.3213    
## release_height      -2.714e-02  2.358e-02  -1.151   0.2497    
## release_side         6.392e-03  6.791e-03   0.941   0.3466    
## extension            2.447e-02  2.659e-02   0.920   0.3573    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 18 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
## optimizer (bobyqa) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 3.8837 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
logreg4 <- glmer(
  is_called_strike ~ plate_horz_location + plate_vert_location + release_speed + pitch_type +
    (1 | pitcher_id) + (1 | umpire_id) + (1 | catcher_id),  # Random intercepts for pitchers and umpires
  data = train_data,
  family = binomial(link = "logit"), control = glmerControl(optimizer = "bobyqa")
)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0935542 (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
summary(logreg4)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: is_called_strike ~ plate_horz_location + plate_vert_location +  
##     release_speed + pitch_type + (1 | pitcher_id) + (1 | umpire_id) +  
##     (1 | catcher_id)
##    Data: train_data
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  50778.8  50908.4 -25374.4  50748.8    41700 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2903 -0.7110 -0.6034  1.2718  2.4192 
## 
## Random effects:
##  Groups     Name        Variance  Std.Dev. 
##  pitcher_id (Intercept) 7.017e-03 0.0837673
##  umpire_id  (Intercept) 3.165e-08 0.0001779
##  catcher_id (Intercept) 2.106e-03 0.0458886
## Number of obs: 41715, groups:  pitcher_id, 654; umpire_id, 88; catcher_id, 77
## 
## Fixed effects:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.724061   0.321272   5.366 8.03e-08 ***
## plate_horz_location  0.023256   0.010951   2.124  0.03370 *  
## plate_vert_location  0.108106   0.010556  10.241  < 2e-16 ***
## release_speed       -0.024980   0.003399  -7.350 1.99e-13 ***
## pitch_type4FB       -0.384135   0.032436 -11.843  < 2e-16 ***
## pitch_typeCB        -0.760673   0.066736 -11.398  < 2e-16 ***
## pitch_typeCH        -1.301028   0.056201 -23.149  < 2e-16 ***
## pitch_typeCT        -0.532083   0.048288 -11.019  < 2e-16 ***
## pitch_typeKN        -1.487646   0.573402  -2.594  0.00948 ** 
## pitch_typeSF        -1.295435   0.082485 -15.705  < 2e-16 ***
## pitch_typeSL        -0.710568   0.047822 -14.859  < 2e-16 ***
## pitch_typeSW        -0.728409   0.060392 -12.061  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) plt_h_ plt_v_ rls_sp pt_4FB ptc_CB ptc_CH ptc_CT ptc_KN
## plt_hrz_lct -0.009                                                        
## plt_vrt_lct -0.134 -0.173                                                 
## release_spd -0.994  0.022  0.058                                          
## ptch_typ4FB  0.013  0.017 -0.191 -0.062                                   
## pitch_typCB -0.776  0.010  0.111  0.746  0.246                            
## pitch_typCH -0.537 -0.012  0.132  0.497  0.313  0.553                     
## pitch_typCT -0.325  0.052 -0.011  0.288  0.399  0.413  0.376              
## pitch_typKN -0.105 -0.002  0.007  0.102  0.028  0.092  0.070  0.051       
## pitch_typSF -0.349 -0.031  0.099  0.320  0.212  0.363  0.306  0.250  0.047
## pitch_typSL -0.613  0.054  0.131  0.567  0.373  0.634  0.536  0.438  0.081
## pitch_typSW -0.698  0.069  0.091  0.665  0.279  0.659  0.526  0.412  0.087
##             ptc_SF ptc_SL
## plt_hrz_lct              
## plt_vrt_lct              
## release_spd              
## ptch_typ4FB              
## pitch_typCB              
## pitch_typCH              
## pitch_typCT              
## pitch_typKN              
## pitch_typSF              
## pitch_typSL  0.354       
## pitch_typSW  0.345  0.607
## optimizer (bobyqa) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.0935542 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
logreg_preds1 <- predict(logreg3, test_data, type = "response", re.form = NA) #CalledStrikePerc
logreg_preds0 <- 1 - logreg_preds1 #NotCalledStrikePerc
logreg_df <- data.frame(NotCalledStrike = logreg_preds0,
                        CalledStrikePerc = logreg_preds1,
                        is_called_strike = as.character(test_data$is_called_strike),
                        x = test_data$plate_horz_location,
                        y = test_data$plate_vert_location)

logreg_df$is_called_strike <- as.numeric(logreg_df$is_called_strike)

#Model performance 
print(paste("Max Strike%:", max(logreg_df$CalledStrikePerc)))
## [1] "Max Strike%: NA"
print(paste("Mean Strike%:", mean(logreg_df$CalledStrikePerc, na.rm = TRUE)))
## [1] "Mean Strike%: 0.311786704792465"
print(paste("Sum of Strike%:", sum(logreg_df$CalledStrikePerc)))
## [1] "Sum of Strike%: NA"
print(paste("Sum of IsStrike:", sum(logreg_df$is_called_strike)))
## [1] "Sum of IsStrike: 3251"
print(paste("Difference:", abs(sum(logreg_df$CalledStrikePerc)
                    - sum(logreg_df$is_called_strike))))
## [1] "Difference: NA"
#Accuracy Test
Strike_Perc_Change <- ifelse(logreg_df$CalledStrikePerc > .4, 1, 0) #.4 is an arbitary
cm <- confusionMatrix(factor(logreg_df$is_called_strike), factor(Strike_Perc_Change)); cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6527  642
##          1 2739  505
##                                           
##                Accuracy : 0.6753          
##                  95% CI : (0.6662, 0.6843)
##     No Information Rate : 0.8898          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0803          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7044          
##             Specificity : 0.4403          
##          Pos Pred Value : 0.9104          
##          Neg Pred Value : 0.1557          
##              Prevalence : 0.8898          
##          Detection Rate : 0.6268          
##    Detection Prevalence : 0.6885          
##       Balanced Accuracy : 0.5723          
##                                           
##        'Positive' Class : 0               
## 
accuracy <- cm$overall["Accuracy"]
print(accuracy)
##  Accuracy 
## 0.6753097
logreg_df$bucket <- cut(logreg_df$CalledStrikePerc, breaks = seq(0, .6, by = .05), include.lowest = TRUE, labels = FALSE)
t = ggplot(logreg_df, aes(x = x, y = y, color = as.factor(bucket))) +
geom_point() + xlim(-2,2) + ylim(0,5) +
scale_color_brewer(palette="Spectral", name = "StrikePerc Bucket",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
labels = c("0-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5", 
"0.5-0.6", "0.6-0.7", "0.7-0.8", "0.8-0.9", "0.9-1.0")) + 
theme_minimal()  + ggtitle('Model') + # lines after this create the plot
geom_rect(xmin = -.833, xmax = .833, ymax = 3.47, 
ymin = 1.6, alpha = 0, color = "black", size = 1) +
geom_rect(xmin = -1.1083, xmax = 1.1083, 
ymin = 1.28, ymax = 3.78, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -.5583, xmax = .5583, 
ymin = 1.9, ymax = 3.1, alpha = 0, color = "black", linetype = 2, linewidth = .5) +
geom_rect(xmin = -1.667, xmax = 1.667, 
ymin = .642, ymax = 4.334, alpha = 0, color = "black", linetype = 3, linewidth = .5) +
coord_fixed() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(colour = "black")
) +
theme(axis.line = element_line(linetype = "solid")) + theme(
legend.text = element_text(size = 6.5),
legend.title = element_text(size = 8)
) + theme(plot.title = element_text(size = 11)) +
theme(panel.background = element_rect(fill = "gray"),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank()) + ylab(" ") + xlab(" ") +
theme(panel.background = element_rect(fill = "white")) ; print(t)
## Warning: Removed 594 rows containing missing values (`geom_point()`).