load game data

game<- read.csv("https://raw.githubusercontent.com/HuangPJ2022/Baseball_db_HPJ/main/unstandardized_data.csv")

data pre-process

check na

colSums(is.na(game))
##                  X             result      release_speed      release_pos_x 
##                  0                  0                  0                  0 
##      release_pos_z        description              stand           p_throws 
##                  0                  0                  0                  0 
##        same_handed       outs_when_up             inning      inning_topbot 
##                  0                  0                  0                  0 
##    effective_speed  release_spin_rate  release_extension      release_pos_y 
##                  0                  9                  5                  0 
##      at_bat_number         home_score         away_score score_differential 
##                  0                  0                  0                  0 
##          spin_axis delta_home_win_exp      delta_run_exp               Name 
##                  9                  0                  0                  0 
##                Age            Pitches       Batted_Balls            Barrels 
##                  0                  0                  0                  0 
##            Barrel.          Barrel.PA      Exit_Velocity             Max_EV 
##                  0                  0                  0                  0 
##       Launch_Angle        Sweet_Spot.                XBA               XSLG 
##                  0                  0                  0                  0 
##               WOBA              XWOBA           XWOBACON           HardHit. 
##                  0                  0                  0                  0 
##                 K.                BB.            Pitcher              p_Age 
##                  0                431                  0                  0 
##          p_Pitches     p_Batted_Balls          p_Barrels          p_Barrel. 
##                  0                  0                  0                  0 
##        p_Barrel.PA    p_Exit_Velocity           p_Max_EV     p_Launch_Angle 
##                  0                  0                  0                  0 
##      p_Sweet_Spot.              p_XBA             p_XSLG             p_WOBA 
##                  0                  0                  0                  0 
##            p_XWOBA         p_XWOBACON         p_HardHit.               p_K. 
##                  0                  0                  0                  0 
##              p_BB.                ERA              x_ERA      pitch_type_CH 
##                  0                  0                  0                  0 
##      pitch_type_CU      pitch_type_EP      pitch_type_FA      pitch_type_FC 
##                  0                  0                  0                  0 
##      pitch_type_FF      pitch_type_FS      pitch_type_KC      pitch_type_SI 
##                  0                  0                  0                  0 
##      pitch_type_SL      pitch_type_ST      pitch_type_SV           zone_1.0 
##                  0                  0                  0                  0 
##           zone_2.0           zone_3.0           zone_4.0           zone_5.0 
##                  0                  0                  0                  0 
##           zone_6.0           zone_7.0           zone_8.0           zone_9.0 
##                  0                  0                  0                  0 
##          zone_11.0          zone_12.0          zone_13.0          zone_14.0 
##                  0                  0                  0                  0

replace na with mean value

game$release_spin_rate[is.na(game$release_spin_rate)] <- mean(game$release_spin_rate, na.rm = TRUE)
game$release_extension[is.na(game$release_extension)] <- mean(game$release_extension, na.rm = TRUE)
game$spin_axis[is.na(game$spin_axis)] <- mean(game$spin_axis, na.rm = TRUE)
game$BB.[is.na(game$BB.)] <- mean(game$BB., na.rm = TRUE)

check na again

colSums(is.na(game))
##                  X             result      release_speed      release_pos_x 
##                  0                  0                  0                  0 
##      release_pos_z        description              stand           p_throws 
##                  0                  0                  0                  0 
##        same_handed       outs_when_up             inning      inning_topbot 
##                  0                  0                  0                  0 
##    effective_speed  release_spin_rate  release_extension      release_pos_y 
##                  0                  0                  0                  0 
##      at_bat_number         home_score         away_score score_differential 
##                  0                  0                  0                  0 
##          spin_axis delta_home_win_exp      delta_run_exp               Name 
##                  0                  0                  0                  0 
##                Age            Pitches       Batted_Balls            Barrels 
##                  0                  0                  0                  0 
##            Barrel.          Barrel.PA      Exit_Velocity             Max_EV 
##                  0                  0                  0                  0 
##       Launch_Angle        Sweet_Spot.                XBA               XSLG 
##                  0                  0                  0                  0 
##               WOBA              XWOBA           XWOBACON           HardHit. 
##                  0                  0                  0                  0 
##                 K.                BB.            Pitcher              p_Age 
##                  0                  0                  0                  0 
##          p_Pitches     p_Batted_Balls          p_Barrels          p_Barrel. 
##                  0                  0                  0                  0 
##        p_Barrel.PA    p_Exit_Velocity           p_Max_EV     p_Launch_Angle 
##                  0                  0                  0                  0 
##      p_Sweet_Spot.              p_XBA             p_XSLG             p_WOBA 
##                  0                  0                  0                  0 
##            p_XWOBA         p_XWOBACON         p_HardHit.               p_K. 
##                  0                  0                  0                  0 
##              p_BB.                ERA              x_ERA      pitch_type_CH 
##                  0                  0                  0                  0 
##      pitch_type_CU      pitch_type_EP      pitch_type_FA      pitch_type_FC 
##                  0                  0                  0                  0 
##      pitch_type_FF      pitch_type_FS      pitch_type_KC      pitch_type_SI 
##                  0                  0                  0                  0 
##      pitch_type_SL      pitch_type_ST      pitch_type_SV           zone_1.0 
##                  0                  0                  0                  0 
##           zone_2.0           zone_3.0           zone_4.0           zone_5.0 
##                  0                  0                  0                  0 
##           zone_6.0           zone_7.0           zone_8.0           zone_9.0 
##                  0                  0                  0                  0 
##          zone_11.0          zone_12.0          zone_13.0          zone_14.0 
##                  0                  0                  0                  0

check column type

str(game)
## 'data.frame':    3415 obs. of  88 variables:
##  $ X                 : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ result            : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ release_speed     : num  90 95.8 90.9 93.8 92.8 80 91.7 93.7 91.7 94.8 ...
##  $ release_pos_x     : num  2 -1.35 -1.91 -1.53 -2.63 2.69 -0.68 -1.58 -1.62 -1.03 ...
##  $ release_pos_z     : num  5.78 6.17 5.6 6.38 5.87 5.9 6.39 5.81 5.4 6.13 ...
##  $ description       : chr  "called_strike" "called_strike" "called_strike" "called_strike" ...
##  $ stand             : chr  "R" "R" "L" "R" ...
##  $ p_throws          : chr  "L" "R" "R" "R" ...
##  $ same_handed       : logi  FALSE TRUE FALSE TRUE TRUE FALSE ...
##  $ outs_when_up      : int  2 2 0 2 1 2 2 1 0 0 ...
##  $ inning            : int  6 4 4 4 2 7 2 9 7 2 ...
##  $ inning_topbot     : chr  "Bot" "Top" "Top" "Top" ...
##  $ effective_speed   : num  90.1 94.9 91.6 94.2 92.8 79.5 90.9 92.9 91.3 93.1 ...
##  $ release_spin_rate : num  2221 2583 2143 2210 2069 ...
##  $ release_extension : num  6.3 5.7 6.7 6.5 6.4 6.1 5.8 6.1 6.1 5.4 ...
##  $ release_pos_y     : num  54.2 54.8 53.8 54 54.1 ...
##  $ at_bat_number     : int  50 32 28 28 17 57 16 73 66 11 ...
##  $ home_score        : int  4 2 1 0 0 1 4 1 10 0 ...
##  $ away_score        : int  3 3 3 1 3 6 1 5 8 0 ...
##  $ score_differential: int  1 1 2 1 3 5 3 4 2 0 ...
##  $ spin_axis         : num  147 180 220 199 226 313 195 197 213 216 ...
##  $ delta_home_win_exp: num  0 0 0 0 0 0.004 -0.011 0.01 0 0 ...
##  $ delta_run_exp     : num  -0.026 -0.026 -0.076 -0.026 -0.052 0.039 0.039 0.111 -0.076 -0.076 ...
##  $ Name              : chr  "Abraham Toro" "P.J. Higgins" "Ford Proctor" "Otto Lopez" ...
##  $ Age               : int  25 29 25 23 24 31 25 28 26 24 ...
##  $ Pitches           : int  1340 916 81 36 679 1160 1074 1761 699 1391 ...
##  $ Batted_Balls      : int  263 147 17 8 89 178 184 321 103 222 ...
##  $ Barrels           : int  18 5 1 0 2 13 9 21 9 22 ...
##  $ Barrel.           : num  6.8 3.4 5.9 0 2.2 7.3 4.9 6.5 8.7 9.9 ...
##  $ Barrel.PA         : num  5.1 2.2 4.5 0 1.1 4.5 3.2 4.4 5.3 6.4 ...
##  $ Exit_Velocity     : num  87 86 89.4 85.5 87.5 88.5 89.7 87.8 87.3 89.5 ...
##  $ Max_EV            : num  108 104 104 103 109 ...
##  $ Launch_Angle      : num  16.7 8.7 -10.5 4.2 17.7 13 10.6 21.5 16.9 11 ...
##  $ Sweet_Spot.       : num  31.2 34.7 29.4 37.5 33.7 37.6 30.4 30.5 36.9 25.7 ...
##  $ XBA               : num  0.226 0.205 0.22 0.283 0.14 0.225 0.22 0.207 0.218 0.222 ...
##  $ XSLG              : num  0.372 0.287 0.302 0.308 0.211 0.373 0.323 0.339 0.368 0.379 ...
##  $ WOBA              : num  0.246 0.306 0.197 0.599 0.183 0.311 0.265 0.278 0.318 0.298 ...
##  $ XWOBA             : num  0.284 0.264 0.296 0.302 0.18 0.288 0.266 0.277 0.285 0.295 ...
##  $ XWOBACON          : num  0.318 0.3 0.302 0.292 0.284 0.382 0.33 0.318 0.381 0.365 ...
##  $ HardHit.          : num  29.7 25.5 29.4 37.5 35.2 37.6 44.6 33.3 32 40.5 ...
##  $ K.                : num  18.5 25.3 13.6 10 43.7 30.1 27 23.1 31.4 26.8 ...
##  $ BB.               : num  6.3 9.6 9.1 10 5.2 ...
##  $ Pitcher           : chr  "Alexander, Tyler" "Ashcraft, Graham" "Stammen, Craig" "Baumann, Mike" ...
##  $ p_Age             : int  27 24 38 26 30 28 27 28 29 28 ...
##  $ p_Pitches         : int  1591 1756 701 580 1694 153 1588 664 65 944 ...
##  $ p_Batted_Balls    : int  340 355 129 115 326 22 280 134 12 162 ...
##  $ p_Barrels         : int  38 17 9 8 22 1 21 9 2 8 ...
##  $ p_Barrel.         : num  11.2 4.8 7 7 6.7 4.5 7.5 6.7 16.7 4.9 ...
##  $ p_Barrel.PA       : num  8.9 3.7 5.1 5.4 5.1 2.6 5.2 4.8 11.1 3.4 ...
##  $ p_Exit_Velocity   : num  88.9 86.9 88.7 91.8 87.9 89.4 88.6 90.1 91.2 87 ...
##  $ p_Max_EV          : num  113 116 111 113 115 ...
##  $ p_Launch_Angle    : num  16.9 3.8 6.9 9.5 13.6 6.2 13.5 19.2 14.1 8.8 ...
##  $ p_Sweet_Spot.     : num  37.6 31 27.1 29.6 36.2 45.5 35 37.3 33.3 33.3 ...
##  $ p_XBA             : num  0.281 0.263 0.258 0.266 0.266 0.225 0.246 0.267 0.258 0.248 ...
##  $ p_XSLG            : num  0.497 0.376 0.412 0.416 0.422 0.371 0.393 0.46 0.596 0.361 ...
##  $ p_WOBA            : num  0.337 0.326 0.343 0.353 0.302 0.292 0.313 0.309 0.358 0.286 ...
##  $ p_XWOBA           : num  0.355 0.311 0.313 0.325 0.324 0.335 0.303 0.348 0.4 0.305 ...
##  $ p_XWOBACON        : num  0.395 0.333 0.362 0.355 0.365 0.371 0.37 0.398 0.482 0.341 ...
##  $ p_HardHit.        : num  37.9 36.3 36.4 46.1 35.9 40.9 40.4 44.8 33.3 32.1 ...
##  $ p_K.              : num  14.3 15.3 19.8 15.4 17.1 25.6 24.1 19.6 22.2 20 ...
##  $ p_BB.             : num  5.9 6.5 5.6 6 6.3 10.3 5.4 9 5.6 9.8 ...
##  $ ERA               : num  4.81 4.89 4.43 4.72 3.1 6.23 4.92 4.11 6.23 3.69 ...
##  $ x_ERA             : num  5.42 4.02 4.08 4.43 4.4 4.74 3.8 5.17 7.22 3.86 ...
##  $ pitch_type_CH     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pitch_type_CU     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pitch_type_EP     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pitch_type_FA     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pitch_type_FC     : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ pitch_type_FF     : int  0 0 0 1 1 0 1 1 1 1 ...
##  $ pitch_type_FS     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pitch_type_KC     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pitch_type_SI     : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ pitch_type_SL     : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ pitch_type_ST     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pitch_type_SV     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ zone_1.0          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ zone_2.0          : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ zone_3.0          : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ zone_4.0          : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ zone_5.0          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ zone_6.0          : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ zone_7.0          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ zone_8.0          : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ zone_9.0          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ zone_11.0         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ zone_12.0         : int  0 0 0 1 0 0 0 1 0 0 ...
##  $ zone_13.0         : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ zone_14.0         : int  0 0 0 0 0 0 1 0 1 0 ...

Feature Selection

select X and Y

library(dplyr)

df = select(game, -c('X', 'description', 'stand', 'p_throws', 'inning_topbot', 'Name', 'Pitcher'))

check dimension

dim(df)
## [1] 3415   81

LASSO feature selection

library(glmnet)
library(caret)
library(plotmo)

library(Matrix)
library(ggplot2)
library(lattice)
library(Formula)
library(plotrix)
library(TeachingDemos)

control = trainControl(method = "cv", number = 10) 

set.seed(123)

LASSO_fit <- train(result ~ .-1, data = df, method = "glmnet",    
                trControl = control,
                tuneGrid = expand.grid(alpha = 1, lambda = 0))
coef(LASSO_fit$finalModel, LASSO_fit$bestTune$lambda)
## 82 x 1 sparse Matrix of class "dgCMatrix"
##                               s1
## (Intercept)        -3.409446e+00
## release_speed       2.563942e-04
## release_pos_x       9.956762e-04
## release_pos_z      -2.134578e-03
## same_handedFALSE   -1.532209e-02
## same_handedTRUE     1.077933e-12
## outs_when_up       -1.521679e-02
## inning              6.482335e-03
## effective_speed     5.273825e-04
## release_spin_rate  -2.228266e-06
## release_extension   6.221663e-02
## release_pos_y       5.613523e-02
## at_bat_number      -9.573441e-04
## home_score          2.343068e-03
## away_score          2.863108e-04
## score_differential -9.618602e-04
## spin_axis           8.273631e-05
## delta_home_win_exp  1.890464e-02
## delta_run_exp      -2.636405e+00
## Age                -1.068024e-03
## Pitches            -1.431469e-06
## Batted_Balls       -3.709130e-05
## Barrels             1.779337e-04
## Barrel.             1.690006e-03
## Barrel.PA          -1.088430e-02
## Exit_Velocity      -2.478066e-03
## Max_EV              2.627209e-03
## Launch_Angle       -8.632783e-04
## Sweet_Spot.         4.854044e-04
## XBA                -3.043001e-01
## XSLG                8.525216e-01
## WOBA                2.019821e-01
## XWOBA              -8.736729e-01
## XWOBACON           -3.635971e-01
## HardHit.            5.156385e-04
## K.                  4.897196e-04
## BB.                 3.602360e-03
## p_Age              -4.084478e-04
## p_Pitches           2.079491e-05
## p_Batted_Balls     -7.151384e-05
## p_Barrels          -1.445318e-04
## p_Barrel.          -3.517417e-03
## p_Barrel.PA         7.972540e-03
## p_Exit_Velocity    -3.838632e-04
## p_Max_EV           -8.497196e-04
## p_Launch_Angle      3.129951e-04
## p_Sweet_Spot.      -1.870361e-03
## p_XBA              -1.650011e-01
## p_XSLG             -1.223420e-01
## p_WOBA              8.679412e-03
## p_XWOBA             2.284598e-01
## p_XWOBACON          2.397653e-01
## p_HardHit.         -9.752821e-04
## p_K.               -6.038245e-04
## p_BB.              -4.776449e-04
## ERA                 1.076397e-03
## x_ERA              -9.105396e-05
## pitch_type_CH       1.608773e-02
## pitch_type_CU       3.170010e-01
## pitch_type_EP       1.231133e-01
## pitch_type_FA      -1.761872e-02
## pitch_type_FC      -6.338956e-03
## pitch_type_FF      -2.159525e-03
## pitch_type_FS       1.344429e-01
## pitch_type_KC       7.520017e-02
## pitch_type_SI       .           
## pitch_type_SL       4.986275e-02
## pitch_type_ST      -2.357313e-02
## pitch_type_SV       7.763057e-03
## zone_1.0           -1.301390e-02
## zone_2.0           -2.152428e-02
## zone_3.0            2.033441e-02
## zone_4.0           -3.561772e-02
## zone_5.0           -3.583687e-02
## zone_6.0           -3.522273e-02
## zone_7.0            .           
## zone_8.0           -4.499269e-02
## zone_9.0            5.511785e-03
## zone_11.0           4.991723e-01
## zone_12.0           5.247672e-01
## zone_13.0           5.025026e-01
## zone_14.0           4.803533e-01
X_lasso = model.matrix(result ~ .-1, data=df)
Y_lasso = df$result

df_LASSO = glmnet(X_lasso, Y_lasso, alpha = 1)

print(df_LASSO)
## 
## Call:  glmnet(x = X_lasso, y = Y_lasso, alpha = 1) 
## 
##    Df  %Dev   Lambda
## 1   0  0.00 0.041560
## 2   2  1.07 0.037870
## 3   4  3.12 0.034500
## 4   5 10.99 0.031440
## 5   5 17.74 0.028650
## 6   5 23.34 0.026100
## 7   5 27.99 0.023780
## 8   5 31.85 0.021670
## 9   5 35.06 0.019740
## 10  5 37.72 0.017990
## 11  5 39.93 0.016390
## 12  5 41.77 0.014940
## 13  5 43.29 0.013610
## 14  5 44.55 0.012400
## 15  6 45.63 0.011300
## 16  7 46.53 0.010290
## 17  7 47.28 0.009380
## 18  9 47.94 0.008547
## 19  9 48.51 0.007787
## 20 11 49.00 0.007096
## 21 11 49.41 0.006465
## 22 13 49.77 0.005891
## 23 13 50.07 0.005368
## 24 14 50.32 0.004891
## 25 14 50.53 0.004456
## 26 17 50.72 0.004060
## 27 19 50.88 0.003700
## 28 22 51.03 0.003371
## 29 23 51.16 0.003072
## 30 25 51.28 0.002799
## 31 25 51.38 0.002550
## 32 30 51.46 0.002323
## 33 30 51.54 0.002117
## 34 33 51.61 0.001929
## 35 36 51.67 0.001758
## 36 40 51.72 0.001601
## 37 40 51.77 0.001459
## 38 43 51.82 0.001330
## 39 46 51.86 0.001211
## 40 47 51.89 0.001104
## 41 51 51.93 0.001006
## 42 54 51.96 0.000916
## 43 55 51.98 0.000835
## 44 56 52.00 0.000761
## 45 57 52.02 0.000693
## 46 57 52.03 0.000632
## 47 58 52.05 0.000575
## 48 61 52.06 0.000524
## 49 63 52.08 0.000478
## 50 63 52.10 0.000435
## 51 63 52.12 0.000397
## 52 62 52.14 0.000362
## 53 63 52.15 0.000329
## 54 63 52.17 0.000300
## 55 64 52.18 0.000273
## 56 64 52.19 0.000249
## 57 66 52.19 0.000227
## 58 68 52.20 0.000207
## 59 70 52.21 0.000188
## 60 72 52.22 0.000172
## 61 72 52.22 0.000157
## 62 72 52.23 0.000143
## 63 72 52.23 0.000130
## 64 70 52.24 0.000118
## 65 70 52.24 0.000108
## 66 70 52.24 0.000098
## 67 72 52.25 0.000090
## 68 75 52.25 0.000082
## 69 76 52.25 0.000074
## 70 76 52.25 0.000068
## 71 76 52.26 0.000062
## 72 75 52.26 0.000056
## 73 76 52.26 0.000051
## 74 77 52.27 0.000047
## 75 78 52.27 0.000043
## 76 78 52.27 0.000039
## 77 78 52.28 0.000035
## 78 78 52.28 0.000032
## 79 78 52.28 0.000029
## 80 78 52.28 0.000027
## 81 78 52.28 0.000024
## 82 79 52.28 0.000022
## 83 79 52.28 0.000020
## 84 79 52.29 0.000018
## 85 79 52.29 0.000017
## 86 79 52.29 0.000015
## 87 79 52.29 0.000014
## 88 79 52.29 0.000013
## 89 79 52.29 0.000012
## 90 79 52.29 0.000011
## 91 79 52.29 0.000010
coef(df_LASSO, s = cv.glmnet(X_lasso, Y_lasso)$lambda.1se)
## 82 x 1 sparse Matrix of class "dgCMatrix"
##                              s1
## (Intercept)        -0.085208680
## release_speed       .          
## release_pos_x       .          
## release_pos_z       .          
## same_handedFALSE    .          
## same_handedTRUE     .          
## outs_when_up       -0.004144210
## inning              .          
## effective_speed     .          
## release_spin_rate   .          
## release_extension   .          
## release_pos_y       .          
## at_bat_number       .          
## home_score          .          
## away_score          .          
## score_differential  .          
## spin_axis           .          
## delta_home_win_exp  .          
## delta_run_exp      -2.142785769
## Age                 .          
## Pitches             .          
## Batted_Balls        .          
## Barrels             .          
## Barrel.             .          
## Barrel.PA           .          
## Exit_Velocity       .          
## Max_EV              .          
## Launch_Angle        .          
## Sweet_Spot.         .          
## XBA                 .          
## XSLG                .          
## WOBA                .          
## XWOBA               .          
## XWOBACON            .          
## HardHit.            .          
## K.                  .          
## BB.                 .          
## p_Age               .          
## p_Pitches           .          
## p_Batted_Balls      .          
## p_Barrels           .          
## p_Barrel.           .          
## p_Barrel.PA         .          
## p_Exit_Velocity     .          
## p_Max_EV            .          
## p_Launch_Angle      .          
## p_Sweet_Spot.       .          
## p_XBA               .          
## p_XSLG              .          
## p_WOBA              .          
## p_XWOBA             .          
## p_XWOBACON          .          
## p_HardHit.          .          
## p_K.                .          
## p_BB.               .          
## ERA                 .          
## x_ERA               .          
## pitch_type_CH       .          
## pitch_type_CU       0.138911130
## pitch_type_EP       .          
## pitch_type_FA       .          
## pitch_type_FC       .          
## pitch_type_FF       .          
## pitch_type_FS       .          
## pitch_type_KC       .          
## pitch_type_SI       .          
## pitch_type_SL       .          
## pitch_type_ST       .          
## pitch_type_SV       .          
## zone_1.0            .          
## zone_2.0            .          
## zone_3.0            .          
## zone_4.0           -0.003590574
## zone_5.0           -0.009671238
## zone_6.0           -0.004523019
## zone_7.0            .          
## zone_8.0           -0.009330144
## zone_9.0            .          
## zone_11.0           0.418600911
## zone_12.0           0.441588841
## zone_13.0           0.419897914
## zone_14.0           0.403122251

finding best lambda

cv.glmnet(X_lasso, Y_lasso)$lambda.1se
## [1] 0.005367583
plot(df_LASSO, label = TRUE)

as lambda increasing, the coefficient of feature mostly goes to 0

par(mfrow=c(1,2))

plot_glmnet(df_LASSO, xvar = "lambda", 
            label = 5,   # the "label" means the top-n variable you want the graph to show (we want it to display the top-5 variables)
             xlab = expression(paste("log(", lambda, ")")),   # use expression() to ask R to enable Greek letters typesetting
             ylab = expression(beta))  
abline(v = log(cv.glmnet(X_lasso, Y_lasso)$lambda.1se), lty = "dashed") 

plot_glmnet(df_LASSO, label = 5, xlab = expression(paste("log(", lambda, ")")), ylab = expression(beta))
abline(v = log(cv.glmnet(X_lasso, Y_lasso)$lambda.1se), lty = "dashed")

## as lambda increasing, the coefficient of feature mostly goes to 0

important features of LASSO:

outs_when_up -0.004144210
delta_run_exp -2.142785769
pitch_type_CU 0.138911130
zone_4.0 -0.003590574
zone_5.0 -0.009671238
zone_6.0 -0.004523019
zone_8.0 -0.009330144
zone_11.0 0.418600911
zone_12.0 0.441588841
zone_13.0 0.419897914
zone_14.0 0.403122251

using feature seleted df to fit model

create feature selected df

lasso_selected_df = select(game, c("result", "outs_when_up", "delta_run_exp", "pitch_type_CU", "zone_4.0", "zone_5.0", "zone_6.0", "zone_8.0", "zone_11.0", "zone_12.0", "zone_13.0", "zone_14.0"))
library(e1071)
library(caret)
library(gbm)
library(pROC)
library(plotROC)
library(ISLR)

train test split

set.seed(123)

s_train_row_number <- createDataPartition(lasso_selected_df$result, p=0.75, list=FALSE)

s_train <- lasso_selected_df[s_train_row_number, ]
s_test <- lasso_selected_df[-s_train_row_number, ]

logit

s_logit_model <- glm(result ~ ., family = binomial, data = s_train)

probit

s_probit_model <- glm(result ~ ., family = binomial(link = "probit"), data = s_train)

GBM

library(rsample) 
library(gbm)
library(xgboost)
library(pdp)
library(ggplot2)
library(purrr)
hyper_grid <- expand.grid(
 shrinkage = c(.01, .1, .2),
 interaction.depth = c(1, 3, 5),
 n.minobsinnode = c(5, 10, 15),
 bag.fraction = c(.5, .75, 1),
 optimal_trees = 0, # you will fill in values from loop
 min_RMSE = 0) 
for(i in 1:nrow(hyper_grid)) {
  
  # reproducibility
  set.seed(123)
  
  # train model
  gbm.tune <- gbm(
    result ~ .,
    data = s_train,
    distribution = "bernoulli",
    cv.folds = 10,
    n.trees = 500,
    interaction.depth = hyper_grid$interaction.depth[i],
    shrinkage = hyper_grid$shrinkage[i],
    n.minobsinnode = hyper_grid$n.minobsinnode[i],
    bag.fraction = hyper_grid$bag.fraction[i],
    train.fraction = .75,
    n.cores = NULL, # will use all cores by default
    verbose = FALSE
  )
  
  # locate minimum training error from the n-th tree, add it to grid
  hyper_grid$optimal_trees[i] <- which.min(gbm.tune$valid.error)
  hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}
optimal_para <- hyper_grid %>% dplyr::arrange(min_RMSE) %>% head(1)

# define a list of optimal hyperparameters
shrink <- optimal_para$shrinkage
ntrees <- optimal_para$optimal_trees
minobs <- optimal_para$n.minobsinnode
bagfrac <- optimal_para$bag.fraction
depth <- optimal_para$interaction.depth

# train final GBM model
s_gbm_model <- gbm(
  result ~ .,
  data = s_train,
  distribution = "bernoulli",
  cv.folds = 10,
  n.trees = ntrees,
  interaction.depth = depth,
  shrinkage = shrink,
  n.minobsinnode = minobs,
  bag.fraction = bagfrac, 
  n.cores = NULL, # will use all cores by default
  verbose = FALSE
  )  
print(shrink) 
## [1] 0.1
print(ntrees) 
## [1] 373
print(minobs) 
## [1] 5
print(bagfrac) 
## [1] 0.5
print(depth)
## [1] 5

svm

lasso_selected_df_for_svm = lasso_selected_df

svm_train <- lasso_selected_df_for_svm[s_train_row_number, ]
svm_test <- lasso_selected_df_for_svm[-s_train_row_number, ]

svm_train$result <- as.factor(svm_train$result)
svm_test$result <- as.factor(svm_test$result)

s_svm_model <- svm(result ~ ., 
                 scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
               kernel = "radial", # other options are available
               degree = 3, # if kernel is of type = "polynomial"
                # gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
               cost = 1, # the cost function (C)
               probability = FALSE, # whether to output probability predictions
               na.action = na.omit,
                 data = svm_train)

tree

library(rsample)
library(dplyr)       # data wrangling
library(rpart)       # performing regression trees
library(rpart.plot)  # plotting regression trees
library(ipred)       # bagging
library(caret)
lasso_selected_df_for_tree = lasso_selected_df

lasso_selected_df_for_tree$result <- as.factor(lasso_selected_df_for_tree$result)

s_train_tree <- lasso_selected_df_for_tree[s_train_row_number, ]
s_test_tree <- lasso_selected_df_for_tree[-s_train_row_number, ]



hyper_grid <- expand.grid(
  minsplit = seq(5, 20, 1),  # from 5 to 20, with an increment of 1
  maxdepth = seq(8, 15, 1)
)

trees <- list()

for (i in 1:nrow(hyper_grid)) {
  
  # get minsplit, maxdepth values at row i
  minsplit <- hyper_grid$minsplit[i]
  maxdepth <- hyper_grid$maxdepth[i]
  # train a model and store in the list
  # R will feed i-th minsplit and maxdepth values from hyper_grid into rpart()'s control function as hyperparameters
  trees[[i]] <- rpart(
    result ~ .,
    data = s_train_tree,
    method  = "anova",
    control = list(minsplit = minsplit, maxdepth = maxdepth)
    )
}

get_cp <- function(x) {
  min <- which.min(x$cptable[, "CP"]) # to select from the column named "CP" and return a row index
  cp <- x$cptable[min, "CP"]  # using the index to subset the desired value
}

get_min_error <- function(x) {
  min    <- which.min(x$cptable[, "xerror"]) # to select from the column named "xerror" and return an index
  xerror <- x$cptable[min, "xerror"]  # using the index to subset the desired value
}

hyper_grid <- hyper_grid %>%
  mutate(
    cp    = purrr::map_dbl(trees, get_cp),  # purrr is a useful functional programming command from the tidyverse package, map_dbl() outputs double vectors (i.e, numeric values that have decimals) from an element of an object; mutate() adds new variables while preserving existing ones (from a data.frame or data.table object)
    error = purrr::map_dbl(trees, get_min_error)
    ) %>%
  arrange(error) %>%
  top_n(-5, wt = error)

min_split <- hyper_grid[1, 1]
max_depth <- hyper_grid[1, 2]
complexity_para <- hyper_grid[1, 3]

optimal_tree <- rpart(
    result ~ .,
    data = s_train_tree,
    # method  = "anova", in order to get predicted class, we need to suppress the "anove" method
    control = list(minsplit = min_split, maxdepth = max_depth, cp = complexity_para)
    )
print(min_split)
## [1] 5
print(max_depth)
## [1] 8
print(complexity_para)
## [1] 0.01
head(trees, 1)
## [[1]]
## n= 2562 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 2562 149.13230 1.062061  
##    2) delta_run_exp>=0.0065 883   0.00000 1.000000 *
##    3) delta_run_exp< 0.0065 1679 143.94280 1.094699  
##      6) zone_11.0< 0.5 1634 106.04650 1.069767  
##       12) zone_12.0< 0.5 1589  66.00378 1.043424  
##         24) zone_14.0< 0.5 1550  29.41935 1.019355  
##           48) zone_13.0< 0.5 1520   0.00000 1.000000 *
##           49) zone_13.0>=0.5 30   0.00000 2.000000 *
##         25) zone_14.0>=0.5 39   0.00000 2.000000 *
##       13) zone_12.0>=0.5 45   0.00000 2.000000 *
##      7) zone_11.0>=0.5 45   0.00000 2.000000 *

RF

library(randomForest)
library(rsample)     # data splitting 
library(dplyr)       # data wrangling
library(rpart)       # build tree models
library(rpart.plot)  # plotting regression trees
library(ipred)       # bagging
library(caret)  
library(ranger)
library(ggplot2)
set.seed(123)

hyper_grid_2 <- expand.grid(
  mtry = seq(1, 4, by = 1),
  node_size = seq(3, 9, by = 2),
  sampe_size = c(.5, .6, .75, .80),
  OOB_RMSE = 0
)

for(i in 1:nrow(hyper_grid_2)) {
  
  # train model
  model <- ranger(
    result ~ ., 
    data = s_train_tree, 
    num.trees = 100,
    mtry = hyper_grid_2$mtry[i],
    min.node.size = hyper_grid_2$node_size[i],
    sample.fraction = hyper_grid_2$sampe_size[i],
    seed = 123
  )
  
  # add OOB error to grid. note how we append the list of values to the vector and index them (fill in and replace the 0 inside the OOB_RMSE column)
  hyper_grid_2$OOB_RMSE[i] <- sqrt(model$prediction.error)
}

best_para <- hyper_grid_2 %>% 
  dplyr::arrange(OOB_RMSE) %>%
  head(1)

m <- best_para$mtry
node_size <- best_para$node_size
p <- best_para$sampe_size


optimal_ranger <- ranger(
    result ~ ., 
    data = s_train_tree, 
    num.trees = 100,
    mtry = m, # number of features
    min.node.size = node_size, # node size
    sample.fraction = p, # fraction of sample used in each tree
    importance = 'impurity'
  )
print(m)
## [1] 4
print(node_size)
## [1] 3
print(p)
## [1] 0.5
print(best_para)
##   mtry node_size sampe_size OOB_RMSE
## 1    4         3        0.5        0

prediction

logit model

pred_logit <- predict(s_logit_model, s_test, type = "response")

# classify outcomes by threshold = 0.5
convert_logit <- as.numeric(pred_logit > 0.5)

# generate confusion matrix and calculate precision score
confmat_logit <- table(Predicted = convert_logit, Actual = s_test$result)

confmat_logit
##          Actual
## Predicted   0   1
##         0 799   0
##         1   0  54
s_logit_model$coefficients
##   (Intercept)  outs_when_up delta_run_exp pitch_type_CU      zone_4.0 
##   -78.6176091    15.1152013  -699.2750438     4.9158852    -0.1700082 
##      zone_5.0      zone_6.0      zone_8.0     zone_11.0     zone_12.0 
##    -0.1973520    -0.1785726    -0.3838084    52.1836586    52.3811138 
##     zone_13.0     zone_14.0 
##    52.2513377    52.2682433
accuracy_logit <- (confmat_logit[1, 1] + confmat_logit[2, 2]) / sum(confmat_logit)
print(paste0("Accuracy of Logit Model: ",accuracy_logit))
## [1] "Accuracy of Logit Model: 1"

probit model

pred_probit <- predict(s_probit_model, s_test, type = "response")

convert_probit <- as.numeric(pred_probit > 0.5)

confmat_probit <- table(Predicted = convert_probit, Actual = s_test$result)

confmat_probit
##          Actual
## Predicted   0   1
##         0 799   0
##         1   0  54
s_probit_model$coefficients
##   (Intercept)  outs_when_up delta_run_exp pitch_type_CU      zone_4.0 
##  -21.69832394    4.52154487 -195.80117917    0.82798299   -0.02635914 
##      zone_5.0      zone_6.0      zone_8.0     zone_11.0     zone_12.0 
##   -0.02928888   -0.02632436   -0.05967798   13.80566474   13.83842804 
##     zone_13.0     zone_14.0 
##   13.81631418   13.81970489
accuracy_probit <- (confmat_probit[1, 1] + confmat_probit[2, 2]) / sum(confmat_probit)
print(paste0("Accuracy of Probit Model: ", accuracy_probit))
## [1] "Accuracy of Probit Model: 1"

svm

pred_svm <- predict(s_svm_model, s_test)

confmat_svm <- table(Predicted = pred_svm, Actual = s_test$result)

confmat_svm
##          Actual
## Predicted   0   1
##         0 799   0
##         1   0  54
accuracy_svm <- (confmat_svm[1, 1] + confmat_svm[2, 2]) / sum(confmat_svm)
print(paste0("Accuracy of SVM Model: ",accuracy_svm))
## [1] "Accuracy of SVM Model: 1"

GBM

pred_gbm <- predict(s_gbm_model, s_test, type = "response")

convert_gbm <- as.numeric(pred_gbm > 0.5)

confmat_gbm <- table(Predicted = convert_gbm, Actual = s_test$result)

confmat_gbm
##          Actual
## Predicted   0   1
##         0 799   0
##         1   0  54
accuracy_gbm <- (confmat_gbm[1, 1] + confmat_gbm[2, 2]) / sum(confmat_gbm)
print(paste0("Accuracy of GBM Model: ",accuracy_gbm))
## [1] "Accuracy of GBM Model: 1"
relative influence of GBM model
par(mar = c(5, 8, 1, 1)) 

summary(
  s_gbm_model, 
  cBars = 5,  # since we have 5 variables to display
  method = relative.influence, # you can also use permutation.test.gbm
  las = 2
  )

##                         var  rel.inf
## delta_run_exp delta_run_exp 24.32038
## zone_11.0         zone_11.0 23.46796
## zone_14.0         zone_14.0 18.07495
## zone_12.0         zone_12.0 17.36242
## zone_13.0         zone_13.0 16.77429
## outs_when_up   outs_when_up  0.00000
## pitch_type_CU pitch_type_CU  0.00000
## zone_4.0           zone_4.0  0.00000
## zone_5.0           zone_5.0  0.00000
## zone_6.0           zone_6.0  0.00000
## zone_8.0           zone_8.0  0.00000

decision tree

# predict on testing data
pred_tree <- predict(optimal_tree, newdata = s_test_tree, type = "class") # set type to "class"

# create confusion matrix and generate accuracy score
confmat_tree <- table(Predicted = pred_tree, Actual = s_test_tree$result)

# view the confusion table (lots of Type II error)
confmat_tree
##          Actual
## Predicted   0   1
##         0 799   0
##         1   0  54
accuracy_tree <- (confmat_tree[1, 1] + confmat_tree[2, 2]) / sum(confmat_tree)
print(paste0("Accuracy of Tree Model: ",accuracy_tree))
## [1] "Accuracy of Tree Model: 1"
plot tree model
rpart.plot(optimal_tree)

RF

# predict on testing data
pred_rf <- predict(optimal_ranger, s_test_tree, type = "response") # set type to "class"

# create confusion matrix and generate accuracy score
confmat_rf <- table(Predicted = pred_rf$predictions, Actual = s_test$result)

# view the confusion table (lots of Type II error)
confmat_rf
##          Actual
## Predicted   0   1
##         0 799   0
##         1   0  54
accuracy_rf <- (confmat_rf[1, 1] + confmat_rf[2, 2]) / sum(confmat_rf)
print(paste0("Accuracy of Random Forest Model: ",accuracy_rf))
## [1] "Accuracy of Random Forest Model: 1"
feature importanct of random forest
rf_feature_importance <- data.frame(vname = names(lasso_selected_df[-1]), importance = optimal_ranger$variable.importance)  

ggplot(rf_feature_importance, aes(x = reorder(vname, -importance), y = importance, fill = vname, label = round(importance, digits = 2))) +  # fill means label with variable names
  geom_bar(stat = "Identity") +
  ggtitle("Most important variables (in the order of 1 to 4)") +
  xlab("Variable") + 
  ylab("Importance") +
  geom_text(size = 3, position = position_stack(vjust = 0.5), col = "white")

comparison: svm model without lasso feature selection

compare_df_for_svm = df

compare_svm_train <- compare_df_for_svm[s_train_row_number, ]
compare_svm_test <- compare_df_for_svm[-s_train_row_number, ]

compare_svm_train$result <- as.factor(compare_svm_train$result)
compare_svm_test$result <- as.factor(compare_svm_test$result)

compare_svm_model <- svm(result ~ ., 
                 scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
               kernel = "radial", # other options are available
               degree = 3, # if kernel is of type = "polynomial"
                # gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
               cost = 1, # the cost function (C)
               probability = FALSE, # whether to output probability predictions
               na.action = na.omit,
                 data = compare_svm_train)

pred_svm_compare <- predict(compare_svm_model, compare_svm_test)

confmat_svm_compare <- table(Predicted = pred_svm_compare, Actual = compare_svm_test$result)

confmat_svm_compare
##          Actual
## Predicted   0   1
##         0 799  54
##         1   0   0
accuracy_compare_svm <- (confmat_svm_compare[1,1] + confmat_svm_compare[2,2])/sum(confmat_svm_compare)
print(paste0("Accuracy of Comparision SVM Model:", accuracy_compare_svm))
## [1] "Accuracy of Comparision SVM Model:0.936694021101993"
svm.pred2 <- as.numeric(pred_svm_compare)
ROC_svm_compare <- roc(s_test$result, svm.pred2, percent = TRUE, main = "Smoothing")
auc(ROC_svm_compare)
## Area under the curve: 50%

ROC & AOU

ROC_logit <- roc(s_test$result, pred_logit, percent = TRUE, main = "Smoothing")
ROC_probit <- roc(s_test$result, pred_probit, percent = TRUE, main = "Smoothing")

svm.pred1 <- as.numeric(pred_svm)
ROC_svm <- roc(s_test$result, svm.pred1, percent = TRUE, main = "Smoothing")
ROC_gbm <- roc(s_test$result, pred_gbm, percent = TRUE, main = "Smoothing")

tree.pred1 <- as.numeric(pred_tree)
ROC_tree <- roc(s_test$result, tree.pred1, percent = TRUE, main = "Smoothing")

rf.pred1 <- as.numeric(pred_rf$predictions)
ROC_rf <- roc(s_test$result, rf.pred1, percent = TRUE, main = "Smoothing")

Logit AUC

auc(ROC_logit)
## Area under the curve: 100%

Probit AUC

auc(ROC_probit)
## Area under the curve: 100%

SVM AUC

auc(ROC_svm)
## Area under the curve: 100%

GBM AUC

auc(ROC_gbm)
## Area under the curve: 100%

Tree AUC

auc(ROC_tree)
## Area under the curve: 100%

Random Forest AUC

auc(ROC_rf)
## Area under the curve: 100%

plot ROC & AUC

par(mar=c(1,1,1,1))  # create plotting environment
plot.roc(ROC_logit, s_test$result, percent = TRUE, main = "ROC curves", add =  FALSE, asp = NA)
lines(ROC_probit, col = "blue")
lines(ROC_svm, col = "red")
lines(ROC_gbm, col = "green")
lines(ROC_tree, col = "cyan")
lines(ROC_rf, col = "orange")
lines(ROC_svm_compare, col = "yellow")
axis(1, at = seq(0, 1, by=0.2), labels = paste(100*seq(1,0, by=-.2)), tick = TRUE)
legend(40, 20, 
       legend = c("model   AUC", "logit: 100", "probit: 100", "SVM: 100", "GBM: 100", "Tree: 100", "RF: 100", "Compare_SVM: 50"),
       col = c("white", "black", "blue", "red", "green", "cyan", "orange", "yellow"),
       lty = c(1, 1, 2, 3, 4, 5, 6, 6),
       pch = c(NA, NA, NA, NA, NA, NA, NA, NA),
       cex = 0.7)