game<- read.csv("https://raw.githubusercontent.com/HuangPJ2022/Baseball_db_HPJ/main/unstandardized_data.csv")
colSums(is.na(game))
## X result release_speed release_pos_x
## 0 0 0 0
## release_pos_z description stand p_throws
## 0 0 0 0
## same_handed outs_when_up inning inning_topbot
## 0 0 0 0
## effective_speed release_spin_rate release_extension release_pos_y
## 0 9 5 0
## at_bat_number home_score away_score score_differential
## 0 0 0 0
## spin_axis delta_home_win_exp delta_run_exp Name
## 9 0 0 0
## Age Pitches Batted_Balls Barrels
## 0 0 0 0
## Barrel. Barrel.PA Exit_Velocity Max_EV
## 0 0 0 0
## Launch_Angle Sweet_Spot. XBA XSLG
## 0 0 0 0
## WOBA XWOBA XWOBACON HardHit.
## 0 0 0 0
## K. BB. Pitcher p_Age
## 0 431 0 0
## p_Pitches p_Batted_Balls p_Barrels p_Barrel.
## 0 0 0 0
## p_Barrel.PA p_Exit_Velocity p_Max_EV p_Launch_Angle
## 0 0 0 0
## p_Sweet_Spot. p_XBA p_XSLG p_WOBA
## 0 0 0 0
## p_XWOBA p_XWOBACON p_HardHit. p_K.
## 0 0 0 0
## p_BB. ERA x_ERA pitch_type_CH
## 0 0 0 0
## pitch_type_CU pitch_type_EP pitch_type_FA pitch_type_FC
## 0 0 0 0
## pitch_type_FF pitch_type_FS pitch_type_KC pitch_type_SI
## 0 0 0 0
## pitch_type_SL pitch_type_ST pitch_type_SV zone_1.0
## 0 0 0 0
## zone_2.0 zone_3.0 zone_4.0 zone_5.0
## 0 0 0 0
## zone_6.0 zone_7.0 zone_8.0 zone_9.0
## 0 0 0 0
## zone_11.0 zone_12.0 zone_13.0 zone_14.0
## 0 0 0 0
game$release_spin_rate[is.na(game$release_spin_rate)] <- mean(game$release_spin_rate, na.rm = TRUE)
game$release_extension[is.na(game$release_extension)] <- mean(game$release_extension, na.rm = TRUE)
game$spin_axis[is.na(game$spin_axis)] <- mean(game$spin_axis, na.rm = TRUE)
game$BB.[is.na(game$BB.)] <- mean(game$BB., na.rm = TRUE)
colSums(is.na(game))
## X result release_speed release_pos_x
## 0 0 0 0
## release_pos_z description stand p_throws
## 0 0 0 0
## same_handed outs_when_up inning inning_topbot
## 0 0 0 0
## effective_speed release_spin_rate release_extension release_pos_y
## 0 0 0 0
## at_bat_number home_score away_score score_differential
## 0 0 0 0
## spin_axis delta_home_win_exp delta_run_exp Name
## 0 0 0 0
## Age Pitches Batted_Balls Barrels
## 0 0 0 0
## Barrel. Barrel.PA Exit_Velocity Max_EV
## 0 0 0 0
## Launch_Angle Sweet_Spot. XBA XSLG
## 0 0 0 0
## WOBA XWOBA XWOBACON HardHit.
## 0 0 0 0
## K. BB. Pitcher p_Age
## 0 0 0 0
## p_Pitches p_Batted_Balls p_Barrels p_Barrel.
## 0 0 0 0
## p_Barrel.PA p_Exit_Velocity p_Max_EV p_Launch_Angle
## 0 0 0 0
## p_Sweet_Spot. p_XBA p_XSLG p_WOBA
## 0 0 0 0
## p_XWOBA p_XWOBACON p_HardHit. p_K.
## 0 0 0 0
## p_BB. ERA x_ERA pitch_type_CH
## 0 0 0 0
## pitch_type_CU pitch_type_EP pitch_type_FA pitch_type_FC
## 0 0 0 0
## pitch_type_FF pitch_type_FS pitch_type_KC pitch_type_SI
## 0 0 0 0
## pitch_type_SL pitch_type_ST pitch_type_SV zone_1.0
## 0 0 0 0
## zone_2.0 zone_3.0 zone_4.0 zone_5.0
## 0 0 0 0
## zone_6.0 zone_7.0 zone_8.0 zone_9.0
## 0 0 0 0
## zone_11.0 zone_12.0 zone_13.0 zone_14.0
## 0 0 0 0
str(game)
## 'data.frame': 3415 obs. of 88 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ result : int 0 0 0 1 0 0 0 0 1 0 ...
## $ release_speed : num 90 95.8 90.9 93.8 92.8 80 91.7 93.7 91.7 94.8 ...
## $ release_pos_x : num 2 -1.35 -1.91 -1.53 -2.63 2.69 -0.68 -1.58 -1.62 -1.03 ...
## $ release_pos_z : num 5.78 6.17 5.6 6.38 5.87 5.9 6.39 5.81 5.4 6.13 ...
## $ description : chr "called_strike" "called_strike" "called_strike" "called_strike" ...
## $ stand : chr "R" "R" "L" "R" ...
## $ p_throws : chr "L" "R" "R" "R" ...
## $ same_handed : logi FALSE TRUE FALSE TRUE TRUE FALSE ...
## $ outs_when_up : int 2 2 0 2 1 2 2 1 0 0 ...
## $ inning : int 6 4 4 4 2 7 2 9 7 2 ...
## $ inning_topbot : chr "Bot" "Top" "Top" "Top" ...
## $ effective_speed : num 90.1 94.9 91.6 94.2 92.8 79.5 90.9 92.9 91.3 93.1 ...
## $ release_spin_rate : num 2221 2583 2143 2210 2069 ...
## $ release_extension : num 6.3 5.7 6.7 6.5 6.4 6.1 5.8 6.1 6.1 5.4 ...
## $ release_pos_y : num 54.2 54.8 53.8 54 54.1 ...
## $ at_bat_number : int 50 32 28 28 17 57 16 73 66 11 ...
## $ home_score : int 4 2 1 0 0 1 4 1 10 0 ...
## $ away_score : int 3 3 3 1 3 6 1 5 8 0 ...
## $ score_differential: int 1 1 2 1 3 5 3 4 2 0 ...
## $ spin_axis : num 147 180 220 199 226 313 195 197 213 216 ...
## $ delta_home_win_exp: num 0 0 0 0 0 0.004 -0.011 0.01 0 0 ...
## $ delta_run_exp : num -0.026 -0.026 -0.076 -0.026 -0.052 0.039 0.039 0.111 -0.076 -0.076 ...
## $ Name : chr "Abraham Toro" "P.J. Higgins" "Ford Proctor" "Otto Lopez" ...
## $ Age : int 25 29 25 23 24 31 25 28 26 24 ...
## $ Pitches : int 1340 916 81 36 679 1160 1074 1761 699 1391 ...
## $ Batted_Balls : int 263 147 17 8 89 178 184 321 103 222 ...
## $ Barrels : int 18 5 1 0 2 13 9 21 9 22 ...
## $ Barrel. : num 6.8 3.4 5.9 0 2.2 7.3 4.9 6.5 8.7 9.9 ...
## $ Barrel.PA : num 5.1 2.2 4.5 0 1.1 4.5 3.2 4.4 5.3 6.4 ...
## $ Exit_Velocity : num 87 86 89.4 85.5 87.5 88.5 89.7 87.8 87.3 89.5 ...
## $ Max_EV : num 108 104 104 103 109 ...
## $ Launch_Angle : num 16.7 8.7 -10.5 4.2 17.7 13 10.6 21.5 16.9 11 ...
## $ Sweet_Spot. : num 31.2 34.7 29.4 37.5 33.7 37.6 30.4 30.5 36.9 25.7 ...
## $ XBA : num 0.226 0.205 0.22 0.283 0.14 0.225 0.22 0.207 0.218 0.222 ...
## $ XSLG : num 0.372 0.287 0.302 0.308 0.211 0.373 0.323 0.339 0.368 0.379 ...
## $ WOBA : num 0.246 0.306 0.197 0.599 0.183 0.311 0.265 0.278 0.318 0.298 ...
## $ XWOBA : num 0.284 0.264 0.296 0.302 0.18 0.288 0.266 0.277 0.285 0.295 ...
## $ XWOBACON : num 0.318 0.3 0.302 0.292 0.284 0.382 0.33 0.318 0.381 0.365 ...
## $ HardHit. : num 29.7 25.5 29.4 37.5 35.2 37.6 44.6 33.3 32 40.5 ...
## $ K. : num 18.5 25.3 13.6 10 43.7 30.1 27 23.1 31.4 26.8 ...
## $ BB. : num 6.3 9.6 9.1 10 5.2 ...
## $ Pitcher : chr "Alexander, Tyler" "Ashcraft, Graham" "Stammen, Craig" "Baumann, Mike" ...
## $ p_Age : int 27 24 38 26 30 28 27 28 29 28 ...
## $ p_Pitches : int 1591 1756 701 580 1694 153 1588 664 65 944 ...
## $ p_Batted_Balls : int 340 355 129 115 326 22 280 134 12 162 ...
## $ p_Barrels : int 38 17 9 8 22 1 21 9 2 8 ...
## $ p_Barrel. : num 11.2 4.8 7 7 6.7 4.5 7.5 6.7 16.7 4.9 ...
## $ p_Barrel.PA : num 8.9 3.7 5.1 5.4 5.1 2.6 5.2 4.8 11.1 3.4 ...
## $ p_Exit_Velocity : num 88.9 86.9 88.7 91.8 87.9 89.4 88.6 90.1 91.2 87 ...
## $ p_Max_EV : num 113 116 111 113 115 ...
## $ p_Launch_Angle : num 16.9 3.8 6.9 9.5 13.6 6.2 13.5 19.2 14.1 8.8 ...
## $ p_Sweet_Spot. : num 37.6 31 27.1 29.6 36.2 45.5 35 37.3 33.3 33.3 ...
## $ p_XBA : num 0.281 0.263 0.258 0.266 0.266 0.225 0.246 0.267 0.258 0.248 ...
## $ p_XSLG : num 0.497 0.376 0.412 0.416 0.422 0.371 0.393 0.46 0.596 0.361 ...
## $ p_WOBA : num 0.337 0.326 0.343 0.353 0.302 0.292 0.313 0.309 0.358 0.286 ...
## $ p_XWOBA : num 0.355 0.311 0.313 0.325 0.324 0.335 0.303 0.348 0.4 0.305 ...
## $ p_XWOBACON : num 0.395 0.333 0.362 0.355 0.365 0.371 0.37 0.398 0.482 0.341 ...
## $ p_HardHit. : num 37.9 36.3 36.4 46.1 35.9 40.9 40.4 44.8 33.3 32.1 ...
## $ p_K. : num 14.3 15.3 19.8 15.4 17.1 25.6 24.1 19.6 22.2 20 ...
## $ p_BB. : num 5.9 6.5 5.6 6 6.3 10.3 5.4 9 5.6 9.8 ...
## $ ERA : num 4.81 4.89 4.43 4.72 3.1 6.23 4.92 4.11 6.23 3.69 ...
## $ x_ERA : num 5.42 4.02 4.08 4.43 4.4 4.74 3.8 5.17 7.22 3.86 ...
## $ pitch_type_CH : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pitch_type_CU : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pitch_type_EP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pitch_type_FA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pitch_type_FC : int 0 1 0 0 0 0 0 0 0 0 ...
## $ pitch_type_FF : int 0 0 0 1 1 0 1 1 1 1 ...
## $ pitch_type_FS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pitch_type_KC : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pitch_type_SI : int 1 0 1 0 0 0 0 0 0 0 ...
## $ pitch_type_SL : int 0 0 0 0 0 1 0 0 0 0 ...
## $ pitch_type_ST : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pitch_type_SV : int 0 0 0 0 0 0 0 0 0 0 ...
## $ zone_1.0 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ zone_2.0 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ zone_3.0 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ zone_4.0 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ zone_5.0 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ zone_6.0 : int 0 0 0 0 1 0 0 0 0 0 ...
## $ zone_7.0 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ zone_8.0 : int 0 0 1 0 0 0 0 0 0 0 ...
## $ zone_9.0 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ zone_11.0 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ zone_12.0 : int 0 0 0 1 0 0 0 1 0 0 ...
## $ zone_13.0 : int 0 0 0 0 0 1 0 0 0 0 ...
## $ zone_14.0 : int 0 0 0 0 0 0 1 0 1 0 ...
library(dplyr)
df = select(game, -c('X', 'description', 'stand', 'p_throws', 'inning_topbot', 'Name', 'Pitcher'))
dim(df)
## [1] 3415 81
library(glmnet)
library(caret)
library(plotmo)
library(Matrix)
library(ggplot2)
library(lattice)
library(Formula)
library(plotrix)
library(TeachingDemos)
control = trainControl(method = "cv", number = 10)
set.seed(123)
LASSO_fit <- train(result ~ .-1, data = df, method = "glmnet",
trControl = control,
tuneGrid = expand.grid(alpha = 1, lambda = 0))
coef(LASSO_fit$finalModel, LASSO_fit$bestTune$lambda)
## 82 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -3.409446e+00
## release_speed 2.563942e-04
## release_pos_x 9.956762e-04
## release_pos_z -2.134578e-03
## same_handedFALSE -1.532209e-02
## same_handedTRUE 1.077933e-12
## outs_when_up -1.521679e-02
## inning 6.482335e-03
## effective_speed 5.273825e-04
## release_spin_rate -2.228266e-06
## release_extension 6.221663e-02
## release_pos_y 5.613523e-02
## at_bat_number -9.573441e-04
## home_score 2.343068e-03
## away_score 2.863108e-04
## score_differential -9.618602e-04
## spin_axis 8.273631e-05
## delta_home_win_exp 1.890464e-02
## delta_run_exp -2.636405e+00
## Age -1.068024e-03
## Pitches -1.431469e-06
## Batted_Balls -3.709130e-05
## Barrels 1.779337e-04
## Barrel. 1.690006e-03
## Barrel.PA -1.088430e-02
## Exit_Velocity -2.478066e-03
## Max_EV 2.627209e-03
## Launch_Angle -8.632783e-04
## Sweet_Spot. 4.854044e-04
## XBA -3.043001e-01
## XSLG 8.525216e-01
## WOBA 2.019821e-01
## XWOBA -8.736729e-01
## XWOBACON -3.635971e-01
## HardHit. 5.156385e-04
## K. 4.897196e-04
## BB. 3.602360e-03
## p_Age -4.084478e-04
## p_Pitches 2.079491e-05
## p_Batted_Balls -7.151384e-05
## p_Barrels -1.445318e-04
## p_Barrel. -3.517417e-03
## p_Barrel.PA 7.972540e-03
## p_Exit_Velocity -3.838632e-04
## p_Max_EV -8.497196e-04
## p_Launch_Angle 3.129951e-04
## p_Sweet_Spot. -1.870361e-03
## p_XBA -1.650011e-01
## p_XSLG -1.223420e-01
## p_WOBA 8.679412e-03
## p_XWOBA 2.284598e-01
## p_XWOBACON 2.397653e-01
## p_HardHit. -9.752821e-04
## p_K. -6.038245e-04
## p_BB. -4.776449e-04
## ERA 1.076397e-03
## x_ERA -9.105396e-05
## pitch_type_CH 1.608773e-02
## pitch_type_CU 3.170010e-01
## pitch_type_EP 1.231133e-01
## pitch_type_FA -1.761872e-02
## pitch_type_FC -6.338956e-03
## pitch_type_FF -2.159525e-03
## pitch_type_FS 1.344429e-01
## pitch_type_KC 7.520017e-02
## pitch_type_SI .
## pitch_type_SL 4.986275e-02
## pitch_type_ST -2.357313e-02
## pitch_type_SV 7.763057e-03
## zone_1.0 -1.301390e-02
## zone_2.0 -2.152428e-02
## zone_3.0 2.033441e-02
## zone_4.0 -3.561772e-02
## zone_5.0 -3.583687e-02
## zone_6.0 -3.522273e-02
## zone_7.0 .
## zone_8.0 -4.499269e-02
## zone_9.0 5.511785e-03
## zone_11.0 4.991723e-01
## zone_12.0 5.247672e-01
## zone_13.0 5.025026e-01
## zone_14.0 4.803533e-01
X_lasso = model.matrix(result ~ .-1, data=df)
Y_lasso = df$result
df_LASSO = glmnet(X_lasso, Y_lasso, alpha = 1)
print(df_LASSO)
##
## Call: glmnet(x = X_lasso, y = Y_lasso, alpha = 1)
##
## Df %Dev Lambda
## 1 0 0.00 0.041560
## 2 2 1.07 0.037870
## 3 4 3.12 0.034500
## 4 5 10.99 0.031440
## 5 5 17.74 0.028650
## 6 5 23.34 0.026100
## 7 5 27.99 0.023780
## 8 5 31.85 0.021670
## 9 5 35.06 0.019740
## 10 5 37.72 0.017990
## 11 5 39.93 0.016390
## 12 5 41.77 0.014940
## 13 5 43.29 0.013610
## 14 5 44.55 0.012400
## 15 6 45.63 0.011300
## 16 7 46.53 0.010290
## 17 7 47.28 0.009380
## 18 9 47.94 0.008547
## 19 9 48.51 0.007787
## 20 11 49.00 0.007096
## 21 11 49.41 0.006465
## 22 13 49.77 0.005891
## 23 13 50.07 0.005368
## 24 14 50.32 0.004891
## 25 14 50.53 0.004456
## 26 17 50.72 0.004060
## 27 19 50.88 0.003700
## 28 22 51.03 0.003371
## 29 23 51.16 0.003072
## 30 25 51.28 0.002799
## 31 25 51.38 0.002550
## 32 30 51.46 0.002323
## 33 30 51.54 0.002117
## 34 33 51.61 0.001929
## 35 36 51.67 0.001758
## 36 40 51.72 0.001601
## 37 40 51.77 0.001459
## 38 43 51.82 0.001330
## 39 46 51.86 0.001211
## 40 47 51.89 0.001104
## 41 51 51.93 0.001006
## 42 54 51.96 0.000916
## 43 55 51.98 0.000835
## 44 56 52.00 0.000761
## 45 57 52.02 0.000693
## 46 57 52.03 0.000632
## 47 58 52.05 0.000575
## 48 61 52.06 0.000524
## 49 63 52.08 0.000478
## 50 63 52.10 0.000435
## 51 63 52.12 0.000397
## 52 62 52.14 0.000362
## 53 63 52.15 0.000329
## 54 63 52.17 0.000300
## 55 64 52.18 0.000273
## 56 64 52.19 0.000249
## 57 66 52.19 0.000227
## 58 68 52.20 0.000207
## 59 70 52.21 0.000188
## 60 72 52.22 0.000172
## 61 72 52.22 0.000157
## 62 72 52.23 0.000143
## 63 72 52.23 0.000130
## 64 70 52.24 0.000118
## 65 70 52.24 0.000108
## 66 70 52.24 0.000098
## 67 72 52.25 0.000090
## 68 75 52.25 0.000082
## 69 76 52.25 0.000074
## 70 76 52.25 0.000068
## 71 76 52.26 0.000062
## 72 75 52.26 0.000056
## 73 76 52.26 0.000051
## 74 77 52.27 0.000047
## 75 78 52.27 0.000043
## 76 78 52.27 0.000039
## 77 78 52.28 0.000035
## 78 78 52.28 0.000032
## 79 78 52.28 0.000029
## 80 78 52.28 0.000027
## 81 78 52.28 0.000024
## 82 79 52.28 0.000022
## 83 79 52.28 0.000020
## 84 79 52.29 0.000018
## 85 79 52.29 0.000017
## 86 79 52.29 0.000015
## 87 79 52.29 0.000014
## 88 79 52.29 0.000013
## 89 79 52.29 0.000012
## 90 79 52.29 0.000011
## 91 79 52.29 0.000010
coef(df_LASSO, s = cv.glmnet(X_lasso, Y_lasso)$lambda.1se)
## 82 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -0.085208680
## release_speed .
## release_pos_x .
## release_pos_z .
## same_handedFALSE .
## same_handedTRUE .
## outs_when_up -0.004144210
## inning .
## effective_speed .
## release_spin_rate .
## release_extension .
## release_pos_y .
## at_bat_number .
## home_score .
## away_score .
## score_differential .
## spin_axis .
## delta_home_win_exp .
## delta_run_exp -2.142785769
## Age .
## Pitches .
## Batted_Balls .
## Barrels .
## Barrel. .
## Barrel.PA .
## Exit_Velocity .
## Max_EV .
## Launch_Angle .
## Sweet_Spot. .
## XBA .
## XSLG .
## WOBA .
## XWOBA .
## XWOBACON .
## HardHit. .
## K. .
## BB. .
## p_Age .
## p_Pitches .
## p_Batted_Balls .
## p_Barrels .
## p_Barrel. .
## p_Barrel.PA .
## p_Exit_Velocity .
## p_Max_EV .
## p_Launch_Angle .
## p_Sweet_Spot. .
## p_XBA .
## p_XSLG .
## p_WOBA .
## p_XWOBA .
## p_XWOBACON .
## p_HardHit. .
## p_K. .
## p_BB. .
## ERA .
## x_ERA .
## pitch_type_CH .
## pitch_type_CU 0.138911130
## pitch_type_EP .
## pitch_type_FA .
## pitch_type_FC .
## pitch_type_FF .
## pitch_type_FS .
## pitch_type_KC .
## pitch_type_SI .
## pitch_type_SL .
## pitch_type_ST .
## pitch_type_SV .
## zone_1.0 .
## zone_2.0 .
## zone_3.0 .
## zone_4.0 -0.003590574
## zone_5.0 -0.009671238
## zone_6.0 -0.004523019
## zone_7.0 .
## zone_8.0 -0.009330144
## zone_9.0 .
## zone_11.0 0.418600911
## zone_12.0 0.441588841
## zone_13.0 0.419897914
## zone_14.0 0.403122251
cv.glmnet(X_lasso, Y_lasso)$lambda.1se
## [1] 0.005367583
plot(df_LASSO, label = TRUE)
as lambda increasing, the coefficient of feature mostly goes to 0
par(mfrow=c(1,2))
plot_glmnet(df_LASSO, xvar = "lambda",
label = 5, # the "label" means the top-n variable you want the graph to show (we want it to display the top-5 variables)
xlab = expression(paste("log(", lambda, ")")), # use expression() to ask R to enable Greek letters typesetting
ylab = expression(beta))
abline(v = log(cv.glmnet(X_lasso, Y_lasso)$lambda.1se), lty = "dashed")
plot_glmnet(df_LASSO, label = 5, xlab = expression(paste("log(", lambda, ")")), ylab = expression(beta))
abline(v = log(cv.glmnet(X_lasso, Y_lasso)$lambda.1se), lty = "dashed")
## as lambda increasing, the coefficient of feature mostly goes to 0
outs_when_up -0.004144210
delta_run_exp -2.142785769
pitch_type_CU 0.138911130
zone_4.0 -0.003590574
zone_5.0
-0.009671238
zone_6.0 -0.004523019
zone_8.0 -0.009330144
zone_11.0 0.418600911
zone_12.0 0.441588841
zone_13.0
0.419897914
zone_14.0 0.403122251
lasso_selected_df = select(game, c("result", "outs_when_up", "delta_run_exp", "pitch_type_CU", "zone_4.0", "zone_5.0", "zone_6.0", "zone_8.0", "zone_11.0", "zone_12.0", "zone_13.0", "zone_14.0"))
library(e1071)
library(caret)
library(gbm)
library(pROC)
library(plotROC)
library(ISLR)
set.seed(123)
s_train_row_number <- createDataPartition(lasso_selected_df$result, p=0.75, list=FALSE)
s_train <- lasso_selected_df[s_train_row_number, ]
s_test <- lasso_selected_df[-s_train_row_number, ]
logit
s_logit_model <- glm(result ~ ., family = binomial, data = s_train)
probit
s_probit_model <- glm(result ~ ., family = binomial(link = "probit"), data = s_train)
GBM
library(rsample)
library(gbm)
library(xgboost)
library(pdp)
library(ggplot2)
library(purrr)
hyper_grid <- expand.grid(
shrinkage = c(.01, .1, .2),
interaction.depth = c(1, 3, 5),
n.minobsinnode = c(5, 10, 15),
bag.fraction = c(.5, .75, 1),
optimal_trees = 0, # you will fill in values from loop
min_RMSE = 0)
for(i in 1:nrow(hyper_grid)) {
# reproducibility
set.seed(123)
# train model
gbm.tune <- gbm(
result ~ .,
data = s_train,
distribution = "bernoulli",
cv.folds = 10,
n.trees = 500,
interaction.depth = hyper_grid$interaction.depth[i],
shrinkage = hyper_grid$shrinkage[i],
n.minobsinnode = hyper_grid$n.minobsinnode[i],
bag.fraction = hyper_grid$bag.fraction[i],
train.fraction = .75,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
# locate minimum training error from the n-th tree, add it to grid
hyper_grid$optimal_trees[i] <- which.min(gbm.tune$valid.error)
hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}
optimal_para <- hyper_grid %>% dplyr::arrange(min_RMSE) %>% head(1)
# define a list of optimal hyperparameters
shrink <- optimal_para$shrinkage
ntrees <- optimal_para$optimal_trees
minobs <- optimal_para$n.minobsinnode
bagfrac <- optimal_para$bag.fraction
depth <- optimal_para$interaction.depth
# train final GBM model
s_gbm_model <- gbm(
result ~ .,
data = s_train,
distribution = "bernoulli",
cv.folds = 10,
n.trees = ntrees,
interaction.depth = depth,
shrinkage = shrink,
n.minobsinnode = minobs,
bag.fraction = bagfrac,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
print(shrink)
## [1] 0.1
print(ntrees)
## [1] 373
print(minobs)
## [1] 5
print(bagfrac)
## [1] 0.5
print(depth)
## [1] 5
svm
lasso_selected_df_for_svm = lasso_selected_df
svm_train <- lasso_selected_df_for_svm[s_train_row_number, ]
svm_test <- lasso_selected_df_for_svm[-s_train_row_number, ]
svm_train$result <- as.factor(svm_train$result)
svm_test$result <- as.factor(svm_test$result)
s_svm_model <- svm(result ~ .,
scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
kernel = "radial", # other options are available
degree = 3, # if kernel is of type = "polynomial"
# gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
cost = 1, # the cost function (C)
probability = FALSE, # whether to output probability predictions
na.action = na.omit,
data = svm_train)
tree
library(rsample)
library(dplyr) # data wrangling
library(rpart) # performing regression trees
library(rpart.plot) # plotting regression trees
library(ipred) # bagging
library(caret)
lasso_selected_df_for_tree = lasso_selected_df
lasso_selected_df_for_tree$result <- as.factor(lasso_selected_df_for_tree$result)
s_train_tree <- lasso_selected_df_for_tree[s_train_row_number, ]
s_test_tree <- lasso_selected_df_for_tree[-s_train_row_number, ]
hyper_grid <- expand.grid(
minsplit = seq(5, 20, 1), # from 5 to 20, with an increment of 1
maxdepth = seq(8, 15, 1)
)
trees <- list()
for (i in 1:nrow(hyper_grid)) {
# get minsplit, maxdepth values at row i
minsplit <- hyper_grid$minsplit[i]
maxdepth <- hyper_grid$maxdepth[i]
# train a model and store in the list
# R will feed i-th minsplit and maxdepth values from hyper_grid into rpart()'s control function as hyperparameters
trees[[i]] <- rpart(
result ~ .,
data = s_train_tree,
method = "anova",
control = list(minsplit = minsplit, maxdepth = maxdepth)
)
}
get_cp <- function(x) {
min <- which.min(x$cptable[, "CP"]) # to select from the column named "CP" and return a row index
cp <- x$cptable[min, "CP"] # using the index to subset the desired value
}
get_min_error <- function(x) {
min <- which.min(x$cptable[, "xerror"]) # to select from the column named "xerror" and return an index
xerror <- x$cptable[min, "xerror"] # using the index to subset the desired value
}
hyper_grid <- hyper_grid %>%
mutate(
cp = purrr::map_dbl(trees, get_cp), # purrr is a useful functional programming command from the tidyverse package, map_dbl() outputs double vectors (i.e, numeric values that have decimals) from an element of an object; mutate() adds new variables while preserving existing ones (from a data.frame or data.table object)
error = purrr::map_dbl(trees, get_min_error)
) %>%
arrange(error) %>%
top_n(-5, wt = error)
min_split <- hyper_grid[1, 1]
max_depth <- hyper_grid[1, 2]
complexity_para <- hyper_grid[1, 3]
optimal_tree <- rpart(
result ~ .,
data = s_train_tree,
# method = "anova", in order to get predicted class, we need to suppress the "anove" method
control = list(minsplit = min_split, maxdepth = max_depth, cp = complexity_para)
)
print(min_split)
## [1] 5
print(max_depth)
## [1] 8
print(complexity_para)
## [1] 0.01
head(trees, 1)
## [[1]]
## n= 2562
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 2562 149.13230 1.062061
## 2) delta_run_exp>=0.0065 883 0.00000 1.000000 *
## 3) delta_run_exp< 0.0065 1679 143.94280 1.094699
## 6) zone_11.0< 0.5 1634 106.04650 1.069767
## 12) zone_12.0< 0.5 1589 66.00378 1.043424
## 24) zone_14.0< 0.5 1550 29.41935 1.019355
## 48) zone_13.0< 0.5 1520 0.00000 1.000000 *
## 49) zone_13.0>=0.5 30 0.00000 2.000000 *
## 25) zone_14.0>=0.5 39 0.00000 2.000000 *
## 13) zone_12.0>=0.5 45 0.00000 2.000000 *
## 7) zone_11.0>=0.5 45 0.00000 2.000000 *
RF
library(randomForest)
library(rsample) # data splitting
library(dplyr) # data wrangling
library(rpart) # build tree models
library(rpart.plot) # plotting regression trees
library(ipred) # bagging
library(caret)
library(ranger)
library(ggplot2)
set.seed(123)
hyper_grid_2 <- expand.grid(
mtry = seq(1, 4, by = 1),
node_size = seq(3, 9, by = 2),
sampe_size = c(.5, .6, .75, .80),
OOB_RMSE = 0
)
for(i in 1:nrow(hyper_grid_2)) {
# train model
model <- ranger(
result ~ .,
data = s_train_tree,
num.trees = 100,
mtry = hyper_grid_2$mtry[i],
min.node.size = hyper_grid_2$node_size[i],
sample.fraction = hyper_grid_2$sampe_size[i],
seed = 123
)
# add OOB error to grid. note how we append the list of values to the vector and index them (fill in and replace the 0 inside the OOB_RMSE column)
hyper_grid_2$OOB_RMSE[i] <- sqrt(model$prediction.error)
}
best_para <- hyper_grid_2 %>%
dplyr::arrange(OOB_RMSE) %>%
head(1)
m <- best_para$mtry
node_size <- best_para$node_size
p <- best_para$sampe_size
optimal_ranger <- ranger(
result ~ .,
data = s_train_tree,
num.trees = 100,
mtry = m, # number of features
min.node.size = node_size, # node size
sample.fraction = p, # fraction of sample used in each tree
importance = 'impurity'
)
print(m)
## [1] 4
print(node_size)
## [1] 3
print(p)
## [1] 0.5
print(best_para)
## mtry node_size sampe_size OOB_RMSE
## 1 4 3 0.5 0
logit model
pred_logit <- predict(s_logit_model, s_test, type = "response")
# classify outcomes by threshold = 0.5
convert_logit <- as.numeric(pred_logit > 0.5)
# generate confusion matrix and calculate precision score
confmat_logit <- table(Predicted = convert_logit, Actual = s_test$result)
confmat_logit
## Actual
## Predicted 0 1
## 0 799 0
## 1 0 54
s_logit_model$coefficients
## (Intercept) outs_when_up delta_run_exp pitch_type_CU zone_4.0
## -78.6176091 15.1152013 -699.2750438 4.9158852 -0.1700082
## zone_5.0 zone_6.0 zone_8.0 zone_11.0 zone_12.0
## -0.1973520 -0.1785726 -0.3838084 52.1836586 52.3811138
## zone_13.0 zone_14.0
## 52.2513377 52.2682433
accuracy_logit <- (confmat_logit[1, 1] + confmat_logit[2, 2]) / sum(confmat_logit)
print(paste0("Accuracy of Logit Model: ",accuracy_logit))
## [1] "Accuracy of Logit Model: 1"
probit model
pred_probit <- predict(s_probit_model, s_test, type = "response")
convert_probit <- as.numeric(pred_probit > 0.5)
confmat_probit <- table(Predicted = convert_probit, Actual = s_test$result)
confmat_probit
## Actual
## Predicted 0 1
## 0 799 0
## 1 0 54
s_probit_model$coefficients
## (Intercept) outs_when_up delta_run_exp pitch_type_CU zone_4.0
## -21.69832394 4.52154487 -195.80117917 0.82798299 -0.02635914
## zone_5.0 zone_6.0 zone_8.0 zone_11.0 zone_12.0
## -0.02928888 -0.02632436 -0.05967798 13.80566474 13.83842804
## zone_13.0 zone_14.0
## 13.81631418 13.81970489
accuracy_probit <- (confmat_probit[1, 1] + confmat_probit[2, 2]) / sum(confmat_probit)
print(paste0("Accuracy of Probit Model: ", accuracy_probit))
## [1] "Accuracy of Probit Model: 1"
svm
pred_svm <- predict(s_svm_model, s_test)
confmat_svm <- table(Predicted = pred_svm, Actual = s_test$result)
confmat_svm
## Actual
## Predicted 0 1
## 0 799 0
## 1 0 54
accuracy_svm <- (confmat_svm[1, 1] + confmat_svm[2, 2]) / sum(confmat_svm)
print(paste0("Accuracy of SVM Model: ",accuracy_svm))
## [1] "Accuracy of SVM Model: 1"
GBM
pred_gbm <- predict(s_gbm_model, s_test, type = "response")
convert_gbm <- as.numeric(pred_gbm > 0.5)
confmat_gbm <- table(Predicted = convert_gbm, Actual = s_test$result)
confmat_gbm
## Actual
## Predicted 0 1
## 0 799 0
## 1 0 54
accuracy_gbm <- (confmat_gbm[1, 1] + confmat_gbm[2, 2]) / sum(confmat_gbm)
print(paste0("Accuracy of GBM Model: ",accuracy_gbm))
## [1] "Accuracy of GBM Model: 1"
par(mar = c(5, 8, 1, 1))
summary(
s_gbm_model,
cBars = 5, # since we have 5 variables to display
method = relative.influence, # you can also use permutation.test.gbm
las = 2
)
## var rel.inf
## delta_run_exp delta_run_exp 24.32038
## zone_11.0 zone_11.0 23.46796
## zone_14.0 zone_14.0 18.07495
## zone_12.0 zone_12.0 17.36242
## zone_13.0 zone_13.0 16.77429
## outs_when_up outs_when_up 0.00000
## pitch_type_CU pitch_type_CU 0.00000
## zone_4.0 zone_4.0 0.00000
## zone_5.0 zone_5.0 0.00000
## zone_6.0 zone_6.0 0.00000
## zone_8.0 zone_8.0 0.00000
decision tree
# predict on testing data
pred_tree <- predict(optimal_tree, newdata = s_test_tree, type = "class") # set type to "class"
# create confusion matrix and generate accuracy score
confmat_tree <- table(Predicted = pred_tree, Actual = s_test_tree$result)
# view the confusion table (lots of Type II error)
confmat_tree
## Actual
## Predicted 0 1
## 0 799 0
## 1 0 54
accuracy_tree <- (confmat_tree[1, 1] + confmat_tree[2, 2]) / sum(confmat_tree)
print(paste0("Accuracy of Tree Model: ",accuracy_tree))
## [1] "Accuracy of Tree Model: 1"
rpart.plot(optimal_tree)
RF
# predict on testing data
pred_rf <- predict(optimal_ranger, s_test_tree, type = "response") # set type to "class"
# create confusion matrix and generate accuracy score
confmat_rf <- table(Predicted = pred_rf$predictions, Actual = s_test$result)
# view the confusion table (lots of Type II error)
confmat_rf
## Actual
## Predicted 0 1
## 0 799 0
## 1 0 54
accuracy_rf <- (confmat_rf[1, 1] + confmat_rf[2, 2]) / sum(confmat_rf)
print(paste0("Accuracy of Random Forest Model: ",accuracy_rf))
## [1] "Accuracy of Random Forest Model: 1"
rf_feature_importance <- data.frame(vname = names(lasso_selected_df[-1]), importance = optimal_ranger$variable.importance)
ggplot(rf_feature_importance, aes(x = reorder(vname, -importance), y = importance, fill = vname, label = round(importance, digits = 2))) + # fill means label with variable names
geom_bar(stat = "Identity") +
ggtitle("Most important variables (in the order of 1 to 4)") +
xlab("Variable") +
ylab("Importance") +
geom_text(size = 3, position = position_stack(vjust = 0.5), col = "white")
compare_df_for_svm = df
compare_svm_train <- compare_df_for_svm[s_train_row_number, ]
compare_svm_test <- compare_df_for_svm[-s_train_row_number, ]
compare_svm_train$result <- as.factor(compare_svm_train$result)
compare_svm_test$result <- as.factor(compare_svm_test$result)
compare_svm_model <- svm(result ~ .,
scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
kernel = "radial", # other options are available
degree = 3, # if kernel is of type = "polynomial"
# gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
cost = 1, # the cost function (C)
probability = FALSE, # whether to output probability predictions
na.action = na.omit,
data = compare_svm_train)
pred_svm_compare <- predict(compare_svm_model, compare_svm_test)
confmat_svm_compare <- table(Predicted = pred_svm_compare, Actual = compare_svm_test$result)
confmat_svm_compare
## Actual
## Predicted 0 1
## 0 799 54
## 1 0 0
accuracy_compare_svm <- (confmat_svm_compare[1,1] + confmat_svm_compare[2,2])/sum(confmat_svm_compare)
print(paste0("Accuracy of Comparision SVM Model:", accuracy_compare_svm))
## [1] "Accuracy of Comparision SVM Model:0.936694021101993"
svm.pred2 <- as.numeric(pred_svm_compare)
ROC_svm_compare <- roc(s_test$result, svm.pred2, percent = TRUE, main = "Smoothing")
auc(ROC_svm_compare)
## Area under the curve: 50%
ROC_logit <- roc(s_test$result, pred_logit, percent = TRUE, main = "Smoothing")
ROC_probit <- roc(s_test$result, pred_probit, percent = TRUE, main = "Smoothing")
svm.pred1 <- as.numeric(pred_svm)
ROC_svm <- roc(s_test$result, svm.pred1, percent = TRUE, main = "Smoothing")
ROC_gbm <- roc(s_test$result, pred_gbm, percent = TRUE, main = "Smoothing")
tree.pred1 <- as.numeric(pred_tree)
ROC_tree <- roc(s_test$result, tree.pred1, percent = TRUE, main = "Smoothing")
rf.pred1 <- as.numeric(pred_rf$predictions)
ROC_rf <- roc(s_test$result, rf.pred1, percent = TRUE, main = "Smoothing")
Logit AUC
auc(ROC_logit)
## Area under the curve: 100%
Probit AUC
auc(ROC_probit)
## Area under the curve: 100%
SVM AUC
auc(ROC_svm)
## Area under the curve: 100%
GBM AUC
auc(ROC_gbm)
## Area under the curve: 100%
Tree AUC
auc(ROC_tree)
## Area under the curve: 100%
Random Forest AUC
auc(ROC_rf)
## Area under the curve: 100%
par(mar=c(1,1,1,1)) # create plotting environment
plot.roc(ROC_logit, s_test$result, percent = TRUE, main = "ROC curves", add = FALSE, asp = NA)
lines(ROC_probit, col = "blue")
lines(ROC_svm, col = "red")
lines(ROC_gbm, col = "green")
lines(ROC_tree, col = "cyan")
lines(ROC_rf, col = "orange")
lines(ROC_svm_compare, col = "yellow")
axis(1, at = seq(0, 1, by=0.2), labels = paste(100*seq(1,0, by=-.2)), tick = TRUE)
legend(40, 20,
legend = c("model AUC", "logit: 100", "probit: 100", "SVM: 100", "GBM: 100", "Tree: 100", "RF: 100", "Compare_SVM: 50"),
col = c("white", "black", "blue", "red", "green", "cyan", "orange", "yellow"),
lty = c(1, 1, 2, 3, 4, 5, 6, 6),
pch = c(NA, NA, NA, NA, NA, NA, NA, NA),
cex = 0.7)