Extreme Gradient Boost

EIX

library("EIX")
library(iBreakDown)
library(data.table)
setwd("C:/Users/subas/Syncplicity/MyProjects_IMP/MY_Papers_V2/TRB 2021/EScotter_BayesianRule/")
it01 <- fread("EScotter_Fin3.csv")
names(it01)
##  [1] "CRASH_NUM1"     "LIGHTING_CD"    "LOC_TYPE_CD"    "MAN_COLL_CD"   
##  [5] "WEATHER_CD"     "DAY_OF_WK"      "INTERSECTION"   "NUM_VEH"       
##  [9] "DR_COND_CD"     "DR_DISTRACT_CD" "VIOLATIONS_CD"  "POSTED_SPEED"  
## [13] "SEVERITY_CD1"   "SEVERITY_CD"
table(it01$SEVERITY_CD1)
## 
##   0   1 
## 195 379
mn01 <- it01[, c("SEVERITY_CD1", "DAY_OF_WK", "LIGHTING_CD", "VIOLATIONS_CD", "WEATHER_CD", "DR_COND_CD", 
                 "NUM_VEH")]

mn02= na.omit(mn01)


library("Matrix")
sparse_matrix <- sparse.model.matrix(SEVERITY_CD1 ~ . ,  data = mn02)
head(sparse_matrix)
## 6 x 46 sparse Matrix of class "dgCMatrix"
##                                                                                
## 1 1 . 1 . . . . . 1 . . . . . . . . . . . . . . . . 1 . . . . 1 . . . . . . . .
## 2 1 . 1 . . . . . 1 . . . . . . . . . . . . . . . . . . 1 . . 1 . . . . 1 . . .
## 3 1 . . . . 1 . . . . . . . . 1 . . . . . . . . . . . . . . . 1 . . . . . 1 . .
## 4 1 . . . . 1 . . . . . . . . . . . . . . . . . . . . . 1 . . 1 . . . . 1 . . .
## 5 1 . 1 . . . . . . . . . . . . . . . . . . . . . . . . 1 . . 1 . . . . 1 . . .
## 6 1 . . . . . 1 . . . . . . . . . . . . . . . . . . 1 . . . . 1 . . . . 1 . . .
##                
## 1 1 . . . . . .
## 2 . . . . . . .
## 3 . . . . . . .
## 4 . . . . . . .
## 5 . . . . . . 1
## 6 . . . . . . .
library("xgboost")
param <- list(objective = "binary:logistic", max_depth = 2)
xgb_model <- xgboost(sparse_matrix, params = param, label = mn02[, SEVERITY_CD1] == 1, nrounds = 50, verbose = FALSE)
head(xgboost::xgb.model.dt.tree(colnames(sparse_matrix),xgb_model))
##    Tree Node  ID        Feature         Split  Yes   No Missing     Quality
## 1:    0    0 0-0       NUM_VEHS -9.536743e-07  0-1  0-2     0-1 17.49819180
## 2:    0    1 0-1    DR_COND_CDA -9.536743e-07  0-3  0-4     0-3  8.00745392
## 3:    0    2 0-2 VIOLATIONS_CDC -9.536743e-07  0-5  0-6     0-5  3.86367798
## 4:    0    3 0-3           Leaf            NA <NA> <NA>    <NA>  0.04834124
## 5:    0    4 0-4           Leaf            NA <NA> <NA>    <NA>  0.20769233
## 6:    0    5 0-5           Leaf            NA <NA> <NA>    <NA>  0.42545456
##     Cover
## 1: 143.50
## 2: 115.75
## 3:  27.75
## 4:  51.75
## 5:  64.00
## 6:  26.50
head(xgboost::xgb.importance(colnames(sparse_matrix),xgb_model))
##           Feature       Gain      Cover Frequency
## 1:       NUM_VEHS 0.18410144 0.10475233 0.0781250
## 2:    DR_COND_CDA 0.09577575 0.05538603 0.0625000
## 3: VIOLATIONS_CDY 0.06925720 0.04880643 0.0546875
## 4: VIOLATIONS_CDC 0.06570486 0.02756556 0.0546875
## 5: VIOLATIONS_CDT 0.06206052 0.08826627 0.0703125
## 6:    DR_COND_CDZ 0.06063317 0.03920121 0.0312500
lolli<-lollipop(xgb_model,sparse_matrix)
plot(lolli)

pairs<-interactions(xgb_model, sparse_matrix, option = "pairs")
head(pairs)
##         Parent          Child   sumGain frequency
## 1:    NUM_VEHS VIOLATIONS_CDC 15.636900         7
## 2: DR_COND_CDB    DR_COND_CDA  9.397680         3
## 3:    NUM_VEHS    DAY_OF_WKTU  9.051174         3
## 4: DAY_OF_WKMO VIOLATIONS_CDY  8.065340         2
## 5:    NUM_VEHS    DR_COND_CDA  8.007454         1
## 6:    NUM_VEHS VIOLATIONS_CDT  6.176212         1
interactions<-interactions(xgb_model, sparse_matrix, option = "interactions")
head(interactions)
##         Parent          Child  sumGain frequency
## 1: DR_COND_CDB    DR_COND_CDA 9.397680         3
## 2: DAY_OF_WKMO VIOLATIONS_CDY 8.065340         2
## 3:    NUM_VEHS VIOLATIONS_CDC 6.423502         2
## 4: WEATHER_CDC    DAY_OF_WKSA 5.086847         1
## 5: DAY_OF_WKWE    WEATHER_CDB 4.615500         1
## 6: DR_COND_CDY VIOLATIONS_CDZ 4.592610         1
importance<-importance(xgb_model, sparse_matrix, option = "both")
head(importance)
##                    Feature sumGain sumCover meanGain meanCover frequency
## 1:                NUM_VEHS  43.810   1261.0    4.381    126.10        10
## 2:          VIOLATIONS_CDT  14.240    947.6    1.779    118.50         8
## 3:             DR_COND_CDZ  10.530    266.7    5.265    133.30         2
## 4:             DR_COND_CDA   9.403    351.5    2.351     87.87         4
## 5: DR_COND_CDB:DR_COND_CDA   9.398    295.2    3.133     98.41         3
## 6:          VIOLATIONS_CDC   9.213    115.9    1.843     23.18         5
##    mean5Gain
## 1:     8.051
## 2:     2.581
## 3:     5.265
## 4:     2.351
## 5:     3.133
## 6:     1.843
plot(importance, radar=FALSE)

plot(importance)

data <- mn02[9,]
new_observation <- sparse_matrix[9,]
wf<-waterfall(xgb_model, new_observation, data, option = "interactions")
wf
##                                         contribution
## xgboost: intercept                             0.690
## xgboost: VIOLATIONS_CD = U                    -2.937
## xgboost: NUM_VEH = M                           0.988
## xgboost: WEATHER_CD = A                       -0.787
## xgboost: DAY_OF_WK:VIOLATIONS_CD = NA:U        0.699
## xgboost: LIGHTING_CD = A                      -0.684
## xgboost: DR_COND_CD:VIOLATIONS_CD = A:U       -0.440
## xgboost: WEATHER_CD:DAY_OF_WK = A:NA           0.375
## xgboost: DAY_OF_WK:WEATHER_CD = NA:A           0.298
## xgboost: NUM_VEH:LIGHTING_CD = M:A             0.159
## xgboost: LIGHTING_CD:DR_COND_CD = A:A          0.139
## xgboost: DR_COND_CD:DAY_OF_WK = A:NA          -0.104
## xgboost: DR_COND_CD = A                       -0.097
## xgboost: NUM_VEH:VIOLATIONS_CD = M:U          -0.066
## xgboost: VIOLATIONS_CD:DR_COND_CD = U:A       -0.061
## xgboost: LIGHTING_CD:WEATHER_CD = A:A         -0.035
## xgboost: NUM_VEH:WEATHER_CD = M:A             -0.023
## xgboost: prediction                           -1.887
plot(wf)