SHAP 01

2021-05-12

setwd("J:/AIiHS/Chap03")
dat= read.csv("TAHIR_rwd1.csv")
table(dat$HwyClass)
## 
##        Rural Interstate   Rural Multi-lane Div. Rural Multi-lane Undiv. 
##                     356                     519                     158 
##          Rural Two-Lane        Urban Interstate   Urban Multi-lane Div. 
##                    8461                     529                     789 
## Urban Multi-lane Undiv.          Urban Two-lane 
##                     739                    2482
head(dat)
##            NewSegID Urban_Rur       HwyClass Length AADT Lanes LaneWidth
## 1        001-01_0_2     Rural Rural Two-Lane  2.000 2920     2        12
## 2 001-01_2.53_4.349     Rural Rural Two-Lane  1.819 2920     2        12
## 3     001-01_2_2.53     Rural Rural Two-Lane  0.530 2920     2        12
## 4  001-01_4.349_6.3     Urban Urban Two-lane  1.951 4320     2        12
## 5    001-01_6.3_8.3     Urban Urban Two-lane  2.000 4320     2        12
## 6  001-01_8.3_9.297     Urban Urban Two-lane  0.997 4320     2        12
##   ShWidth Curve MinPSL Total_Crash KABC_Crash
## 1       6     8     55          12          4
## 2       6     4     45           6          3
## 3       6     4     55           1          1
## 4       6     7     35           9          4
## 5       6     5     45           3          1
## 6       6     2     45           2          2
dat= subset(dat, HwyClass=="Rural Two-Lane")
dim(dat)
## [1] 8461   12
## 75% of the sample size
smp_size <- floor(0.75 * nrow(dat))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(dat)), size = smp_size)

train <- dat[train_ind, ]
test <- dat[-train_ind, ]

dim(train)
## [1] 6345   12
dim(test)
## [1] 2116   12
train_df= train[, c(4, 5, 7:11)]
test_df= test[, c(4, 5, 7:11)]

library(keras)
library(tfdatasets)
library(tensorflow)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
require(xgboost)
## Loading required package: xgboost
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
require(Matrix)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
require(vcd)
## Loading required package: vcd
## Loading required package: grid
germanvar<-train_df[,1:6]
label <- as.numeric(train_df$Total_Crash) ## make it a numeric NOT integer
data <-  as.matrix(germanvar)  # to matrix
mode(data) <- 'double'  # to numeric i.e double precision

param_dart <- list(objective = "reg:linear",  # For regression
                   nrounds = 366,
                   eta = 0.018,
                   max_depth = 10,
                   gamma = 0.009,
                   subsample = 0.98,
                   colsample_bytree = 0.86)

mod <- xgboost(data = data, label = label,
                        xgb_param = param_dart, nrounds = param_dart$nrounds,
                        verbose = FALSE, nthread = parallel::detectCores() - 2,
                        early_stopping_rounds = 8)
## [10:48:58] WARNING: amalgamation/../src/learner.cc:541: 
## Parameters: { xgb_param } might not be used.
## 
##   This may not be accurate due to some parameters are only used in language bindings but
##   passed down to XGBoost core.  Or some parameters are not used but slip through this
##   verification. Please open an issue if you find above cases.
library("SHAPforxgboost")
# To return the SHAP values and ranked features by mean|SHAP|
shap_values <- shap.values(xgb_model = mod, X_train = data)
# The ranked features by mean |SHAP|
shap_values$mean_shap_score
##      AADT    Length     Curve   ShWidth    MinPSL LaneWidth 
## 1.6528671 1.3239341 0.5810324 0.4764935 0.2331637 0.2283748
# To prepare the long-format data:
shap_long <- shap.prep(xgb_model = mod, X_train = data)
# is the same as: using given shap_contrib
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = data)
# (Notice that there will be a data.table warning from `melt.data.table` due to `dayint` coerced from integer to double)

# **SHAP summary plot**
shap.plot.summary(shap_long)

# sometimes for a preview, you want to plot less data to make it faster using `dilute`
shap.plot.summary(shap_long, x_bound  = 1.2, dilute = 10)

shap.plot.summary.wrap1(mod, X = as.matrix(data))

shap.plot.summary.wrap2(shap_values$shap_score, as.matrix(data))

shap.plot.dependence(data_long = shap_long, x = "AADT")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long, x= "AADT",
                     color_feature = "MinPSL")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long, x= "AADT",
                     y = "MinPSL", color_feature = "MinPSL")  
## `geom_smooth()` using formula 'y ~ x'

fig_list = lapply(names(shap_values$mean_shap_score)[1:6], shap.plot.dependence, 
                  data_long = shap_long, dilute = 5)
gridExtra::grid.arrange(grobs = fig_list, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 11
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

# prepare the data using either: 
# (this step is slow since it calculates all the combinations of features. This example spends 10s.)
data_int <- shap.prep.interaction(xgb_mod = mod, X_train = as.matrix(data))
# or:
shap_int <- predict(mod, as.matrix(data), predinteraction = TRUE)

# **SHAP interaction effect plot **
shap.plot.dependence(data_long = shap_long,
                     data_int = shap_int,
                     x= "AADT",
                     y = "MinPSL", 
                     color_feature = "MinPSL")
## `geom_smooth()` using formula 'y ~ x'

# choose to show top 4 features by setting `top_n = 4`, set 6 clustering groups.  
plot_data <- shap.prep.stack.data(shap_contrib = shap_values$shap_score, 
                                  top_n = 4, n_groups = 6)
## The SHAP values of the Rest 2 features were summed into variable 'rest_variables'.
# choose to zoom in at location 500, set y-axis limit using `y_parent_limit`  
# it is also possible to set y-axis limit for zoom-in part alone using `y_zoomin_limit`  
shap.plot.force_plot(plot_data, zoom_in_location = 500, y_parent_limit = c(-1,1))
## Data has N = 6345 | zoom in length is 150 at location 500.

shap.plot.force_plot_bygroup(plot_data)