setwd("J:/AIiHS/Chap03")
dat= read.csv("TAHIR_rwd1.csv")
table(dat$HwyClass)
##
## Rural Interstate Rural Multi-lane Div. Rural Multi-lane Undiv.
## 356 519 158
## Rural Two-Lane Urban Interstate Urban Multi-lane Div.
## 8461 529 789
## Urban Multi-lane Undiv. Urban Two-lane
## 739 2482
## NewSegID Urban_Rur HwyClass Length AADT Lanes LaneWidth
## 1 001-01_0_2 Rural Rural Two-Lane 2.000 2920 2 12
## 2 001-01_2.53_4.349 Rural Rural Two-Lane 1.819 2920 2 12
## 3 001-01_2_2.53 Rural Rural Two-Lane 0.530 2920 2 12
## 4 001-01_4.349_6.3 Urban Urban Two-lane 1.951 4320 2 12
## 5 001-01_6.3_8.3 Urban Urban Two-lane 2.000 4320 2 12
## 6 001-01_8.3_9.297 Urban Urban Two-lane 0.997 4320 2 12
## ShWidth Curve MinPSL Total_Crash KABC_Crash
## 1 6 8 55 12 4
## 2 6 4 45 6 3
## 3 6 4 55 1 1
## 4 6 7 35 9 4
## 5 6 5 45 3 1
## 6 6 2 45 2 2
dat= subset(dat, HwyClass=="Rural Two-Lane")
dim(dat)
## [1] 8461 12
## 75% of the sample size
smp_size <- floor(0.75 * nrow(dat))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(dat)), size = smp_size)
train <- dat[train_ind, ]
test <- dat[-train_ind, ]
dim(train)
## [1] 6345 12
## [1] 2116 12
train_df= train[, c(4, 5, 7:11)]
test_df= test[, c(4, 5, 7:11)]
library(keras)
library(tfdatasets)
library(tensorflow)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
require(xgboost)
## Loading required package: xgboost
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## Loading required package: vcd
## Loading required package: grid
germanvar<-train_df[,1:6]
label <- as.numeric(train_df$Total_Crash) ## make it a numeric NOT integer
data <- as.matrix(germanvar) # to matrix
mode(data) <- 'double' # to numeric i.e double precision
param_dart <- list(objective = "reg:linear", # For regression
nrounds = 366,
eta = 0.018,
max_depth = 10,
gamma = 0.009,
subsample = 0.98,
colsample_bytree = 0.86)
mod <- xgboost(data = data, label = label,
xgb_param = param_dart, nrounds = param_dart$nrounds,
verbose = FALSE, nthread = parallel::detectCores() - 2,
early_stopping_rounds = 8)
## [10:48:58] WARNING: amalgamation/../src/learner.cc:541:
## Parameters: { xgb_param } might not be used.
##
## This may not be accurate due to some parameters are only used in language bindings but
## passed down to XGBoost core. Or some parameters are not used but slip through this
## verification. Please open an issue if you find above cases.
library("SHAPforxgboost")
# To return the SHAP values and ranked features by mean|SHAP|
shap_values <- shap.values(xgb_model = mod, X_train = data)
# The ranked features by mean |SHAP|
shap_values$mean_shap_score
## AADT Length Curve ShWidth MinPSL LaneWidth
## 1.6528671 1.3239341 0.5810324 0.4764935 0.2331637 0.2283748
# To prepare the long-format data:
shap_long <- shap.prep(xgb_model = mod, X_train = data)
# is the same as: using given shap_contrib
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = data)
# (Notice that there will be a data.table warning from `melt.data.table` due to `dayint` coerced from integer to double)
# **SHAP summary plot**
shap.plot.summary(shap_long)

# sometimes for a preview, you want to plot less data to make it faster using `dilute`
shap.plot.summary(shap_long, x_bound = 1.2, dilute = 10)

shap.plot.summary.wrap1(mod, X = as.matrix(data))

shap.plot.summary.wrap2(shap_values$shap_score, as.matrix(data))

shap.plot.dependence(data_long = shap_long, x = "AADT")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long, x= "AADT",
color_feature = "MinPSL")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long, x= "AADT",
y = "MinPSL", color_feature = "MinPSL")
## `geom_smooth()` using formula 'y ~ x'

fig_list = lapply(names(shap_values$mean_shap_score)[1:6], shap.plot.dependence,
data_long = shap_long, dilute = 5)
gridExtra::grid.arrange(grobs = fig_list, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 11
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

# prepare the data using either:
# (this step is slow since it calculates all the combinations of features. This example spends 10s.)
data_int <- shap.prep.interaction(xgb_mod = mod, X_train = as.matrix(data))
# or:
shap_int <- predict(mod, as.matrix(data), predinteraction = TRUE)
# **SHAP interaction effect plot **
shap.plot.dependence(data_long = shap_long,
data_int = shap_int,
x= "AADT",
y = "MinPSL",
color_feature = "MinPSL")
## `geom_smooth()` using formula 'y ~ x'

# choose to show top 4 features by setting `top_n = 4`, set 6 clustering groups.
plot_data <- shap.prep.stack.data(shap_contrib = shap_values$shap_score,
top_n = 4, n_groups = 6)
## The SHAP values of the Rest 2 features were summed into variable 'rest_variables'.
# choose to zoom in at location 500, set y-axis limit using `y_parent_limit`
# it is also possible to set y-axis limit for zoom-in part alone using `y_zoomin_limit`
shap.plot.force_plot(plot_data, zoom_in_location = 500, y_parent_limit = c(-1,1))
## Data has N = 6345 | zoom in length is 150 at location 500.

shap.plot.force_plot_bygroup(plot_data)
