xgboost
require(xgboost)
require(Matrix)
require(data.table)
require(vcd)
data(Arthritis)
df <- data.table(Arthritis, keep.rownames = F)
head(df)
## ID Treatment Sex Age Improved
## 1: 57 Treated Male 27 Some
## 2: 46 Treated Male 29 None
## 3: 77 Treated Male 30 None
## 4: 17 Treated Male 32 Marked
## 5: 36 Treated Male 46 Marked
## 6: 23 Treated Male 58 Marked
head(df[,AgeDiscret := as.factor(round(Age/10,0))])
## ID Treatment Sex Age Improved AgeDiscret
## 1: 57 Treated Male 27 Some 3
## 2: 46 Treated Male 29 None 3
## 3: 77 Treated Male 30 None 3
## 4: 17 Treated Male 32 Marked 3
## 5: 36 Treated Male 46 Marked 5
## 6: 23 Treated Male 58 Marked 6
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
## ID Treatment Sex Age Improved AgeDiscret AgeCat
## 1: 57 Treated Male 27 Some 3 Young
## 2: 46 Treated Male 29 None 3 Young
## 3: 77 Treated Male 30 None 3 Young
## 4: 17 Treated Male 32 Marked 3 Old
## 5: 36 Treated Male 46 Marked 5 Old
## 6: 23 Treated Male 58 Marked 6 Old
df[,ID:=NULL]
levels(df[,Treatment])
## [1] "Placebo" "Treated"
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
head(sparse_matrix)
## 6 x 9 sparse Matrix of class "dgCMatrix"
## TreatmentTreated SexMale Age AgeDiscret3 AgeDiscret4 AgeDiscret5 AgeDiscret6
## 1 1 1 27 1 . . .
## 2 1 1 29 1 . . .
## 3 1 1 30 1 . . .
## 4 1 1 32 1 . . .
## 5 1 1 46 . . 1 .
## 6 1 1 58 . . . 1
## AgeDiscret7 AgeCatYoung
## 1 . 1
## 2 . 1
## 3 . 1
## 4 . .
## 5 . .
## 6 . .
output_vector = df[,Improved] == "Marked"
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic")
## [1] train-error:0.202381
## [2] train-error:0.166667
## [3] train-error:0.166667
## [4] train-error:0.166667
## [5] train-error:0.154762
## [6] train-error:0.154762
## [7] train-error:0.154762
## [8] train-error:0.166667
## [9] train-error:0.166667
## [10] train-error:0.166667
importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix),
model = bst, data = sparse_matrix, label = output_vector)
# Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
xgb.plot.importance(importance_matrix = importance)

# }
# NOT RUN {
# Below is an example of how to save this plot to a file.
# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
library(DiagrammeR)
xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
xgb.plot.tree(model = bst)
library("SHAPforxgboost")
dataXY_df
## dayint Column_WV AOT_Uncertainty elev aod RelAZ DevAll_P1km
## 1: 11012 1.822 0.0130 60.34783 0.333 124.38 0.665166517
## 2: 11012 1.801 0.0131 17.16000 0.604 123.85 0.917266187
## 3: 11012 1.887 0.0161 11.16667 0.342 124.41 0.140287770
## 4: 11014 1.752 0.0110 11.73913 0.343 126.55 0.002697842
## 5: 11015 0.620 0.0201 60.34783 0.225 62.02 0.665166517
## ---
## 10144: 16789 0.673 0.0094 67.00000 0.069 65.85 0.626570916
## 10145: 16789 0.654 0.0073 11.16667 0.068 66.82 0.140287770
## 10146: 16794 1.331 0.0025 25.65385 0.065 68.32 0.032403240
## 10147: 16795 0.499 0.0103 25.65385 0.099 116.05 0.032403240
## 10148: 16796 2.616 0.0058 11.16667 0.155 65.94 0.140287770
## dist_water_km forestProp_1km diffcwv
## 1: 20.897764 0.1323132 -0.237886
## 2: 2.766632 0.0000000 -0.398595
## 3: 4.091601 0.2266187 -0.437449
## 4: 2.332293 0.1142086 -0.507795
## 5: 20.897764 0.1323132 0.009901
## ---
## 10144: 7.458141 0.1373429 0.069668
## 10145: 4.091601 0.2266187 0.035273
## 10146: 16.273121 0.5328533 -0.384584
## 10147: 16.273121 0.5328533 -0.006044
## 10148: 4.091601 0.2266187 0.158404
y_var <- "diffcwv"
dataX <- dataXY_df[,-..y_var]
# hyperparameter tuning results
param_dart <- list(objective = "reg:linear", # For regression
nrounds = 366,
eta = 0.018,
max_depth = 10,
gamma = 0.009,
subsample = 0.98,
colsample_bytree = 0.86)
mod <- xgboost::xgboost(data = as.matrix(dataX), label = as.matrix(dataXY_df[[y_var]]),
xgb_param = param_dart, nrounds = param_dart$nrounds,
verbose = FALSE, nthread = parallel::detectCores() - 2,
early_stopping_rounds = 8)
# To return the SHAP values and ranked features by mean|SHAP|
shap_values <- shap.values(xgb_model = mod, X_train = dataX)
# The ranked features by mean |SHAP|
shap_values$mean_shap_score
## dayint Column_WV AOT_Uncertainty aod elev
## 0.08452563 0.06163608 0.05989677 0.03729633 0.02585212
## dist_water_km RelAZ DevAll_P1km forestProp_1km
## 0.02408034 0.01749243 0.01728937 0.01226319
# To prepare the long-format data:
shap_long <- shap.prep(xgb_model = mod, X_train = dataX)
# is the same as: using given shap_contrib
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = dataX)
# (Notice that there will be a data.table warning from `melt.data.table` due to `dayint` coerced from integer to double)
# **SHAP summary plot**
shap.plot.summary(shap_long)

# sometimes for a preview, you want to plot less data to make it faster using `dilute`
shap.plot.summary(shap_long, x_bound = 1.2, dilute = 10)

# Alternatives options to make the same plot:
# option 1: start with the xgboost model
shap.plot.summary.wrap1(mod, X = as.matrix(dataX))

# option 2: supply a self-made SHAP values dataset (e.g. sometimes as output from cross-validation)
shap.plot.summary.wrap2(shap_values$shap_score, as.matrix(dataX))

# **SHAP dependence plot**
# if without y, will just plot SHAP values of x vs. x
shap.plot.dependence(data_long = shap_long, x = "dayint")

# optional to color the plot by assigning `color_feature` (Fig.A)
shap.plot.dependence(data_long = shap_long, x= "dayint",
color_feature = "Column_WV")

# optional to put a different SHAP values on the y axis to view some interaction (Fig.B)
shap.plot.dependence(data_long = shap_long, x= "dayint",
y = "Column_WV", color_feature = "Column_WV")

# To make plots for a group of features:
fig_list = lapply(names(shap_values$mean_shap_score)[1:6], shap.plot.dependence,
data_long = shap_long, dilute = 5)
gridExtra::grid.arrange(grobs = fig_list, ncol = 2)

# prepare the data using either:
# (this step is slow since it calculates all the combinations of features. This example spends 10s.)
data_int <- shap.prep.interaction(xgb_mod = mod, X_train = as.matrix(dataX))
# or:
shap_int <- predict(mod, as.matrix(dataX), predinteraction = TRUE)
# **SHAP interaction effect plot **
shap.plot.dependence(data_long = shap_long,
data_int = shap_int,
x= "Column_WV",
y = "AOT_Uncertainty",
color_feature = "AOT_Uncertainty")

# choose to show top 4 features by setting `top_n = 4`, set 6 clustering groups.
plot_data <- shap.prep.stack.data(shap_contrib = shap_values$shap_score, top_n = 4, n_groups = 6)
# choose to zoom in at location 500, set y-axis limit using `y_parent_limit`
# it is also possible to set y-axis limit for zoom-in part alone using `y_zoomin_limit`
shap.plot.force_plot(plot_data, zoom_in_location = 500, y_parent_limit = c(-1,1))

# plot by each cluster
shap.plot.force_plot_bygroup(plot_data)
