Extreme Gradient Boost

xgboost

require(xgboost)
require(Matrix)
require(data.table)
require(vcd)

data(Arthritis)
df <- data.table(Arthritis, keep.rownames = F)
head(df)
##    ID Treatment  Sex Age Improved
## 1: 57   Treated Male  27     Some
## 2: 46   Treated Male  29     None
## 3: 77   Treated Male  30     None
## 4: 17   Treated Male  32   Marked
## 5: 36   Treated Male  46   Marked
## 6: 23   Treated Male  58   Marked
head(df[,AgeDiscret := as.factor(round(Age/10,0))])
##    ID Treatment  Sex Age Improved AgeDiscret
## 1: 57   Treated Male  27     Some          3
## 2: 46   Treated Male  29     None          3
## 3: 77   Treated Male  30     None          3
## 4: 17   Treated Male  32   Marked          3
## 5: 36   Treated Male  46   Marked          5
## 6: 23   Treated Male  58   Marked          6
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
##    ID Treatment  Sex Age Improved AgeDiscret AgeCat
## 1: 57   Treated Male  27     Some          3  Young
## 2: 46   Treated Male  29     None          3  Young
## 3: 77   Treated Male  30     None          3  Young
## 4: 17   Treated Male  32   Marked          3    Old
## 5: 36   Treated Male  46   Marked          5    Old
## 6: 23   Treated Male  58   Marked          6    Old
df[,ID:=NULL]
levels(df[,Treatment])
## [1] "Placebo" "Treated"
sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
head(sparse_matrix)
## 6 x 9 sparse Matrix of class "dgCMatrix"
##   TreatmentTreated SexMale Age AgeDiscret3 AgeDiscret4 AgeDiscret5 AgeDiscret6
## 1                1       1  27           1           .           .           .
## 2                1       1  29           1           .           .           .
## 3                1       1  30           1           .           .           .
## 4                1       1  32           1           .           .           .
## 5                1       1  46           .           .           1           .
## 6                1       1  58           .           .           .           1
##   AgeDiscret7 AgeCatYoung
## 1           .           1
## 2           .           1
## 3           .           1
## 4           .           .
## 5           .           .
## 6           .           .
output_vector = df[,Improved] == "Marked"

bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
               eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic")
## [1]  train-error:0.202381 
## [2]  train-error:0.166667 
## [3]  train-error:0.166667 
## [4]  train-error:0.166667 
## [5]  train-error:0.154762 
## [6]  train-error:0.154762 
## [7]  train-error:0.154762 
## [8]  train-error:0.166667 
## [9]  train-error:0.166667 
## [10] train-error:0.166667
importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), 
                                model = bst, data = sparse_matrix, label = output_vector)
# Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
xgb.plot.importance(importance_matrix = importance)

# }
# NOT RUN {
# Below is an example of how to save this plot to a file. 
# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
library(DiagrammeR)
xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
xgb.plot.tree(model = bst)
library("SHAPforxgboost")
dataXY_df
##        dayint Column_WV AOT_Uncertainty     elev   aod  RelAZ DevAll_P1km
##     1:  11012     1.822          0.0130 60.34783 0.333 124.38 0.665166517
##     2:  11012     1.801          0.0131 17.16000 0.604 123.85 0.917266187
##     3:  11012     1.887          0.0161 11.16667 0.342 124.41 0.140287770
##     4:  11014     1.752          0.0110 11.73913 0.343 126.55 0.002697842
##     5:  11015     0.620          0.0201 60.34783 0.225  62.02 0.665166517
##    ---                                                                   
## 10144:  16789     0.673          0.0094 67.00000 0.069  65.85 0.626570916
## 10145:  16789     0.654          0.0073 11.16667 0.068  66.82 0.140287770
## 10146:  16794     1.331          0.0025 25.65385 0.065  68.32 0.032403240
## 10147:  16795     0.499          0.0103 25.65385 0.099 116.05 0.032403240
## 10148:  16796     2.616          0.0058 11.16667 0.155  65.94 0.140287770
##        dist_water_km forestProp_1km   diffcwv
##     1:     20.897764      0.1323132 -0.237886
##     2:      2.766632      0.0000000 -0.398595
##     3:      4.091601      0.2266187 -0.437449
##     4:      2.332293      0.1142086 -0.507795
##     5:     20.897764      0.1323132  0.009901
##    ---                                       
## 10144:      7.458141      0.1373429  0.069668
## 10145:      4.091601      0.2266187  0.035273
## 10146:     16.273121      0.5328533 -0.384584
## 10147:     16.273121      0.5328533 -0.006044
## 10148:      4.091601      0.2266187  0.158404
y_var <-  "diffcwv"
dataX <- dataXY_df[,-..y_var]
# hyperparameter tuning results
param_dart <- list(objective = "reg:linear",  # For regression
                   nrounds = 366,
                   eta = 0.018,
                   max_depth = 10,
                   gamma = 0.009,
                   subsample = 0.98,
                   colsample_bytree = 0.86)

mod <- xgboost::xgboost(data = as.matrix(dataX), label = as.matrix(dataXY_df[[y_var]]), 
                        xgb_param = param_dart, nrounds = param_dart$nrounds,
                        verbose = FALSE, nthread = parallel::detectCores() - 2,
                        early_stopping_rounds = 8)

# To return the SHAP values and ranked features by mean|SHAP|
shap_values <- shap.values(xgb_model = mod, X_train = dataX)
# The ranked features by mean |SHAP|
shap_values$mean_shap_score
##          dayint       Column_WV AOT_Uncertainty             aod            elev 
##      0.08452563      0.06163608      0.05989677      0.03729633      0.02585212 
##   dist_water_km           RelAZ     DevAll_P1km  forestProp_1km 
##      0.02408034      0.01749243      0.01728937      0.01226319
# To prepare the long-format data:
shap_long <- shap.prep(xgb_model = mod, X_train = dataX)
# is the same as: using given shap_contrib
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = dataX)
# (Notice that there will be a data.table warning from `melt.data.table` due to `dayint` coerced from integer to double)

# **SHAP summary plot**
shap.plot.summary(shap_long)

# sometimes for a preview, you want to plot less data to make it faster using `dilute`
shap.plot.summary(shap_long, x_bound  = 1.2, dilute = 10)

# Alternatives options to make the same plot:
# option 1: start with the xgboost model
shap.plot.summary.wrap1(mod, X = as.matrix(dataX))

# option 2: supply a self-made SHAP values dataset (e.g. sometimes as output from cross-validation)
shap.plot.summary.wrap2(shap_values$shap_score, as.matrix(dataX))

# **SHAP dependence plot**
# if without y, will just plot SHAP values of x vs. x
shap.plot.dependence(data_long = shap_long, x = "dayint")

# optional to color the plot by assigning `color_feature` (Fig.A)
shap.plot.dependence(data_long = shap_long, x= "dayint",
                     color_feature = "Column_WV")

# optional to put a different SHAP values on the y axis to view some interaction (Fig.B)      
shap.plot.dependence(data_long = shap_long, x= "dayint",
                     y = "Column_WV", color_feature = "Column_WV")     

# To make plots for a group of features:
fig_list = lapply(names(shap_values$mean_shap_score)[1:6], shap.plot.dependence, 
                  data_long = shap_long, dilute = 5)
gridExtra::grid.arrange(grobs = fig_list, ncol = 2)

# prepare the data using either: 
# (this step is slow since it calculates all the combinations of features. This example spends 10s.)
data_int <- shap.prep.interaction(xgb_mod = mod, X_train = as.matrix(dataX))
# or:
shap_int <- predict(mod, as.matrix(dataX), predinteraction = TRUE)

# **SHAP interaction effect plot **
shap.plot.dependence(data_long = shap_long,
                     data_int = shap_int,
                     x= "Column_WV",
                     y = "AOT_Uncertainty", 
                     color_feature = "AOT_Uncertainty")

# choose to show top 4 features by setting `top_n = 4`, set 6 clustering groups.  
plot_data <- shap.prep.stack.data(shap_contrib = shap_values$shap_score, top_n = 4, n_groups = 6)

# choose to zoom in at location 500, set y-axis limit using `y_parent_limit`  
# it is also possible to set y-axis limit for zoom-in part alone using `y_zoomin_limit`  
shap.plot.force_plot(plot_data, zoom_in_location = 500, y_parent_limit = c(-1,1))

# plot by each cluster
shap.plot.force_plot_bygroup(plot_data)