##Application of ShapXgboost for Diabetes Prediction:Reading the Libraries and Data

suppressPackageStartupMessages({
library(SHAPforxgboost)
library(xgboost)
library(data.table)
library(ggplot2)
library(readr)
})
## Warning: package 'xgboost' was built under R version 4.0.3
## Warning: package 'data.table' was built under R version 4.0.3
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3

Reading the Data set and converting the Response Variable in to Factor

diabetes <- read_csv("C:/Users/Nikhil/Desktop/diabetes.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Pregnancies = col_double(),
##   Glucose = col_double(),
##   BloodPressure = col_double(),
##   SkinThickness = col_double(),
##   Insulin = col_double(),
##   BMI = col_double(),
##   DiabetesPedigreeFunction = col_double(),
##   Age = col_double(),
##   Outcome = col_double()
## )
data<-diabetes
data$Outcome = as.factor(data$Outcome)

Building XGBoost model for Diabetes Prediction

X1 = as.matrix(data[,-9])

mod1 = xgboost::xgboost(
  data = X1, label = data$Outcome, gamma = 0, eta = 1, 
  lambda = 0,nrounds = 1, verbose = F)

shap_values <- shap.values(xgb_model = mod1, X_train = X1)
shap_values$mean_shap_score
##                  Glucose                      BMI                      Age 
##               0.17838708               0.10814442               0.09400389 
## DiabetesPedigreeFunction            BloodPressure              Pregnancies 
##               0.06113414               0.03001822               0.02416222 
##                  Insulin            SkinThickness 
##               0.01081403               0.00000000

Shap Plots

shap_values_Diabetes <- shap_values$shap_score


shap_long_Diabetes <- shap.prep(xgb_model = mod1, X_train = X1)

shap_long_Diabetes <- shap.prep(shap_contrib = shap_values_Diabetes, X_train = X1)

Shap Summary Plots

shap.plot.summary(shap_long_Diabetes)

shap.plot.summary(shap_long_Diabetes, x_bound  = 1.5, dilute = 10)

shap.plot.summary.wrap1(mod1, X1, top_n = 7)

shap.plot.summary.wrap2(shap_score = shap_values$shap_score, X1, top_n = 3)

shap.plot.summary.wrap2(shap_score = shap_values$shap_score, X1, top_n = 5)

Shap Dependence Plots

shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
                           y = "Insulin", color_feature = "BloodPressure")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
                           y = "Insulin", color_feature = "BMI")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
                           y = "Insulin", color_feature = "Age")
## `geom_smooth()` using formula 'y ~ x'

## Additional Shap Dependence Plots

shap.plot.dependence(data_long = shap_long_Diabetes, "Insulin")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "BloodPressure")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "BMI")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "Glucose")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "Age")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "Pregnancies")
## `geom_smooth()` using formula 'y ~ x'

## Building the Model and Extracting the Shap Values for Visualization

mod1 = xgboost::xgboost(
  data = as.matrix(data[,-9]), label = data$Outcome,
  gamma = 0, eta = 1, lambda = 0,nrounds = 1, verbose = FALSE)


data_int <- shap.prep.interaction(xgb_mod = mod1,
                                  X_train = as.matrix(data[,-9]))

shap_int <- predict(mod1, as.matrix(data[,-9]), predinteraction = TRUE)
# **SHAP interaction effect plot **
shap.plot.dependence(data_long = shap_long_Diabetes,
                           x="Age",
                           y = "BMI",
                           color_feature = "Insulin")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes,
                           x="Glucose",
                           y = "BMI",
                           color_feature = "BloodPressure")
## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes,
                           x="Insulin",
                           y = "BloodPressure",
                           color_feature = "DiabetesPedigreeFunction")
## `geom_smooth()` using formula 'y ~ x'

## Shap Force Plots

# **SHAP force plot**
plot_data <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
                                  n_groups = 4)
## All the features will be used.
plot_data1 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
                                  n_groups = 5)
## All the features will be used.
plot_data2 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
                                  n_groups = 3)
## All the features will be used.
plot_data3 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
                                  n_groups = 7)
## All the features will be used.
## Shap force plot 
shap.plot.force_plot(plot_data)
## Data has N = 768 | zoom in length is 76 at location 460.8.

shap.plot.force_plot(plot_data1)
## Data has N = 768 | zoom in length is 76 at location 460.8.

shap.plot.force_plot(plot_data2)
## Data has N = 768 | zoom in length is 76 at location 460.8.

shap.plot.force_plot(plot_data3)
## Data has N = 768 | zoom in length is 76 at location 460.8.

# plot by each cluster
shap.plot.force_plot_bygroup(plot_data)

shap.plot.force_plot_bygroup(plot_data1)

shap.plot.force_plot_bygroup(plot_data2)

shap.plot.force_plot_bygroup(plot_data3)

References

https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/