Interpretable Machine Learning For Diabetes Prediction using ShapXGBoost

##Application of ShapXgboost for Diabetes Prediction:Reading the Libraries and Data

suppressPackageStartupMessages({
library(SHAPforxgboost)
library(xgboost)
library(data.table)
library(ggplot2)
library(readr)
})

## Warning: package 'xgboost' was built under R version 4.0.3

## Warning: package 'data.table' was built under R version 4.0.3

## Warning: package 'ggplot2' was built under R version 4.0.3

## Warning: package 'readr' was built under R version 4.0.3

Reading the Data set and converting the Response Variable in to Factor

diabetes <- read_csv("C:/Users/Nikhil/Desktop/diabetes.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Pregnancies = col_double(),
##   Glucose = col_double(),
##   BloodPressure = col_double(),
##   SkinThickness = col_double(),
##   Insulin = col_double(),
##   BMI = col_double(),
##   DiabetesPedigreeFunction = col_double(),
##   Age = col_double(),
##   Outcome = col_double()
## )

data<-diabetes
data$Outcome = as.factor(data$Outcome)

Building XGBoost model for Diabetes Prediction

X1 = as.matrix(data[,-9])

mod1 = xgboost::xgboost(
  data = X1, label = data$Outcome, gamma = 0, eta = 1, 
  lambda = 0,nrounds = 1, verbose = F)

shap_values <- shap.values(xgb_model = mod1, X_train = X1)
shap_values$mean_shap_score

##                  Glucose                      BMI                      Age 
##               0.17838708               0.10814442               0.09400389 
## DiabetesPedigreeFunction            BloodPressure              Pregnancies 
##               0.06113414               0.03001822               0.02416222 
##                  Insulin            SkinThickness 
##               0.01081403               0.00000000

Shap Plots

shap_values_Diabetes <- shap_values$shap_score


shap_long_Diabetes <- shap.prep(xgb_model = mod1, X_train = X1)

shap_long_Diabetes <- shap.prep(shap_contrib = shap_values_Diabetes, X_train = X1)

Shap Summary Plots

shap.plot.summary(shap_long_Diabetes)

shap.plot.summary(shap_long_Diabetes, x_bound  = 1.5, dilute = 10)

shap.plot.summary.wrap1(mod1, X1, top_n = 7)

shap.plot.summary.wrap2(shap_score = shap_values$shap_score, X1, top_n = 3)

shap.plot.summary.wrap2(shap_score = shap_values$shap_score, X1, top_n = 5)

Shap Dependence Plots

shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
                           y = "Insulin", color_feature = "BloodPressure")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
                           y = "Insulin", color_feature = "BMI")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
                           y = "Insulin", color_feature = "Age")

## `geom_smooth()` using formula 'y ~ x'

## Additional Shap Dependence Plots

shap.plot.dependence(data_long = shap_long_Diabetes, "Insulin")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "BloodPressure")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "BMI")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "Glucose")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "Age")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes, "Pregnancies")

## `geom_smooth()` using formula 'y ~ x'

## Building the Model and Extracting the Shap Values for Visualization

mod1 = xgboost::xgboost(
  data = as.matrix(data[,-9]), label = data$Outcome,
  gamma = 0, eta = 1, lambda = 0,nrounds = 1, verbose = FALSE)


data_int <- shap.prep.interaction(xgb_mod = mod1,
                                  X_train = as.matrix(data[,-9]))

shap_int <- predict(mod1, as.matrix(data[,-9]), predinteraction = TRUE)

# **SHAP interaction effect plot **
shap.plot.dependence(data_long = shap_long_Diabetes,
                           x="Age",
                           y = "BMI",
                           color_feature = "Insulin")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes,
                           x="Glucose",
                           y = "BMI",
                           color_feature = "BloodPressure")

## `geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(data_long = shap_long_Diabetes,
                           x="Insulin",
                           y = "BloodPressure",
                           color_feature = "DiabetesPedigreeFunction")

## `geom_smooth()` using formula 'y ~ x'

## Shap Force Plots

# **SHAP force plot**
plot_data <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
                                  n_groups = 4)

## All the features will be used.

plot_data1 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
                                  n_groups = 5)

## All the features will be used.

plot_data2 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
                                  n_groups = 3)

## All the features will be used.

plot_data3 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
                                  n_groups = 7)

## All the features will be used.

## Shap force plot 
shap.plot.force_plot(plot_data)

## Data has N = 768 | zoom in length is 76 at location 460.8.

shap.plot.force_plot(plot_data1)

## Data has N = 768 | zoom in length is 76 at location 460.8.

shap.plot.force_plot(plot_data2)

## Data has N = 768 | zoom in length is 76 at location 460.8.

shap.plot.force_plot(plot_data3)

## Data has N = 768 | zoom in length is 76 at location 460.8.

# plot by each cluster
shap.plot.force_plot_bygroup(plot_data)

shap.plot.force_plot_bygroup(plot_data1)

shap.plot.force_plot_bygroup(plot_data2)

shap.plot.force_plot_bygroup(plot_data3)

Interpretable Machine Learning For Diabetes Prediction using ShapXGBoost

Akshata Kishore Moharir

Reading the Data set and converting the Response Variable in to Factor

Building XGBoost model for Diabetes Prediction

Shap Plots

Shap Summary Plots

Shap Dependence Plots

References

https://liuyanguu.github.io/post/2019/07/18/visualization-of-shap-for-xgboost/