##Application of ShapXgboost for Diabetes Prediction:Reading the Libraries and Data
suppressPackageStartupMessages({
library(SHAPforxgboost)
library(xgboost)
library(data.table)
library(ggplot2)
library(readr)
})
## Warning: package 'xgboost' was built under R version 4.0.3
## Warning: package 'data.table' was built under R version 4.0.3
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
diabetes <- read_csv("C:/Users/Nikhil/Desktop/diabetes.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Pregnancies = col_double(),
## Glucose = col_double(),
## BloodPressure = col_double(),
## SkinThickness = col_double(),
## Insulin = col_double(),
## BMI = col_double(),
## DiabetesPedigreeFunction = col_double(),
## Age = col_double(),
## Outcome = col_double()
## )
data<-diabetes
data$Outcome = as.factor(data$Outcome)
X1 = as.matrix(data[,-9])
mod1 = xgboost::xgboost(
data = X1, label = data$Outcome, gamma = 0, eta = 1,
lambda = 0,nrounds = 1, verbose = F)
shap_values <- shap.values(xgb_model = mod1, X_train = X1)
shap_values$mean_shap_score
## Glucose BMI Age
## 0.17838708 0.10814442 0.09400389
## DiabetesPedigreeFunction BloodPressure Pregnancies
## 0.06113414 0.03001822 0.02416222
## Insulin SkinThickness
## 0.01081403 0.00000000
shap_values_Diabetes <- shap_values$shap_score
shap_long_Diabetes <- shap.prep(xgb_model = mod1, X_train = X1)
shap_long_Diabetes <- shap.prep(shap_contrib = shap_values_Diabetes, X_train = X1)
shap.plot.summary(shap_long_Diabetes)
shap.plot.summary(shap_long_Diabetes, x_bound = 1.5, dilute = 10)
shap.plot.summary.wrap1(mod1, X1, top_n = 7)
shap.plot.summary.wrap2(shap_score = shap_values$shap_score, X1, top_n = 3)
shap.plot.summary.wrap2(shap_score = shap_values$shap_score, X1, top_n = 5)
shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
y = "Insulin", color_feature = "BloodPressure")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
y = "Insulin", color_feature = "BMI")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes, x="Glucose",
y = "Insulin", color_feature = "Age")
## `geom_smooth()` using formula 'y ~ x'
## Additional Shap Dependence Plots
shap.plot.dependence(data_long = shap_long_Diabetes, "Insulin")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes, "BloodPressure")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes, "BMI")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes, "Glucose")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes, "Age")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes, "Pregnancies")
## `geom_smooth()` using formula 'y ~ x'
## Building the Model and Extracting the Shap Values for Visualization
mod1 = xgboost::xgboost(
data = as.matrix(data[,-9]), label = data$Outcome,
gamma = 0, eta = 1, lambda = 0,nrounds = 1, verbose = FALSE)
data_int <- shap.prep.interaction(xgb_mod = mod1,
X_train = as.matrix(data[,-9]))
shap_int <- predict(mod1, as.matrix(data[,-9]), predinteraction = TRUE)
# **SHAP interaction effect plot **
shap.plot.dependence(data_long = shap_long_Diabetes,
x="Age",
y = "BMI",
color_feature = "Insulin")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes,
x="Glucose",
y = "BMI",
color_feature = "BloodPressure")
## `geom_smooth()` using formula 'y ~ x'
shap.plot.dependence(data_long = shap_long_Diabetes,
x="Insulin",
y = "BloodPressure",
color_feature = "DiabetesPedigreeFunction")
## `geom_smooth()` using formula 'y ~ x'
## Shap Force Plots
# **SHAP force plot**
plot_data <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
n_groups = 4)
## All the features will be used.
plot_data1 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
n_groups = 5)
## All the features will be used.
plot_data2 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
n_groups = 3)
## All the features will be used.
plot_data3 <- shap.prep.stack.data(shap_contrib = shap_values_Diabetes,
n_groups = 7)
## All the features will be used.
## Shap force plot
shap.plot.force_plot(plot_data)
## Data has N = 768 | zoom in length is 76 at location 460.8.
shap.plot.force_plot(plot_data1)
## Data has N = 768 | zoom in length is 76 at location 460.8.
shap.plot.force_plot(plot_data2)
## Data has N = 768 | zoom in length is 76 at location 460.8.
shap.plot.force_plot(plot_data3)
## Data has N = 768 | zoom in length is 76 at location 460.8.
# plot by each cluster
shap.plot.force_plot_bygroup(plot_data)
shap.plot.force_plot_bygroup(plot_data1)
shap.plot.force_plot_bygroup(plot_data2)
shap.plot.force_plot_bygroup(plot_data3)