Load Required Libraries

library(synthpop)
library(tidyverse)
library(missRanger)
library(FactoMineR)
set.seed(54321)

Load Data

load(file = "data_for_rf.RData")

Impute Missing Values

# Select the first 32 columns and drop rows with missing values in the 'pri' and 'min_Alb' columns.
data <- data_for_rf %>% select(1:32) %>%
  drop_na(pri, min_Alb) %>% 
  mutate(age = as.numeric(age))

pri <- data$pri
min_Alb <- data$min_Alb

# Select all columns except 'pri' and 'min_Alb'.
data.0 <- data %>% 
  select(-c(pri, min_Alb))

# Impute missing values using missRanger.
data.impute.0 <- missRanger(data.0)
## 
## Missing value imputation by random forests
## 
##   Variables to impute:       lab_count, rx_count, BMI, min_pH, min_Hgb, max_Lactate, min_dbp, min_hgb_A1C, max_HR, min_HR, min_O2_sat, min_sbp, min_mbp, min_Abpm, min_Abps, min_Abpd, max_WBC, max_temp, max_PEEP, min_pO2, min_BE, max_pCO2, min_Braden_Activity, min_Braden_Friction_Shear, min_Braden_Mobility, min_Braden_Moisture, min_Braden_Nutrition, min_Braden_Sensory_Perception
##   Variables used to impute:  lab_count, rx_count, BMI, min_pH, age, min_Hgb, max_Lactate, min_dbp, previous_visits, min_hgb_A1C, max_HR, min_HR, min_O2_sat, min_sbp, min_mbp, min_Abpm, min_Abps, min_Abpd, max_WBC, max_temp, max_PEEP, min_pO2, min_BE, max_pCO2, min_Braden_Activity, min_Braden_Friction_Shear, min_Braden_Mobility, min_Braden_Moisture, min_Braden_Nutrition, min_Braden_Sensory_Perception
## iter 1:  ............................
## iter 2:  ............................
## iter 3:  ............................
## iter 4:  ............................
# Combine imputed data with 'pri' and 'min_Alb' columns and convert 'pri' to a factor.
data.impute <- cbind(data.impute.0, pri, min_Alb) %>% 
  mutate(pri = as.factor(pri))

Generate Synthetic Data

synth.MIMIC <- syn(data.impute)
## 
## Synthesis
## -----------
##  lab_count rx_count BMI min_pH age min_Hgb max_Lactate min_dbp previous_visits min_hgb_A1C
##  max_HR min_HR min_O2_sat min_sbp min_mbp min_Abpm min_Abps min_Abpd max_WBC max_temp
##  max_PEEP min_pO2 min_BE max_pCO2 min_Braden_Activity min_Braden_Friction_Shear min_Braden_Mobility min_Braden_Moisture min_Braden_Nutrition min_Braden_Sensory_Perception
##  pri min_Alb
synth_MIMIC <- synth.MIMIC$syn

Plot Original and Synthetic Data

plot(data.impute$min_HR, data.impute$min_Alb)
abline(lm(data.impute$min_Alb ~ data.impute$min_HR), lty = 2)

plot(synth_MIMIC$min_HR, synth_MIMIC$min_Alb)
abline(lm(synth_MIMIC$min_Alb ~ synth_MIMIC$min_HR), lty = 2)

Principal Component Analysis

# Remove 'pri' column from original and synthetic data.
pca.data.impute <- data.impute %>% select(-pri)
pca.synth_MIMIC <- synth_MIMIC %>% select(-pri)

# Perform PCA on original and synthetic data.
pca_result.mimic <- PCA(pca.data.impute, graph = TRUE)

pca_result.synth <- PCA(pca.synth_MIMIC, graph = TRUE)

Export datasets

save(data.impute, file="data_for_ML.impute.RData")
save(synth_MIMIC, file="data_for_ML.synth.RData")

XAI with Synth RF

library(ranger)          # for random forest models
library(DALEX)           # for model explainers
library(modelStudio)     # for interactive model explainers studio

# Convert the 'pri' variable to a numeric vector with 0 and 1 values
synth_MIMIC_num <- synth_MIMIC %>%
  mutate(pri = as.numeric(pri) - 1)

# Train a random forest model using ranger, with 'pri' as the response variable
model_rf <- ranger(pri ~ ., data = synth_MIMIC_num)

# Create a DALEX explainer for the random forest model
explainer_rf  <- DALEX::explain(model_rf,
                                data = synth_MIMIC_num,
                                y = synth_MIMIC_num$pri,
                                label = "MIMIC --> Synth")
## Preparation of a new explainer is initiated
##   -> model label       :  MIMIC --> Synth 
##   -> data              :  3473  rows  32  cols 
##   -> target variable   :  3473  values 
##   -> predict function  :  yhat.ranger  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package ranger , ver. 0.14.1 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  0 , mean =  0.09058859 , max =  0.787  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.267 , mean =  -0.004495873 , max =  0.5463667  
##   A new explainer has been created!
# Use modelStudio to create an interactive studio with model explainers
# modelStudio(explainer_rf)

https://rpubs.com/andystats/XAI_mimic4_synth

XAI with Synth XGBoost

library(xgboost)         # for XGBoost models
library(DALEX)           # for model explainers
library(modelStudio)     # for interactive model explainers studio

# Convert the 'pri' variable to a numeric vector with 0 and 1 values
synth_MIMIC_num <- synth_MIMIC %>%
  mutate(pri = as.numeric(pri) - 1)

# Prepare data for XGBoost
data_matrix <- xgb.DMatrix(data.matrix(synth_MIMIC_num[,-which(names(synth_MIMIC_num) == "pri")]),
                           label = synth_MIMIC_num$pri)

# Set XGBoost parameters
xgb_params <- list(
  objective = "binary:logistic",
  eval_metric = "logloss",
  max_depth = 6,
  eta = 0.3
)

# Train an XGBoost model
model_xgb <- xgb.train(params = xgb_params,
                       data = data_matrix,
                       nrounds = 100)

# Create a custom predict function for XGBoost
xgb_predict <- function(model, newdata) {
  newdata_matrix <- xgb.DMatrix(data.matrix(newdata))
  return(predict(model, newdata = newdata_matrix))
}

# Create a DALEX explainer for the XGBoost model
explainer_xgb <- DALEX::explain(model = model_xgb,
                                data = synth_MIMIC_num[,-which(names(synth_MIMIC_num) == "pri")],
                                y = synth_MIMIC_num$pri,
                                predict_function = xgb_predict,
                                label = "MIMIC --> Synth (XGBoost)")
## Preparation of a new explainer is initiated
##   -> model label       :  MIMIC --> Synth (XGBoost) 
##   -> data              :  3473  rows  31  cols 
##   -> target variable   :  3473  values 
##   -> predict function  :  xgb_predict 
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  2.325129e-05 , mean =  0.08610184 , max =  0.9923863  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.09597715 , mean =  -9.125908e-06 , max =  0.1123768  
##   A new explainer has been created!
# Use modelStudio to create an interactive studio with model explainers
# modelStudio(explainer_xgb)

https://rpubs.com/andystats/XAI_mimic4_synth_xgboost