Packages

library(pacman)
p_load(tidyverse, tidymodels, janitor, vip, themis, doParallel, parallel, ranger, DALEX, DALEXtra)

Read file

hfea_2017_2018 <- read_csv("2017-2018-xlsb.csv", col_types = cols(),  na = "NA")
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)

EDA

# Clean names
hfea_2017_2018 <- hfea_2017_2018 |> 
  clean_names()

# Select required variables
hfea <- hfea_2017_2018 |> 
  select(
    patient_age_at_treatment,
    total_number_of_previous_ivf_cycles,
    total_number_of_previous_di_cycles,
    egg_donor_age_at_registration,
    sperm_donor_age_at_registration,
    type_of_treatment_ivf_or_di,
    specific_treatment_type,
    egg_source,
    sperm_source,
    fresh_eggs_collected,
    total_eggs_mixed,
    total_embryos_created,
    total_embryos_thawed,
    embryos_stored_for_use_by_patient,
    early_outcome,
    heart_one_weeks_gestation,
    heart_one_birth_outcome,
    heart_one_birth_weight,
    heart_one_sex,
    heart_two_birth_outcome,
    heart_two_birth_weight,
    heart_two_sex,
    patient_ethnicity,
    partner_ethnicity,
    partner_type,
    partner_age,
    causes_of_infertility_tubal_disease,
    causes_of_infertility_ovulatory_disorder,
    causes_of_infertility_male_factor,
    causes_of_infertility_patient_unexplained,
    causes_of_infertility_endometriosis,
    stimulation_used,
    pgt_m_treatment,
    pgt_a_treatment,
    eggs_thawed_0_1,
    fresh_eggs_stored_0_1,
    year_of_treatment,
    live_birth_occurrence,
    number_of_live_births
  )

# Variables conversion
hfea_names <- c("patient_age_at_treatment",
                "total_number_of_previous_ivf_cycles",
                "total_number_of_previous_di_cycles",
                "egg_donor_age_at_registration",
                "sperm_donor_age_at_registration",
                "type_of_treatment_ivf_or_di",
                "specific_treatment_type",
                "egg_source",
                "sperm_source",
                "fresh_eggs_collected",
                "total_eggs_mixed",
                "total_embryos_created",
                "total_embryos_thawed",
                "embryos_stored_for_use_by_patient",
                "early_outcome",
                "heart_one_weeks_gestation",
                "heart_one_birth_outcome",
                "heart_one_birth_weight",
                "heart_one_sex",
                "heart_two_birth_outcome",
                "heart_two_birth_weight",
                "heart_two_sex",
                "patient_ethnicity",
                "partner_ethnicity",
                "partner_type",
                "partner_age",
                "causes_of_infertility_tubal_disease",
                "causes_of_infertility_ovulatory_disorder",
                "causes_of_infertility_male_factor",
                "causes_of_infertility_patient_unexplained",
                "causes_of_infertility_endometriosis",
                "stimulation_used",
                "pgt_m_treatment",
                "pgt_a_treatment",
                "eggs_thawed_0_1",                
                "fresh_eggs_stored_0_1",
                "year_of_treatment")
hfea <- hfea |> 
  mutate(across(all_of(hfea_names), factor))

# Create categorical outcome variable
hfea <- hfea |> 
  mutate(
    clinical_outcome = factor(
      case_when(
        live_birth_occurrence == 1 ~ "Success",
        TRUE ~ "Failure"
      )
    )
  )
lapply(hfea, function(x) {
  
  if (is.numeric(x)) return(summary(x))
  
  if (is.factor(x)) return(table(x))
  
})
$patient_age_at_treatment
x
18-34 35-37 38-39 40-42 43-44 45-50   999 
68365 38567 24365 23813  7292  4545  2669 

$total_number_of_previous_ivf_cycles
x
   >5     0     1     2     3     4     5 
 6498 72758 40631 24011 13623  7695  4400 

$total_number_of_previous_di_cycles
x
    >5      0      1      2      3      4      5 
  1115 157604   3763   2985   2323   1091    735 

$egg_donor_age_at_registration
x
                              <= 20               >35 Between 21 and 25 
           160975               324               867              1558 
Between 26 and 30 Between 31 and 35 
             2660              3232 

$sperm_donor_age_at_registration
x
                              <= 20               >45 Between 21 and 25 
           144738              1069               481              5806 
Between 26 and 30 Between 31 and 35 Between 36 and 40 Between 41 and 45 
             6241              5041              4475              1765 

$type_of_treatment_ivf_or_di
x
    DI    IVF 
 11282 158334 

$specific_treatment_type
x
          DI         ICSI     ICSI:IVF ICSI:Unknown          IVF  IVF:Unknown 
       11282        60955          430          133        58647           47 
     Unknown 
       38122 

$egg_source
x
          Donor Patient 
  11282    8677  149657 

$sperm_source
x
          Donor Partner 
    195   24782  144639 

$fresh_eggs_collected
x
        >40     0   1-5 11-15 16-20 21-25 26-30 31-35 36-40  6-10 
11282   274 60060 25857 21799 10986  4750  1983   753   317 31555 

$total_eggs_mixed
x
        >40     0   1-5 11-15 16-20 21-25 26-30 31-35 36-40  6-10 
11282   122 62337 31160 18623  7982  3021  1088   403   146 33452 

$total_embryos_created
x
        >30     0   1-5 11-15 16-20 21-25 26-30  6-10 
11282    84 66796 48335  9882  2728   689   206 29614 

$total_embryos_thawed
x
   >10      0    1-5   6-10 
   213 119661  48930    812 

$embryos_stored_for_use_by_patient
x
          >20      0    1-5  11-15  16-20   6-10 
 11282    101 109022  41130   1116    245   6720 

$early_outcome
x
                                         Biochemical Pregnancy Only 
                            37096                              8966 
               Ectopic/Hetrotopic Intrauterine Fetal Pulsation Seen 
                              538                             46571 
                      Miscarriage                              None 
                             4937                             71508 

$heart_one_weeks_gestation
x
                                         30                    31 
               129263                   160                   221 
                   32                    33                    34 
                  290                   406                   676 
                   35                    36                    37 
                 1074                  2163                  4512 
                   38                    39                    40 
                 7211                 10456                  8615 
Greater than 40 weeks    Less than 30 weeks 
                 4024                   545 

$heart_one_birth_outcome
x
                               Ectotopic/Hetrotopic Pregnancy 
                        122806                             46 
              Embryo Reduction                    Live  Birth 
                            39                          40525 
             Lost to Follow Up                    Miscarriage 
                           431                           5261 
                   Still Birth                    Termination 
                           148                            360 

$heart_one_birth_weight
x
                                 5.5kg or greater Between 1.5kg and 1.99Kg 
                  129357                       27                     1048 
  Between 1kg and 1.49Kg Between 2.0kg and 2.49Kg Between 2.5kg and 2.99Kg 
                     527                     3067                     7241 
Between 3.0kg and 3.49Kg Between 3.5kg and 3.99Kg Between 4.0kg and 4.49Kg 
                   14630                    10179                     2779 
Between 4.5kg and 4.99Kg Between 5.0kg and 5.49Kg            Less than 1kg 
                     420                       56                      285 

$heart_one_sex
x
            F      M 
129123  19860  20633 

$heart_two_birth_outcome
x
                 Embryo Reduction      Live  Birth      Miscarriage 
          164891               12             4167              506 
     Still Birth 
              40 

$heart_two_birth_weight
x
                         Between 1.5kg and 1.99Kg   Between 1kg and 1.49Kg 
                  165515                      606                      220 
Between 2.0kg and 2.49Kg Between 2.5kg and 2.99Kg Between 3.0kg and 3.49Kg 
                    1438                     1082                      455 
Between 3.5kg and 3.99Kg Between 4.0kg and 4.49Kg Between 4.5kg and 4.99Kg 
                     131                       37                        8 
           Less than 1kg 
                     124 

$heart_two_sex
x
            F      M 
165466   2191   1959 

$patient_ethnicity
x
 Asian  Black  Mixed  Other  White 
 17119   4243   2573  30236 115445 

$partner_ethnicity
x
Any other ethnicity               Asian               Black               Mixed 
               3663               15030                3796                1843 
              Other               White 
              37889              107395 

$partner_type
x
   Female      Male       N/A      None Surrogate 
     9827    148986      3464      6712       627 

$partner_age
x
        >60 18-34 35-37 38-39 40-42 43-44 45-50 51-55 56-60   999 
16933   772 48438 31328 19597 21118 10352 15595  4145  1332     6 

$causes_of_infertility_tubal_disease
x
     0      1 
154648  14968 

$causes_of_infertility_ovulatory_disorder
x
     0      1 
151465  18151 

$causes_of_infertility_male_factor
x
     0      1 
117841  51775 

$causes_of_infertility_patient_unexplained
x
     0      1 
122981  46635 

$causes_of_infertility_endometriosis
x
     0      1 
160751   8865 

$stimulation_used
x
     0      1 
 64471 105145 

$pgt_m_treatment
x
     0      1 
168202   1414 

$pgt_a_treatment
x
     0      1 
167625   1991 

$eggs_thawed_0_1
x
     0      1 
168127   1489 

$fresh_eggs_stored_0_1
x
     0      1 
165770   3846 

$year_of_treatment
x
 2017  2018 
84398 85218 

$live_birth_occurrence
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.2419  0.0000  1.0000 

$number_of_live_births
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   0.000   0.000   0.264   0.000   3.000 

$clinical_outcome
x
Failure Success 
 128593   41023 
# Replace missing values with "0"

hfea$fresh_eggs_collected[hfea$fresh_eggs_collected == ""] <- "0"

hfea$total_eggs_mixed[hfea$total_eggs_mixed == ""] <- "0"

hfea$total_embryos_created[hfea$total_embryos_created == ""] <- "0"

hfea$embryos_stored_for_use_by_patient[hfea$embryos_stored_for_use_by_patient == ""] <- "0"

hfea$early_outcome[hfea$early_outcome == ""] <- "None"


# Remove variables with so much missing values 
hfea <- hfea |> 
  select(-c(live_birth_occurrence,
            number_of_live_births,
            year_of_treatment,
            heart_one_birth_outcome,
            heart_one_weeks_gestation,
            heart_one_birth_weight,
            heart_two_birth_weight,
            heart_two_birth_outcome,
            egg_donor_age_at_registration,
            
            sperm_donor_age_at_registration,
            heart_one_sex,
            heart_two_sex,
            egg_source,
            sperm_source
  ))

# Drops the row if "" or 999 appears anywhere especially in the cloumn  partner_age
clean_hfea <- hfea |> 
  filter(!if_any(everything(), ~ .x %in% c("", "999")))

Machine Learning with Ensemble

Random Forest

Preprocess

# Data splitting 
data_split <- initial_split(clean_hfea, prop = 0.80, strata = clinical_outcome)

train_split <- data_split |> training()
test_split <- data_split |> testing()

# Preprocessing
hfea_recipe <- recipe(clinical_outcome ~ ., data = train_split) |> 
  step_novel(all_nominal_predictors()) |> 
  #step_dummy(all_nominal_predictors(), -all_outcomes(), one_hot = FALSE) |>
  step_zv(all_nominal_predictors()) |>
  step_normalize(all_numeric_predictors()) |> 
  step_downsample(clinical_outcome, under_ratio = 1) # Solve the 3:1 outcome variable mild imbalance

Model and Workflow

# Create model design
ivf_rf <- rand_forest(min_n = 5, mtry = 2, trees = 2000) |> 
  set_engine("ranger", num.threads = detectCores(), importance = 'permutation') |> 
  set_mode("classification")

# Add recipe and model design to workflow
set.seed(14)
ivf_workflow <- workflow() |> 
  add_recipe(hfea_recipe) |> 
  add_model(ivf_rf) |> 
  fit(train_split)

# Predict
ivf_test <- augment(ivf_workflow, test_split)

# Metrics
ivf_metrics <- metric_set(accuracy, sensitivity, specificity, precision, npv)

ivf_test |> 
  ivf_metrics(truth = clinical_outcome,
             estimate = .pred_class, 
             .pred_Success)
# A tibble: 5 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.943
2 sensitivity binary         0.925
3 specificity binary         0.996
4 precision   binary         0.999
5 npv         binary         0.820

Variable Importance Plot (VIP)

# Feature Importance Vip Plot
# 1. Extract the underlying engine fit cleanly
extracted_fit <- extract_fit_engine(ivf_workflow)

# 2. Generate the plot with your specific fill and theme styles
vip(extracted_fit, aesthetics = list(fill = "turquoise1", color = "black")) + 
  theme_minimal(base_size = 14) +
  labs(
    title = "Ensemble Feature Importance Profile",
    subtitle = "Random Forest VIP",
    y = "Importance Score",
    x = "Clinical Predictor"
  )

Shapley values

set.seed(16)

# 1. Take a representative sample of your background training features
background_sample <- train_split |> 
  select(-clinical_outcome) |> 
  slice_sample(n = 200) # Fast & highly accurate baseline

# 2. Extract the matching target outcomes for those specific 200 rows
background_y <- train_split$clinical_outcome[sample(1:nrow(train_split), 200)]

# 3. Build the explainer with the lightweight sample
ivf_explainer <- explain_tidymodels(
  ivf_workflow,
  data = background_sample, # Swapped out the massive 150k row data frame
  y = background_y,
  verbose = FALSE
)
## Warning in Ops.factor(y, predict_function(model, data)): '-' not meaningful for
## factors
# 4. Run the SHAP prediction (This will now finish in seconds!)
single_var <- clean_hfea[1000, -26]

ivf_shap <- predict_parts(
  explainer = ivf_explainer,
  new_observation = single_var,
  type = "shap",
  B = 25 # 25 bootstrap iterations
)

ivf_shap |> plot()