Packages

library(pacman)
p_load(tidyverse, tidymodels, skimr, janitor, tabnet, torch, vip, themis, doParallel, parallel, ComplexUpset)

Load file

# 1st file
hfea_2017_2018 <- read_csv("2017-2018-xlsb.csv", col_types = cols(),  na = "NA")
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
hfea_2017_2018 |> skim()
Data summary
Name hfea_2017_2018
Number of rows 169616
Number of columns 61
_______________________
Column type frequency:
character 27
logical 7
numeric 27
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Patient age at treatment 0 1 3 5 0 7 0
Total number of previous IVF cycles 0 1 1 2 0 7 0
Total number of previous DI cycles 0 1 1 2 0 7 0
Main reason for producing embroys storing eggs 0 1 8 15 0 7 0
Egg donor age at registration 0 1 0 17 160975 6 0
Sperm donor age at registration 0 1 0 17 144738 8 0
Type of treatment - IVF or DI 0 1 2 3 0 2 0
Specific treatment type 0 1 2 12 0 7 0
Egg source 0 1 0 7 11282 3 0
Sperm source 0 1 0 7 195 3 0
Fresh eggs collected 0 1 0 5 11282 11 0
Total eggs mixed 0 1 0 5 11282 11 0
Total embryos created 0 1 0 5 11282 9 0
Total embryos thawed 0 1 1 4 0 4 0
Embryos stored for use by patient 0 1 0 5 11282 7 0
Early outcome 0 1 0 33 37096 6 0
Heart one weeks gestation 0 1 0 21 129263 14 0
Heart one birth outcome 0 1 0 30 122806 8 0
Heart one birth weight 0 1 0 24 129357 12 0
Heart one sex 0 1 0 1 129123 3 0
Heart two birth outcome 0 1 0 16 164891 5 0
Heart two birth weight 0 1 0 24 165515 10 0
Heart two sex 0 1 0 1 165466 3 0
Patient ethnicity 0 1 5 5 0 5 0
Partner ethnicity 0 1 5 19 0 6 0
Partner Type 0 1 3 9 0 5 0
Partner age 0 1 0 5 16933 11 0

Variable type: logical

skim_variable n_missing complete_rate mean count
Heart two birth congenital abnormalities 169579 0 1 TRU: 37
Heart three weeks gestation 169616 0 NaN :
Heart three birth outcome 169616 0 NaN :
Heart three birth weight 169616 0 NaN :
Heart three sex 169564 0 0 FAL: 52
Heart three birth congenital abnormalities 169579 0 1 TRU: 37
Heart three delivery date 169616 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Total number of previous pregnancies - IVF and DI 126928 0.25 0.81 NaN 0 0 1 1 5 ▇▁▁▁▁
Total number of previous live births - IVF or DI 64546 0.62 0.23 0.45 0 0 0 0 3 ▇▂▁▁▁
Causes of infertility - tubal disease 0 1.00 0.09 0.28 0 0 0 0 1 ▇▁▁▁▁
Causes of infertility - ovulatory disorder 0 1.00 0.11 0.31 0 0 0 0 1 ▇▁▁▁▁
Causes of infertility - male factor 0 1.00 0.31 0.46 0 0 0 1 1 ▇▁▁▁▃
Causes of infertility - patient unexplained 0 1.00 0.27 0.45 0 0 0 1 1 ▇▁▁▁▃
Causes of infertility - endometriosis 0 1.00 0.05 0.22 0 0 0 0 1 ▇▁▁▁▁
Stimulation used 0 1.00 0.62 0.49 0 0 1 1 1 ▅▁▁▁▇
Donated embryo 11282 0.93 0.01 0.10 0 0 0 0 1 ▇▁▁▁▁
PGT-M treatment 0 1.00 0.01 0.09 0 0 0 0 1 ▇▁▁▁▁
PGT-A treatment 0 1.00 0.01 0.11 0 0 0 0 1 ▇▁▁▁▁
Elective single embryo transfer 11282 0.93 0.34 0.47 0 0 0 1 1 ▇▁▁▁▅
Fresh cycle 11282 0.93 0.69 0.46 0 0 1 1 1 ▃▁▁▁▇
Frozen cycle 11282 0.93 0.31 0.46 0 0 0 1 1 ▇▁▁▁▃
Eggs thawed (0/1) 0 1.00 0.01 0.09 0 0 0 0 1 ▇▁▁▁▁
Fresh eggs stored (0/1) 0 1.00 0.02 0.15 0 0 0 0 1 ▇▁▁▁▁
Embryos transferred 11282 0.93 1.02 0.72 0 1 1 1 3 ▃▇▁▃▁
Embryos transferred from eggs micro-injected 84734 0.50 0.78 0.80 0 0 1 1 3 ▇▆▁▃▁
Date of embryo transfer 48782 0.71 5.39 52.12 0 0 3 5 999 ▇▁▁▁▁
Year of treatment 0 1.00 2017.50 0.50 2017 2017 2018 2018 2018 ▇▁▁▁▇
Live birth occurrence 0 1.00 0.24 0.43 0 0 0 0 1 ▇▁▁▁▂
Number of live births 0 1.00 0.26 0.49 0 0 0 0 3 ▇▂▁▁▁
Number of foetal sacs with fetal pulsation 109775 0.35 0.87 0.53 0 1 1 1 3 ▂▇▁▁▁
Heart one delivery date 129223 0.24 2018.18 0.68 2017 2018 2018 2019 2019 ▂▁▇▁▆
Heart one birth congenital abnormalities 169173 0.00 1.00 0.00 1 1 1 1 1 ▁▁▇▁▁
Heart two weeks gestation 165679 0.02 35.92 NaN 30 35 36 37 40 ▂▂▇▇▂
Heart two delivery date 165473 0.02 2018.05 NaN 2017 2018 2018 2019 2019 ▃▁▇▁▃

Wrangling and cleaning

# Clean names
hfea_2017_2018 <- hfea_2017_2018 |> 
  clean_names()

# Select required variables
hfea <- hfea_2017_2018 |> 
  select(
    patient_age_at_treatment,
    total_number_of_previous_ivf_cycles,
    total_number_of_previous_di_cycles,
    egg_donor_age_at_registration,
    sperm_donor_age_at_registration,
    type_of_treatment_ivf_or_di,
    specific_treatment_type,
    egg_source,
    sperm_source,
    fresh_eggs_collected,
    total_eggs_mixed,
    total_embryos_created,
    total_embryos_thawed,
    embryos_stored_for_use_by_patient,
    early_outcome,
    heart_one_weeks_gestation,
    heart_one_birth_outcome,
    heart_one_birth_weight,
    heart_one_sex,
    heart_two_birth_outcome,
    heart_two_birth_weight,
    heart_two_sex,
    patient_ethnicity,
    partner_ethnicity,
    partner_type,
    partner_age,
    causes_of_infertility_tubal_disease,
    causes_of_infertility_ovulatory_disorder,
    causes_of_infertility_male_factor,
    causes_of_infertility_patient_unexplained,
    causes_of_infertility_endometriosis,
    stimulation_used,
    pgt_m_treatment,
    pgt_a_treatment,
    eggs_thawed_0_1,
    fresh_eggs_stored_0_1,
    year_of_treatment,
    live_birth_occurrence,
    number_of_live_births
  )


hfea |> str()
tibble [169,616 × 39] (S3: tbl_df/tbl/data.frame)
 $ patient_age_at_treatment                 : chr [1:169616] "45-50" "18-34" "999" "18-34" ...
 $ total_number_of_previous_ivf_cycles      : chr [1:169616] "3" "0" "0" "0" ...
 $ total_number_of_previous_di_cycles       : chr [1:169616] "0" "0" "0" "0" ...
 $ egg_donor_age_at_registration            : chr [1:169616] "" "" "" "" ...
 $ sperm_donor_age_at_registration          : chr [1:169616] "Between 36 and 40" "" "" "" ...
 $ type_of_treatment_ivf_or_di              : chr [1:169616] "IVF" "IVF" "IVF" "IVF" ...
 $ specific_treatment_type                  : chr [1:169616] "ICSI" "IVF" "IVF" "ICSI" ...
 $ egg_source                               : chr [1:169616] "Patient" "Patient" "Patient" "Patient" ...
 $ sperm_source                             : chr [1:169616] "Donor" "Partner" "Partner" "Partner" ...
 $ fresh_eggs_collected                     : chr [1:169616] "1-5" "21-25" "21-25" "1-5" ...
 $ total_eggs_mixed                         : chr [1:169616] "1-5" "0" "0" "1-5" ...
 $ total_embryos_created                    : chr [1:169616] "0" "0" "0" "1-5" ...
 $ total_embryos_thawed                     : chr [1:169616] "0" "0" "0" "0" ...
 $ embryos_stored_for_use_by_patient        : chr [1:169616] "0" "0" "0" "1-5" ...
 $ early_outcome                            : chr [1:169616] "" "" "" "None" ...
 $ heart_one_weeks_gestation                : chr [1:169616] "" "" "" "" ...
 $ heart_one_birth_outcome                  : chr [1:169616] "" "" "" "" ...
 $ heart_one_birth_weight                   : chr [1:169616] "" "" "" "" ...
 $ heart_one_sex                            : chr [1:169616] "" "" "" "" ...
 $ heart_two_birth_outcome                  : chr [1:169616] "" "" "" "" ...
 $ heart_two_birth_weight                   : chr [1:169616] "" "" "" "" ...
 $ heart_two_sex                            : chr [1:169616] "" "" "" "" ...
 $ patient_ethnicity                        : chr [1:169616] "Black" "White" "Other" "White" ...
 $ partner_ethnicity                        : chr [1:169616] "Other" "Other" "Other" "White" ...
 $ partner_type                             : chr [1:169616] "Male" "N/A" "N/A" "Male" ...
 $ partner_age                              : chr [1:169616] "" "" "" "18-34" ...
 $ causes_of_infertility_tubal_disease      : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ causes_of_infertility_ovulatory_disorder : num [1:169616] 0 0 0 0 0 0 0 1 0 0 ...
 $ causes_of_infertility_male_factor        : num [1:169616] 0 0 0 1 1 0 1 1 0 0 ...
 $ causes_of_infertility_patient_unexplained: num [1:169616] 0 0 0 0 0 1 0 0 1 1 ...
 $ causes_of_infertility_endometriosis      : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ stimulation_used                         : num [1:169616] 0 1 1 1 0 1 1 1 1 1 ...
 $ pgt_m_treatment                          : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ pgt_a_treatment                          : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ eggs_thawed_0_1                          : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ fresh_eggs_stored_0_1                    : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ year_of_treatment                        : num [1:169616] 2017 2018 2018 2018 2018 ...
 $ live_birth_occurrence                    : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ number_of_live_births                    : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
hfea |> glimpse()
Rows: 169,616
Columns: 39
$ patient_age_at_treatment                  <chr> "45-50", "18-34", "999", "18…
$ total_number_of_previous_ivf_cycles       <chr> "3", "0", "0", "0", "1", "0"…
$ total_number_of_previous_di_cycles        <chr> "0", "0", "0", "0", "0", "0"…
$ egg_donor_age_at_registration             <chr> "", "", "", "", "", "", "", …
$ sperm_donor_age_at_registration           <chr> "Between 36 and 40", "", "",…
$ type_of_treatment_ivf_or_di               <chr> "IVF", "IVF", "IVF", "IVF", …
$ specific_treatment_type                   <chr> "ICSI", "IVF", "IVF", "ICSI"…
$ egg_source                                <chr> "Patient", "Patient", "Patie…
$ sperm_source                              <chr> "Donor", "Partner", "Partner…
$ fresh_eggs_collected                      <chr> "1-5", "21-25", "21-25", "1-…
$ total_eggs_mixed                          <chr> "1-5", "0", "0", "1-5", "0",…
$ total_embryos_created                     <chr> "0", "0", "0", "1-5", "0", "…
$ total_embryos_thawed                      <chr> "0", "0", "0", "0", "1-5", "…
$ embryos_stored_for_use_by_patient         <chr> "0", "0", "0", "1-5", "0", "…
$ early_outcome                             <chr> "", "", "", "None", "None", …
$ heart_one_weeks_gestation                 <chr> "", "", "", "", "", "", "", …
$ heart_one_birth_outcome                   <chr> "", "", "", "", "", "", "", …
$ heart_one_birth_weight                    <chr> "", "", "", "", "", "", "", …
$ heart_one_sex                             <chr> "", "", "", "", "", "", "", …
$ heart_two_birth_outcome                   <chr> "", "", "", "", "", "", "", …
$ heart_two_birth_weight                    <chr> "", "", "", "", "", "", "", …
$ heart_two_sex                             <chr> "", "", "", "", "", "", "", …
$ patient_ethnicity                         <chr> "Black", "White", "Other", "…
$ partner_ethnicity                         <chr> "Other", "Other", "Other", "…
$ partner_type                              <chr> "Male", "N/A", "N/A", "Male"…
$ partner_age                               <chr> "", "", "", "18-34", "18-34"…
$ causes_of_infertility_tubal_disease       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ causes_of_infertility_ovulatory_disorder  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
$ causes_of_infertility_male_factor         <dbl> 0, 0, 0, 1, 1, 0, 1, 1, 0, 0…
$ causes_of_infertility_patient_unexplained <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1…
$ causes_of_infertility_endometriosis       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ stimulation_used                          <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 1, 1…
$ pgt_m_treatment                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ pgt_a_treatment                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ eggs_thawed_0_1                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ fresh_eggs_stored_0_1                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ year_of_treatment                         <dbl> 2017, 2018, 2018, 2018, 2018…
$ live_birth_occurrence                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ number_of_live_births                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
# Variables conversion
hfea_names <- c("patient_age_at_treatment",
                "total_number_of_previous_ivf_cycles",
                "total_number_of_previous_di_cycles",
                "egg_donor_age_at_registration",
                "sperm_donor_age_at_registration",
                "type_of_treatment_ivf_or_di",
                "specific_treatment_type",
                "egg_source",
                "sperm_source",
                "fresh_eggs_collected",
                "total_eggs_mixed",
                "total_embryos_created",
                "total_embryos_thawed",
                "embryos_stored_for_use_by_patient",
                "early_outcome",
                "heart_one_weeks_gestation",
                "heart_one_birth_outcome",
                "heart_one_birth_weight",
                "heart_one_sex",
                "heart_two_birth_outcome",
                "heart_two_birth_weight",
                "heart_two_sex",
                "patient_ethnicity",
                "partner_ethnicity",
                "partner_type",
                "partner_age",
                "causes_of_infertility_tubal_disease",
                "causes_of_infertility_ovulatory_disorder",
                "causes_of_infertility_male_factor",
                "causes_of_infertility_patient_unexplained",
                "causes_of_infertility_endometriosis",
                "stimulation_used",
                "pgt_m_treatment",
                "pgt_a_treatment",
                "eggs_thawed_0_1",                
                "fresh_eggs_stored_0_1",
                "year_of_treatment")
hfea <- hfea |> 
  mutate(across(all_of(hfea_names), factor))

# Create categorical outcome variable
hfea <- hfea |> 
  mutate(
    clinical_outcome = factor(
      case_when(
        live_birth_occurrence == 1 ~ "Success",
        TRUE ~ "Failure"
      )
    )
  )

hfea |> str()
tibble [169,616 × 40] (S3: tbl_df/tbl/data.frame)
 $ patient_age_at_treatment                 : Factor w/ 7 levels "18-34","35-37",..: 6 1 7 1 1 4 4 1 3 3 ...
 $ total_number_of_previous_ivf_cycles      : Factor w/ 7 levels ">5","0","1","2",..: 5 2 2 2 3 2 2 2 2 3 ...
 $ total_number_of_previous_di_cycles       : Factor w/ 7 levels ">5","0","1","2",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ egg_donor_age_at_registration            : Factor w/ 6 levels "","<= 20",">35",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ sperm_donor_age_at_registration          : Factor w/ 8 levels "","<= 20",">45",..: 7 1 1 1 1 1 1 1 1 1 ...
 $ type_of_treatment_ivf_or_di              : Factor w/ 2 levels "DI","IVF": 2 2 2 2 2 2 2 2 2 2 ...
 $ specific_treatment_type                  : Factor w/ 7 levels "DI","ICSI","ICSI:IVF",..: 2 5 5 2 7 5 2 2 5 5 ...
 $ egg_source                               : Factor w/ 3 levels "","Donor","Patient": 3 3 3 3 3 3 3 3 3 3 ...
 $ sperm_source                             : Factor w/ 3 levels "","Donor","Partner": 2 3 3 3 3 3 3 3 3 3 ...
 $ fresh_eggs_collected                     : Factor w/ 11 levels "",">40","0","1-5",..: 4 7 7 4 3 4 4 5 11 11 ...
 $ total_eggs_mixed                         : Factor w/ 11 levels "",">40","0","1-5",..: 4 3 3 4 3 4 4 5 11 11 ...
 $ total_embryos_created                    : Factor w/ 9 levels "",">30","0","1-5",..: 3 3 3 4 3 4 4 9 4 4 ...
 $ total_embryos_thawed                     : Factor w/ 4 levels ">10","0","1-5",..: 2 2 2 2 3 2 2 2 2 2 ...
 $ embryos_stored_for_use_by_patient        : Factor w/ 7 levels "",">20","0","1-5",..: 3 3 3 4 3 3 3 3 3 3 ...
 $ early_outcome                            : Factor w/ 6 levels "","Biochemical Pregnancy Only",..: 1 1 1 6 6 6 6 6 6 6 ...
 $ heart_one_weeks_gestation                : Factor w/ 14 levels "","30","31","32",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ heart_one_birth_outcome                  : Factor w/ 8 levels "","Ectotopic/Hetrotopic Pregnancy",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ heart_one_birth_weight                   : Factor w/ 12 levels "","5.5kg or greater",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ heart_one_sex                            : Factor w/ 3 levels "","F","M": 1 1 1 1 1 1 1 1 1 1 ...
 $ heart_two_birth_outcome                  : Factor w/ 5 levels "","Embryo Reduction",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ heart_two_birth_weight                   : Factor w/ 10 levels "","Between 1.5kg and 1.99Kg",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ heart_two_sex                            : Factor w/ 3 levels "","F","M": 1 1 1 1 1 1 1 1 1 1 ...
 $ patient_ethnicity                        : Factor w/ 5 levels "Asian","Black",..: 2 5 4 5 5 5 5 5 5 5 ...
 $ partner_ethnicity                        : Factor w/ 6 levels "Any other ethnicity",..: 5 5 5 6 6 6 6 6 6 6 ...
 $ partner_type                             : Factor w/ 5 levels "Female","Male",..: 2 3 3 2 2 2 2 2 2 2 ...
 $ partner_age                              : Factor w/ 11 levels "",">60","18-34",..: 1 1 1 3 3 6 5 3 6 6 ...
 $ causes_of_infertility_tubal_disease      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ causes_of_infertility_ovulatory_disorder : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
 $ causes_of_infertility_male_factor        : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 1 1 ...
 $ causes_of_infertility_patient_unexplained: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 2 ...
 $ causes_of_infertility_endometriosis      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ stimulation_used                         : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 2 2 2 2 ...
 $ pgt_m_treatment                          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ pgt_a_treatment                          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ eggs_thawed_0_1                          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ fresh_eggs_stored_0_1                    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ year_of_treatment                        : Factor w/ 2 levels "2017","2018": 1 2 2 2 2 1 1 2 1 1 ...
 $ live_birth_occurrence                    : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ number_of_live_births                    : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
 $ clinical_outcome                         : Factor w/ 2 levels "Failure","Success": 1 1 1 1 1 1 1 1 1 1 ...

Exploratory analysis

Variable distribution

lapply(hfea, function(x) {
  
  if (is.numeric(x)) return(summary(x))
  
  if (is.factor(x)) return(table(x))
  
})
$patient_age_at_treatment
x
18-34 35-37 38-39 40-42 43-44 45-50   999 
68365 38567 24365 23813  7292  4545  2669 

$total_number_of_previous_ivf_cycles
x
   >5     0     1     2     3     4     5 
 6498 72758 40631 24011 13623  7695  4400 

$total_number_of_previous_di_cycles
x
    >5      0      1      2      3      4      5 
  1115 157604   3763   2985   2323   1091    735 

$egg_donor_age_at_registration
x
                              <= 20               >35 Between 21 and 25 
           160975               324               867              1558 
Between 26 and 30 Between 31 and 35 
             2660              3232 

$sperm_donor_age_at_registration
x
                              <= 20               >45 Between 21 and 25 
           144738              1069               481              5806 
Between 26 and 30 Between 31 and 35 Between 36 and 40 Between 41 and 45 
             6241              5041              4475              1765 

$type_of_treatment_ivf_or_di
x
    DI    IVF 
 11282 158334 

$specific_treatment_type
x
          DI         ICSI     ICSI:IVF ICSI:Unknown          IVF  IVF:Unknown 
       11282        60955          430          133        58647           47 
     Unknown 
       38122 

$egg_source
x
          Donor Patient 
  11282    8677  149657 

$sperm_source
x
          Donor Partner 
    195   24782  144639 

$fresh_eggs_collected
x
        >40     0   1-5 11-15 16-20 21-25 26-30 31-35 36-40  6-10 
11282   274 60060 25857 21799 10986  4750  1983   753   317 31555 

$total_eggs_mixed
x
        >40     0   1-5 11-15 16-20 21-25 26-30 31-35 36-40  6-10 
11282   122 62337 31160 18623  7982  3021  1088   403   146 33452 

$total_embryos_created
x
        >30     0   1-5 11-15 16-20 21-25 26-30  6-10 
11282    84 66796 48335  9882  2728   689   206 29614 

$total_embryos_thawed
x
   >10      0    1-5   6-10 
   213 119661  48930    812 

$embryos_stored_for_use_by_patient
x
          >20      0    1-5  11-15  16-20   6-10 
 11282    101 109022  41130   1116    245   6720 

$early_outcome
x
                                         Biochemical Pregnancy Only 
                            37096                              8966 
               Ectopic/Hetrotopic Intrauterine Fetal Pulsation Seen 
                              538                             46571 
                      Miscarriage                              None 
                             4937                             71508 

$heart_one_weeks_gestation
x
                                         30                    31 
               129263                   160                   221 
                   32                    33                    34 
                  290                   406                   676 
                   35                    36                    37 
                 1074                  2163                  4512 
                   38                    39                    40 
                 7211                 10456                  8615 
Greater than 40 weeks    Less than 30 weeks 
                 4024                   545 

$heart_one_birth_outcome
x
                               Ectotopic/Hetrotopic Pregnancy 
                        122806                             46 
              Embryo Reduction                    Live  Birth 
                            39                          40525 
             Lost to Follow Up                    Miscarriage 
                           431                           5261 
                   Still Birth                    Termination 
                           148                            360 

$heart_one_birth_weight
x
                                 5.5kg or greater Between 1.5kg and 1.99Kg 
                  129357                       27                     1048 
  Between 1kg and 1.49Kg Between 2.0kg and 2.49Kg Between 2.5kg and 2.99Kg 
                     527                     3067                     7241 
Between 3.0kg and 3.49Kg Between 3.5kg and 3.99Kg Between 4.0kg and 4.49Kg 
                   14630                    10179                     2779 
Between 4.5kg and 4.99Kg Between 5.0kg and 5.49Kg            Less than 1kg 
                     420                       56                      285 

$heart_one_sex
x
            F      M 
129123  19860  20633 

$heart_two_birth_outcome
x
                 Embryo Reduction      Live  Birth      Miscarriage 
          164891               12             4167              506 
     Still Birth 
              40 

$heart_two_birth_weight
x
                         Between 1.5kg and 1.99Kg   Between 1kg and 1.49Kg 
                  165515                      606                      220 
Between 2.0kg and 2.49Kg Between 2.5kg and 2.99Kg Between 3.0kg and 3.49Kg 
                    1438                     1082                      455 
Between 3.5kg and 3.99Kg Between 4.0kg and 4.49Kg Between 4.5kg and 4.99Kg 
                     131                       37                        8 
           Less than 1kg 
                     124 

$heart_two_sex
x
            F      M 
165466   2191   1959 

$patient_ethnicity
x
 Asian  Black  Mixed  Other  White 
 17119   4243   2573  30236 115445 

$partner_ethnicity
x
Any other ethnicity               Asian               Black               Mixed 
               3663               15030                3796                1843 
              Other               White 
              37889              107395 

$partner_type
x
   Female      Male       N/A      None Surrogate 
     9827    148986      3464      6712       627 

$partner_age
x
        >60 18-34 35-37 38-39 40-42 43-44 45-50 51-55 56-60   999 
16933   772 48438 31328 19597 21118 10352 15595  4145  1332     6 

$causes_of_infertility_tubal_disease
x
     0      1 
154648  14968 

$causes_of_infertility_ovulatory_disorder
x
     0      1 
151465  18151 

$causes_of_infertility_male_factor
x
     0      1 
117841  51775 

$causes_of_infertility_patient_unexplained
x
     0      1 
122981  46635 

$causes_of_infertility_endometriosis
x
     0      1 
160751   8865 

$stimulation_used
x
     0      1 
 64471 105145 

$pgt_m_treatment
x
     0      1 
168202   1414 

$pgt_a_treatment
x
     0      1 
167625   1991 

$eggs_thawed_0_1
x
     0      1 
168127   1489 

$fresh_eggs_stored_0_1
x
     0      1 
165770   3846 

$year_of_treatment
x
 2017  2018 
84398 85218 

$live_birth_occurrence
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.2419  0.0000  1.0000 

$number_of_live_births
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   0.000   0.000   0.264   0.000   3.000 

$clinical_outcome
x
Failure Success 
 128593   41023 
hfea |> skim()
Warning: There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names),
  mangled_skimmers$funs)`.
ℹ In group 0: .
Caused by warning:
! There were 17 warnings in `dplyr::summarize()`.
The first warning was:
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names),
  mangled_skimmers$funs)`.
Caused by warning in `sorted_count()`:
! Variable contains value(s) of "" that have been converted to "empty".
ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.
Data summary
Name hfea
Number of rows 169616
Number of columns 40
_______________________
Column type frequency:
factor 38
numeric 2
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
patient_age_at_treatment 0 1 FALSE 7 18-: 68365, 35-: 38567, 38-: 24365, 40-: 23813
total_number_of_previous_ivf_cycles 0 1 FALSE 7 0: 72758, 1: 40631, 2: 24011, 3: 13623
total_number_of_previous_di_cycles 0 1 FALSE 7 0: 157604, 1: 3763, 2: 2985, 3: 2323
egg_donor_age_at_registration 0 1 FALSE 6 emp: 160975, Bet: 3232, Bet: 2660, Bet: 1558
sperm_donor_age_at_registration 0 1 FALSE 8 emp: 144738, Bet: 6241, Bet: 5806, Bet: 5041
type_of_treatment_ivf_or_di 0 1 FALSE 2 IVF: 158334, DI: 11282
specific_treatment_type 0 1 FALSE 7 ICS: 60955, IVF: 58647, Unk: 38122, DI: 11282
egg_source 0 1 FALSE 3 Pat: 149657, emp: 11282, Don: 8677
sperm_source 0 1 FALSE 3 Par: 144639, Don: 24782, emp: 195
fresh_eggs_collected 0 1 FALSE 11 0: 60060, 6-1: 31555, 1-5: 25857, 11-: 21799
total_eggs_mixed 0 1 FALSE 11 0: 62337, 6-1: 33452, 1-5: 31160, 11-: 18623
total_embryos_created 0 1 FALSE 9 0: 66796, 1-5: 48335, 6-1: 29614, emp: 11282
total_embryos_thawed 0 1 FALSE 4 0: 119661, 1-5: 48930, 6-1: 812, >10: 213
embryos_stored_for_use_by_patient 0 1 FALSE 7 0: 109022, 1-5: 41130, emp: 11282, 6-1: 6720
early_outcome 0 1 FALSE 6 Non: 71508, Int: 46571, emp: 37096, Bio: 8966
heart_one_weeks_gestation 0 1 FALSE 14 emp: 129263, 39: 10456, 40: 8615, 38: 7211
heart_one_birth_outcome 0 1 FALSE 8 emp: 122806, Liv: 40525, Mis: 5261, Los: 431
heart_one_birth_weight 0 1 FALSE 12 emp: 129357, Bet: 14630, Bet: 10179, Bet: 7241
heart_one_sex 0 1 FALSE 3 emp: 129123, M: 20633, F: 19860
heart_two_birth_outcome 0 1 FALSE 5 emp: 164891, Liv: 4167, Mis: 506, Sti: 40
heart_two_birth_weight 0 1 FALSE 10 emp: 165515, Bet: 1438, Bet: 1082, Bet: 606
heart_two_sex 0 1 FALSE 3 emp: 165466, F: 2191, M: 1959
patient_ethnicity 0 1 FALSE 5 Whi: 115445, Oth: 30236, Asi: 17119, Bla: 4243
partner_ethnicity 0 1 FALSE 6 Whi: 107395, Oth: 37889, Asi: 15030, Bla: 3796
partner_type 0 1 FALSE 5 Mal: 148986, Fem: 9827, Non: 6712, N/A: 3464
partner_age 0 1 FALSE 11 18-: 48438, 35-: 31328, 40-: 21118, 38-: 19597
causes_of_infertility_tubal_disease 0 1 FALSE 2 0: 154648, 1: 14968
causes_of_infertility_ovulatory_disorder 0 1 FALSE 2 0: 151465, 1: 18151
causes_of_infertility_male_factor 0 1 FALSE 2 0: 117841, 1: 51775
causes_of_infertility_patient_unexplained 0 1 FALSE 2 0: 122981, 1: 46635
causes_of_infertility_endometriosis 0 1 FALSE 2 0: 160751, 1: 8865
stimulation_used 0 1 FALSE 2 1: 105145, 0: 64471
pgt_m_treatment 0 1 FALSE 2 0: 168202, 1: 1414
pgt_a_treatment 0 1 FALSE 2 0: 167625, 1: 1991
eggs_thawed_0_1 0 1 FALSE 2 0: 168127, 1: 1489
fresh_eggs_stored_0_1 0 1 FALSE 2 0: 165770, 1: 3846
year_of_treatment 0 1 FALSE 2 201: 85218, 201: 84398
clinical_outcome 0 1 FALSE 2 Fai: 128593, Suc: 41023

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
live_birth_occurrence 0 1 0.24 0.43 0 0 0 0 1 ▇▁▁▁▂
number_of_live_births 0 1 0.26 0.49 0 0 0 0 3 ▇▂▁▁▁
# Replace missing values with "0"

hfea$fresh_eggs_collected[hfea$fresh_eggs_collected == ""] <- "0"

hfea$total_eggs_mixed[hfea$total_eggs_mixed == ""] <- "0"

hfea$total_embryos_created[hfea$total_embryos_created == ""] <- "0"

hfea$embryos_stored_for_use_by_patient[hfea$embryos_stored_for_use_by_patient == ""] <- "0"

hfea$early_outcome[hfea$early_outcome == ""] <- "None"


# Remove variables with so much missing values 
hfea <- hfea |> 
  select(-c(live_birth_occurrence,
            number_of_live_births,
            year_of_treatment,
            heart_one_birth_outcome,
            heart_one_weeks_gestation,
            heart_one_birth_weight,
            heart_two_birth_weight,
            heart_two_birth_outcome,
            egg_donor_age_at_registration,
            
            sperm_donor_age_at_registration,
            heart_one_sex,
            heart_two_sex,
            egg_source,
            sperm_source
  ))

Remove blanks (““) and”999” from rows

hfea |> dim()
[1] 169616     26
# Drops the row if "" or 999 appears anywhere especially in the cloumn  partner_age
clean_hfea <- hfea |> 
  filter(!if_any(everything(), ~ .x %in% c("", "999")))

clean_hfea |> dim()
[1] 152677     26

Visualization

Charts

# Patient age
clean_hfea |> 
  select(patient_age_at_treatment) |> 
  count(patient_age_at_treatment) |> 
  ggplot(aes(x = reorder(patient_age_at_treatment, n), y = n, fill = patient_age_at_treatment)) + geom_col(show.legend = FALSE) + geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + theme(axis.title.y = element_text(angle = 0)) + labs(title = "Patients age group")

# Number of previous cycle
clean_hfea |> 
  select(total_number_of_previous_ivf_cycles) |> 
  count(total_number_of_previous_ivf_cycles) |> 
  ggplot(aes(x = reorder(total_number_of_previous_ivf_cycles, n), y = n, fill = total_number_of_previous_ivf_cycles)) + geom_col(show.legend = FALSE) + geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + theme(axis.title.y = element_text(angle = 0)) + labs(title = "Number of previous cycle")

# Eggs collected
clean_hfea |> 
  select(fresh_eggs_collected) |> 
  count(fresh_eggs_collected) |> 
  ggplot(aes(x = reorder(fresh_eggs_collected, n), y = n, fill = fresh_eggs_collected)) + geom_col(show.legend = FALSE) + geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + theme(axis.title.y = element_text(angle = 0)) + labs(title = "Fresh eggs collected")

# Total embryo created
clean_hfea |> 
  select(total_embryos_created) |> 
  count(total_embryos_created) |> 
  ggplot(aes(x = reorder(total_embryos_created, n), y = n, fill = total_embryos_created)) + geom_col(show.legend = FALSE) + geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + theme(axis.title.y = element_text(angle = 0)) + labs(title = "Total embryo created")

Horizontal Prevalence Bar Chart and Co-Occurrence Matrix Heatmap

# 1. Clean up and reshape the data for visualization
hfea_long <- clean_hfea |> 
  select(
    causes_of_infertility_tubal_disease,
    causes_of_infertility_ovulatory_disorder, 
    causes_of_infertility_male_factor,
    causes_of_infertility_patient_unexplained,
    causes_of_infertility_endometriosis
  ) |> 
  # Convert everything to standard 1 (Yes) and 0 (No) if they are factors/text strings
  mutate(across(everything(), ~ if_else(. == "Yes" | . == 1, 1, 0))) 

# Observe
hfea_long |> 
  select(
    causes_of_infertility_tubal_disease,
    causes_of_infertility_ovulatory_disorder, 
    causes_of_infertility_male_factor,
    causes_of_infertility_patient_unexplained,
    causes_of_infertility_endometriosis
  ) |>
  head()
# A tibble: 6 × 5
  causes_of_infertility_tubal_di…¹ causes_of_infertilit…² causes_of_infertilit…³
                             <dbl>                  <dbl>                  <dbl>
1                                0                      0                      1
2                                0                      0                      1
3                                0                      0                      0
4                                0                      0                      1
5                                0                      1                      1
6                                0                      0                      0
# ℹ abbreviated names: ¹​causes_of_infertility_tubal_disease,
#   ²​causes_of_infertility_ovulatory_disorder,
#   ³​causes_of_infertility_male_factor
# ℹ 2 more variables: causes_of_infertility_patient_unexplained <dbl>,
#   causes_of_infertility_endometriosis <dbl>

Horizontal Prevalence Bar Chart

hfea_long |> 
  summarise(across(everything(), sum)) |> 
  pivot_longer(cols = everything(), names_to = "Infertility_Cause", values_to = "Count") |> 
  mutate(
    Percentage = (Count / nrow(clean_hfea)) * 100,
    Infertility_Cause = str_remove(Infertility_Cause, "causes_of_infertility_") # Clean labels
  ) |> 
  ggplot(aes(x = reorder(Infertility_Cause, Percentage), y = Percentage)) +
  geom_col(fill = "steelblue", width = 0.6) +
  coord_flip() +
  labs(
    title = "Prevalence Matrix of Infertility Etiologies",
    subtitle = paste("Analysis of baseline cohort distribution (N =", nrow(clean_hfea), ")"),
    x = "Diagnostic Category", y = "Percentage of Total Cohort (%)"
  ) +
  theme_minimal()

Co-Occurrence Matrix Heatmap

# Calculate cross-product matrix to discover diagnostic intersections
co_matrix <- t(as.matrix(hfea_long)) %*% as.matrix(hfea_long)

as_tibble(co_matrix, rownames = "Var1") |> 
  pivot_longer(-Var1, names_to = "Var2", values_to = "Overlap_Count") |> 
  mutate(
    Var1 = str_remove(Var1, "causes_of_infertility_"),
    Var2 = str_remove(Var2, "causes_of_infertility_")
  ) |> 
  ggplot(aes(x = Var1, y = Var2, fill = Overlap_Count)) +
  geom_tile(color = "white") +
  geom_text(aes(label = scales::comma(Overlap_Count)), color = "black", size = 3.5) +
  scale_fill_gradient(low = "#e8f0fe", high = "#1a73e8") +
  labs(
    title = "Etiology Intersections & Patient Co-Occurrences",
    subtitle = "Heatmap showing overlapping diagnoses across the clinical registry",
    x = NULL, y = NULL, fill = "Patient Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

UpSet Plot

# 1. Isolate and clean the multi-label categorical features
upset_data <- clean_hfea |> 
  select(
    causes_of_infertility_tubal_disease,
    causes_of_infertility_ovulatory_disorder, 
    causes_of_infertility_male_factor,
    causes_of_infertility_patient_unexplained,
    causes_of_infertility_endometriosis
  ) |> 
  # ComplexUpset requires logical (TRUE/FALSE) or binary (1/0) matrices
  mutate(across(everything(), ~ if_else(. == "Yes" | . == 1, TRUE, FALSE))) |> 
  # Rename columns cleanly so the chart axis labels look ready for publication
  rename(
    `Tubal Disease` = causes_of_infertility_tubal_disease,
    `Ovulatory Disorder` = causes_of_infertility_ovulatory_disorder,
    `Male Factor` = causes_of_infertility_male_factor,
    `Unexplained` = causes_of_infertility_patient_unexplained,
    `Endometriosis` = causes_of_infertility_endometriosis
  )

# 2. Extract the clean column names to serve as our target sets
infertility_sets <- colnames(upset_data)

# 3. Generate the ComplexUpSet Visualization
suppressWarnings(
  upset(
    upset_data,
    infertility_sets,
    name = "Diagnostic Intersections / Combinations",
    width_ratio = 0.3, # Adjusts balance between left set sizes and main matrix
    height_ratio = 0.6, # Adjusts balance between top bar chart and main matrix
    stripes = upset_stripes(
      mapping = aes(color = "grey95"),
      colors = c("white", "grey95")
    ),
    base_annotations = list(
      'Intersection Size' = intersection_size(
        counts = TRUE, # Shows numerical patient totals on top of the bars
        mapping = aes(fill = "steelblue")
      ) + 
        scale_fill_identity() +
        theme_minimal() +
        labs(y = "Patient Cohort Size")
    ),
    set_sizes = upset_set_size() + 
      theme_minimal() +
      labs(x = "Total Prevalence per Cause")
  ) +
    labs(
      title = "Multi-Label Clinical Cohort Mapping via UpSet Intersection Design",
      subtitle = paste("Comprehensive mapping of standalone and overlapping etiologies (N =", scales::comma(nrow(clean_hfea)), ")")
    )
)

Deep learning with Tabnet

Preprocess

# Set seeds for R and torch
torch::torch_manual_seed(13)
set.seed(13)

# Data splitting 
data_split <- initial_split(clean_hfea, prop = 0.80, strata = clinical_outcome)

train_split <- data_split |> training()
test_split <- data_split |> testing()

# Preprocessing
hfea_recipe <- recipe(clinical_outcome ~ ., data = train_split) |> 
  step_novel(all_nominal_predictors()) |> 
  #step_dummy(all_nominal_predictors(), -all_outcomes(), one_hot = FALSE) |>
  step_zv(all_nominal_predictors()) |>
  step_normalize(all_numeric_predictors()) |> 
  step_downsample(clinical_outcome, under_ratio = 1) # Solve the 3:1 outcome variable mild imbalance

Transformer with tabnet

# Detect how many logical cores your computer has
all_cores <- parallel::detectCores(logical = TRUE)

# Register a cluster using all available cores except 1 (leaves 1 core free so my PC doesn't freeze)
cl <- makePSOCKcluster(all_cores - 1)
registerDoParallel(cl)

cat("Parallel backend registered with", all_cores - 1, "cores.\n")
Parallel backend registered with 3 cores.
# deep learning with tabnet
tabnet_tune_spec <- tabnet(
  epochs = 40,               # Number of training epochs over dataset
  batch_size = 256,          # Sub-cohort sizing optimized for local RAM
  decision_width = 16,       # Width of the prediction layer
  attention_width = 16,      # Width of the attention embedding mask
  num_steps = 4,             # Sequential attention steps (depth of processing)
  learn_rate = 0.01          # Gradient descent step velocity
) |> 
  set_engine("torch") |> 
  set_mode("classification")

# Combine into a unified tuning workflow
transformer_workflow <- workflow() |> 
  add_recipe(hfea_recipe) |> 
  add_model(tabnet_tune_spec)

# Execute Attention-Based Model Optimization Loop
cat("--- Commencing Transformer Optimization Pipeline ---\n")
--- Commencing Transformer Optimization Pipeline ---
fitted_transformer <- fit(transformer_workflow, data = train_split)

# Predict
predictions <- augment(fitted_transformer, test_split)
ivf_metrics <- metric_set(accuracy, sensitivity, specificity, precision, npv)

predictions |> 
  ivf_metrics(truth = clinical_outcome, 
              estimate = .pred_class,          
              .pred_Success)
# A tibble: 5 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.963
2 sensitivity binary         0.952
3 specificity binary         0.996
4 precision   binary         0.999
5 npv         binary         0.878

Attention map (Clinical interpretability)

#Extracting raw TabNet feature importances driven by internal cross-attention masks
extracted_fit <- extract_fit_engine(fitted_transformer)
explain_data  <- bake(prep(hfea_recipe), test_split)

# Extract attention masks
attention_explain <- tabnet_explain(extracted_fit, explain_data)

# Render Global Interpretability Plot for Clinical Support Decisions
autoplot(attention_explain) +
  labs(title = "Clinical Decision Support: Global Transformer Attention Weights",
       subtitle = "Feature importance maps computed natively via multi-step attention layer interactions",
       x = "Clinical Predictor", y = "Aggregated Attention Score") +
  theme_minimal(base_size = 14)

# Feature Importance Vip Plot
# 1. Extract the underlying engine fit cleanly
extracted_fit <- extract_fit_engine(fitted_transformer)

# 2. Generate the plot with your specific fill and theme styles
vip(extracted_fit, aesthetics = list(fill = "turquoise1", color = "black")) + 
  theme_minimal(base_size = 14) +
  labs(
    title = "Transformer Feature Importance Profile",
    subtitle = "Attribution weights derived natively from cross-attention masks",
    y = "Importance Score",
    x = "Clinical Predictor"
  )

# Close multicore
stopCluster(cl)
registerDoSEQ() # Returns R to standard single-threaded behavior