library(pacman)
p_load(tidyverse, tidymodels, skimr, janitor, tabnet, torch, vip, themis, doParallel, parallel, ComplexUpset)
# 1st file
hfea_2017_2018 <- read_csv("2017-2018-xlsb.csv", col_types = cols(), na = "NA")
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
dat <- vroom(...)
problems(dat)
hfea_2017_2018 |> skim()
| Name | hfea_2017_2018 |
| Number of rows | 169616 |
| Number of columns | 61 |
| _______________________ | |
| Column type frequency: | |
| character | 27 |
| logical | 7 |
| numeric | 27 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Patient age at treatment | 0 | 1 | 3 | 5 | 0 | 7 | 0 |
| Total number of previous IVF cycles | 0 | 1 | 1 | 2 | 0 | 7 | 0 |
| Total number of previous DI cycles | 0 | 1 | 1 | 2 | 0 | 7 | 0 |
| Main reason for producing embroys storing eggs | 0 | 1 | 8 | 15 | 0 | 7 | 0 |
| Egg donor age at registration | 0 | 1 | 0 | 17 | 160975 | 6 | 0 |
| Sperm donor age at registration | 0 | 1 | 0 | 17 | 144738 | 8 | 0 |
| Type of treatment - IVF or DI | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| Specific treatment type | 0 | 1 | 2 | 12 | 0 | 7 | 0 |
| Egg source | 0 | 1 | 0 | 7 | 11282 | 3 | 0 |
| Sperm source | 0 | 1 | 0 | 7 | 195 | 3 | 0 |
| Fresh eggs collected | 0 | 1 | 0 | 5 | 11282 | 11 | 0 |
| Total eggs mixed | 0 | 1 | 0 | 5 | 11282 | 11 | 0 |
| Total embryos created | 0 | 1 | 0 | 5 | 11282 | 9 | 0 |
| Total embryos thawed | 0 | 1 | 1 | 4 | 0 | 4 | 0 |
| Embryos stored for use by patient | 0 | 1 | 0 | 5 | 11282 | 7 | 0 |
| Early outcome | 0 | 1 | 0 | 33 | 37096 | 6 | 0 |
| Heart one weeks gestation | 0 | 1 | 0 | 21 | 129263 | 14 | 0 |
| Heart one birth outcome | 0 | 1 | 0 | 30 | 122806 | 8 | 0 |
| Heart one birth weight | 0 | 1 | 0 | 24 | 129357 | 12 | 0 |
| Heart one sex | 0 | 1 | 0 | 1 | 129123 | 3 | 0 |
| Heart two birth outcome | 0 | 1 | 0 | 16 | 164891 | 5 | 0 |
| Heart two birth weight | 0 | 1 | 0 | 24 | 165515 | 10 | 0 |
| Heart two sex | 0 | 1 | 0 | 1 | 165466 | 3 | 0 |
| Patient ethnicity | 0 | 1 | 5 | 5 | 0 | 5 | 0 |
| Partner ethnicity | 0 | 1 | 5 | 19 | 0 | 6 | 0 |
| Partner Type | 0 | 1 | 3 | 9 | 0 | 5 | 0 |
| Partner age | 0 | 1 | 0 | 5 | 16933 | 11 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| Heart two birth congenital abnormalities | 169579 | 0 | 1 | TRU: 37 |
| Heart three weeks gestation | 169616 | 0 | NaN | : |
| Heart three birth outcome | 169616 | 0 | NaN | : |
| Heart three birth weight | 169616 | 0 | NaN | : |
| Heart three sex | 169564 | 0 | 0 | FAL: 52 |
| Heart three birth congenital abnormalities | 169579 | 0 | 1 | TRU: 37 |
| Heart three delivery date | 169616 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Total number of previous pregnancies - IVF and DI | 126928 | 0.25 | 0.81 | NaN | 0 | 0 | 1 | 1 | 5 | ▇▁▁▁▁ |
| Total number of previous live births - IVF or DI | 64546 | 0.62 | 0.23 | 0.45 | 0 | 0 | 0 | 0 | 3 | ▇▂▁▁▁ |
| Causes of infertility - tubal disease | 0 | 1.00 | 0.09 | 0.28 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| Causes of infertility - ovulatory disorder | 0 | 1.00 | 0.11 | 0.31 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| Causes of infertility - male factor | 0 | 1.00 | 0.31 | 0.46 | 0 | 0 | 0 | 1 | 1 | ▇▁▁▁▃ |
| Causes of infertility - patient unexplained | 0 | 1.00 | 0.27 | 0.45 | 0 | 0 | 0 | 1 | 1 | ▇▁▁▁▃ |
| Causes of infertility - endometriosis | 0 | 1.00 | 0.05 | 0.22 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| Stimulation used | 0 | 1.00 | 0.62 | 0.49 | 0 | 0 | 1 | 1 | 1 | ▅▁▁▁▇ |
| Donated embryo | 11282 | 0.93 | 0.01 | 0.10 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| PGT-M treatment | 0 | 1.00 | 0.01 | 0.09 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| PGT-A treatment | 0 | 1.00 | 0.01 | 0.11 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| Elective single embryo transfer | 11282 | 0.93 | 0.34 | 0.47 | 0 | 0 | 0 | 1 | 1 | ▇▁▁▁▅ |
| Fresh cycle | 11282 | 0.93 | 0.69 | 0.46 | 0 | 0 | 1 | 1 | 1 | ▃▁▁▁▇ |
| Frozen cycle | 11282 | 0.93 | 0.31 | 0.46 | 0 | 0 | 0 | 1 | 1 | ▇▁▁▁▃ |
| Eggs thawed (0/1) | 0 | 1.00 | 0.01 | 0.09 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| Fresh eggs stored (0/1) | 0 | 1.00 | 0.02 | 0.15 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| Embryos transferred | 11282 | 0.93 | 1.02 | 0.72 | 0 | 1 | 1 | 1 | 3 | ▃▇▁▃▁ |
| Embryos transferred from eggs micro-injected | 84734 | 0.50 | 0.78 | 0.80 | 0 | 0 | 1 | 1 | 3 | ▇▆▁▃▁ |
| Date of embryo transfer | 48782 | 0.71 | 5.39 | 52.12 | 0 | 0 | 3 | 5 | 999 | ▇▁▁▁▁ |
| Year of treatment | 0 | 1.00 | 2017.50 | 0.50 | 2017 | 2017 | 2018 | 2018 | 2018 | ▇▁▁▁▇ |
| Live birth occurrence | 0 | 1.00 | 0.24 | 0.43 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▂ |
| Number of live births | 0 | 1.00 | 0.26 | 0.49 | 0 | 0 | 0 | 0 | 3 | ▇▂▁▁▁ |
| Number of foetal sacs with fetal pulsation | 109775 | 0.35 | 0.87 | 0.53 | 0 | 1 | 1 | 1 | 3 | ▂▇▁▁▁ |
| Heart one delivery date | 129223 | 0.24 | 2018.18 | 0.68 | 2017 | 2018 | 2018 | 2019 | 2019 | ▂▁▇▁▆ |
| Heart one birth congenital abnormalities | 169173 | 0.00 | 1.00 | 0.00 | 1 | 1 | 1 | 1 | 1 | ▁▁▇▁▁ |
| Heart two weeks gestation | 165679 | 0.02 | 35.92 | NaN | 30 | 35 | 36 | 37 | 40 | ▂▂▇▇▂ |
| Heart two delivery date | 165473 | 0.02 | 2018.05 | NaN | 2017 | 2018 | 2018 | 2019 | 2019 | ▃▁▇▁▃ |
# Clean names
hfea_2017_2018 <- hfea_2017_2018 |>
clean_names()
# Select required variables
hfea <- hfea_2017_2018 |>
select(
patient_age_at_treatment,
total_number_of_previous_ivf_cycles,
total_number_of_previous_di_cycles,
egg_donor_age_at_registration,
sperm_donor_age_at_registration,
type_of_treatment_ivf_or_di,
specific_treatment_type,
egg_source,
sperm_source,
fresh_eggs_collected,
total_eggs_mixed,
total_embryos_created,
total_embryos_thawed,
embryos_stored_for_use_by_patient,
early_outcome,
heart_one_weeks_gestation,
heart_one_birth_outcome,
heart_one_birth_weight,
heart_one_sex,
heart_two_birth_outcome,
heart_two_birth_weight,
heart_two_sex,
patient_ethnicity,
partner_ethnicity,
partner_type,
partner_age,
causes_of_infertility_tubal_disease,
causes_of_infertility_ovulatory_disorder,
causes_of_infertility_male_factor,
causes_of_infertility_patient_unexplained,
causes_of_infertility_endometriosis,
stimulation_used,
pgt_m_treatment,
pgt_a_treatment,
eggs_thawed_0_1,
fresh_eggs_stored_0_1,
year_of_treatment,
live_birth_occurrence,
number_of_live_births
)
hfea |> str()
tibble [169,616 × 39] (S3: tbl_df/tbl/data.frame)
$ patient_age_at_treatment : chr [1:169616] "45-50" "18-34" "999" "18-34" ...
$ total_number_of_previous_ivf_cycles : chr [1:169616] "3" "0" "0" "0" ...
$ total_number_of_previous_di_cycles : chr [1:169616] "0" "0" "0" "0" ...
$ egg_donor_age_at_registration : chr [1:169616] "" "" "" "" ...
$ sperm_donor_age_at_registration : chr [1:169616] "Between 36 and 40" "" "" "" ...
$ type_of_treatment_ivf_or_di : chr [1:169616] "IVF" "IVF" "IVF" "IVF" ...
$ specific_treatment_type : chr [1:169616] "ICSI" "IVF" "IVF" "ICSI" ...
$ egg_source : chr [1:169616] "Patient" "Patient" "Patient" "Patient" ...
$ sperm_source : chr [1:169616] "Donor" "Partner" "Partner" "Partner" ...
$ fresh_eggs_collected : chr [1:169616] "1-5" "21-25" "21-25" "1-5" ...
$ total_eggs_mixed : chr [1:169616] "1-5" "0" "0" "1-5" ...
$ total_embryos_created : chr [1:169616] "0" "0" "0" "1-5" ...
$ total_embryos_thawed : chr [1:169616] "0" "0" "0" "0" ...
$ embryos_stored_for_use_by_patient : chr [1:169616] "0" "0" "0" "1-5" ...
$ early_outcome : chr [1:169616] "" "" "" "None" ...
$ heart_one_weeks_gestation : chr [1:169616] "" "" "" "" ...
$ heart_one_birth_outcome : chr [1:169616] "" "" "" "" ...
$ heart_one_birth_weight : chr [1:169616] "" "" "" "" ...
$ heart_one_sex : chr [1:169616] "" "" "" "" ...
$ heart_two_birth_outcome : chr [1:169616] "" "" "" "" ...
$ heart_two_birth_weight : chr [1:169616] "" "" "" "" ...
$ heart_two_sex : chr [1:169616] "" "" "" "" ...
$ patient_ethnicity : chr [1:169616] "Black" "White" "Other" "White" ...
$ partner_ethnicity : chr [1:169616] "Other" "Other" "Other" "White" ...
$ partner_type : chr [1:169616] "Male" "N/A" "N/A" "Male" ...
$ partner_age : chr [1:169616] "" "" "" "18-34" ...
$ causes_of_infertility_tubal_disease : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ causes_of_infertility_ovulatory_disorder : num [1:169616] 0 0 0 0 0 0 0 1 0 0 ...
$ causes_of_infertility_male_factor : num [1:169616] 0 0 0 1 1 0 1 1 0 0 ...
$ causes_of_infertility_patient_unexplained: num [1:169616] 0 0 0 0 0 1 0 0 1 1 ...
$ causes_of_infertility_endometriosis : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ stimulation_used : num [1:169616] 0 1 1 1 0 1 1 1 1 1 ...
$ pgt_m_treatment : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ pgt_a_treatment : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ eggs_thawed_0_1 : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ fresh_eggs_stored_0_1 : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ year_of_treatment : num [1:169616] 2017 2018 2018 2018 2018 ...
$ live_birth_occurrence : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ number_of_live_births : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
hfea |> glimpse()
Rows: 169,616
Columns: 39
$ patient_age_at_treatment <chr> "45-50", "18-34", "999", "18…
$ total_number_of_previous_ivf_cycles <chr> "3", "0", "0", "0", "1", "0"…
$ total_number_of_previous_di_cycles <chr> "0", "0", "0", "0", "0", "0"…
$ egg_donor_age_at_registration <chr> "", "", "", "", "", "", "", …
$ sperm_donor_age_at_registration <chr> "Between 36 and 40", "", "",…
$ type_of_treatment_ivf_or_di <chr> "IVF", "IVF", "IVF", "IVF", …
$ specific_treatment_type <chr> "ICSI", "IVF", "IVF", "ICSI"…
$ egg_source <chr> "Patient", "Patient", "Patie…
$ sperm_source <chr> "Donor", "Partner", "Partner…
$ fresh_eggs_collected <chr> "1-5", "21-25", "21-25", "1-…
$ total_eggs_mixed <chr> "1-5", "0", "0", "1-5", "0",…
$ total_embryos_created <chr> "0", "0", "0", "1-5", "0", "…
$ total_embryos_thawed <chr> "0", "0", "0", "0", "1-5", "…
$ embryos_stored_for_use_by_patient <chr> "0", "0", "0", "1-5", "0", "…
$ early_outcome <chr> "", "", "", "None", "None", …
$ heart_one_weeks_gestation <chr> "", "", "", "", "", "", "", …
$ heart_one_birth_outcome <chr> "", "", "", "", "", "", "", …
$ heart_one_birth_weight <chr> "", "", "", "", "", "", "", …
$ heart_one_sex <chr> "", "", "", "", "", "", "", …
$ heart_two_birth_outcome <chr> "", "", "", "", "", "", "", …
$ heart_two_birth_weight <chr> "", "", "", "", "", "", "", …
$ heart_two_sex <chr> "", "", "", "", "", "", "", …
$ patient_ethnicity <chr> "Black", "White", "Other", "…
$ partner_ethnicity <chr> "Other", "Other", "Other", "…
$ partner_type <chr> "Male", "N/A", "N/A", "Male"…
$ partner_age <chr> "", "", "", "18-34", "18-34"…
$ causes_of_infertility_tubal_disease <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ causes_of_infertility_ovulatory_disorder <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
$ causes_of_infertility_male_factor <dbl> 0, 0, 0, 1, 1, 0, 1, 1, 0, 0…
$ causes_of_infertility_patient_unexplained <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1…
$ causes_of_infertility_endometriosis <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ stimulation_used <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 1, 1…
$ pgt_m_treatment <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ pgt_a_treatment <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ eggs_thawed_0_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ fresh_eggs_stored_0_1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ year_of_treatment <dbl> 2017, 2018, 2018, 2018, 2018…
$ live_birth_occurrence <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ number_of_live_births <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
# Variables conversion
hfea_names <- c("patient_age_at_treatment",
"total_number_of_previous_ivf_cycles",
"total_number_of_previous_di_cycles",
"egg_donor_age_at_registration",
"sperm_donor_age_at_registration",
"type_of_treatment_ivf_or_di",
"specific_treatment_type",
"egg_source",
"sperm_source",
"fresh_eggs_collected",
"total_eggs_mixed",
"total_embryos_created",
"total_embryos_thawed",
"embryos_stored_for_use_by_patient",
"early_outcome",
"heart_one_weeks_gestation",
"heart_one_birth_outcome",
"heart_one_birth_weight",
"heart_one_sex",
"heart_two_birth_outcome",
"heart_two_birth_weight",
"heart_two_sex",
"patient_ethnicity",
"partner_ethnicity",
"partner_type",
"partner_age",
"causes_of_infertility_tubal_disease",
"causes_of_infertility_ovulatory_disorder",
"causes_of_infertility_male_factor",
"causes_of_infertility_patient_unexplained",
"causes_of_infertility_endometriosis",
"stimulation_used",
"pgt_m_treatment",
"pgt_a_treatment",
"eggs_thawed_0_1",
"fresh_eggs_stored_0_1",
"year_of_treatment")
hfea <- hfea |>
mutate(across(all_of(hfea_names), factor))
# Create categorical outcome variable
hfea <- hfea |>
mutate(
clinical_outcome = factor(
case_when(
live_birth_occurrence == 1 ~ "Success",
TRUE ~ "Failure"
)
)
)
hfea |> str()
tibble [169,616 × 40] (S3: tbl_df/tbl/data.frame)
$ patient_age_at_treatment : Factor w/ 7 levels "18-34","35-37",..: 6 1 7 1 1 4 4 1 3 3 ...
$ total_number_of_previous_ivf_cycles : Factor w/ 7 levels ">5","0","1","2",..: 5 2 2 2 3 2 2 2 2 3 ...
$ total_number_of_previous_di_cycles : Factor w/ 7 levels ">5","0","1","2",..: 2 2 2 2 2 2 2 2 2 2 ...
$ egg_donor_age_at_registration : Factor w/ 6 levels "","<= 20",">35",..: 1 1 1 1 1 1 1 1 1 1 ...
$ sperm_donor_age_at_registration : Factor w/ 8 levels "","<= 20",">45",..: 7 1 1 1 1 1 1 1 1 1 ...
$ type_of_treatment_ivf_or_di : Factor w/ 2 levels "DI","IVF": 2 2 2 2 2 2 2 2 2 2 ...
$ specific_treatment_type : Factor w/ 7 levels "DI","ICSI","ICSI:IVF",..: 2 5 5 2 7 5 2 2 5 5 ...
$ egg_source : Factor w/ 3 levels "","Donor","Patient": 3 3 3 3 3 3 3 3 3 3 ...
$ sperm_source : Factor w/ 3 levels "","Donor","Partner": 2 3 3 3 3 3 3 3 3 3 ...
$ fresh_eggs_collected : Factor w/ 11 levels "",">40","0","1-5",..: 4 7 7 4 3 4 4 5 11 11 ...
$ total_eggs_mixed : Factor w/ 11 levels "",">40","0","1-5",..: 4 3 3 4 3 4 4 5 11 11 ...
$ total_embryos_created : Factor w/ 9 levels "",">30","0","1-5",..: 3 3 3 4 3 4 4 9 4 4 ...
$ total_embryos_thawed : Factor w/ 4 levels ">10","0","1-5",..: 2 2 2 2 3 2 2 2 2 2 ...
$ embryos_stored_for_use_by_patient : Factor w/ 7 levels "",">20","0","1-5",..: 3 3 3 4 3 3 3 3 3 3 ...
$ early_outcome : Factor w/ 6 levels "","Biochemical Pregnancy Only",..: 1 1 1 6 6 6 6 6 6 6 ...
$ heart_one_weeks_gestation : Factor w/ 14 levels "","30","31","32",..: 1 1 1 1 1 1 1 1 1 1 ...
$ heart_one_birth_outcome : Factor w/ 8 levels "","Ectotopic/Hetrotopic Pregnancy",..: 1 1 1 1 1 1 1 1 1 1 ...
$ heart_one_birth_weight : Factor w/ 12 levels "","5.5kg or greater",..: 1 1 1 1 1 1 1 1 1 1 ...
$ heart_one_sex : Factor w/ 3 levels "","F","M": 1 1 1 1 1 1 1 1 1 1 ...
$ heart_two_birth_outcome : Factor w/ 5 levels "","Embryo Reduction",..: 1 1 1 1 1 1 1 1 1 1 ...
$ heart_two_birth_weight : Factor w/ 10 levels "","Between 1.5kg and 1.99Kg",..: 1 1 1 1 1 1 1 1 1 1 ...
$ heart_two_sex : Factor w/ 3 levels "","F","M": 1 1 1 1 1 1 1 1 1 1 ...
$ patient_ethnicity : Factor w/ 5 levels "Asian","Black",..: 2 5 4 5 5 5 5 5 5 5 ...
$ partner_ethnicity : Factor w/ 6 levels "Any other ethnicity",..: 5 5 5 6 6 6 6 6 6 6 ...
$ partner_type : Factor w/ 5 levels "Female","Male",..: 2 3 3 2 2 2 2 2 2 2 ...
$ partner_age : Factor w/ 11 levels "",">60","18-34",..: 1 1 1 3 3 6 5 3 6 6 ...
$ causes_of_infertility_tubal_disease : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ causes_of_infertility_ovulatory_disorder : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
$ causes_of_infertility_male_factor : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 1 1 ...
$ causes_of_infertility_patient_unexplained: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 2 ...
$ causes_of_infertility_endometriosis : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ stimulation_used : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 2 2 2 2 ...
$ pgt_m_treatment : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ pgt_a_treatment : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ eggs_thawed_0_1 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ fresh_eggs_stored_0_1 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ year_of_treatment : Factor w/ 2 levels "2017","2018": 1 2 2 2 2 1 1 2 1 1 ...
$ live_birth_occurrence : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ number_of_live_births : num [1:169616] 0 0 0 0 0 0 0 0 0 0 ...
$ clinical_outcome : Factor w/ 2 levels "Failure","Success": 1 1 1 1 1 1 1 1 1 1 ...
lapply(hfea, function(x) {
if (is.numeric(x)) return(summary(x))
if (is.factor(x)) return(table(x))
})
$patient_age_at_treatment
x
18-34 35-37 38-39 40-42 43-44 45-50 999
68365 38567 24365 23813 7292 4545 2669
$total_number_of_previous_ivf_cycles
x
>5 0 1 2 3 4 5
6498 72758 40631 24011 13623 7695 4400
$total_number_of_previous_di_cycles
x
>5 0 1 2 3 4 5
1115 157604 3763 2985 2323 1091 735
$egg_donor_age_at_registration
x
<= 20 >35 Between 21 and 25
160975 324 867 1558
Between 26 and 30 Between 31 and 35
2660 3232
$sperm_donor_age_at_registration
x
<= 20 >45 Between 21 and 25
144738 1069 481 5806
Between 26 and 30 Between 31 and 35 Between 36 and 40 Between 41 and 45
6241 5041 4475 1765
$type_of_treatment_ivf_or_di
x
DI IVF
11282 158334
$specific_treatment_type
x
DI ICSI ICSI:IVF ICSI:Unknown IVF IVF:Unknown
11282 60955 430 133 58647 47
Unknown
38122
$egg_source
x
Donor Patient
11282 8677 149657
$sperm_source
x
Donor Partner
195 24782 144639
$fresh_eggs_collected
x
>40 0 1-5 11-15 16-20 21-25 26-30 31-35 36-40 6-10
11282 274 60060 25857 21799 10986 4750 1983 753 317 31555
$total_eggs_mixed
x
>40 0 1-5 11-15 16-20 21-25 26-30 31-35 36-40 6-10
11282 122 62337 31160 18623 7982 3021 1088 403 146 33452
$total_embryos_created
x
>30 0 1-5 11-15 16-20 21-25 26-30 6-10
11282 84 66796 48335 9882 2728 689 206 29614
$total_embryos_thawed
x
>10 0 1-5 6-10
213 119661 48930 812
$embryos_stored_for_use_by_patient
x
>20 0 1-5 11-15 16-20 6-10
11282 101 109022 41130 1116 245 6720
$early_outcome
x
Biochemical Pregnancy Only
37096 8966
Ectopic/Hetrotopic Intrauterine Fetal Pulsation Seen
538 46571
Miscarriage None
4937 71508
$heart_one_weeks_gestation
x
30 31
129263 160 221
32 33 34
290 406 676
35 36 37
1074 2163 4512
38 39 40
7211 10456 8615
Greater than 40 weeks Less than 30 weeks
4024 545
$heart_one_birth_outcome
x
Ectotopic/Hetrotopic Pregnancy
122806 46
Embryo Reduction Live Birth
39 40525
Lost to Follow Up Miscarriage
431 5261
Still Birth Termination
148 360
$heart_one_birth_weight
x
5.5kg or greater Between 1.5kg and 1.99Kg
129357 27 1048
Between 1kg and 1.49Kg Between 2.0kg and 2.49Kg Between 2.5kg and 2.99Kg
527 3067 7241
Between 3.0kg and 3.49Kg Between 3.5kg and 3.99Kg Between 4.0kg and 4.49Kg
14630 10179 2779
Between 4.5kg and 4.99Kg Between 5.0kg and 5.49Kg Less than 1kg
420 56 285
$heart_one_sex
x
F M
129123 19860 20633
$heart_two_birth_outcome
x
Embryo Reduction Live Birth Miscarriage
164891 12 4167 506
Still Birth
40
$heart_two_birth_weight
x
Between 1.5kg and 1.99Kg Between 1kg and 1.49Kg
165515 606 220
Between 2.0kg and 2.49Kg Between 2.5kg and 2.99Kg Between 3.0kg and 3.49Kg
1438 1082 455
Between 3.5kg and 3.99Kg Between 4.0kg and 4.49Kg Between 4.5kg and 4.99Kg
131 37 8
Less than 1kg
124
$heart_two_sex
x
F M
165466 2191 1959
$patient_ethnicity
x
Asian Black Mixed Other White
17119 4243 2573 30236 115445
$partner_ethnicity
x
Any other ethnicity Asian Black Mixed
3663 15030 3796 1843
Other White
37889 107395
$partner_type
x
Female Male N/A None Surrogate
9827 148986 3464 6712 627
$partner_age
x
>60 18-34 35-37 38-39 40-42 43-44 45-50 51-55 56-60 999
16933 772 48438 31328 19597 21118 10352 15595 4145 1332 6
$causes_of_infertility_tubal_disease
x
0 1
154648 14968
$causes_of_infertility_ovulatory_disorder
x
0 1
151465 18151
$causes_of_infertility_male_factor
x
0 1
117841 51775
$causes_of_infertility_patient_unexplained
x
0 1
122981 46635
$causes_of_infertility_endometriosis
x
0 1
160751 8865
$stimulation_used
x
0 1
64471 105145
$pgt_m_treatment
x
0 1
168202 1414
$pgt_a_treatment
x
0 1
167625 1991
$eggs_thawed_0_1
x
0 1
168127 1489
$fresh_eggs_stored_0_1
x
0 1
165770 3846
$year_of_treatment
x
2017 2018
84398 85218
$live_birth_occurrence
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.2419 0.0000 1.0000
$number_of_live_births
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 0.264 0.000 3.000
$clinical_outcome
x
Failure Success
128593 41023
hfea |> skim()
Warning: There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names),
mangled_skimmers$funs)`.
ℹ In group 0: .
Caused by warning:
! There were 17 warnings in `dplyr::summarize()`.
The first warning was:
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names),
mangled_skimmers$funs)`.
Caused by warning in `sorted_count()`:
! Variable contains value(s) of "" that have been converted to "empty".
ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.
| Name | hfea |
| Number of rows | 169616 |
| Number of columns | 40 |
| _______________________ | |
| Column type frequency: | |
| factor | 38 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| patient_age_at_treatment | 0 | 1 | FALSE | 7 | 18-: 68365, 35-: 38567, 38-: 24365, 40-: 23813 |
| total_number_of_previous_ivf_cycles | 0 | 1 | FALSE | 7 | 0: 72758, 1: 40631, 2: 24011, 3: 13623 |
| total_number_of_previous_di_cycles | 0 | 1 | FALSE | 7 | 0: 157604, 1: 3763, 2: 2985, 3: 2323 |
| egg_donor_age_at_registration | 0 | 1 | FALSE | 6 | emp: 160975, Bet: 3232, Bet: 2660, Bet: 1558 |
| sperm_donor_age_at_registration | 0 | 1 | FALSE | 8 | emp: 144738, Bet: 6241, Bet: 5806, Bet: 5041 |
| type_of_treatment_ivf_or_di | 0 | 1 | FALSE | 2 | IVF: 158334, DI: 11282 |
| specific_treatment_type | 0 | 1 | FALSE | 7 | ICS: 60955, IVF: 58647, Unk: 38122, DI: 11282 |
| egg_source | 0 | 1 | FALSE | 3 | Pat: 149657, emp: 11282, Don: 8677 |
| sperm_source | 0 | 1 | FALSE | 3 | Par: 144639, Don: 24782, emp: 195 |
| fresh_eggs_collected | 0 | 1 | FALSE | 11 | 0: 60060, 6-1: 31555, 1-5: 25857, 11-: 21799 |
| total_eggs_mixed | 0 | 1 | FALSE | 11 | 0: 62337, 6-1: 33452, 1-5: 31160, 11-: 18623 |
| total_embryos_created | 0 | 1 | FALSE | 9 | 0: 66796, 1-5: 48335, 6-1: 29614, emp: 11282 |
| total_embryos_thawed | 0 | 1 | FALSE | 4 | 0: 119661, 1-5: 48930, 6-1: 812, >10: 213 |
| embryos_stored_for_use_by_patient | 0 | 1 | FALSE | 7 | 0: 109022, 1-5: 41130, emp: 11282, 6-1: 6720 |
| early_outcome | 0 | 1 | FALSE | 6 | Non: 71508, Int: 46571, emp: 37096, Bio: 8966 |
| heart_one_weeks_gestation | 0 | 1 | FALSE | 14 | emp: 129263, 39: 10456, 40: 8615, 38: 7211 |
| heart_one_birth_outcome | 0 | 1 | FALSE | 8 | emp: 122806, Liv: 40525, Mis: 5261, Los: 431 |
| heart_one_birth_weight | 0 | 1 | FALSE | 12 | emp: 129357, Bet: 14630, Bet: 10179, Bet: 7241 |
| heart_one_sex | 0 | 1 | FALSE | 3 | emp: 129123, M: 20633, F: 19860 |
| heart_two_birth_outcome | 0 | 1 | FALSE | 5 | emp: 164891, Liv: 4167, Mis: 506, Sti: 40 |
| heart_two_birth_weight | 0 | 1 | FALSE | 10 | emp: 165515, Bet: 1438, Bet: 1082, Bet: 606 |
| heart_two_sex | 0 | 1 | FALSE | 3 | emp: 165466, F: 2191, M: 1959 |
| patient_ethnicity | 0 | 1 | FALSE | 5 | Whi: 115445, Oth: 30236, Asi: 17119, Bla: 4243 |
| partner_ethnicity | 0 | 1 | FALSE | 6 | Whi: 107395, Oth: 37889, Asi: 15030, Bla: 3796 |
| partner_type | 0 | 1 | FALSE | 5 | Mal: 148986, Fem: 9827, Non: 6712, N/A: 3464 |
| partner_age | 0 | 1 | FALSE | 11 | 18-: 48438, 35-: 31328, 40-: 21118, 38-: 19597 |
| causes_of_infertility_tubal_disease | 0 | 1 | FALSE | 2 | 0: 154648, 1: 14968 |
| causes_of_infertility_ovulatory_disorder | 0 | 1 | FALSE | 2 | 0: 151465, 1: 18151 |
| causes_of_infertility_male_factor | 0 | 1 | FALSE | 2 | 0: 117841, 1: 51775 |
| causes_of_infertility_patient_unexplained | 0 | 1 | FALSE | 2 | 0: 122981, 1: 46635 |
| causes_of_infertility_endometriosis | 0 | 1 | FALSE | 2 | 0: 160751, 1: 8865 |
| stimulation_used | 0 | 1 | FALSE | 2 | 1: 105145, 0: 64471 |
| pgt_m_treatment | 0 | 1 | FALSE | 2 | 0: 168202, 1: 1414 |
| pgt_a_treatment | 0 | 1 | FALSE | 2 | 0: 167625, 1: 1991 |
| eggs_thawed_0_1 | 0 | 1 | FALSE | 2 | 0: 168127, 1: 1489 |
| fresh_eggs_stored_0_1 | 0 | 1 | FALSE | 2 | 0: 165770, 1: 3846 |
| year_of_treatment | 0 | 1 | FALSE | 2 | 201: 85218, 201: 84398 |
| clinical_outcome | 0 | 1 | FALSE | 2 | Fai: 128593, Suc: 41023 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| live_birth_occurrence | 0 | 1 | 0.24 | 0.43 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▂ |
| number_of_live_births | 0 | 1 | 0.26 | 0.49 | 0 | 0 | 0 | 0 | 3 | ▇▂▁▁▁ |
# Replace missing values with "0"
hfea$fresh_eggs_collected[hfea$fresh_eggs_collected == ""] <- "0"
hfea$total_eggs_mixed[hfea$total_eggs_mixed == ""] <- "0"
hfea$total_embryos_created[hfea$total_embryos_created == ""] <- "0"
hfea$embryos_stored_for_use_by_patient[hfea$embryos_stored_for_use_by_patient == ""] <- "0"
hfea$early_outcome[hfea$early_outcome == ""] <- "None"
# Remove variables with so much missing values
hfea <- hfea |>
select(-c(live_birth_occurrence,
number_of_live_births,
year_of_treatment,
heart_one_birth_outcome,
heart_one_weeks_gestation,
heart_one_birth_weight,
heart_two_birth_weight,
heart_two_birth_outcome,
egg_donor_age_at_registration,
sperm_donor_age_at_registration,
heart_one_sex,
heart_two_sex,
egg_source,
sperm_source
))
hfea |> dim()
[1] 169616 26
# Drops the row if "" or 999 appears anywhere especially in the cloumn partner_age
clean_hfea <- hfea |>
filter(!if_any(everything(), ~ .x %in% c("", "999")))
clean_hfea |> dim()
[1] 152677 26
# Patient age
clean_hfea |>
select(patient_age_at_treatment) |>
count(patient_age_at_treatment) |>
ggplot(aes(x = reorder(patient_age_at_treatment, n), y = n, fill = patient_age_at_treatment)) + geom_col(show.legend = FALSE) + geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + theme(axis.title.y = element_text(angle = 0)) + labs(title = "Patients age group")
# Number of previous cycle
clean_hfea |>
select(total_number_of_previous_ivf_cycles) |>
count(total_number_of_previous_ivf_cycles) |>
ggplot(aes(x = reorder(total_number_of_previous_ivf_cycles, n), y = n, fill = total_number_of_previous_ivf_cycles)) + geom_col(show.legend = FALSE) + geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + theme(axis.title.y = element_text(angle = 0)) + labs(title = "Number of previous cycle")
# Eggs collected
clean_hfea |>
select(fresh_eggs_collected) |>
count(fresh_eggs_collected) |>
ggplot(aes(x = reorder(fresh_eggs_collected, n), y = n, fill = fresh_eggs_collected)) + geom_col(show.legend = FALSE) + geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + theme(axis.title.y = element_text(angle = 0)) + labs(title = "Fresh eggs collected")
# Total embryo created
clean_hfea |>
select(total_embryos_created) |>
count(total_embryos_created) |>
ggplot(aes(x = reorder(total_embryos_created, n), y = n, fill = total_embryos_created)) + geom_col(show.legend = FALSE) + geom_text(aes(label = n, vjust = -0.5)) + theme_minimal() + theme(axis.title.y = element_text(angle = 0)) + labs(title = "Total embryo created")
# 1. Clean up and reshape the data for visualization
hfea_long <- clean_hfea |>
select(
causes_of_infertility_tubal_disease,
causes_of_infertility_ovulatory_disorder,
causes_of_infertility_male_factor,
causes_of_infertility_patient_unexplained,
causes_of_infertility_endometriosis
) |>
# Convert everything to standard 1 (Yes) and 0 (No) if they are factors/text strings
mutate(across(everything(), ~ if_else(. == "Yes" | . == 1, 1, 0)))
# Observe
hfea_long |>
select(
causes_of_infertility_tubal_disease,
causes_of_infertility_ovulatory_disorder,
causes_of_infertility_male_factor,
causes_of_infertility_patient_unexplained,
causes_of_infertility_endometriosis
) |>
head()
# A tibble: 6 × 5
causes_of_infertility_tubal_di…¹ causes_of_infertilit…² causes_of_infertilit…³
<dbl> <dbl> <dbl>
1 0 0 1
2 0 0 1
3 0 0 0
4 0 0 1
5 0 1 1
6 0 0 0
# ℹ abbreviated names: ¹causes_of_infertility_tubal_disease,
# ²causes_of_infertility_ovulatory_disorder,
# ³causes_of_infertility_male_factor
# ℹ 2 more variables: causes_of_infertility_patient_unexplained <dbl>,
# causes_of_infertility_endometriosis <dbl>
hfea_long |>
summarise(across(everything(), sum)) |>
pivot_longer(cols = everything(), names_to = "Infertility_Cause", values_to = "Count") |>
mutate(
Percentage = (Count / nrow(clean_hfea)) * 100,
Infertility_Cause = str_remove(Infertility_Cause, "causes_of_infertility_") # Clean labels
) |>
ggplot(aes(x = reorder(Infertility_Cause, Percentage), y = Percentage)) +
geom_col(fill = "steelblue", width = 0.6) +
coord_flip() +
labs(
title = "Prevalence Matrix of Infertility Etiologies",
subtitle = paste("Analysis of baseline cohort distribution (N =", nrow(clean_hfea), ")"),
x = "Diagnostic Category", y = "Percentage of Total Cohort (%)"
) +
theme_minimal()
# Calculate cross-product matrix to discover diagnostic intersections
co_matrix <- t(as.matrix(hfea_long)) %*% as.matrix(hfea_long)
as_tibble(co_matrix, rownames = "Var1") |>
pivot_longer(-Var1, names_to = "Var2", values_to = "Overlap_Count") |>
mutate(
Var1 = str_remove(Var1, "causes_of_infertility_"),
Var2 = str_remove(Var2, "causes_of_infertility_")
) |>
ggplot(aes(x = Var1, y = Var2, fill = Overlap_Count)) +
geom_tile(color = "white") +
geom_text(aes(label = scales::comma(Overlap_Count)), color = "black", size = 3.5) +
scale_fill_gradient(low = "#e8f0fe", high = "#1a73e8") +
labs(
title = "Etiology Intersections & Patient Co-Occurrences",
subtitle = "Heatmap showing overlapping diagnoses across the clinical registry",
x = NULL, y = NULL, fill = "Patient Count"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# 1. Isolate and clean the multi-label categorical features
upset_data <- clean_hfea |>
select(
causes_of_infertility_tubal_disease,
causes_of_infertility_ovulatory_disorder,
causes_of_infertility_male_factor,
causes_of_infertility_patient_unexplained,
causes_of_infertility_endometriosis
) |>
# ComplexUpset requires logical (TRUE/FALSE) or binary (1/0) matrices
mutate(across(everything(), ~ if_else(. == "Yes" | . == 1, TRUE, FALSE))) |>
# Rename columns cleanly so the chart axis labels look ready for publication
rename(
`Tubal Disease` = causes_of_infertility_tubal_disease,
`Ovulatory Disorder` = causes_of_infertility_ovulatory_disorder,
`Male Factor` = causes_of_infertility_male_factor,
`Unexplained` = causes_of_infertility_patient_unexplained,
`Endometriosis` = causes_of_infertility_endometriosis
)
# 2. Extract the clean column names to serve as our target sets
infertility_sets <- colnames(upset_data)
# 3. Generate the ComplexUpSet Visualization
suppressWarnings(
upset(
upset_data,
infertility_sets,
name = "Diagnostic Intersections / Combinations",
width_ratio = 0.3, # Adjusts balance between left set sizes and main matrix
height_ratio = 0.6, # Adjusts balance between top bar chart and main matrix
stripes = upset_stripes(
mapping = aes(color = "grey95"),
colors = c("white", "grey95")
),
base_annotations = list(
'Intersection Size' = intersection_size(
counts = TRUE, # Shows numerical patient totals on top of the bars
mapping = aes(fill = "steelblue")
) +
scale_fill_identity() +
theme_minimal() +
labs(y = "Patient Cohort Size")
),
set_sizes = upset_set_size() +
theme_minimal() +
labs(x = "Total Prevalence per Cause")
) +
labs(
title = "Multi-Label Clinical Cohort Mapping via UpSet Intersection Design",
subtitle = paste("Comprehensive mapping of standalone and overlapping etiologies (N =", scales::comma(nrow(clean_hfea)), ")")
)
)
# Set seeds for R and torch
torch::torch_manual_seed(13)
set.seed(13)
# Data splitting
data_split <- initial_split(clean_hfea, prop = 0.80, strata = clinical_outcome)
train_split <- data_split |> training()
test_split <- data_split |> testing()
# Preprocessing
hfea_recipe <- recipe(clinical_outcome ~ ., data = train_split) |>
step_novel(all_nominal_predictors()) |>
#step_dummy(all_nominal_predictors(), -all_outcomes(), one_hot = FALSE) |>
step_zv(all_nominal_predictors()) |>
step_normalize(all_numeric_predictors()) |>
step_downsample(clinical_outcome, under_ratio = 1) # Solve the 3:1 outcome variable mild imbalance
# Detect how many logical cores your computer has
all_cores <- parallel::detectCores(logical = TRUE)
# Register a cluster using all available cores except 1 (leaves 1 core free so my PC doesn't freeze)
cl <- makePSOCKcluster(all_cores - 1)
registerDoParallel(cl)
cat("Parallel backend registered with", all_cores - 1, "cores.\n")
Parallel backend registered with 3 cores.
# deep learning with tabnet
tabnet_tune_spec <- tabnet(
epochs = 40, # Number of training epochs over dataset
batch_size = 256, # Sub-cohort sizing optimized for local RAM
decision_width = 16, # Width of the prediction layer
attention_width = 16, # Width of the attention embedding mask
num_steps = 4, # Sequential attention steps (depth of processing)
learn_rate = 0.01 # Gradient descent step velocity
) |>
set_engine("torch") |>
set_mode("classification")
# Combine into a unified tuning workflow
transformer_workflow <- workflow() |>
add_recipe(hfea_recipe) |>
add_model(tabnet_tune_spec)
# Execute Attention-Based Model Optimization Loop
cat("--- Commencing Transformer Optimization Pipeline ---\n")
--- Commencing Transformer Optimization Pipeline ---
fitted_transformer <- fit(transformer_workflow, data = train_split)
# Predict
predictions <- augment(fitted_transformer, test_split)
ivf_metrics <- metric_set(accuracy, sensitivity, specificity, precision, npv)
predictions |>
ivf_metrics(truth = clinical_outcome,
estimate = .pred_class,
.pred_Success)
# A tibble: 5 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.963
2 sensitivity binary 0.952
3 specificity binary 0.996
4 precision binary 0.999
5 npv binary 0.878
#Extracting raw TabNet feature importances driven by internal cross-attention masks
extracted_fit <- extract_fit_engine(fitted_transformer)
explain_data <- bake(prep(hfea_recipe), test_split)
# Extract attention masks
attention_explain <- tabnet_explain(extracted_fit, explain_data)
# Render Global Interpretability Plot for Clinical Support Decisions
autoplot(attention_explain) +
labs(title = "Clinical Decision Support: Global Transformer Attention Weights",
subtitle = "Feature importance maps computed natively via multi-step attention layer interactions",
x = "Clinical Predictor", y = "Aggregated Attention Score") +
theme_minimal(base_size = 14)
# Feature Importance Vip Plot
# 1. Extract the underlying engine fit cleanly
extracted_fit <- extract_fit_engine(fitted_transformer)
# 2. Generate the plot with your specific fill and theme styles
vip(extracted_fit, aesthetics = list(fill = "turquoise1", color = "black")) +
theme_minimal(base_size = 14) +
labs(
title = "Transformer Feature Importance Profile",
subtitle = "Attribution weights derived natively from cross-attention masks",
y = "Importance Score",
x = "Clinical Predictor"
)
# Close multicore
stopCluster(cl)
registerDoSEQ() # Returns R to standard single-threaded behavior