EDA
# Clean names
hfea_2017_2018 <- hfea_2017_2018 |>
clean_names()
# Select required variables
hfea <- hfea_2017_2018 |>
select(
patient_age_at_treatment,
total_number_of_previous_ivf_cycles,
total_number_of_previous_di_cycles,
egg_donor_age_at_registration,
sperm_donor_age_at_registration,
type_of_treatment_ivf_or_di,
specific_treatment_type,
egg_source,
sperm_source,
fresh_eggs_collected,
total_eggs_mixed,
total_embryos_created,
total_embryos_thawed,
embryos_stored_for_use_by_patient,
early_outcome,
heart_one_weeks_gestation,
heart_one_birth_outcome,
heart_one_birth_weight,
heart_one_sex,
heart_two_birth_outcome,
heart_two_birth_weight,
heart_two_sex,
patient_ethnicity,
partner_ethnicity,
partner_type,
partner_age,
causes_of_infertility_tubal_disease,
causes_of_infertility_ovulatory_disorder,
causes_of_infertility_male_factor,
causes_of_infertility_patient_unexplained,
causes_of_infertility_endometriosis,
stimulation_used,
pgt_m_treatment,
pgt_a_treatment,
eggs_thawed_0_1,
fresh_eggs_stored_0_1,
year_of_treatment,
live_birth_occurrence,
number_of_live_births
)
# Variables conversion
hfea_names <- c("patient_age_at_treatment",
"total_number_of_previous_ivf_cycles",
"total_number_of_previous_di_cycles",
"egg_donor_age_at_registration",
"sperm_donor_age_at_registration",
"type_of_treatment_ivf_or_di",
"specific_treatment_type",
"egg_source",
"sperm_source",
"fresh_eggs_collected",
"total_eggs_mixed",
"total_embryos_created",
"total_embryos_thawed",
"embryos_stored_for_use_by_patient",
"early_outcome",
"heart_one_weeks_gestation",
"heart_one_birth_outcome",
"heart_one_birth_weight",
"heart_one_sex",
"heart_two_birth_outcome",
"heart_two_birth_weight",
"heart_two_sex",
"patient_ethnicity",
"partner_ethnicity",
"partner_type",
"partner_age",
"causes_of_infertility_tubal_disease",
"causes_of_infertility_ovulatory_disorder",
"causes_of_infertility_male_factor",
"causes_of_infertility_patient_unexplained",
"causes_of_infertility_endometriosis",
"stimulation_used",
"pgt_m_treatment",
"pgt_a_treatment",
"eggs_thawed_0_1",
"fresh_eggs_stored_0_1",
"year_of_treatment")
hfea <- hfea |>
mutate(across(all_of(hfea_names), factor))
# Create categorical outcome variable
hfea <- hfea |>
mutate(
clinical_outcome = factor(
case_when(
live_birth_occurrence == 1 ~ "Success",
TRUE ~ "Failure"
)
)
)
lapply(hfea, function(x) {
if (is.numeric(x)) return(summary(x))
if (is.factor(x)) return(table(x))
})
$patient_age_at_treatment
x
18-34 35-37 38-39 40-42 43-44 45-50 999
68365 38567 24365 23813 7292 4545 2669
$total_number_of_previous_ivf_cycles
x
>5 0 1 2 3 4 5
6498 72758 40631 24011 13623 7695 4400
$total_number_of_previous_di_cycles
x
>5 0 1 2 3 4 5
1115 157604 3763 2985 2323 1091 735
$egg_donor_age_at_registration
x
<= 20 >35 Between 21 and 25
160975 324 867 1558
Between 26 and 30 Between 31 and 35
2660 3232
$sperm_donor_age_at_registration
x
<= 20 >45 Between 21 and 25
144738 1069 481 5806
Between 26 and 30 Between 31 and 35 Between 36 and 40 Between 41 and 45
6241 5041 4475 1765
$type_of_treatment_ivf_or_di
x
DI IVF
11282 158334
$specific_treatment_type
x
DI ICSI ICSI:IVF ICSI:Unknown IVF IVF:Unknown
11282 60955 430 133 58647 47
Unknown
38122
$egg_source
x
Donor Patient
11282 8677 149657
$sperm_source
x
Donor Partner
195 24782 144639
$fresh_eggs_collected
x
>40 0 1-5 11-15 16-20 21-25 26-30 31-35 36-40 6-10
11282 274 60060 25857 21799 10986 4750 1983 753 317 31555
$total_eggs_mixed
x
>40 0 1-5 11-15 16-20 21-25 26-30 31-35 36-40 6-10
11282 122 62337 31160 18623 7982 3021 1088 403 146 33452
$total_embryos_created
x
>30 0 1-5 11-15 16-20 21-25 26-30 6-10
11282 84 66796 48335 9882 2728 689 206 29614
$total_embryos_thawed
x
>10 0 1-5 6-10
213 119661 48930 812
$embryos_stored_for_use_by_patient
x
>20 0 1-5 11-15 16-20 6-10
11282 101 109022 41130 1116 245 6720
$early_outcome
x
Biochemical Pregnancy Only
37096 8966
Ectopic/Hetrotopic Intrauterine Fetal Pulsation Seen
538 46571
Miscarriage None
4937 71508
$heart_one_weeks_gestation
x
30 31
129263 160 221
32 33 34
290 406 676
35 36 37
1074 2163 4512
38 39 40
7211 10456 8615
Greater than 40 weeks Less than 30 weeks
4024 545
$heart_one_birth_outcome
x
Ectotopic/Hetrotopic Pregnancy
122806 46
Embryo Reduction Live Birth
39 40525
Lost to Follow Up Miscarriage
431 5261
Still Birth Termination
148 360
$heart_one_birth_weight
x
5.5kg or greater Between 1.5kg and 1.99Kg
129357 27 1048
Between 1kg and 1.49Kg Between 2.0kg and 2.49Kg Between 2.5kg and 2.99Kg
527 3067 7241
Between 3.0kg and 3.49Kg Between 3.5kg and 3.99Kg Between 4.0kg and 4.49Kg
14630 10179 2779
Between 4.5kg and 4.99Kg Between 5.0kg and 5.49Kg Less than 1kg
420 56 285
$heart_one_sex
x
F M
129123 19860 20633
$heart_two_birth_outcome
x
Embryo Reduction Live Birth Miscarriage
164891 12 4167 506
Still Birth
40
$heart_two_birth_weight
x
Between 1.5kg and 1.99Kg Between 1kg and 1.49Kg
165515 606 220
Between 2.0kg and 2.49Kg Between 2.5kg and 2.99Kg Between 3.0kg and 3.49Kg
1438 1082 455
Between 3.5kg and 3.99Kg Between 4.0kg and 4.49Kg Between 4.5kg and 4.99Kg
131 37 8
Less than 1kg
124
$heart_two_sex
x
F M
165466 2191 1959
$patient_ethnicity
x
Asian Black Mixed Other White
17119 4243 2573 30236 115445
$partner_ethnicity
x
Any other ethnicity Asian Black Mixed
3663 15030 3796 1843
Other White
37889 107395
$partner_type
x
Female Male N/A None Surrogate
9827 148986 3464 6712 627
$partner_age
x
>60 18-34 35-37 38-39 40-42 43-44 45-50 51-55 56-60 999
16933 772 48438 31328 19597 21118 10352 15595 4145 1332 6
$causes_of_infertility_tubal_disease
x
0 1
154648 14968
$causes_of_infertility_ovulatory_disorder
x
0 1
151465 18151
$causes_of_infertility_male_factor
x
0 1
117841 51775
$causes_of_infertility_patient_unexplained
x
0 1
122981 46635
$causes_of_infertility_endometriosis
x
0 1
160751 8865
$stimulation_used
x
0 1
64471 105145
$pgt_m_treatment
x
0 1
168202 1414
$pgt_a_treatment
x
0 1
167625 1991
$eggs_thawed_0_1
x
0 1
168127 1489
$fresh_eggs_stored_0_1
x
0 1
165770 3846
$year_of_treatment
x
2017 2018
84398 85218
$live_birth_occurrence
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.2419 0.0000 1.0000
$number_of_live_births
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 0.000 0.264 0.000 3.000
$clinical_outcome
x
Failure Success
128593 41023
# Replace missing values with "0"
hfea$fresh_eggs_collected[hfea$fresh_eggs_collected == ""] <- "0"
hfea$total_eggs_mixed[hfea$total_eggs_mixed == ""] <- "0"
hfea$total_embryos_created[hfea$total_embryos_created == ""] <- "0"
hfea$embryos_stored_for_use_by_patient[hfea$embryos_stored_for_use_by_patient == ""] <- "0"
hfea$early_outcome[hfea$early_outcome == ""] <- "None"
# Remove variables with so much missing values
hfea <- hfea |>
select(-c(live_birth_occurrence,
number_of_live_births,
year_of_treatment,
heart_one_birth_outcome,
heart_one_weeks_gestation,
heart_one_birth_weight,
heart_two_birth_weight,
heart_two_birth_outcome,
egg_donor_age_at_registration,
sperm_donor_age_at_registration,
heart_one_sex,
heart_two_sex,
egg_source,
sperm_source
))
# Drops the row if "" or 999 appears anywhere especially in the cloumn partner_age
clean_hfea <- hfea |>
filter(!if_any(everything(), ~ .x %in% c("", "999")))
Shapley values
set.seed(16)
# 1. Take a representative sample of your background training features
background_sample <- train_split |>
select(-clinical_outcome) |>
slice_sample(n = 200) # Fast & highly accurate baseline
# 2. Extract the matching target outcomes for those specific 200 rows
background_y <- train_split$clinical_outcome[sample(1:nrow(train_split), 200)]
# 3. Build the explainer with the lightweight sample
ivf_explainer <- explain_tidymodels(
ivf_workflow,
data = background_sample, # Swapped out the massive 150k row data frame
y = background_y,
verbose = FALSE
)
## Warning in Ops.factor(y, predict_function(model, data)): '-' not meaningful for
## factors
# 4. Run the SHAP prediction (This will now finish in seconds!)
single_var <- clean_hfea[1000, -26]
ivf_shap <- predict_parts(
explainer = ivf_explainer,
new_observation = single_var,
type = "shap",
B = 25 # 25 bootstrap iterations
)
ivf_shap |> plot()
