Packages

library(pacman)
p_load(tidymodels, tidyverse, plotly, skimr, themis, readxl, janitor, xgboost, doParallel, probably)

Read file

msc <- read_csv("mscph.csv", col_types = cols(),  na = "NA")

names(msc)
 [1] "pep_id_sex"                      "pep_id_deliv_type"              
 [3] "pep_id_gest_age_birth_wks"       "pep_id_hiv_serostatus_mo"       
 [5] "pep_id_birth_weight_grams"       "mothers_recent_viral_load"      
 [7] "first_pcr_test_result"           "second_pcr_test"                
 [9] "final_outcome_result"            "mother_received_intervetion_y_n"
[11] "place_site_of_delivery"          "brst_feeding_method"            
[13] "type_of_avr_prophy_for_baby"     "cotrimoxazole_administered_y_n" 
[15] "age_mo"                          "education"                      
[17] "marital_st"                     

Change variable names for readability

# Rename columns
msc <- msc |> rename(infant_sex = pep_id_sex,
                     delivery_type = pep_id_deliv_type,
                     infant_age_at_birth = pep_id_gest_age_birth_wks,
                     mother_hiv_serostatus = pep_id_hiv_serostatus_mo,
                     infant_weight = pep_id_birth_weight_grams,
                     mother_viral_load = mothers_recent_viral_load,
                     first_pcr = first_pcr_test_result,
                     second_pcr = second_pcr_test,
                     final_outcome = final_outcome_result,
                     mother_received_intervetion = mother_received_intervetion_y_n,
                     delivery_place = place_site_of_delivery,
                     breast_feeding_method = brst_feeding_method,
                     baby_prophylaxis = type_of_avr_prophy_for_baby,
                     cotrimaxazole_for_mmother = cotrimoxazole_administered_y_n,
                     mother_age = age_mo,
                     education_level = education,
                     marital_status = marital_st)

Convert character variables to factors

# To factors
msc_names <- c("infant_sex", "delivery_type", "mother_hiv_serostatus", "first_pcr", "second_pcr", "final_outcome", "mother_received_intervetion", "delivery_place", "breast_feeding_method",  "baby_prophylaxis", "cotrimaxazole_for_mmother", "education_level", "marital_status")


msc <- msc |> 
  mutate(across(all_of(msc_names), factor))

str(msc)
tibble [600 × 17] (S3: tbl_df/tbl/data.frame)
 $ infant_sex                 : Factor w/ 2 levels "Female","Male": 1 1 1 2 1 2 2 1 1 1 ...
 $ delivery_type              : Factor w/ 4 levels "Assisted vaginal",..: 2 4 4 4 4 4 4 2 4 4 ...
 $ infant_age_at_birth        : num [1:600] 37 38 38 38 38 37 38 39 38 38 ...
 $ mother_hiv_serostatus      : Factor w/ 3 levels "HIV-1 & HIV-2 positive",..: 1 2 2 2 2 2 2 1 2 2 ...
 $ infant_weight              : num [1:600] 3000 2600 2600 2000 3400 2400 3200 2900 2100 3300 ...
 $ mother_viral_load          : num [1:600] 20 20 20 20 20 20.3 20 876 20 20 ...
 $ first_pcr                  : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
 $ second_pcr                 : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
 $ final_outcome              : Factor w/ 2 levels "HIV Infected",..: 2 2 2 2 1 2 2 2 2 1 ...
 $ mother_received_intervetion: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ delivery_place             : Factor w/ 2 levels "Inside Facility",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ breast_feeding_method      : Factor w/ 2 levels "Exclusive","Mixed Feeding": 2 2 2 1 1 2 2 1 1 1 ...
 $ baby_prophylaxis           : Factor w/ 2 levels "AZT + NVP","NVP": 2 2 2 2 2 2 2 2 2 2 ...
 $ cotrimaxazole_for_mmother  : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ mother_age                 : num [1:600] 42 46 22 44 38 41 36 33 36 33 ...
 $ education_level            : Factor w/ 3 levels "Primary","Secondary",..: 3 3 2 2 2 2 2 2 3 2 ...
 $ marital_status             : Factor w/ 5 levels "Divorced","Married",..: 5 4 2 2 2 2 2 NA 4 2 ...

Models

Preprocessing

# Remove columns with NAs
new_msc <- msc  |>  select(-marital_status)

# Split data
set.seed(111)
data_split <- initial_split(new_msc, prop = .8, strata = final_outcome)
data_split
<Training/Testing/Total>
<480/120/600>
# Training and test data
m_train <- data_split |>  training()
m_test <- data_split  |>  testing()
m_train |> glimpse()
Rows: 480
Columns: 16
$ infant_sex                  <fct> Female, Female, Male, Female, Female, Fema…
$ delivery_type               <fct> Standard vaginal, Standard vaginal, Standa…
$ infant_age_at_birth         <dbl> 38, 38, 40, 38, 38, 38, 38, 38, 38, 38, 38…
$ mother_hiv_serostatus       <fct> HIV-1 positive, HIV-1 positive, HIV-1 posi…
$ infant_weight               <dbl> 3400, 3300, 4000, 2200, 2200, 2500, 3500, …
$ mother_viral_load           <dbl> 20, 20, 52, 20, 20, 20, 20, 20, 243, 20, 2…
$ first_pcr                   <fct> Negative, Negative, Negative, Positive, Po…
$ second_pcr                  <fct> Negative, Negative, Negative, Positive, Po…
$ final_outcome               <fct> HIV Infected, HIV Infected, HIV Infected, …
$ mother_received_intervetion <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ delivery_place              <fct> Inside Facility, Inside Facility, Inside F…
$ breast_feeding_method       <fct> Exclusive, Exclusive, Mixed Feeding, Exclu…
$ baby_prophylaxis            <fct> NVP, NVP, NVP, NVP, NVP, NVP, NVP, NVP, NV…
$ cotrimaxazole_for_mmother   <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ mother_age                  <dbl> 38, 33, 23, 47, 44, 33, 34, 34, 40, 32, 40…
$ education_level             <fct> Secondary, Secondary, Secondary, Tertiary,…
# Outcome levels
levels(m_train$final_outcome)
[1] "HIV Infected"   "HIV Uninfected"
# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train)  |>  
  step_log(mother_viral_load, base = 10, offset = 1) |> 
  step_dummy(all_nominal_predictors()) |> 
  step_interact(terms = ~ infant_weight:infant_age_at_birth) |> 
  step_normalize(all_numeric_predictors())

XG Boost

# Create model design
hiv_xg <- boost_tree(trees =2000, tree_depth = 3) |> 
  set_engine("xgboost") |> 
  set_mode("classification")

# Add recipe and model design to workflow
set.seed(222)
xg_workflow <- workflow() |> 
  add_recipe(reci_m) |> 
  add_model(hiv_xg) |> 
  fit(m_train)

# Predict
xg_test <- augment(xg_workflow, m_test)

# Metrics
xg_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)

xg_test |> 
  xg_metrics(truth = final_outcome, 
              estimate = .pred_class,          
              `.pred_HIV Infected`)
# A tibble: 6 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.775
2 sensitivity binary         0.690
3 specificity binary         0.821
4 precision   binary         0.674
5 npv         binary         0.831
6 roc_auc     binary         0.817

Model with hyperparameter tuning

# Create model design
hiv_xg <- boost_tree(trees = tune(), tree_depth = tune()) |> 
  set_engine("xgboost") |> 
  set_mode("classification")

# Add recipe and model design to workflow
set.seed(222)
xg_workflow <- workflow() |> 
  add_recipe(reci_m) |> 
  add_model(hiv_xg) 


# Create a Hyper-Parameter Grid
set.seed(333)
xg_grid <- expand.grid(tree_depth=c(1, 2, 5, 10, 15),
                       trees = c(5, 10, 15, 50, 100))


# Step 6 - Create Resamples for Cross-Validation:
xg_fold <- m_train |> 
  vfold_cv(v = 10, strata = final_outcome)

# Do Parallel
doParallel::registerDoParallel()

# Tune and train
set.seed(333)
StartTime=Sys.time()
xg_tune <- tune_grid(xg_workflow, resamples = xg_fold, 
                     grid = xg_grid, metrics = metric_set(accuracy))

RunTime=Sys.time()- StartTime

# Visualize tuning results
print("TUNING RESULTS:")
[1] "TUNING RESULTS:"
autoplot(xg_tune)

# Extract the Best Hyper-Parameters:
xg_best <- select_best(xg_tune, metric = "accuracy")

# Finalize and Train the Best Workflow Model:
set.seed(333)
xg_result <- finalize_workflow(xg_workflow, xg_best) |> 
  fit(m_train)

# Assess Prediction Quality Based on the Testing Data:
xg_aug <- augment(xg_result, new_data = m_test)

xg_aug |> 
  xg_metrics(truth = final_outcome, 
              estimate = .pred_class,         
              `.pred_HIV Infected`)
# A tibble: 6 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.783
2 sensitivity binary         0.714
3 specificity binary         0.821
4 precision   binary         0.682
5 npv         binary         0.842
6 roc_auc     binary         0.864

Youden Index

# From predicted data
threshold_data <- xg_aug |> 
  threshold_perf(truth = final_outcome,
                 estimate = `.pred_HIV Infected`,
                 thresholds = seq(0, 1, by = 0.01))

# 2. Manual Calculation (Bypassing the pivot error)
sens_values <- threshold_data |> filter(.metric == "sensitivity") |> pull(.estimate)
spec_values <- threshold_data |> filter(.metric == "specificity") |> pull(.estimate)
thresholds  <- threshold_data |> filter(.metric == "sensitivity") |> pull(.threshold)

# 3. Create a clean results table
youden_results <- tibble(
  threshold    = thresholds,
  sensitivity  = sens_values,
  specificity  = spec_values,
  youden_index = sensitivity + specificity - 1
) |> 
  arrange(desc(youden_index))

# 4. Show the winner
best_row <- youden_results |> slice(1)
print(best_row)
# A tibble: 1 × 4
  threshold sensitivity specificity youden_index
      <dbl>       <dbl>       <dbl>        <dbl>
1      0.21       0.905       0.705        0.610
# Map the winning probability back to the Viral Load units
optimal_cutoff <- xg_aug |>
  mutate(diff = abs(`.pred_HIV Infected` - best_row$threshold)) |>
  slice_min(diff, n = 1) |>
  pull(mother_viral_load) # Ensure this matches your column name

cat("The optimal Maternal Viral Load threshold is:", optimal_cutoff, "ml/copies", sep = "")
The optimal Maternal Viral Load threshold is:20ml/copies