Packages

library(pacman)
p_load(tidymodels, tidyverse, plotly, skimr, themis, readxl, janitor, vip, ranger, parallel, partykit, ggparty, rpart, rpart.plot, probably)

Read file

msc <- read_csv("mscph.csv", col_types = cols(),  na = "NA")

names(msc)
 [1] "pep_id_sex"                      "pep_id_deliv_type"              
 [3] "pep_id_gest_age_birth_wks"       "pep_id_hiv_serostatus_mo"       
 [5] "pep_id_birth_weight_grams"       "mothers_recent_viral_load"      
 [7] "first_pcr_test_result"           "second_pcr_test"                
 [9] "final_outcome_result"            "mother_received_intervetion_y_n"
[11] "place_site_of_delivery"          "brst_feeding_method"            
[13] "type_of_avr_prophy_for_baby"     "cotrimoxazole_administered_y_n" 
[15] "age_mo"                          "education"                      
[17] "marital_st"                     

Change variable names for readability

# Rename columns
msc <- msc |> rename(infant_sex = pep_id_sex,
                     delivery_type = pep_id_deliv_type,
                     infant_age_at_birth = pep_id_gest_age_birth_wks,
                     mother_hiv_serostatus = pep_id_hiv_serostatus_mo,
                     infant_weight = pep_id_birth_weight_grams,
                     mother_viral_load = mothers_recent_viral_load,
                     first_pcr = first_pcr_test_result,
                     second_pcr = second_pcr_test,
                     final_outcome = final_outcome_result,
                     mother_received_intervetion = mother_received_intervetion_y_n,
                     delivery_place = place_site_of_delivery,
                     breast_feeding_method = brst_feeding_method,
                     baby_prophylaxis = type_of_avr_prophy_for_baby,
                     cotrimaxazole_for_mmother = cotrimoxazole_administered_y_n,
                     mother_age = age_mo,
                     education_level = education,
                     marital_status = marital_st)

Convert character variables to factors

# To factors
msc_names <- c("infant_sex", "delivery_type", "mother_hiv_serostatus", "first_pcr", "second_pcr", "final_outcome", "mother_received_intervetion", "delivery_place", "breast_feeding_method",  "baby_prophylaxis", "cotrimaxazole_for_mmother", "education_level", "marital_status")


msc <- msc |> 
  mutate(across(all_of(msc_names), factor))

str(msc)
tibble [600 × 17] (S3: tbl_df/tbl/data.frame)
 $ infant_sex                 : Factor w/ 2 levels "Female","Male": 1 1 1 2 1 2 2 1 1 1 ...
 $ delivery_type              : Factor w/ 4 levels "Assisted vaginal",..: 2 4 4 4 4 4 4 2 4 4 ...
 $ infant_age_at_birth        : num [1:600] 37 38 38 38 38 37 38 39 38 38 ...
 $ mother_hiv_serostatus      : Factor w/ 3 levels "HIV-1 & HIV-2 positive",..: 1 2 2 2 2 2 2 1 2 2 ...
 $ infant_weight              : num [1:600] 3000 2600 2600 2000 3400 2400 3200 2900 2100 3300 ...
 $ mother_viral_load          : num [1:600] 20 20 20 20 20 20.3 20 876 20 20 ...
 $ first_pcr                  : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
 $ second_pcr                 : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
 $ final_outcome              : Factor w/ 2 levels "HIV Infected",..: 2 2 2 2 1 2 2 2 2 1 ...
 $ mother_received_intervetion: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ delivery_place             : Factor w/ 2 levels "Inside Facility",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ breast_feeding_method      : Factor w/ 2 levels "Exclusive","Mixed Feeding": 2 2 2 1 1 2 2 1 1 1 ...
 $ baby_prophylaxis           : Factor w/ 2 levels "AZT + NVP","NVP": 2 2 2 2 2 2 2 2 2 2 ...
 $ cotrimaxazole_for_mmother  : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ mother_age                 : num [1:600] 42 46 22 44 38 41 36 33 36 33 ...
 $ education_level            : Factor w/ 3 levels "Primary","Secondary",..: 3 3 2 2 2 2 2 2 3 2 ...
 $ marital_status             : Factor w/ 5 levels "Divorced","Married",..: 5 4 2 2 2 2 2 NA 4 2 ...

Models

Preprocessing

# Remove columns with NAs
new_msc <- msc  |>  select(-marital_status)

# Split data
set.seed(111)
data_split <- initial_split(new_msc, prop = .8, strata = final_outcome)
data_split
<Training/Testing/Total>
<480/120/600>
# Training and test data
m_train <- data_split |>  training()
m_test <- data_split  |>  testing()
m_train |> glimpse()
Rows: 480
Columns: 16
$ infant_sex                  <fct> Female, Female, Male, Female, Female, Fema…
$ delivery_type               <fct> Standard vaginal, Standard vaginal, Standa…
$ infant_age_at_birth         <dbl> 38, 38, 40, 38, 38, 38, 38, 38, 38, 38, 38…
$ mother_hiv_serostatus       <fct> HIV-1 positive, HIV-1 positive, HIV-1 posi…
$ infant_weight               <dbl> 3400, 3300, 4000, 2200, 2200, 2500, 3500, …
$ mother_viral_load           <dbl> 20, 20, 52, 20, 20, 20, 20, 20, 243, 20, 2…
$ first_pcr                   <fct> Negative, Negative, Negative, Positive, Po…
$ second_pcr                  <fct> Negative, Negative, Negative, Positive, Po…
$ final_outcome               <fct> HIV Infected, HIV Infected, HIV Infected, …
$ mother_received_intervetion <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ delivery_place              <fct> Inside Facility, Inside Facility, Inside F…
$ breast_feeding_method       <fct> Exclusive, Exclusive, Mixed Feeding, Exclu…
$ baby_prophylaxis            <fct> NVP, NVP, NVP, NVP, NVP, NVP, NVP, NVP, NV…
$ cotrimaxazole_for_mmother   <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ mother_age                  <dbl> 38, 33, 23, 47, 44, 33, 34, 34, 40, 32, 40…
$ education_level             <fct> Secondary, Secondary, Secondary, Tertiary,…
# Outcome levels
levels(m_train$final_outcome)
[1] "HIV Infected"   "HIV Uninfected"
# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train)  |>  
  step_log(mother_viral_load, base = 10, offset = 1) |> 
  step_dummy(all_nominal_predictors()) |> 
  step_interact(terms = ~ infant_weight:infant_age_at_birth) |> 
  step_normalize(all_numeric_predictors())

Random Forest

# Create model design
hiv_rf <- rand_forest(min_n = 5, mtry = 2, trees = 2000) |> 
  set_engine("ranger", num.threads = detectCores(), importance = 'permutation') |> 
  set_mode("classification")

# Add recipe and model design to workflow
set.seed(222)
rf_workflow <- workflow() |> 
  add_recipe(reci_m) |> 
  add_model(hiv_rf) |> 
  fit(m_train)
  
# Predict
rf_test <- augment(rf_workflow, m_test)

# Metrics
rf_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)

rf_test |> 
  rf_metrics(truth = final_outcome, 
              estimate = .pred_class,          
              `.pred_HIV Infected`)
# A tibble: 6 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.725
2 sensitivity binary         0.310
3 specificity binary         0.949
4 precision   binary         0.765
5 npv         binary         0.718
6 roc_auc     binary         0.856

Plot trees

# Create conditional inference tree model (CIT)
cit_tree <- ctree(final_outcome ~ ., data = new_msc)
plot(cit_tree, gp = gpar(fontsize = 8))

# Classification and regression tree model (CART)
hiv_rpart <- decision_tree(tree_depth = 3) |> 
  set_engine("rpart") |> 
  set_mode("classification")

rpart_workflow <- workflow() |> 
  add_recipe(reci_m) |> 
  add_model(hiv_rpart) |> 
  fit(m_train)

rpart.plot(extract_fit_engine(rpart_workflow),
           yes.text="YES", no.text="NO",roundint=FALSE)

cit_tree2 <- rpart(final_outcome ~ ., data = m_train, method = "class", cp = 0.008)
cit_tree2$variable.importance
##               infant_weight           mother_viral_load 
##                  34.3613128                  32.1026831 
##            baby_prophylaxis              delivery_place 
##                  19.0399667                  12.4613799 
##                   first_pcr       breast_feeding_method 
##                  11.7996047                  11.0783154 
## mother_received_intervetion                  second_pcr 
##                  10.1100478                   9.2695726 
##               delivery_type       mother_hiv_serostatus 
##                   6.2243593                   4.4414128 
##   cotrimaxazole_for_mmother                  mother_age 
##                   4.4038575                   4.2028636 
##         infant_age_at_birth                  infant_sex 
##                   2.4055076                   1.6067407 
##             education_level 
##                   0.2685652
cit_tree2
## n= 480 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 480 168 HIV Uninfected (0.35000000 0.65000000)  
##      2) mother_viral_load>=344 63  16 HIV Infected (0.74603175 0.25396825)  
##        4) infant_weight>=3400 29   1 HIV Infected (0.96551724 0.03448276) *
##        5) infant_weight< 3400 34  15 HIV Infected (0.55882353 0.44117647)  
##         10) mother_viral_load>=4438.5 15   1 HIV Infected (0.93333333 0.06666667) *
##         11) mother_viral_load< 4438.5 19   5 HIV Uninfected (0.26315789 0.73684211) *
##      3) mother_viral_load< 344 417 121 HIV Uninfected (0.29016787 0.70983213)  
##        6) second_pcr=Positive 9   0 HIV Infected (1.00000000 0.00000000) *
##        7) second_pcr=Negative 408 112 HIV Uninfected (0.27450980 0.72549020)  
##         14) infant_weight>=2450 349 112 HIV Uninfected (0.32091691 0.67908309)  
##           28) breast_feeding_method=Exclusive 184  81 HIV Uninfected (0.44021739 0.55978261)  
##             56) infant_weight>=3250 57  17 HIV Infected (0.70175439 0.29824561)  
##              112) cotrimaxazole_for_mmother=Yes 50  12 HIV Infected (0.76000000 0.24000000) *
##              113) cotrimaxazole_for_mmother=No 7   2 HIV Uninfected (0.28571429 0.71428571) *
##             57) infant_weight< 3250 127  41 HIV Uninfected (0.32283465 0.67716535)  
##              114) delivery_type=Standard vaginal 75  31 HIV Uninfected (0.41333333 0.58666667)  
##                228) mother_hiv_serostatus=HIV-1 positive 68  31 HIV Uninfected (0.45588235 0.54411765)  
##                  456) infant_weight< 3050 57  28 HIV Infected (0.50877193 0.49122807)  
##                    912) infant_age_at_birth>=37.5 50  22 HIV Infected (0.56000000 0.44000000)  
##                     1824) mother_age< 42.5 41  15 HIV Infected (0.63414634 0.36585366) *
##                     1825) mother_age>=42.5 9   2 HIV Uninfected (0.22222222 0.77777778) *
##                    913) infant_age_at_birth< 37.5 7   1 HIV Uninfected (0.14285714 0.85714286) *
##                  457) infant_weight>=3050 11   2 HIV Uninfected (0.18181818 0.81818182) *
##                229) mother_hiv_serostatus=HIV-1 & HIV-2 positive,HIV-2 positive 7   0 HIV Uninfected (0.00000000 1.00000000) *
##              115) delivery_type=Assisted vaginal,Elective cesarean,Emergency cesarean 52  10 HIV Uninfected (0.19230769 0.80769231) *
##           29) breast_feeding_method=Mixed Feeding 165  31 HIV Uninfected (0.18787879 0.81212121) *
##         15) infant_weight< 2450 59   0 HIV Uninfected (0.00000000 1.00000000) *

Variable Importance Plot

vip(extract_fit_engine(rf_workflow), metric="specificity", aesthetics = list(fill = "green", color = "black"))

Youden Index

# From predicted data
threshold_data <- rf_test |> 
  threshold_perf(truth = final_outcome,
                 estimate = `.pred_HIV Infected`,
                 thresholds = seq(0, 1, by = 0.01))

# 2. Manual Calculation (Bypassing the pivot error)
sens_values <- threshold_data |> filter(.metric == "sensitivity") |> pull(.estimate)
spec_values <- threshold_data |> filter(.metric == "specificity") |> pull(.estimate)
thresholds  <- threshold_data |> filter(.metric == "sensitivity") |> pull(.threshold)

# 3. Create a clean results table
youden_results <- tibble(
  threshold    = thresholds,
  sensitivity  = sens_values,
  specificity  = spec_values,
  youden_index = sensitivity + specificity - 1
) |> 
  arrange(desc(youden_index))

# 4. Show the winner
best_row <- youden_results |> slice(1)
print(best_row)
# A tibble: 1 × 4
  threshold sensitivity specificity youden_index
      <dbl>       <dbl>       <dbl>        <dbl>
1      0.36       0.762       0.782        0.544
# Map the winning probability back to the Viral Load units
optimal_cutoff <- rf_test |>
  mutate(diff = abs(`.pred_HIV Infected` - best_row$threshold)) |>
  slice_min(diff, n = 1) |>
  pull(mother_viral_load) # Ensure this matches your column name

cat("The optimal Maternal Viral Load threshold is:", optimal_cutoff, "ml/copies", sep = "")
The optimal Maternal Viral Load threshold is:123ml/copies

A Youden Index of 0.544 is a solid result in clinical diagnostic research, though its interpretation depends heavily on the context of the screening.

The index ranges from 0 to 1, where 0 means the test is as useful as a coin flip, and 1 represents a perfect test with no false positives or false negatives.

1. The Statistical Interpretation

A value of 0.544 means that your Maternal Viral Load threshold is performing 54.4% better at correctly classifying infants than random guessing.

Mathematically, it tells you the vertical distance between your ROC curve and the 45-degree “chance” line. A value above 0.5 is generally considered a moderately strong discriminator in complex biological datasets like viral transmission.


2. The Clinical Interpretation

In the context of HIV transmission from mother to infant, this value suggests:

  • Balance: Your chosen threshold is “fair” to both sides—it is not overly biased toward catching every case (at the cost of many false alarms) nor overly biased toward specificity.
  • Effectiveness: The maternal viral load is a meaningful predictor, but it isn’t the only factor. Since the index isn’t closer to 0.8 or 0.9, there are likely other variables (like duration of ARVs, mode of delivery, or breastfeeding status) that also influence whether an infant becomes HIV-positive.

3. Benchmarking the Value

While there are no “universal” grades for the Youden Index, researchers often use these informal benchmarks:

Youden Index (\(J\)) Quality of Discrimination
\(J < 0.2\) Negligible / Poor
\(0.2 \le J < 0.4\) Fair
\(0.4 \le J < 0.6\) Good / Moderate (Your value: 0.544)
\(0.6 \le J < 0.8\) Very Good
\(J \ge 0.8\) Excellent / Near Perfect

4. Important Caveat for HIV Research

In maternal-to-child transmission (PMTCT) studies, we often prioritize Sensitivity over the Youden Index.

If your “optimal” threshold of 0.544 comes from a point where sensitivity is only 70%, you might actually prefer a less “optimal” Youden Index (e.g., 0.45) if that lower threshold gives you 95% Sensitivity. In public health, missing an infected infant is usually considered much worse than a false positive.

Check the sensitivity and specificity values that produced this 0.544. Are you comfortable with that specific level of sensitivity for your study?