Profile of expectant women living with HIV and prevention of vertical transmission in Sub-Saharan Africa: A machine learning approach (K Nearest Neighbour and Logistic Regression)

Packages

library(pacman)
p_load( tidymodels, tidyverse, plotly, skimr, themis, readxl, janitor, vip)

Read file

msc <- read_csv("mscph.csv", col_types = cols(),  na = "NA")

names(msc)

 [1] "pep_id_sex"                      "pep_id_deliv_type"              
 [3] "pep_id_gest_age_birth_wks"       "pep_id_hiv_serostatus_mo"       
 [5] "pep_id_birth_weight_grams"       "mothers_recent_viral_load"      
 [7] "first_pcr_test_result"           "second_pcr_test"                
 [9] "final_outcome_result"            "mother_received_intervetion_y_n"
[11] "place_site_of_delivery"          "brst_feeding_method"            
[13] "type_of_avr_prophy_for_baby"     "cotrimoxazole_administered_y_n" 
[15] "age_mo"                          "education"                      
[17] "marital_st"

Change variable names for readability

# Rename columns
msc <- msc |> rename(infant_sex = pep_id_sex,
                     delivery_type = pep_id_deliv_type,
                     infant_age_at_birth = pep_id_gest_age_birth_wks,
                     mother_hiv_serostatus = pep_id_hiv_serostatus_mo,
                     infant_weight = pep_id_birth_weight_grams,
                     mother_viral_load = mothers_recent_viral_load,
                     first_pcr = first_pcr_test_result,
                     second_pcr = second_pcr_test,
                     final_outcome = final_outcome_result,
                     mother_received_intervetion = mother_received_intervetion_y_n,
                     delivery_place = place_site_of_delivery,
                     breast_feeding_method = brst_feeding_method,
                     baby_prophylaxis = type_of_avr_prophy_for_baby,
                     cotrimaxazole_for_mmother = cotrimoxazole_administered_y_n,
                     mother_age = age_mo,
                     education_level = education,
                     marital_status = marital_st)

Convert character variables to factors

# To factors
msc_names <- c("infant_sex", "delivery_type", "mother_hiv_serostatus", "first_pcr", "second_pcr", "final_outcome", "mother_received_intervetion", "delivery_place", "breast_feeding_method",  "baby_prophylaxis", "cotrimaxazole_for_mmother", "education_level", "marital_status")


msc <- msc |> 
  mutate(across(all_of(msc_names), factor))

str(msc)

tibble [600 × 17] (S3: tbl_df/tbl/data.frame)
 $ infant_sex                 : Factor w/ 2 levels "Female","Male": 1 1 1 2 1 2 2 1 1 1 ...
 $ delivery_type              : Factor w/ 4 levels "Assisted vaginal",..: 2 4 4 4 4 4 4 2 4 4 ...
 $ infant_age_at_birth        : num [1:600] 37 38 38 38 38 37 38 39 38 38 ...
 $ mother_hiv_serostatus      : Factor w/ 3 levels "HIV-1 & HIV-2 positive",..: 1 2 2 2 2 2 2 1 2 2 ...
 $ infant_weight              : num [1:600] 3000 2600 2600 2000 3400 2400 3200 2900 2100 3300 ...
 $ mother_viral_load          : num [1:600] 20 20 20 20 20 20.3 20 876 20 20 ...
 $ first_pcr                  : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
 $ second_pcr                 : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
 $ final_outcome              : Factor w/ 2 levels "HIV Infected",..: 2 2 2 2 1 2 2 2 2 1 ...
 $ mother_received_intervetion: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ delivery_place             : Factor w/ 2 levels "Inside Facility",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ breast_feeding_method      : Factor w/ 2 levels "Exclusive","Mixed Feeding": 2 2 2 1 1 2 2 1 1 1 ...
 $ baby_prophylaxis           : Factor w/ 2 levels "AZT + NVP","NVP": 2 2 2 2 2 2 2 2 2 2 ...
 $ cotrimaxazole_for_mmother  : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ mother_age                 : num [1:600] 42 46 22 44 38 41 36 33 36 33 ...
 $ education_level            : Factor w/ 3 levels "Primary","Secondary",..: 3 3 2 2 2 2 2 2 3 2 ...
 $ marital_status             : Factor w/ 5 levels "Divorced","Married",..: 5 4 2 2 2 2 2 NA 4 2 ...

Models

Preprocessing

# Remove columns with NAs
new_msc <- msc  |>  select(-marital_status)

# Split data
set.seed(111)
data_split <- initial_split(new_msc, prop = .8, strata = final_outcome)
data_split

<Training/Testing/Total>
<480/120/600>

# Training and test data
m_train <- data_split |>  training()
m_test <- data_split  |>  testing()
m_train |> glimpse()

Rows: 480
Columns: 16
$ infant_sex                  <fct> Female, Female, Male, Female, Female, Fema…
$ delivery_type               <fct> Standard vaginal, Standard vaginal, Standa…
$ infant_age_at_birth         <dbl> 38, 38, 40, 38, 38, 38, 38, 38, 38, 38, 38…
$ mother_hiv_serostatus       <fct> HIV-1 positive, HIV-1 positive, HIV-1 posi…
$ infant_weight               <dbl> 3400, 3300, 4000, 2200, 2200, 2500, 3500, …
$ mother_viral_load           <dbl> 20, 20, 52, 20, 20, 20, 20, 20, 243, 20, 2…
$ first_pcr                   <fct> Negative, Negative, Negative, Positive, Po…
$ second_pcr                  <fct> Negative, Negative, Negative, Positive, Po…
$ final_outcome               <fct> HIV Infected, HIV Infected, HIV Infected, …
$ mother_received_intervetion <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ delivery_place              <fct> Inside Facility, Inside Facility, Inside F…
$ breast_feeding_method       <fct> Exclusive, Exclusive, Mixed Feeding, Exclu…
$ baby_prophylaxis            <fct> NVP, NVP, NVP, NVP, NVP, NVP, NVP, NVP, NV…
$ cotrimaxazole_for_mmother   <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ mother_age                  <dbl> 38, 33, 23, 47, 44, 33, 34, 34, 40, 32, 40…
$ education_level             <fct> Secondary, Secondary, Secondary, Tertiary,…

levels(m_train$final_outcome)

[1] "HIV Infected"   "HIV Uninfected"

# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train)  |>  
  step_log(mother_viral_load, base = 10, offset = 1) |> 
  step_dummy(all_nominal_predictors()) |> 
  step_interact(terms = ~ infant_weight:infant_age_at_birth) |> 
  step_normalize(all_numeric_predictors())

K Nearest Neighbour

# Create model design
hiv_knn <- nearest_neighbor(neighbors = tune(), weight_func = "optimal") |> 
  set_engine("kknn") |> 
  set_mode("classification")
hiv_knn |> head()

$args
$args$neighbors
<quosure>
expr: ^tune()
env:  global

$args$weight_func
<quosure>
expr: ^"optimal"
env:  empty

$args$dist_power
<quosure>
expr: ^NULL
env:  empty


$eng_args
<list_of<quosure>>

named list()

$mode
[1] "classification"

$user_specified_mode
[1] TRUE

$method
NULL

$engine
[1] "kknn"

# Add recipe and model design to workflow
workflow_knn <- workflow() |> 
  add_recipe(reci_m) |> 
  add_model(hiv_knn)
workflow_knn |> head()

$pre
$actions
$actions$recipe
$recipe

── Recipe ──────────────────────────────────────────────────────────────────────

── Inputs

Number of variables by role

outcome:    1
predictor: 15

── Operations

• Log transformation on: mother_viral_load

• Dummy variables from: all_nominal_predictors()

• Interactions with: infant_weight:infant_age_at_birth

• Centering and scaling for: all_numeric_predictors()


$blueprint
NULL

attr(,"class")
[1] "action_recipe" "action_pre"    "action"       


$mold
NULL

$case_weights
NULL

attr(,"class")
[1] "stage_pre" "stage"    

$fit
$actions
$actions$model
$spec
K-Nearest Neighbor Model Specification (classification)

Main Arguments:
  neighbors = tune()
  weight_func = optimal

Computational engine: kknn 


$formula
NULL

attr(,"class")
[1] "action_model" "action_fit"   "action"      


$fit
NULL

attr(,"class")
[1] "stage_fit" "stage"    

$post
$actions
named list()

$fit
NULL

attr(,"class")
[1] "stage_post" "stage"     

$trained
[1] FALSE

# Create hyper-parameter grid
knn_hp_grid <- tibble(neighbors = c(1:10))
knn_hp_grid |> head()

# A tibble: 6 × 1
  neighbors
      <int>
1         1
2         2
3         3
4         4
5         5
6         6

# Resamples for cross validation
set.seed(222)
knn_fold <- vfold_cv(m_train, v = 4, strata = final_outcome)
knn_fold |> head()

#  4-fold cross-validation using stratification 
# A tibble: 4 × 2
  splits            id   
  <list>            <chr>
1 <split [360/120]> Fold1
2 <split [360/120]> Fold2
3 <split [360/120]> Fold3
4 <split [360/120]> Fold4

# Tune workflow and train models
knn_tune <- tune_grid(workflow_knn, resamples = knn_fold, grid = knn_hp_grid, metrics = 
                        metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv))
autoplot(knn_tune)

# Extract the best hyper-parameter
knn_best_hp <- select_best(knn_tune, metric = "sensitivity")
knn_best_hp |> print()

# A tibble: 1 × 2
  neighbors .config         
      <int> <chr>           
1         1 pre0_mod01_post0

knn_best_hp <- select_best(knn_tune, metric = "accuracy")
knn_best_hp |> print()

# A tibble: 1 × 2
  neighbors .config         
      <int> <chr>           
1         1 pre0_mod01_post0

show_best(knn_tune, metric = "specificity")

# A tibble: 5 × 7
  neighbors .metric     .estimator  mean     n std_err .config         
      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>           
1         6 specificity binary     0.833     4  0.0216 pre0_mod06_post0
2         7 specificity binary     0.833     4  0.0216 pre0_mod07_post0
3         5 specificity binary     0.830     4  0.0236 pre0_mod05_post0
4         8 specificity binary     0.827     4  0.0264 pre0_mod08_post0
5         1 specificity binary     0.817     4  0.0274 pre0_mod01_post0

show_best(knn_tune, metric = "sensitivity")

# A tibble: 5 × 7
  neighbors .metric     .estimator  mean     n std_err .config         
      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>           
1         1 sensitivity binary     0.613     4  0.0355 pre0_mod01_post0
2         3 sensitivity binary     0.613     4  0.0355 pre0_mod03_post0
3         4 sensitivity binary     0.613     4  0.0355 pre0_mod04_post0
4         2 sensitivity binary     0.607     4  0.0395 pre0_mod02_post0
5         5 sensitivity binary     0.583     4  0.0461 pre0_mod05_post0

show_best(knn_tune, metric = "accuracy")

# A tibble: 5 × 7
  neighbors .metric  .estimator  mean     n std_err .config         
      <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>           
1         1 accuracy binary     0.746     4  0.0188 pre0_mod01_post0
2         3 accuracy binary     0.746     4  0.0188 pre0_mod03_post0
3         4 accuracy binary     0.746     4  0.0188 pre0_mod04_post0
4         2 accuracy binary     0.744     4  0.0188 pre0_mod02_post0
5         5 accuracy binary     0.744     4  0.0184 pre0_mod05_post0

show_best(knn_tune, metric = "precision")

# A tibble: 5 × 7
  neighbors .metric   .estimator  mean     n std_err .config         
      <int> <chr>     <chr>      <dbl> <int>   <dbl> <chr>           
1         6 precision binary     0.652     4  0.0351 pre0_mod06_post0
2         7 precision binary     0.652     4  0.0351 pre0_mod07_post0
3         5 precision binary     0.652     4  0.0313 pre0_mod05_post0
4         1 precision binary     0.648     4  0.0314 pre0_mod01_post0
5         3 precision binary     0.648     4  0.0314 pre0_mod03_post0

show_best(knn_tune, metric = "roc_auc")

# A tibble: 5 × 7
  neighbors .metric .estimator  mean     n std_err .config         
      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
1        10 roc_auc binary     0.790     4  0.0120 pre0_mod10_post0
2         9 roc_auc binary     0.788     4  0.0112 pre0_mod09_post0
3         8 roc_auc binary     0.786     4  0.0117 pre0_mod08_post0
4         7 roc_auc binary     0.785     4  0.0108 pre0_mod07_post0
5         6 roc_auc binary     0.782     4  0.0102 pre0_mod06_post0

show_best(knn_tune, metric = "npv")

# A tibble: 5 × 7
  neighbors .metric .estimator  mean     n std_err .config         
      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
1         1 npv     binary     0.798     4  0.0151 pre0_mod01_post0
2         3 npv     binary     0.798     4  0.0151 pre0_mod03_post0
3         4 npv     binary     0.798     4  0.0151 pre0_mod04_post0
4         2 npv     binary     0.795     4  0.0161 pre0_mod02_post0
5         5 npv     binary     0.789     4  0.0177 pre0_mod05_post0

# Finalize and train the best workflow model
knn_best_model <- workflow_knn |> 
  finalize_workflow(knn_best_hp) |> 
  fit(m_train)
knn_best_model |> print()

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps

• step_log()
• step_dummy()
• step_interact()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────

Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(1L,     data, 5), kernel = ~"optimal")

Type of response variable: nominal
Minimal misclassification: 0.2458333
Best kernel: optimal
Best k: 1

knn_test <- augment(knn_best_model, m_test)
knn_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)
knn_test |> 
  knn_metrics(truth = final_outcome, 
              estimate = .pred_class,          # For accuracy, sens, spec
              `.pred_HIV Infected`)

# A tibble: 6 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.733
2 sensitivity binary         0.667
3 specificity binary         0.769
4 precision   binary         0.609
5 npv         binary         0.811
6 roc_auc     binary         0.718

#knn_metrics(knn_test, truth=final_outcome, estimate = .pred_class, starts_with(".pred_"))
#metrics(knn_test, truth = final_outcome, estimate = .pred)
#collect_metrics(knn_metrics)

Logistic Regression with glmnet

# Logistic regression does not require normalization
# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train)  |>  
  step_log(mother_viral_load, base = 10, offset = 1) |> 
  step_dummy(all_nominal_predictors()) |> 
  step_interact(terms = ~ infant_weight:infant_age_at_birth) |> 
  step_normalize(all_numeric_predictors()) #|> # Required for glmnet!
  #step_smote(final_outcome, over_ratio = 1)

# Create model design
hiv_logreg <- logistic_reg(penalty = tune(), mixture = tune()) |> 
  set_engine("glmnet") |> 
  set_mode("classification")

# Create a grid of values to try
logreg_grid <- grid_regular(penalty(), mixture(), levels = 5)

# Add recipe and model design to workflow
workflow_logreg <- workflow() |> 
  add_recipe(reci_m) |> 
  add_model(hiv_logreg)
workflow_logreg |> head()

$pre
$actions
$actions$recipe
$recipe

── Recipe ──────────────────────────────────────────────────────────────────────

── Inputs

Number of variables by role

outcome:    1
predictor: 15

── Operations

• Log transformation on: mother_viral_load

• Dummy variables from: all_nominal_predictors()

• Interactions with: infant_weight:infant_age_at_birth

• Centering and scaling for: all_numeric_predictors()


$blueprint
NULL

attr(,"class")
[1] "action_recipe" "action_pre"    "action"       


$mold
NULL

$case_weights
NULL

attr(,"class")
[1] "stage_pre" "stage"    

$fit
$actions
$actions$model
$spec
Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = tune()
  mixture = tune()

Computational engine: glmnet 


$formula
NULL

attr(,"class")
[1] "action_model" "action_fit"   "action"      


$fit
NULL

attr(,"class")
[1] "stage_fit" "stage"    

$post
$actions
named list()

$fit
NULL

attr(,"class")
[1] "stage_post" "stage"     

$trained
[1] FALSE

# Resamples for cross validation
set.seed(333)
logreg_fold <- vfold_cv(m_train, v = 4, strata = final_outcome)
logreg_fold |> head()

#  4-fold cross-validation using stratification 
# A tibble: 4 × 2
  splits            id   
  <list>            <chr>
1 <split [360/120]> Fold1
2 <split [360/120]> Fold2
3 <split [360/120]> Fold3
4 <split [360/120]> Fold4

# Tune workflow and train models
logreg_tune <- tune_grid(workflow_logreg, resamples = logreg_fold, grid = logreg_grid, metrics = metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv))

→ A | warning: While computing binary `precision()`, no predicted events were detected (i.e.
               `true_positive + false_positive = 0`).
               Precision is undefined in this case, and `NA` will be returned.
               Note that 42 true event(s) actually occurred for the problematic event level,
               HIV Infected

There were issues with some computations   A: x4There were issues with some computations   A: x8There were issues with some computations   A: x12There were issues with some computations   A: x16There were issues with some computations   A: x16

autoplot(logreg_tune)

# Extract the best hyper-parameter
lr_best_hp <- select_best(logreg_tune, metric = "sensitivity")
lr_best_hp |> print()

# A tibble: 1 × 3
       penalty mixture .config         
         <dbl>   <dbl> <chr>           
1 0.0000000001    0.25 pre0_mod02_post0

lr_best_hp <- select_best(logreg_tune, metric = "accuracy")
lr_best_hp |> print()

# A tibble: 1 × 3
       penalty mixture .config         
         <dbl>   <dbl> <chr>           
1 0.0000000001    0.25 pre0_mod02_post0

show_best(logreg_tune, metric = "specificity")

# A tibble: 5 × 8
  penalty mixture .metric     .estimator  mean     n std_err .config         
    <dbl>   <dbl> <chr>       <chr>      <dbl> <int>   <dbl> <chr>           
1       1    0.25 specificity binary     1         4 0       pre0_mod22_post0
2       1    0.5  specificity binary     1         4 0       pre0_mod23_post0
3       1    0.75 specificity binary     1         4 0       pre0_mod24_post0
4       1    1    specificity binary     1         4 0       pre0_mod25_post0
5       1    0    specificity binary     0.994     4 0.00370 pre0_mod21_post0

show_best(logreg_tune, metric = "sensitivity")

# A tibble: 5 × 8
       penalty mixture .metric     .estimator  mean     n std_err .config       
         <dbl>   <dbl> <chr>       <chr>      <dbl> <int>   <dbl> <chr>         
1 0.0000000001    0.25 sensitivity binary     0.440     4  0.0648 pre0_mod02_po…
2 0.0000000001    0.5  sensitivity binary     0.440     4  0.0648 pre0_mod03_po…
3 0.0000000001    0.75 sensitivity binary     0.440     4  0.0648 pre0_mod04_po…
4 0.0000000001    1    sensitivity binary     0.440     4  0.0648 pre0_mod05_po…
5 0.0000000316    0.25 sensitivity binary     0.440     4  0.0648 pre0_mod07_po…

show_best(logreg_tune, metric = "accuracy")

# A tibble: 5 × 8
       penalty mixture .metric  .estimator  mean     n std_err .config         
         <dbl>   <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>           
1 0.0000000001    0.25 accuracy binary     0.729     4  0.0206 pre0_mod02_post0
2 0.0000000316    0.25 accuracy binary     0.729     4  0.0206 pre0_mod07_post0
3 0.00001         0.25 accuracy binary     0.729     4  0.0206 pre0_mod12_post0
4 0.0000000001    1    accuracy binary     0.727     4  0.0246 pre0_mod05_post0
5 0.0000000316    1    accuracy binary     0.727     4  0.0246 pre0_mod10_post0

show_best(logreg_tune, metric = "precision")

# A tibble: 5 × 8
       penalty mixture .metric   .estimator  mean     n std_err .config         
         <dbl>   <dbl> <chr>     <chr>      <dbl> <int>   <dbl> <chr>           
1 1               0    precision binary     0.958     4  0.0241 pre0_mod21_post0
2 0.00316         0.25 precision binary     0.668     4  0.0250 pre0_mod17_post0
3 0.00316         0.5  precision binary     0.668     4  0.0250 pre0_mod18_post0
4 0.0000000001    0.25 precision binary     0.665     4  0.0301 pre0_mod02_post0
5 0.0000000316    0.25 precision binary     0.665     4  0.0301 pre0_mod07_post0

show_best(logreg_tune, metric = "roc_auc")

# A tibble: 5 × 8
       penalty mixture .metric .estimator  mean     n std_err .config         
         <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
1 0.00316            1 roc_auc binary     0.774     4 0.00825 pre0_mod20_post0
2 0.0000000001       0 roc_auc binary     0.774     4 0.0105  pre0_mod01_post0
3 0.0000000316       0 roc_auc binary     0.774     4 0.0105  pre0_mod06_post0
4 0.00001            0 roc_auc binary     0.774     4 0.0105  pre0_mod11_post0
5 0.00316            0 roc_auc binary     0.774     4 0.0105  pre0_mod16_post0

show_best(logreg_tune, metric = "npv")

# A tibble: 5 × 8
       penalty mixture .metric .estimator  mean     n std_err .config         
         <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
1 0.0000000001    0.25 npv     binary     0.748     4  0.0209 pre0_mod02_post0
2 0.0000000316    0.25 npv     binary     0.748     4  0.0209 pre0_mod07_post0
3 0.00001         0.25 npv     binary     0.748     4  0.0209 pre0_mod12_post0
4 0.0000000001    1    npv     binary     0.747     4  0.0224 pre0_mod05_post0
5 0.0000000316    1    npv     binary     0.747     4  0.0224 pre0_mod10_post0

# Finalize and train the best workflow model
lr_best_model <- workflow_logreg |> 
  finalize_workflow(lr_best_hp) |> 
  fit(m_train)
lr_best_model |> print()

══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps

• step_log()
• step_dummy()
• step_interact()
• step_normalize()

── Model ───────────────────────────────────────────────────────────────────────

Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~0.25) 

    Df  %Dev  Lambda
1    0  0.00 0.59490
2    3  0.66 0.54200
3    3  1.58 0.49390
4    6  2.86 0.45000
5    6  4.07 0.41000
6    7  5.37 0.37360
7    7  6.65 0.34040
8    8  7.97 0.31020
9    8  9.28 0.28260
10   8 10.51 0.25750
11   8 11.65 0.23460
12   8 12.72 0.21380
13   8 13.70 0.19480
14   9 14.62 0.17750
15  10 15.51 0.16170
16  10 16.37 0.14740
17  10 17.16 0.13430
18  10 17.89 0.12230
19  10 18.57 0.11150
20  11 19.19 0.10160
21  12 19.81 0.09254
22  13 20.38 0.08432
23  13 20.91 0.07683
24  13 21.40 0.07001
25  13 21.84 0.06379
26  14 22.25 0.05812
27  14 22.62 0.05296
28  14 22.96 0.04825
29  14 23.27 0.04397
30  15 23.56 0.04006
31  16 23.82 0.03650
32  17 24.07 0.03326
33  17 24.34 0.03030
34  17 24.58 0.02761
35  17 24.80 0.02516
36  17 25.01 0.02292
37  17 25.19 0.02089
38  17 25.36 0.01903
39  17 25.51 0.01734
40  17 25.65 0.01580
41  17 25.78 0.01440
42  17 25.89 0.01312
43  17 26.00 0.01195
44  17 26.09 0.01089
45  17 26.18 0.00992
46  17 26.26 0.00904

...
and 54 more lines.

lr_test <- augment(lr_best_model, m_test)
lr_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)
lr_test |> 
  lr_metrics(truth = final_outcome, 
              estimate = .pred_class,          # For accuracy, sens, spec
              `.pred_HIV Infected`)

# A tibble: 6 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.7  
2 sensitivity binary         0.333
3 specificity binary         0.897
4 precision   binary         0.636
5 npv         binary         0.714
6 roc_auc     binary         0.777

Logistic Regression with glm

# Logistic regression using glm does not require normalization
# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train)  |>  
  step_log(mother_viral_load, base = 10, offset = 1) |> 
  step_dummy(all_nominal_predictors()) |> 
  step_interact(terms = ~ infant_weight:infant_age_at_birth) |> 
  step_smote(final_outcome, over_ratio = 1)
  

# Create model design
hiv_lr <- logistic_reg() |> 
  set_engine("glm") |> 
  set_mode("classification")

# Add recipe and model design to workflow
workflow_lr <- workflow() |> 
  add_recipe(reci_m) |> 
  add_model(hiv_lr) |> 
  fit(m_train)

Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

workflow_lr |> head()

$pre
$actions
$actions$recipe
$recipe

── Recipe ──────────────────────────────────────────────────────────────────────

── Inputs

Number of variables by role

outcome:    1
predictor: 15

── Operations

• Log transformation on: mother_viral_load

• Dummy variables from: all_nominal_predictors()

• Interactions with: infant_weight:infant_age_at_birth

• SMOTE based on: final_outcome


$blueprint

Recipe blueprint:

# Predictors: 0

# Outcomes: 0

Intercept: FALSE

Novel Levels: FALSE

Composition: tibble


attr(,"class")
[1] "action_recipe" "action_pre"    "action"       


$mold
$mold$predictors
# A tibble: 624 × 20
   infant_age_at_birth infant_weight mother_viral_load mother_age
                 <dbl>         <dbl>             <dbl>      <dbl>
 1                  38          3400              1.32         38
 2                  38          3300              1.32         33
 3                  40          4000              1.72         23
 4                  38          2200              1.32         47
 5                  38          2200              1.32         44
 6                  38          2500              1.32         33
 7                  38          3500              1.32         34
 8                  38          3400              1.32         34
 9                  38          3300              2.39         40
10                  38          3000              1.32         32
# ℹ 614 more rows
# ℹ 16 more variables: infant_sex_Male <dbl>,
#   delivery_type_Elective.cesarean <dbl>,
#   delivery_type_Emergency.cesarean <dbl>,
#   delivery_type_Standard.vaginal <dbl>,
#   mother_hiv_serostatus_HIV.1.positive <dbl>,
#   mother_hiv_serostatus_HIV.2.positive <dbl>, first_pcr_Positive <dbl>, …

$mold$outcomes
# A tibble: 624 × 1
   final_outcome
   <fct>        
 1 HIV Infected 
 2 HIV Infected 
 3 HIV Infected 
 4 HIV Infected 
 5 HIV Infected 
 6 HIV Infected 
 7 HIV Infected 
 8 HIV Infected 
 9 HIV Infected 
10 HIV Infected 
# ℹ 614 more rows

$mold$blueprint

Recipe blueprint:

# Predictors: 15

# Outcomes: 1

Intercept: FALSE

Novel Levels: FALSE

Composition: tibble


$mold$extras
$mold$extras$roles
NULL



$case_weights
NULL

attr(,"class")
[1] "stage_pre" "stage"    

$fit
$actions
$actions$model
$spec
Logistic Regression Model Specification (classification)

Computational engine: glm 


$formula
NULL

attr(,"class")
[1] "action_model" "action_fit"   "action"      


$fit
parsnip model object


Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)

Coefficients:
                         (Intercept)                   infant_age_at_birth  
                           5.188e+01                            -1.159e+00  
                       infant_weight                     mother_viral_load  
                          -1.528e-02                            -2.911e-01  
                          mother_age                       infant_sex_Male  
                           3.853e-03                             2.432e-01  
     delivery_type_Elective.cesarean      delivery_type_Emergency.cesarean  
                           2.513e-01                             1.080e+00  
      delivery_type_Standard.vaginal  mother_hiv_serostatus_HIV.1.positive  
                           3.387e-02                            -3.069e+00  
mother_hiv_serostatus_HIV.2.positive                    first_pcr_Positive  
                           2.001e+01                            -7.393e+01  
                 second_pcr_Positive       mother_received_intervetion_Yes  
                          -6.518e+02                            -3.563e+00  
     delivery_place_Outside.Facility   breast_feeding_method_Mixed.Feeding  
                          -1.016e+00                             1.006e+00  
                baby_prophylaxis_NVP         cotrimaxazole_for_mmother_Yes  
                           2.752e+00                             2.192e-01  
           education_level_Secondary              education_level_Tertiary  
                          -2.853e-01                            -2.590e-01  
 infant_weight_x_infant_age_at_birth  
                           3.668e-04  

Degrees of Freedom: 623 Total (i.e. Null);  603 Residual
Null Deviance:      865 
Residual Deviance: 613.1    AIC: 655.1

attr(,"class")
[1] "stage_fit" "stage"    

$post
$actions
named list()

$fit
NULL

attr(,"class")
[1] "stage_post" "stage"     

$trained
[1] TRUE

hiv_lr_test <- augment(workflow_lr, m_test)

lr_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)
hiv_lr_test |> 
  lr_metrics(truth = final_outcome, 
              estimate = .pred_class,          
              `.pred_HIV Infected`)

# A tibble: 6 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary         0.717
2 sensitivity binary         0.667
3 specificity binary         0.744
4 precision   binary         0.583
5 npv         binary         0.806
6 roc_auc     binary         0.772

Profile of expectant women living with HIV and prevention of vertical transmission in Sub-Saharan Africa: A machine learning approach (K Nearest Neighbour and Logistic Regression)

2026-03-13

Packages

Read file

Change variable names for readability

Convert character variables to factors

Models

Preprocessing

K Nearest Neighbour

Logistic Regression with glmnet

Logistic Regression with glm