library(pacman)
p_load(tidymodels, tidyverse, plotly, skimr, themis, readxl, janitor, xgboost, doParallel, probably)
msc <- read_csv("mscph.csv", col_types = cols(), na = "NA")
names(msc)
[1] "pep_id_sex" "pep_id_deliv_type"
[3] "pep_id_gest_age_birth_wks" "pep_id_hiv_serostatus_mo"
[5] "pep_id_birth_weight_grams" "mothers_recent_viral_load"
[7] "first_pcr_test_result" "second_pcr_test"
[9] "final_outcome_result" "mother_received_intervetion_y_n"
[11] "place_site_of_delivery" "brst_feeding_method"
[13] "type_of_avr_prophy_for_baby" "cotrimoxazole_administered_y_n"
[15] "age_mo" "education"
[17] "marital_st"
# Rename columns
msc <- msc |> rename(infant_sex = pep_id_sex,
delivery_type = pep_id_deliv_type,
infant_age_at_birth = pep_id_gest_age_birth_wks,
mother_hiv_serostatus = pep_id_hiv_serostatus_mo,
infant_weight = pep_id_birth_weight_grams,
mother_viral_load = mothers_recent_viral_load,
first_pcr = first_pcr_test_result,
second_pcr = second_pcr_test,
final_outcome = final_outcome_result,
mother_received_intervetion = mother_received_intervetion_y_n,
delivery_place = place_site_of_delivery,
breast_feeding_method = brst_feeding_method,
baby_prophylaxis = type_of_avr_prophy_for_baby,
cotrimaxazole_for_mmother = cotrimoxazole_administered_y_n,
mother_age = age_mo,
education_level = education,
marital_status = marital_st)
# To factors
msc_names <- c("infant_sex", "delivery_type", "mother_hiv_serostatus", "first_pcr", "second_pcr", "final_outcome", "mother_received_intervetion", "delivery_place", "breast_feeding_method", "baby_prophylaxis", "cotrimaxazole_for_mmother", "education_level", "marital_status")
msc <- msc |>
mutate(across(all_of(msc_names), factor))
str(msc)
tibble [600 × 17] (S3: tbl_df/tbl/data.frame)
$ infant_sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 1 2 2 1 1 1 ...
$ delivery_type : Factor w/ 4 levels "Assisted vaginal",..: 2 4 4 4 4 4 4 2 4 4 ...
$ infant_age_at_birth : num [1:600] 37 38 38 38 38 37 38 39 38 38 ...
$ mother_hiv_serostatus : Factor w/ 3 levels "HIV-1 & HIV-2 positive",..: 1 2 2 2 2 2 2 1 2 2 ...
$ infant_weight : num [1:600] 3000 2600 2600 2000 3400 2400 3200 2900 2100 3300 ...
$ mother_viral_load : num [1:600] 20 20 20 20 20 20.3 20 876 20 20 ...
$ first_pcr : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
$ second_pcr : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
$ final_outcome : Factor w/ 2 levels "HIV Infected",..: 2 2 2 2 1 2 2 2 2 1 ...
$ mother_received_intervetion: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
$ delivery_place : Factor w/ 2 levels "Inside Facility",..: 1 1 1 1 1 1 1 1 1 1 ...
$ breast_feeding_method : Factor w/ 2 levels "Exclusive","Mixed Feeding": 2 2 2 1 1 2 2 1 1 1 ...
$ baby_prophylaxis : Factor w/ 2 levels "AZT + NVP","NVP": 2 2 2 2 2 2 2 2 2 2 ...
$ cotrimaxazole_for_mmother : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
$ mother_age : num [1:600] 42 46 22 44 38 41 36 33 36 33 ...
$ education_level : Factor w/ 3 levels "Primary","Secondary",..: 3 3 2 2 2 2 2 2 3 2 ...
$ marital_status : Factor w/ 5 levels "Divorced","Married",..: 5 4 2 2 2 2 2 NA 4 2 ...
# Remove columns with NAs
new_msc <- msc |> select(-marital_status)
# Split data
set.seed(111)
data_split <- initial_split(new_msc, prop = .8, strata = final_outcome)
data_split
<Training/Testing/Total>
<480/120/600>
# Training and test data
m_train <- data_split |> training()
m_test <- data_split |> testing()
m_train |> glimpse()
Rows: 480
Columns: 16
$ infant_sex <fct> Female, Female, Male, Female, Female, Fema…
$ delivery_type <fct> Standard vaginal, Standard vaginal, Standa…
$ infant_age_at_birth <dbl> 38, 38, 40, 38, 38, 38, 38, 38, 38, 38, 38…
$ mother_hiv_serostatus <fct> HIV-1 positive, HIV-1 positive, HIV-1 posi…
$ infant_weight <dbl> 3400, 3300, 4000, 2200, 2200, 2500, 3500, …
$ mother_viral_load <dbl> 20, 20, 52, 20, 20, 20, 20, 20, 243, 20, 2…
$ first_pcr <fct> Negative, Negative, Negative, Positive, Po…
$ second_pcr <fct> Negative, Negative, Negative, Positive, Po…
$ final_outcome <fct> HIV Infected, HIV Infected, HIV Infected, …
$ mother_received_intervetion <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ delivery_place <fct> Inside Facility, Inside Facility, Inside F…
$ breast_feeding_method <fct> Exclusive, Exclusive, Mixed Feeding, Exclu…
$ baby_prophylaxis <fct> NVP, NVP, NVP, NVP, NVP, NVP, NVP, NVP, NV…
$ cotrimaxazole_for_mmother <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ mother_age <dbl> 38, 33, 23, 47, 44, 33, 34, 34, 40, 32, 40…
$ education_level <fct> Secondary, Secondary, Secondary, Tertiary,…
# Outcome levels
levels(m_train$final_outcome)
[1] "HIV Infected" "HIV Uninfected"
# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train) |>
step_log(mother_viral_load, base = 10, offset = 1) |>
step_dummy(all_nominal_predictors()) |>
step_interact(terms = ~ infant_weight:infant_age_at_birth) |>
step_normalize(all_numeric_predictors())
# Create model design
hiv_xg <- boost_tree(trees =2000, tree_depth = 3) |>
set_engine("xgboost") |>
set_mode("classification")
# Add recipe and model design to workflow
set.seed(222)
xg_workflow <- workflow() |>
add_recipe(reci_m) |>
add_model(hiv_xg) |>
fit(m_train)
# Predict
xg_test <- augment(xg_workflow, m_test)
# Metrics
xg_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)
xg_test |>
xg_metrics(truth = final_outcome,
estimate = .pred_class,
`.pred_HIV Infected`)
# A tibble: 6 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.775
2 sensitivity binary 0.690
3 specificity binary 0.821
4 precision binary 0.674
5 npv binary 0.831
6 roc_auc binary 0.817
# Create model design
hiv_xg <- boost_tree(trees = tune(), tree_depth = tune()) |>
set_engine("xgboost") |>
set_mode("classification")
# Add recipe and model design to workflow
set.seed(222)
xg_workflow <- workflow() |>
add_recipe(reci_m) |>
add_model(hiv_xg)
# Create a Hyper-Parameter Grid
set.seed(333)
xg_grid <- expand.grid(tree_depth=c(1, 2, 5, 10, 15),
trees = c(5, 10, 15, 50, 100))
# Step 6 - Create Resamples for Cross-Validation:
xg_fold <- m_train |>
vfold_cv(v = 10, strata = final_outcome)
# Do Parallel
doParallel::registerDoParallel()
# Tune and train
set.seed(333)
StartTime=Sys.time()
xg_tune <- tune_grid(xg_workflow, resamples = xg_fold,
grid = xg_grid, metrics = metric_set(accuracy))
RunTime=Sys.time()- StartTime
# Visualize tuning results
print("TUNING RESULTS:")
[1] "TUNING RESULTS:"
autoplot(xg_tune)
# Extract the Best Hyper-Parameters:
xg_best <- select_best(xg_tune, metric = "accuracy")
# Finalize and Train the Best Workflow Model:
set.seed(333)
xg_result <- finalize_workflow(xg_workflow, xg_best) |>
fit(m_train)
# Assess Prediction Quality Based on the Testing Data:
xg_aug <- augment(xg_result, new_data = m_test)
xg_aug |>
xg_metrics(truth = final_outcome,
estimate = .pred_class,
`.pred_HIV Infected`)
# A tibble: 6 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.783
2 sensitivity binary 0.714
3 specificity binary 0.821
4 precision binary 0.682
5 npv binary 0.842
6 roc_auc binary 0.864
# From predicted data
threshold_data <- xg_aug |>
threshold_perf(truth = final_outcome,
estimate = `.pred_HIV Infected`,
thresholds = seq(0, 1, by = 0.01))
# 2. Manual Calculation (Bypassing the pivot error)
sens_values <- threshold_data |> filter(.metric == "sensitivity") |> pull(.estimate)
spec_values <- threshold_data |> filter(.metric == "specificity") |> pull(.estimate)
thresholds <- threshold_data |> filter(.metric == "sensitivity") |> pull(.threshold)
# 3. Create a clean results table
youden_results <- tibble(
threshold = thresholds,
sensitivity = sens_values,
specificity = spec_values,
youden_index = sensitivity + specificity - 1
) |>
arrange(desc(youden_index))
# 4. Show the winner
best_row <- youden_results |> slice(1)
print(best_row)
# A tibble: 1 × 4
threshold sensitivity specificity youden_index
<dbl> <dbl> <dbl> <dbl>
1 0.21 0.905 0.705 0.610
# Map the winning probability back to the Viral Load units
optimal_cutoff <- xg_aug |>
mutate(diff = abs(`.pred_HIV Infected` - best_row$threshold)) |>
slice_min(diff, n = 1) |>
pull(mother_viral_load) # Ensure this matches your column name
cat("The optimal Maternal Viral Load threshold is:", optimal_cutoff, "ml/copies", sep = "")
The optimal Maternal Viral Load threshold is:20ml/copies