Packages
library(pacman)
p_load( tidymodels, tidyverse, plotly, skimr, themis, readxl, janitor, vip)
Read file
msc <- read_csv("mscph.csv", col_types = cols(), na = "NA")
names(msc)
[1] "pep_id_sex" "pep_id_deliv_type"
[3] "pep_id_gest_age_birth_wks" "pep_id_hiv_serostatus_mo"
[5] "pep_id_birth_weight_grams" "mothers_recent_viral_load"
[7] "first_pcr_test_result" "second_pcr_test"
[9] "final_outcome_result" "mother_received_intervetion_y_n"
[11] "place_site_of_delivery" "brst_feeding_method"
[13] "type_of_avr_prophy_for_baby" "cotrimoxazole_administered_y_n"
[15] "age_mo" "education"
[17] "marital_st"
Change variable names for readability
# Rename columns
msc <- msc |> rename(infant_sex = pep_id_sex,
delivery_type = pep_id_deliv_type,
infant_age_at_birth = pep_id_gest_age_birth_wks,
mother_hiv_serostatus = pep_id_hiv_serostatus_mo,
infant_weight = pep_id_birth_weight_grams,
mother_viral_load = mothers_recent_viral_load,
first_pcr = first_pcr_test_result,
second_pcr = second_pcr_test,
final_outcome = final_outcome_result,
mother_received_intervetion = mother_received_intervetion_y_n,
delivery_place = place_site_of_delivery,
breast_feeding_method = brst_feeding_method,
baby_prophylaxis = type_of_avr_prophy_for_baby,
cotrimaxazole_for_mmother = cotrimoxazole_administered_y_n,
mother_age = age_mo,
education_level = education,
marital_status = marital_st)
Convert character variables to factors
# To factors
msc_names <- c("infant_sex", "delivery_type", "mother_hiv_serostatus", "first_pcr", "second_pcr", "final_outcome", "mother_received_intervetion", "delivery_place", "breast_feeding_method", "baby_prophylaxis", "cotrimaxazole_for_mmother", "education_level", "marital_status")
msc <- msc |>
mutate(across(all_of(msc_names), factor))
str(msc)
tibble [600 × 17] (S3: tbl_df/tbl/data.frame)
$ infant_sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 1 2 2 1 1 1 ...
$ delivery_type : Factor w/ 4 levels "Assisted vaginal",..: 2 4 4 4 4 4 4 2 4 4 ...
$ infant_age_at_birth : num [1:600] 37 38 38 38 38 37 38 39 38 38 ...
$ mother_hiv_serostatus : Factor w/ 3 levels "HIV-1 & HIV-2 positive",..: 1 2 2 2 2 2 2 1 2 2 ...
$ infant_weight : num [1:600] 3000 2600 2600 2000 3400 2400 3200 2900 2100 3300 ...
$ mother_viral_load : num [1:600] 20 20 20 20 20 20.3 20 876 20 20 ...
$ first_pcr : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
$ second_pcr : Factor w/ 2 levels "Negative","Positive": 1 1 1 1 1 1 1 1 1 1 ...
$ final_outcome : Factor w/ 2 levels "HIV Infected",..: 2 2 2 2 1 2 2 2 2 1 ...
$ mother_received_intervetion: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
$ delivery_place : Factor w/ 2 levels "Inside Facility",..: 1 1 1 1 1 1 1 1 1 1 ...
$ breast_feeding_method : Factor w/ 2 levels "Exclusive","Mixed Feeding": 2 2 2 1 1 2 2 1 1 1 ...
$ baby_prophylaxis : Factor w/ 2 levels "AZT + NVP","NVP": 2 2 2 2 2 2 2 2 2 2 ...
$ cotrimaxazole_for_mmother : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
$ mother_age : num [1:600] 42 46 22 44 38 41 36 33 36 33 ...
$ education_level : Factor w/ 3 levels "Primary","Secondary",..: 3 3 2 2 2 2 2 2 3 2 ...
$ marital_status : Factor w/ 5 levels "Divorced","Married",..: 5 4 2 2 2 2 2 NA 4 2 ...
Models
Preprocessing
# Remove columns with NAs
new_msc <- msc |> select(-marital_status)
# Split data
set.seed(111)
data_split <- initial_split(new_msc, prop = .8, strata = final_outcome)
data_split
<Training/Testing/Total>
<480/120/600>
# Training and test data
m_train <- data_split |> training()
m_test <- data_split |> testing()
m_train |> glimpse()
Rows: 480
Columns: 16
$ infant_sex <fct> Female, Female, Male, Female, Female, Fema…
$ delivery_type <fct> Standard vaginal, Standard vaginal, Standa…
$ infant_age_at_birth <dbl> 38, 38, 40, 38, 38, 38, 38, 38, 38, 38, 38…
$ mother_hiv_serostatus <fct> HIV-1 positive, HIV-1 positive, HIV-1 posi…
$ infant_weight <dbl> 3400, 3300, 4000, 2200, 2200, 2500, 3500, …
$ mother_viral_load <dbl> 20, 20, 52, 20, 20, 20, 20, 20, 243, 20, 2…
$ first_pcr <fct> Negative, Negative, Negative, Positive, Po…
$ second_pcr <fct> Negative, Negative, Negative, Positive, Po…
$ final_outcome <fct> HIV Infected, HIV Infected, HIV Infected, …
$ mother_received_intervetion <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ delivery_place <fct> Inside Facility, Inside Facility, Inside F…
$ breast_feeding_method <fct> Exclusive, Exclusive, Mixed Feeding, Exclu…
$ baby_prophylaxis <fct> NVP, NVP, NVP, NVP, NVP, NVP, NVP, NVP, NV…
$ cotrimaxazole_for_mmother <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ mother_age <dbl> 38, 33, 23, 47, 44, 33, 34, 34, 40, 32, 40…
$ education_level <fct> Secondary, Secondary, Secondary, Tertiary,…
levels(m_train$final_outcome)
[1] "HIV Infected" "HIV Uninfected"
# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train) |>
step_log(mother_viral_load, base = 10, offset = 1) |>
step_dummy(all_nominal_predictors()) |>
step_interact(terms = ~ infant_weight:infant_age_at_birth) |>
step_normalize(all_numeric_predictors())
K Nearest Neighbour
# Create model design
hiv_knn <- nearest_neighbor(neighbors = tune(), weight_func = "optimal") |>
set_engine("kknn") |>
set_mode("classification")
hiv_knn |> head()
$args
$args$neighbors
<quosure>
expr: ^tune()
env: global
$args$weight_func
<quosure>
expr: ^"optimal"
env: empty
$args$dist_power
<quosure>
expr: ^NULL
env: empty
$eng_args
<list_of<quosure>>
named list()
$mode
[1] "classification"
$user_specified_mode
[1] TRUE
$method
NULL
$engine
[1] "kknn"
# Add recipe and model design to workflow
workflow_knn <- workflow() |>
add_recipe(reci_m) |>
add_model(hiv_knn)
workflow_knn |> head()
$pre
$actions
$actions$recipe
$recipe
── Recipe ──────────────────────────────────────────────────────────────────────
── Inputs
Number of variables by role
outcome: 1
predictor: 15
── Operations
• Log transformation on: mother_viral_load
• Dummy variables from: all_nominal_predictors()
• Interactions with: infant_weight:infant_age_at_birth
• Centering and scaling for: all_numeric_predictors()
$blueprint
NULL
attr(,"class")
[1] "action_recipe" "action_pre" "action"
$mold
NULL
$case_weights
NULL
attr(,"class")
[1] "stage_pre" "stage"
$fit
$actions
$actions$model
$spec
K-Nearest Neighbor Model Specification (classification)
Main Arguments:
neighbors = tune()
weight_func = optimal
Computational engine: kknn
$formula
NULL
attr(,"class")
[1] "action_model" "action_fit" "action"
$fit
NULL
attr(,"class")
[1] "stage_fit" "stage"
$post
$actions
named list()
$fit
NULL
attr(,"class")
[1] "stage_post" "stage"
$trained
[1] FALSE
# Create hyper-parameter grid
knn_hp_grid <- tibble(neighbors = c(1:10))
knn_hp_grid |> head()
# A tibble: 6 × 1
neighbors
<int>
1 1
2 2
3 3
4 4
5 5
6 6
# Resamples for cross validation
set.seed(222)
knn_fold <- vfold_cv(m_train, v = 4, strata = final_outcome)
knn_fold |> head()
# 4-fold cross-validation using stratification
# A tibble: 4 × 2
splits id
<list> <chr>
1 <split [360/120]> Fold1
2 <split [360/120]> Fold2
3 <split [360/120]> Fold3
4 <split [360/120]> Fold4
# Tune workflow and train models
knn_tune <- tune_grid(workflow_knn, resamples = knn_fold, grid = knn_hp_grid, metrics =
metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv))
autoplot(knn_tune)

# Extract the best hyper-parameter
knn_best_hp <- select_best(knn_tune, metric = "sensitivity")
knn_best_hp |> print()
# A tibble: 1 × 2
neighbors .config
<int> <chr>
1 1 pre0_mod01_post0
knn_best_hp <- select_best(knn_tune, metric = "accuracy")
knn_best_hp |> print()
# A tibble: 1 × 2
neighbors .config
<int> <chr>
1 1 pre0_mod01_post0
show_best(knn_tune, metric = "specificity")
# A tibble: 5 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 6 specificity binary 0.833 4 0.0216 pre0_mod06_post0
2 7 specificity binary 0.833 4 0.0216 pre0_mod07_post0
3 5 specificity binary 0.830 4 0.0236 pre0_mod05_post0
4 8 specificity binary 0.827 4 0.0264 pre0_mod08_post0
5 1 specificity binary 0.817 4 0.0274 pre0_mod01_post0
show_best(knn_tune, metric = "sensitivity")
# A tibble: 5 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 sensitivity binary 0.613 4 0.0355 pre0_mod01_post0
2 3 sensitivity binary 0.613 4 0.0355 pre0_mod03_post0
3 4 sensitivity binary 0.613 4 0.0355 pre0_mod04_post0
4 2 sensitivity binary 0.607 4 0.0395 pre0_mod02_post0
5 5 sensitivity binary 0.583 4 0.0461 pre0_mod05_post0
show_best(knn_tune, metric = "accuracy")
# A tibble: 5 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 accuracy binary 0.746 4 0.0188 pre0_mod01_post0
2 3 accuracy binary 0.746 4 0.0188 pre0_mod03_post0
3 4 accuracy binary 0.746 4 0.0188 pre0_mod04_post0
4 2 accuracy binary 0.744 4 0.0188 pre0_mod02_post0
5 5 accuracy binary 0.744 4 0.0184 pre0_mod05_post0
show_best(knn_tune, metric = "precision")
# A tibble: 5 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 6 precision binary 0.652 4 0.0351 pre0_mod06_post0
2 7 precision binary 0.652 4 0.0351 pre0_mod07_post0
3 5 precision binary 0.652 4 0.0313 pre0_mod05_post0
4 1 precision binary 0.648 4 0.0314 pre0_mod01_post0
5 3 precision binary 0.648 4 0.0314 pre0_mod03_post0
show_best(knn_tune, metric = "roc_auc")
# A tibble: 5 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 10 roc_auc binary 0.790 4 0.0120 pre0_mod10_post0
2 9 roc_auc binary 0.788 4 0.0112 pre0_mod09_post0
3 8 roc_auc binary 0.786 4 0.0117 pre0_mod08_post0
4 7 roc_auc binary 0.785 4 0.0108 pre0_mod07_post0
5 6 roc_auc binary 0.782 4 0.0102 pre0_mod06_post0
show_best(knn_tune, metric = "npv")
# A tibble: 5 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 npv binary 0.798 4 0.0151 pre0_mod01_post0
2 3 npv binary 0.798 4 0.0151 pre0_mod03_post0
3 4 npv binary 0.798 4 0.0151 pre0_mod04_post0
4 2 npv binary 0.795 4 0.0161 pre0_mod02_post0
5 5 npv binary 0.789 4 0.0177 pre0_mod05_post0
# Finalize and train the best workflow model
knn_best_model <- workflow_knn |>
finalize_workflow(knn_best_hp) |>
fit(m_train)
knn_best_model |> print()
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()
── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps
• step_log()
• step_dummy()
• step_interact()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(1L, data, 5), kernel = ~"optimal")
Type of response variable: nominal
Minimal misclassification: 0.2458333
Best kernel: optimal
Best k: 1
knn_test <- augment(knn_best_model, m_test)
knn_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)
knn_test |>
knn_metrics(truth = final_outcome,
estimate = .pred_class, # For accuracy, sens, spec
`.pred_HIV Infected`)
# A tibble: 6 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.733
2 sensitivity binary 0.667
3 specificity binary 0.769
4 precision binary 0.609
5 npv binary 0.811
6 roc_auc binary 0.718
#knn_metrics(knn_test, truth=final_outcome, estimate = .pred_class, starts_with(".pred_"))
#metrics(knn_test, truth = final_outcome, estimate = .pred)
#collect_metrics(knn_metrics)
Logistic Regression with glmnet
# Logistic regression does not require normalization
# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train) |>
step_log(mother_viral_load, base = 10, offset = 1) |>
step_dummy(all_nominal_predictors()) |>
step_interact(terms = ~ infant_weight:infant_age_at_birth) |>
step_normalize(all_numeric_predictors()) #|> # Required for glmnet!
#step_smote(final_outcome, over_ratio = 1)
# Create model design
hiv_logreg <- logistic_reg(penalty = tune(), mixture = tune()) |>
set_engine("glmnet") |>
set_mode("classification")
# Create a grid of values to try
logreg_grid <- grid_regular(penalty(), mixture(), levels = 5)
# Add recipe and model design to workflow
workflow_logreg <- workflow() |>
add_recipe(reci_m) |>
add_model(hiv_logreg)
workflow_logreg |> head()
$pre
$actions
$actions$recipe
$recipe
── Recipe ──────────────────────────────────────────────────────────────────────
── Inputs
Number of variables by role
outcome: 1
predictor: 15
── Operations
• Log transformation on: mother_viral_load
• Dummy variables from: all_nominal_predictors()
• Interactions with: infant_weight:infant_age_at_birth
• Centering and scaling for: all_numeric_predictors()
$blueprint
NULL
attr(,"class")
[1] "action_recipe" "action_pre" "action"
$mold
NULL
$case_weights
NULL
attr(,"class")
[1] "stage_pre" "stage"
$fit
$actions
$actions$model
$spec
Logistic Regression Model Specification (classification)
Main Arguments:
penalty = tune()
mixture = tune()
Computational engine: glmnet
$formula
NULL
attr(,"class")
[1] "action_model" "action_fit" "action"
$fit
NULL
attr(,"class")
[1] "stage_fit" "stage"
$post
$actions
named list()
$fit
NULL
attr(,"class")
[1] "stage_post" "stage"
$trained
[1] FALSE
# Resamples for cross validation
set.seed(333)
logreg_fold <- vfold_cv(m_train, v = 4, strata = final_outcome)
logreg_fold |> head()
# 4-fold cross-validation using stratification
# A tibble: 4 × 2
splits id
<list> <chr>
1 <split [360/120]> Fold1
2 <split [360/120]> Fold2
3 <split [360/120]> Fold3
4 <split [360/120]> Fold4
# Tune workflow and train models
logreg_tune <- tune_grid(workflow_logreg, resamples = logreg_fold, grid = logreg_grid, metrics = metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv))
→ A | warning: While computing binary `precision()`, no predicted events were detected (i.e.
`true_positive + false_positive = 0`).
Precision is undefined in this case, and `NA` will be returned.
Note that 42 true event(s) actually occurred for the problematic event level,
HIV Infected
There were issues with some computations A: x4There were issues with some computations A: x8There were issues with some computations A: x12There were issues with some computations A: x16There were issues with some computations A: x16
autoplot(logreg_tune)

# Extract the best hyper-parameter
lr_best_hp <- select_best(logreg_tune, metric = "sensitivity")
lr_best_hp |> print()
# A tibble: 1 × 3
penalty mixture .config
<dbl> <dbl> <chr>
1 0.0000000001 0.25 pre0_mod02_post0
lr_best_hp <- select_best(logreg_tune, metric = "accuracy")
lr_best_hp |> print()
# A tibble: 1 × 3
penalty mixture .config
<dbl> <dbl> <chr>
1 0.0000000001 0.25 pre0_mod02_post0
show_best(logreg_tune, metric = "specificity")
# A tibble: 5 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 0.25 specificity binary 1 4 0 pre0_mod22_post0
2 1 0.5 specificity binary 1 4 0 pre0_mod23_post0
3 1 0.75 specificity binary 1 4 0 pre0_mod24_post0
4 1 1 specificity binary 1 4 0 pre0_mod25_post0
5 1 0 specificity binary 0.994 4 0.00370 pre0_mod21_post0
show_best(logreg_tune, metric = "sensitivity")
# A tibble: 5 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 0.25 sensitivity binary 0.440 4 0.0648 pre0_mod02_po…
2 0.0000000001 0.5 sensitivity binary 0.440 4 0.0648 pre0_mod03_po…
3 0.0000000001 0.75 sensitivity binary 0.440 4 0.0648 pre0_mod04_po…
4 0.0000000001 1 sensitivity binary 0.440 4 0.0648 pre0_mod05_po…
5 0.0000000316 0.25 sensitivity binary 0.440 4 0.0648 pre0_mod07_po…
show_best(logreg_tune, metric = "accuracy")
# A tibble: 5 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 0.25 accuracy binary 0.729 4 0.0206 pre0_mod02_post0
2 0.0000000316 0.25 accuracy binary 0.729 4 0.0206 pre0_mod07_post0
3 0.00001 0.25 accuracy binary 0.729 4 0.0206 pre0_mod12_post0
4 0.0000000001 1 accuracy binary 0.727 4 0.0246 pre0_mod05_post0
5 0.0000000316 1 accuracy binary 0.727 4 0.0246 pre0_mod10_post0
show_best(logreg_tune, metric = "precision")
# A tibble: 5 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 0 precision binary 0.958 4 0.0241 pre0_mod21_post0
2 0.00316 0.25 precision binary 0.668 4 0.0250 pre0_mod17_post0
3 0.00316 0.5 precision binary 0.668 4 0.0250 pre0_mod18_post0
4 0.0000000001 0.25 precision binary 0.665 4 0.0301 pre0_mod02_post0
5 0.0000000316 0.25 precision binary 0.665 4 0.0301 pre0_mod07_post0
show_best(logreg_tune, metric = "roc_auc")
# A tibble: 5 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00316 1 roc_auc binary 0.774 4 0.00825 pre0_mod20_post0
2 0.0000000001 0 roc_auc binary 0.774 4 0.0105 pre0_mod01_post0
3 0.0000000316 0 roc_auc binary 0.774 4 0.0105 pre0_mod06_post0
4 0.00001 0 roc_auc binary 0.774 4 0.0105 pre0_mod11_post0
5 0.00316 0 roc_auc binary 0.774 4 0.0105 pre0_mod16_post0
show_best(logreg_tune, metric = "npv")
# A tibble: 5 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 0.25 npv binary 0.748 4 0.0209 pre0_mod02_post0
2 0.0000000316 0.25 npv binary 0.748 4 0.0209 pre0_mod07_post0
3 0.00001 0.25 npv binary 0.748 4 0.0209 pre0_mod12_post0
4 0.0000000001 1 npv binary 0.747 4 0.0224 pre0_mod05_post0
5 0.0000000316 1 npv binary 0.747 4 0.0224 pre0_mod10_post0
# Finalize and train the best workflow model
lr_best_model <- workflow_logreg |>
finalize_workflow(lr_best_hp) |>
fit(m_train)
lr_best_model |> print()
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps
• step_log()
• step_dummy()
• step_interact()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~0.25)
Df %Dev Lambda
1 0 0.00 0.59490
2 3 0.66 0.54200
3 3 1.58 0.49390
4 6 2.86 0.45000
5 6 4.07 0.41000
6 7 5.37 0.37360
7 7 6.65 0.34040
8 8 7.97 0.31020
9 8 9.28 0.28260
10 8 10.51 0.25750
11 8 11.65 0.23460
12 8 12.72 0.21380
13 8 13.70 0.19480
14 9 14.62 0.17750
15 10 15.51 0.16170
16 10 16.37 0.14740
17 10 17.16 0.13430
18 10 17.89 0.12230
19 10 18.57 0.11150
20 11 19.19 0.10160
21 12 19.81 0.09254
22 13 20.38 0.08432
23 13 20.91 0.07683
24 13 21.40 0.07001
25 13 21.84 0.06379
26 14 22.25 0.05812
27 14 22.62 0.05296
28 14 22.96 0.04825
29 14 23.27 0.04397
30 15 23.56 0.04006
31 16 23.82 0.03650
32 17 24.07 0.03326
33 17 24.34 0.03030
34 17 24.58 0.02761
35 17 24.80 0.02516
36 17 25.01 0.02292
37 17 25.19 0.02089
38 17 25.36 0.01903
39 17 25.51 0.01734
40 17 25.65 0.01580
41 17 25.78 0.01440
42 17 25.89 0.01312
43 17 26.00 0.01195
44 17 26.09 0.01089
45 17 26.18 0.00992
46 17 26.26 0.00904
...
and 54 more lines.
lr_test <- augment(lr_best_model, m_test)
lr_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)
lr_test |>
lr_metrics(truth = final_outcome,
estimate = .pred_class, # For accuracy, sens, spec
`.pred_HIV Infected`)
# A tibble: 6 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.7
2 sensitivity binary 0.333
3 specificity binary 0.897
4 precision binary 0.636
5 npv binary 0.714
6 roc_auc binary 0.777
Logistic Regression with glm
# Logistic regression using glm does not require normalization
# Recipe
reci_m <- recipe(final_outcome ~ ., data = m_train) |>
step_log(mother_viral_load, base = 10, offset = 1) |>
step_dummy(all_nominal_predictors()) |>
step_interact(terms = ~ infant_weight:infant_age_at_birth) |>
step_smote(final_outcome, over_ratio = 1)
# Create model design
hiv_lr <- logistic_reg() |>
set_engine("glm") |>
set_mode("classification")
# Add recipe and model design to workflow
workflow_lr <- workflow() |>
add_recipe(reci_m) |>
add_model(hiv_lr) |>
fit(m_train)
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
workflow_lr |> head()
$pre
$actions
$actions$recipe
$recipe
── Recipe ──────────────────────────────────────────────────────────────────────
── Inputs
Number of variables by role
outcome: 1
predictor: 15
── Operations
• Log transformation on: mother_viral_load
• Dummy variables from: all_nominal_predictors()
• Interactions with: infant_weight:infant_age_at_birth
• SMOTE based on: final_outcome
$blueprint
Recipe blueprint:
# Predictors: 0
# Outcomes: 0
Intercept: FALSE
Novel Levels: FALSE
Composition: tibble
attr(,"class")
[1] "action_recipe" "action_pre" "action"
$mold
$mold$predictors
# A tibble: 624 × 20
infant_age_at_birth infant_weight mother_viral_load mother_age
<dbl> <dbl> <dbl> <dbl>
1 38 3400 1.32 38
2 38 3300 1.32 33
3 40 4000 1.72 23
4 38 2200 1.32 47
5 38 2200 1.32 44
6 38 2500 1.32 33
7 38 3500 1.32 34
8 38 3400 1.32 34
9 38 3300 2.39 40
10 38 3000 1.32 32
# ℹ 614 more rows
# ℹ 16 more variables: infant_sex_Male <dbl>,
# delivery_type_Elective.cesarean <dbl>,
# delivery_type_Emergency.cesarean <dbl>,
# delivery_type_Standard.vaginal <dbl>,
# mother_hiv_serostatus_HIV.1.positive <dbl>,
# mother_hiv_serostatus_HIV.2.positive <dbl>, first_pcr_Positive <dbl>, …
$mold$outcomes
# A tibble: 624 × 1
final_outcome
<fct>
1 HIV Infected
2 HIV Infected
3 HIV Infected
4 HIV Infected
5 HIV Infected
6 HIV Infected
7 HIV Infected
8 HIV Infected
9 HIV Infected
10 HIV Infected
# ℹ 614 more rows
$mold$blueprint
Recipe blueprint:
# Predictors: 15
# Outcomes: 1
Intercept: FALSE
Novel Levels: FALSE
Composition: tibble
$mold$extras
$mold$extras$roles
NULL
$case_weights
NULL
attr(,"class")
[1] "stage_pre" "stage"
$fit
$actions
$actions$model
$spec
Logistic Regression Model Specification (classification)
Computational engine: glm
$formula
NULL
attr(,"class")
[1] "action_model" "action_fit" "action"
$fit
parsnip model object
Call: stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
Coefficients:
(Intercept) infant_age_at_birth
5.188e+01 -1.159e+00
infant_weight mother_viral_load
-1.528e-02 -2.911e-01
mother_age infant_sex_Male
3.853e-03 2.432e-01
delivery_type_Elective.cesarean delivery_type_Emergency.cesarean
2.513e-01 1.080e+00
delivery_type_Standard.vaginal mother_hiv_serostatus_HIV.1.positive
3.387e-02 -3.069e+00
mother_hiv_serostatus_HIV.2.positive first_pcr_Positive
2.001e+01 -7.393e+01
second_pcr_Positive mother_received_intervetion_Yes
-6.518e+02 -3.563e+00
delivery_place_Outside.Facility breast_feeding_method_Mixed.Feeding
-1.016e+00 1.006e+00
baby_prophylaxis_NVP cotrimaxazole_for_mmother_Yes
2.752e+00 2.192e-01
education_level_Secondary education_level_Tertiary
-2.853e-01 -2.590e-01
infant_weight_x_infant_age_at_birth
3.668e-04
Degrees of Freedom: 623 Total (i.e. Null); 603 Residual
Null Deviance: 865
Residual Deviance: 613.1 AIC: 655.1
attr(,"class")
[1] "stage_fit" "stage"
$post
$actions
named list()
$fit
NULL
attr(,"class")
[1] "stage_post" "stage"
$trained
[1] TRUE
hiv_lr_test <- augment(workflow_lr, m_test)
lr_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)
hiv_lr_test |>
lr_metrics(truth = final_outcome,
estimate = .pred_class,
`.pred_HIV Infected`)
# A tibble: 6 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.717
2 sensitivity binary 0.667
3 specificity binary 0.744
4 precision binary 0.583
5 npv binary 0.806
6 roc_auc binary 0.772