Load packages

library(pacman)
p_load(readxl, skimr, janitor, tidyverse, AMR, plotly, themis, tidymodels)
hpv <- read_xlsx("Modelling_HPV-1.xlsx", sheet = "Sheet2", col_names = TRUE, na = "NA")
hpv |> skim()
Data summary
Name hpv
Number of rows 372
Number of columns 24
_______________________
Column type frequency:
character 17
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
UI 0 1.00 4 4 0 372 0
Visit_type 0 1.00 17 17 0 2 0
Marital_status 5 0.99 6 18 0 4 0
Employment_status 2 0.99 8 22 0 3 0
Education_level 2 0.99 7 18 0 3 0
Contraceptive_use 2 0.99 2 3 0 2 0
Contraceptive_type 107 0.71 4 19 0 7 0
First_sex_age 5 0.99 11 14 0 4 0
Morethan_one_sexual_patner 4 0.99 2 3 0 2 0
Had_STI 11 0.97 2 3 0 2 0
Family_cervical_cancer 6 0.98 2 3 0 2 0
Smoke_cigarretes 3 0.99 2 3 0 2 0
Alcohol_use 0 1.00 5 12 0 3 0
HIV_serostatus 6 0.98 8 12 0 2 0
Hpv_detected 2 0.99 2 3 0 2 0
HPV 0 1.00 3 6 0 20 0
Predisposition to Risk 208 0.44 4 19 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1.00 31.01 6.84 16.00 25.00 31.00 36.00 44.00 ▂▇▇▇▅
No_of_children 0 1.00 2.35 1.58 0.00 1.00 2.00 3.00 8.00 ▆▇▂▁▁
BG concentration 1 1.00 38.37 58.19 0.02 4.98 16.30 45.80 421.48 ▇▁▁▁▁
Cells 1 1.00 2906.84 4408.58 1.80 377.18 1234.68 3469.94 31930.63 ▇▁▁▁▁
Cq-HPV 207 0.44 23.72 5.70 10.65 19.77 23.61 27.80 38.52 ▂▆▇▅▁
HPV concentration 207 0.44 102143.24 561427.32 0.01 12.65 260.67 4200.99 5624444.87 ▇▁▁▁▁
HPV viral load (copies per 10^3 cells) 207 0.44 58109.56 300671.88 0.00 11.46 225.25 5685.23 3311231.27 ▇▁▁▁▁

Missing values

You can also embed plots, for example:

# Check column names before cleaning
hpv |> 
  colnames()
 [1] "UI"                                    
 [2] "Visit_type"                            
 [3] "Age"                                   
 [4] "Marital_status"                        
 [5] "No_of_children"                        
 [6] "Employment_status"                     
 [7] "Education_level"                       
 [8] "Contraceptive_use"                     
 [9] "Contraceptive_type"                    
[10] "First_sex_age"                         
[11] "Morethan_one_sexual_patner"            
[12] "Had_STI"                               
[13] "Family_cervical_cancer"                
[14] "Smoke_cigarretes"                      
[15] "Alcohol_use"                           
[16] "HIV_serostatus"                        
[17] "Hpv_detected"                          
[18] "BG concentration"                      
[19] "Cells"                                 
[20] "HPV"                                   
[21] "Cq-HPV"                                
[22] "HPV concentration"                     
[23] "HPV viral load (copies per 10^3 cells)"
[24] "Predisposition to Risk"                
# Clean names
hpv <- hpv |> 
  clean_names()

# Check column names after
hpv |> 
  colnames()
 [1] "ui"                                  
 [2] "visit_type"                          
 [3] "age"                                 
 [4] "marital_status"                      
 [5] "no_of_children"                      
 [6] "employment_status"                   
 [7] "education_level"                     
 [8] "contraceptive_use"                   
 [9] "contraceptive_type"                  
[10] "first_sex_age"                       
[11] "morethan_one_sexual_patner"          
[12] "had_sti"                             
[13] "family_cervical_cancer"              
[14] "smoke_cigarretes"                    
[15] "alcohol_use"                         
[16] "hiv_serostatus"                      
[17] "hpv_detected"                        
[18] "bg_concentration"                    
[19] "cells"                               
[20] "hpv"                                 
[21] "cq_hpv"                              
[22] "hpv_concentration"                   
[23] "hpv_viral_load_copies_per_10_3_cells"
[24] "predisposition_to_risk"              
# Check the class of each column
hpv |> 
  lapply(class)
$ui
[1] "character"

$visit_type
[1] "character"

$age
[1] "numeric"

$marital_status
[1] "character"

$no_of_children
[1] "numeric"

$employment_status
[1] "character"

$education_level
[1] "character"

$contraceptive_use
[1] "character"

$contraceptive_type
[1] "character"

$first_sex_age
[1] "character"

$morethan_one_sexual_patner
[1] "character"

$had_sti
[1] "character"

$family_cervical_cancer
[1] "character"

$smoke_cigarretes
[1] "character"

$alcohol_use
[1] "character"

$hiv_serostatus
[1] "character"

$hpv_detected
[1] "character"

$bg_concentration
[1] "numeric"

$cells
[1] "numeric"

$hpv
[1] "character"

$cq_hpv
[1] "numeric"

$hpv_concentration
[1] "numeric"

$hpv_viral_load_copies_per_10_3_cells
[1] "numeric"

$predisposition_to_risk
[1] "character"
#unlist(lapply(RheumArth, class))


# Turn blank to NAs in contraceptive_type
hpv <- hpv |> mutate(contraceptive_type = na_if(contraceptive_type, ""))

# Replace NAs
hpv <- hpv |> mutate(contraceptive_type = replace_na(contraceptive_type, "none"))

#For contraceptive_use
hpv <- hpv |> mutate(contraceptive_use = na_if(contraceptive_use, ""))

# Replace NAs
hpv <- hpv |> mutate(contraceptive_use = replace_na(contraceptive_use, "No"))

#For education_level
hpv <- hpv |> mutate(education_level = na_if(education_level, ""))

# Replace NAs
hpv <- hpv |> mutate(education_level = replace_na(education_level, "Unable to read/Write"))

# Replace NAs in viral load with zero
hpv <- hpv |> 
  mutate(hpv_viral_load_copies_per_10_3_cells = ifelse(is.na(hpv_viral_load_copies_per_10_3_cells), 0, hpv_viral_load_copies_per_10_3_cells))
head(hpv)
# A tibble: 6 × 24
  ui    visit_type          age marital_status no_of_children employment_status 
  <chr> <chr>             <dbl> <chr>                   <dbl> <chr>             
1 K001  Routine screening    37 Married                     5 Self-employed/Bus…
2 K002  Routine screening    41 Married                     4 Self-employed/Bus…
3 K003  Routine screening    33 Married                     2 Employed          
4 K004  Routine screening    33 Married                     4 Employed          
5 K005  Routine screening    28 Single                      2 Employed          
6 K006  Routine screening    44 Widowed                     1 Self-employed/Bus…
# ℹ 18 more variables: education_level <chr>, contraceptive_use <chr>,
#   contraceptive_type <chr>, first_sex_age <chr>,
#   morethan_one_sexual_patner <chr>, had_sti <chr>,
#   family_cervical_cancer <chr>, smoke_cigarretes <chr>, alcohol_use <chr>,
#   hiv_serostatus <chr>, hpv_detected <chr>, bg_concentration <dbl>,
#   cells <dbl>, hpv <chr>, cq_hpv <dbl>, hpv_concentration <dbl>,
#   hpv_viral_load_copies_per_10_3_cells <dbl>, predisposition_to_risk <chr>
dim(hpv)
[1] 372  24
# Remove unwanted columns
hpv <- hpv |> select(-c(ui, cells, hpv, cq_hpv, hpv_concentration, predisposition_to_risk))

hpv |> colnames()
 [1] "visit_type"                          
 [2] "age"                                 
 [3] "marital_status"                      
 [4] "no_of_children"                      
 [5] "employment_status"                   
 [6] "education_level"                     
 [7] "contraceptive_use"                   
 [8] "contraceptive_type"                  
 [9] "first_sex_age"                       
[10] "morethan_one_sexual_patner"          
[11] "had_sti"                             
[12] "family_cervical_cancer"              
[13] "smoke_cigarretes"                    
[14] "alcohol_use"                         
[15] "hiv_serostatus"                      
[16] "hpv_detected"                        
[17] "bg_concentration"                    
[18] "hpv_viral_load_copies_per_10_3_cells"

Cleansing

skim(hpv)
Data summary
Name hpv
Number of rows 372
Number of columns 18
_______________________
Column type frequency:
character 14
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
visit_type 0 1.00 17 17 0 2 0
marital_status 5 0.99 6 18 0 4 0
employment_status 2 0.99 8 22 0 3 0
education_level 0 1.00 7 20 0 4 0
contraceptive_use 0 1.00 2 3 0 2 0
contraceptive_type 0 1.00 4 19 0 8 0
first_sex_age 5 0.99 11 14 0 4 0
morethan_one_sexual_patner 4 0.99 2 3 0 2 0
had_sti 11 0.97 2 3 0 2 0
family_cervical_cancer 6 0.98 2 3 0 2 0
smoke_cigarretes 3 0.99 2 3 0 2 0
alcohol_use 0 1.00 5 12 0 3 0
hiv_serostatus 6 0.98 8 12 0 2 0
hpv_detected 2 0.99 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 31.01 6.84 16.00 25.00 31.0 36.00 44.00 ▂▇▇▇▅
no_of_children 0 1 2.35 1.58 0.00 1.00 2.0 3.00 8.00 ▆▇▂▁▁
bg_concentration 1 1 38.37 58.19 0.02 4.98 16.3 45.80 421.48 ▇▁▁▁▁
hpv_viral_load_copies_per_10_3_cells 0 1 25774.40 201986.22 0.00 0.00 0.0 132.82 3311231.27 ▇▁▁▁▁
hpv |> select(marital_status) |> unique()
# A tibble: 5 × 1
  marital_status    
  <chr>             
1 Married           
2 Single            
3 Widowed           
4 Divorced/Separated
5 <NA>              
hpv |> select(employment_status) |> unique()
# A tibble: 4 × 1
  employment_status     
  <chr>                 
1 Self-employed/Business
2 Employed              
3 Unemployed            
4 <NA>                  
hpv |> select(first_sex_age) |> unique()
# A tibble: 5 × 1
  first_sex_age 
  <chr>         
1 Below 15 years
2 15-19 years   
3 20-25 years   
4 Above 25 years
5 <NA>          
hpv |> select(morethan_one_sexual_patner) |> unique()
# A tibble: 3 × 1
  morethan_one_sexual_patner
  <chr>                     
1 Yes                       
2 No                        
3 <NA>                      
hpv |> select(had_sti) |> unique()
# A tibble: 3 × 1
  had_sti
  <chr>  
1 No     
2 Yes    
3 <NA>   
hpv |> select(family_cervical_cancer) |> unique()
# A tibble: 3 × 1
  family_cervical_cancer
  <chr>                 
1 Yes                   
2 No                    
3 <NA>                  
hpv |> select(smoke_cigarretes) |> unique()
# A tibble: 3 × 1
  smoke_cigarretes
  <chr>           
1 No              
2 Yes             
3 <NA>            
hpv |> select(hiv_serostatus) |> unique()
# A tibble: 3 × 1
  hiv_serostatus
  <chr>         
1 Reactive      
2 Non-Reactive  
3 <NA>          
hpv |> select(hpv_detected) |> unique()
# A tibble: 3 × 1
  hpv_detected
  <chr>       
1 Yes         
2 No          
3 <NA>        
# Repalce NAs

hpv <- hpv |> mutate(marital_status = replace_na(marital_status, "Married"))
hpv <- hpv |> mutate(employment_status = replace_na(employment_status, "Unemployed"))
hpv <- hpv |> mutate(first_sex_age = replace_na(first_sex_age, "Below 15 years"))
hpv <- hpv |> mutate(morethan_one_sexual_patner = replace_na(morethan_one_sexual_patner, "Yes"))
hpv <- hpv |> mutate(had_sti = replace_na(had_sti, "No"))
hpv <- hpv |> mutate(family_cervical_cancer = replace_na(family_cervical_cancer, "No"))
hpv <- hpv |> mutate(smoke_cigarretes = replace_na(smoke_cigarretes, "No"))
hpv <- hpv |> mutate(hiv_serostatus = replace_na(hiv_serostatus, "Non-Reactive"))
hpv <- hpv |> mutate(hpv_detected = replace_na(hpv_detected, "No"))

skim(hpv)
Data summary
Name hpv
Number of rows 372
Number of columns 18
_______________________
Column type frequency:
character 14
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
visit_type 0 1 17 17 0 2 0
marital_status 0 1 6 18 0 4 0
employment_status 0 1 8 22 0 3 0
education_level 0 1 7 20 0 4 0
contraceptive_use 0 1 2 3 0 2 0
contraceptive_type 0 1 4 19 0 8 0
first_sex_age 0 1 11 14 0 4 0
morethan_one_sexual_patner 0 1 2 3 0 2 0
had_sti 0 1 2 3 0 2 0
family_cervical_cancer 0 1 2 3 0 2 0
smoke_cigarretes 0 1 2 3 0 2 0
alcohol_use 0 1 5 12 0 3 0
hiv_serostatus 0 1 8 12 0 2 0
hpv_detected 0 1 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 31.01 6.84 16.00 25.00 31.0 36.00 44.00 ▂▇▇▇▅
no_of_children 0 1 2.35 1.58 0.00 1.00 2.0 3.00 8.00 ▆▇▂▁▁
bg_concentration 1 1 38.37 58.19 0.02 4.98 16.3 45.80 421.48 ▇▁▁▁▁
hpv_viral_load_copies_per_10_3_cells 0 1 25774.40 201986.22 0.00 0.00 0.0 132.82 3311231.27 ▇▁▁▁▁
# To factors

hpv_names <- c("visit_type", "marital_status", "employment_status", "education_level", "contraceptive_use", "contraceptive_type", "first_sex_age", "morethan_one_sexual_patner", "had_sti", "family_cervical_cancer", "smoke_cigarretes", "alcohol_use", "hiv_serostatus", "hpv_detected")

hpv <- hpv |> 
  mutate(across(all_of(hpv_names), factor))


str(hpv)
tibble [372 × 18] (S3: tbl_df/tbl/data.frame)
 $ visit_type                          : Factor w/ 2 levels "Initial screening",..: 2 2 2 2 2 2 2 2 2 1 ...
 $ age                                 : num [1:372] 37 41 33 33 28 44 25 43 36 43 ...
 $ marital_status                      : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 2 3 4 2 4 2 4 ...
 $ no_of_children                      : num [1:372] 5 4 2 4 2 1 2 3 1 2 ...
 $ employment_status                   : Factor w/ 3 levels "Employed","Self-employed/Business",..: 2 2 1 1 1 2 3 2 2 2 ...
 $ education_level                     : Factor w/ 4 levels "College/University",..: 2 3 2 1 3 2 1 1 2 2 ...
 $ contraceptive_use                   : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 1 2 2 2 ...
 $ contraceptive_type                  : Factor w/ 8 levels "Condom","Depo",..: 2 3 2 3 2 6 6 1 3 2 ...
 $ first_sex_age                       : Factor w/ 4 levels "15-19 years",..: 4 1 1 2 4 1 4 2 3 1 ...
 $ morethan_one_sexual_patner          : Factor w/ 2 levels "No","Yes": 2 1 2 1 2 1 2 2 2 2 ...
 $ had_sti                             : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 1 ...
 $ family_cervical_cancer              : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
 $ smoke_cigarretes                    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ alcohol_use                         : Factor w/ 3 levels "Never","Occasionally",..: 2 1 1 1 1 1 1 1 1 2 ...
 $ hiv_serostatus                      : Factor w/ 2 levels "Non-Reactive",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ hpv_detected                        : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
 $ bg_concentration                    : num [1:372] 66.08 126.15 7.49 198.47 79.43 ...
 $ hpv_viral_load_copies_per_10_3_cells: num [1:372] 1.9 65.2 0 0 3403.7 ...

Exploration

lapply(hpv, function(x) {

if (is.numeric(x)) return(summary(x))

if (is.factor(x)) return(table(x))

})
$visit_type
x
Initial screening Routine screening 
              195               177 

$age
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  16.00   25.00   31.00   31.01   36.00   44.00 

$marital_status
x
Divorced/Separated            Married             Single            Widowed 
                23                259                 72                 18 

$no_of_children
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   1.000   2.000   2.352   3.000   8.000 

$employment_status
x
              Employed Self-employed/Business             Unemployed 
                    70                    198                    104 

$education_level
x
  College/University           Highschool              Primary 
                  99                  136                  135 
Unable to read/Write 
                   2 

$contraceptive_use
x
 No Yes 
108 264 

$contraceptive_type
x
             Condom                Depo             Implant      Implant Condom 
                 32                 101                  85                   1 
               IUCD                none Oral Contraceptives    Tubal Litigation 
                 21                 107                  23                   2 

$first_sex_age
x
   15-19 years    20-25 years Above 25 years Below 15 years 
           241             71              5             55 

$morethan_one_sexual_patner
x
 No Yes 
 85 287 

$had_sti
x
 No Yes 
294  78 

$family_cervical_cancer
x
 No Yes 
354  18 

$smoke_cigarretes
x
 No Yes 
365   7 

$alcohol_use
x
       Never Occasionally      Usually 
         325           45            2 

$hiv_serostatus
x
Non-Reactive     Reactive 
         186          186 

$hpv_detected
x
 No Yes 
210 162 

$bg_concentration
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
  0.02378   4.97883  16.29782  38.37031  45.80327 421.48427         1 

$hpv_viral_load_copies_per_10_3_cells
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
      0.0       0.0       0.0   25774.4     132.8 3311231.3 

Graphics

# First sex
hpv |> 
  select(first_sex_age) |> 
  count(first_sex_age) |> 
  ggplot(aes(x = reorder(first_sex_age, n), y = n, fill = first_sex_age)) + geom_col(show.legend = FALSE) +
  geom_text(aes(label = n, vjust = -0.5)) +
  theme_minimal() +
  theme(axis.title.y = element_text(angle = 0)) +
  labs(title = "Age at first intercourse")

# Contrceptives
hpv |> 
  select(contraceptive_type) |> 
  count(contraceptive_type) |> 
  ggplot(aes(x = reorder(contraceptive_type, n), y = n, fill = contraceptive_type)) + geom_col(show.legend = FALSE) +
  geom_text(aes(label = n, vjust = -0.5)) +
  theme_minimal() +
  theme(axis.title.y = element_text(angle = 0)) +
  labs(title = "Contraceptive types")

# Education
hpv |> 
  select(education_level) |> 
  count(education_level) |> 
  ggplot(aes(x = reorder(education_level, n), y = n, fill = education_level)) + geom_col(show.legend = FALSE) +
  geom_text(aes(label = n, vjust = -0.5)) +
  theme_minimal() +
  theme(axis.title.y = element_text(angle = 0)) +
  labs(title = "Levels of education")

# Marital status
hpv |> 
  select(marital_status) |> 
  count(marital_status) |> 
  ggplot(aes(x = reorder(marital_status, n), y = n, fill = marital_status)) + geom_col(show.legend = FALSE) +
  geom_text(aes(label = n, vjust = -0.5)) +
  theme_minimal() +
  theme(axis.title.y = element_text(angle = 0)) +
  labs(title = "Marital status")

Modeling with Logistic Regression

# Preprocess
set.seed(123)
hpv_split <- initial_split(hpv, prop = .8, strata = hpv_detected)
hpv_split
## <Training/Testing/Total>
## <297/75/372>

Training and Test Splits

# Training and test data
hpv_train <- hpv_split |>  training()
hpv_test <- hpv_split |>  testing()

# Recipe
hpv_m <- recipe(hpv_detected ~ ., data = hpv_train) |>  
  step_rose(hpv_detected)  
  
  

# Prep and juice for test
hpv_prep <- hpv_m |>  prep()
hpv_juice <- hpv_prep |>  juice()

Logistic Regression

# Set engine
lr <- logistic_reg() |>  
  set_engine(engine = "glm") |> 
  set_mode("classification")

# Workflow
lr_wf <- workflow() |> 
  add_recipe(hpv_m) |>  
  add_model(lr)

# Cross validation
set.seed(456)
m_folds <- vfold_cv(hpv_train)

# Set metrics
ev_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)

# Evaluate model
doParallel::registerDoParallel()
set.seed(789)

lr_rs <- fit_resamples(
  lr_wf,
  resamples = m_folds,
  metrics = ev_metrics
)
## → A | error:   factor contraceptive_type has new levels Tubal Litigation
## There were issues with some computations   A: x1                                                 → B | error:   contrasts can be applied only to factors with 2 or more levels
## There were issues with some computations   A: x1There were issues with some computations   A: x1   B: x1                                                         → C | error:   factor education_level has new levels Unable to read/Write
## There were issues with some computations   A: x1   B: x1There were issues with some computations   A: x1   B: x1   C: x1There were issues with some computations   A: x1   B: x1   C: x1

Metrics

# Get metrics
collect_metrics(lr_rs)
## # A tibble: 6 × 6
##   .metric     .estimator  mean     n std_err .config        
##   <chr>       <chr>      <dbl> <int>   <dbl> <chr>          
## 1 accuracy    binary     0.486     7  0.0220 pre0_mod0_post0
## 2 npv         binary     0.420     7  0.0424 pre0_mod0_post0
## 3 precision   binary     0.554     7  0.0369 pre0_mod0_post0
## 4 roc_auc     binary     0.506     7  0.0307 pre0_mod0_post0
## 5 sensitivity binary     0.452     7  0.0546 pre0_mod0_post0
## 6 specificity binary     0.522     7  0.0707 pre0_mod0_post0
# Get summary
summary(lr_rs)
##  splits.Length  splits.Class  splits.Mode      id           
##  4            vfold_split  list           Length:10         
##  4            vfold_split  list           Class :character  
##  4            vfold_split  list           Mode  :character  
##  4            vfold_split  list                             
##  4            vfold_split  list                             
##  4            vfold_split  list                             
##  4            vfold_split  list                             
##  4            vfold_split  list                             
##  4            vfold_split  list                             
##  4            vfold_split  list                             
##  .metrics.Length  .metrics.Class  .metrics.Mode
##  4       tbl_df  list                          
##  4       tbl_df  list                          
##  0       -none-  NULL                          
##  4       tbl_df  list                          
##  4       tbl_df  list                          
##  0       -none-  NULL                          
##  4       tbl_df  list                          
##  4       tbl_df  list                          
##  4       tbl_df  list                          
##  0       -none-  NULL                          
##  .notes.Length  .notes.Class  .notes.Mode
##  4       tbl_df  list                    
##  4       tbl_df  list                    
##  4       tbl_df  list                    
##  4       tbl_df  list                    
##  4       tbl_df  list                    
##  4       tbl_df  list                    
##  4       tbl_df  list                    
##  4       tbl_df  list                    
##  4       tbl_df  list                    
##  4       tbl_df  list

Other Metrics

Logistic_m <- glm(
  hpv_detected ~ ., 
  data = hpv_train,
  family = "binomial"
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(Logistic_m)
## 
## Call:
## glm(formula = hpv_detected ~ ., family = "binomial", data = hpv_train)
## 
## Coefficients: (1 not defined because of singularities)
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                              1.119e+00  1.543e+00   0.726  0.46811
## visit_typeRoutine screening             -3.796e-01  6.190e-01  -0.613  0.53975
## age                                     -7.053e-02  4.046e-02  -1.743  0.08126
## marital_statusMarried                   -1.718e+00  7.945e-01  -2.162  0.03061
## marital_statusSingle                    -8.326e-01  8.316e-01  -1.001  0.31676
## marital_statusWidowed                   -6.275e-01  1.082e+00  -0.580  0.56179
## no_of_children                          -2.802e-02  1.845e-01  -0.152  0.87925
## employment_statusSelf-employed/Business  2.427e-01  6.094e-01   0.398  0.69045
## employment_statusUnemployed              1.238e-01  6.623e-01   0.187  0.85173
## education_levelHighschool                1.298e-02  5.718e-01   0.023  0.98190
## education_levelPrimary                  -4.822e-01  6.428e-01  -0.750  0.45314
## education_levelUnable to read/Write     -1.722e+01  1.773e+04  -0.001  0.99922
## contraceptive_useYes                     7.316e-01  7.842e-01   0.933  0.35086
## contraceptive_typeDepo                   4.873e-01  6.978e-01   0.698  0.48495
## contraceptive_typeImplant               -1.035e+00  8.047e-01  -1.286  0.19838
## contraceptive_typeImplant Condom         1.733e+01  1.773e+04   0.001  0.99922
## contraceptive_typeIUCD                   3.925e-01  1.050e+00   0.374  0.70843
## contraceptive_typenone                          NA         NA      NA       NA
## contraceptive_typeOral Contraceptives    9.156e-02  1.035e+00   0.088  0.92952
## contraceptive_typeTubal Litigation      -1.715e+01  1.773e+04  -0.001  0.99923
## first_sex_age20-25 years                -4.028e-01  5.235e-01  -0.769  0.44163
## first_sex_ageAbove 25 years              1.874e+00  1.433e+00   1.308  0.19081
## first_sex_ageBelow 15 years              4.244e-01  6.156e-01   0.689  0.49053
## morethan_one_sexual_patnerYes           -1.297e-01  5.033e-01  -0.258  0.79668
## had_stiYes                               4.253e-01  4.751e-01   0.895  0.37066
## family_cervical_cancerYes                8.151e-01  9.001e-01   0.906  0.36515
## smoke_cigarretesYes                      8.688e-01  1.484e+00   0.585  0.55828
## alcohol_useOccasionally                 -7.296e-01  6.598e-01  -1.106  0.26881
## alcohol_useUsually                       1.426e+01  1.155e+04   0.001  0.99902
## hiv_serostatusReactive                   6.389e-01  6.785e-01   0.942  0.34632
## bg_concentration                         1.093e-02  3.532e-03   3.093  0.00198
## hpv_viral_load_copies_per_10_3_cells     3.472e-02  1.030e-02   3.371  0.00075
##                                            
## (Intercept)                                
## visit_typeRoutine screening                
## age                                     .  
## marital_statusMarried                   *  
## marital_statusSingle                       
## marital_statusWidowed                      
## no_of_children                             
## employment_statusSelf-employed/Business    
## employment_statusUnemployed                
## education_levelHighschool                  
## education_levelPrimary                     
## education_levelUnable to read/Write        
## contraceptive_useYes                       
## contraceptive_typeDepo                     
## contraceptive_typeImplant                  
## contraceptive_typeImplant Condom           
## contraceptive_typeIUCD                     
## contraceptive_typenone                     
## contraceptive_typeOral Contraceptives      
## contraceptive_typeTubal Litigation         
## first_sex_age20-25 years                   
## first_sex_ageAbove 25 years                
## first_sex_ageBelow 15 years                
## morethan_one_sexual_patnerYes              
## had_stiYes                                 
## family_cervical_cancerYes                  
## smoke_cigarretesYes                        
## alcohol_useOccasionally                    
## alcohol_useUsually                         
## hiv_serostatusReactive                     
## bg_concentration                        ** 
## hpv_viral_load_copies_per_10_3_cells    ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 405.45  on 295  degrees of freedom
## Residual deviance: 190.83  on 265  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 252.83
## 
## Number of Fisher Scoring iterations: 19
suppressWarnings(exp(cbind(Odds_ratio = coef(Logistic_m), confint(Logistic_m))))
## Waiting for profiling to be done...
##                                           Odds_ratio         2.5 %      97.5 %
## (Intercept)                             3.062983e+00  1.487053e-01  65.6276330
## visit_typeRoutine screening             6.841391e-01  2.001539e-01   2.3103482
## age                                     9.318973e-01  8.586981e-01   1.0072338
## marital_statusMarried                   1.794440e-01  3.711114e-02   0.8723728
## marital_statusSingle                    4.349323e-01  8.440324e-02   2.2809956
## marital_statusWidowed                   5.339104e-01  5.324712e-02   4.1612406
## no_of_children                          9.723655e-01  6.756950e-01   1.3988944
## employment_statusSelf-employed/Business 1.274691e+00  3.947499e-01   4.4048907
## employment_statusUnemployed             1.131786e+00  3.149256e-01   4.3160555
## education_levelHighschool               1.013059e+00  3.329960e-01   3.1899092
## education_levelPrimary                  6.174085e-01  1.758937e-01   2.2305915
## education_levelUnable to read/Write     3.307008e-08            NA         Inf
## contraceptive_useYes                    2.078383e+00  4.371859e-01   9.8169572
## contraceptive_typeDepo                  1.627931e+00  4.317220e-01   6.8590228
## contraceptive_typeImplant               3.552272e-01  7.080944e-02   1.7359335
## contraceptive_typeImplant Condom        3.355238e+07  0.000000e+00          NA
## contraceptive_typeIUCD                  1.480695e+00  1.769290e-01  11.5439977
## contraceptive_typenone                            NA            NA          NA
## contraceptive_typeOral Contraceptives   1.095885e+00  1.337278e-01   8.2056882
## contraceptive_typeTubal Litigation      3.559450e-08            NA         Inf
## first_sex_age20-25 years                6.684639e-01  2.281398e-01   1.8104253
## first_sex_ageAbove 25 years             6.516958e+00  2.365648e-01 103.7302085
## first_sex_ageBelow 15 years             1.528720e+00  4.497373e-01   5.1274001
## morethan_one_sexual_patnerYes           8.783741e-01  3.321797e-01   2.4316862
## had_stiYes                              1.530061e+00  5.891806e-01   3.8513646
## family_cervical_cancerYes               2.259418e+00  3.474946e-01  12.7118750
## smoke_cigarretesYes                     2.383935e+00  7.878702e-02  40.5628259
## alcohol_useOccasionally                 4.821148e-01  1.223119e-01   1.6734329
## alcohol_useUsually                      1.558789e+06 5.423684e-158         Inf
## hiv_serostatusReactive                  1.894476e+00  5.146945e-01   7.5052775
## bg_concentration                        1.010985e+00  1.004424e+00   1.0184661
## hpv_viral_load_copies_per_10_3_cells    1.035325e+00  1.018810e+00   1.0602343

ROC Curve

# Fit logistic regression
model <- glm(
  hpv_detected ~ ., 
  data = hpv_train, 
  family = "binomial"
)
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predicted probabilities on test set
hpv_probs <- predict(model, newdata = hpv_test, type = "response")
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# ROC curve
library(pROC)
Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'
The following objects are masked from 'package:stats':

    cov, smooth, var
roc_obj <- roc(
  hpv_test$hpv_detected,  # true outcome
  hpv_probs,              # predicted probabilities
  levels = c("No", "Yes"),# make sure first level is negative class
  direction = "<"
)

# Plot ROC
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve for HPV Detection")

auc(roc_obj)  # AUC value
Area under the curve: 0.7922