Load packages
library(pacman)
p_load(readxl, skimr, janitor, tidyverse, AMR, plotly, themis, tidymodels)
hpv <- read_xlsx("Modelling_HPV-1.xlsx", sheet = "Sheet2", col_names = TRUE, na = "NA")
hpv |> skim()
Data summary
| Name |
hpv |
| Number of rows |
372 |
| Number of columns |
24 |
| _______________________ |
|
| Column type frequency: |
|
| character |
17 |
| numeric |
7 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| UI |
0 |
1.00 |
4 |
4 |
0 |
372 |
0 |
| Visit_type |
0 |
1.00 |
17 |
17 |
0 |
2 |
0 |
| Marital_status |
5 |
0.99 |
6 |
18 |
0 |
4 |
0 |
| Employment_status |
2 |
0.99 |
8 |
22 |
0 |
3 |
0 |
| Education_level |
2 |
0.99 |
7 |
18 |
0 |
3 |
0 |
| Contraceptive_use |
2 |
0.99 |
2 |
3 |
0 |
2 |
0 |
| Contraceptive_type |
107 |
0.71 |
4 |
19 |
0 |
7 |
0 |
| First_sex_age |
5 |
0.99 |
11 |
14 |
0 |
4 |
0 |
| Morethan_one_sexual_patner |
4 |
0.99 |
2 |
3 |
0 |
2 |
0 |
| Had_STI |
11 |
0.97 |
2 |
3 |
0 |
2 |
0 |
| Family_cervical_cancer |
6 |
0.98 |
2 |
3 |
0 |
2 |
0 |
| Smoke_cigarretes |
3 |
0.99 |
2 |
3 |
0 |
2 |
0 |
| Alcohol_use |
0 |
1.00 |
5 |
12 |
0 |
3 |
0 |
| HIV_serostatus |
6 |
0.98 |
8 |
12 |
0 |
2 |
0 |
| Hpv_detected |
2 |
0.99 |
2 |
3 |
0 |
2 |
0 |
| HPV |
0 |
1.00 |
3 |
6 |
0 |
20 |
0 |
| Predisposition to Risk |
208 |
0.44 |
4 |
19 |
0 |
7 |
0 |
Variable type: numeric
| Age |
0 |
1.00 |
31.01 |
6.84 |
16.00 |
25.00 |
31.00 |
36.00 |
44.00 |
▂▇▇▇▅ |
| No_of_children |
0 |
1.00 |
2.35 |
1.58 |
0.00 |
1.00 |
2.00 |
3.00 |
8.00 |
▆▇▂▁▁ |
| BG concentration |
1 |
1.00 |
38.37 |
58.19 |
0.02 |
4.98 |
16.30 |
45.80 |
421.48 |
▇▁▁▁▁ |
| Cells |
1 |
1.00 |
2906.84 |
4408.58 |
1.80 |
377.18 |
1234.68 |
3469.94 |
31930.63 |
▇▁▁▁▁ |
| Cq-HPV |
207 |
0.44 |
23.72 |
5.70 |
10.65 |
19.77 |
23.61 |
27.80 |
38.52 |
▂▆▇▅▁ |
| HPV concentration |
207 |
0.44 |
102143.24 |
561427.32 |
0.01 |
12.65 |
260.67 |
4200.99 |
5624444.87 |
▇▁▁▁▁ |
| HPV viral load (copies per 10^3 cells) |
207 |
0.44 |
58109.56 |
300671.88 |
0.00 |
11.46 |
225.25 |
5685.23 |
3311231.27 |
▇▁▁▁▁ |
Missing values
You can also embed plots, for example:
# Check column names before cleaning
hpv |>
colnames()
[1] "UI"
[2] "Visit_type"
[3] "Age"
[4] "Marital_status"
[5] "No_of_children"
[6] "Employment_status"
[7] "Education_level"
[8] "Contraceptive_use"
[9] "Contraceptive_type"
[10] "First_sex_age"
[11] "Morethan_one_sexual_patner"
[12] "Had_STI"
[13] "Family_cervical_cancer"
[14] "Smoke_cigarretes"
[15] "Alcohol_use"
[16] "HIV_serostatus"
[17] "Hpv_detected"
[18] "BG concentration"
[19] "Cells"
[20] "HPV"
[21] "Cq-HPV"
[22] "HPV concentration"
[23] "HPV viral load (copies per 10^3 cells)"
[24] "Predisposition to Risk"
# Clean names
hpv <- hpv |>
clean_names()
# Check column names after
hpv |>
colnames()
[1] "ui"
[2] "visit_type"
[3] "age"
[4] "marital_status"
[5] "no_of_children"
[6] "employment_status"
[7] "education_level"
[8] "contraceptive_use"
[9] "contraceptive_type"
[10] "first_sex_age"
[11] "morethan_one_sexual_patner"
[12] "had_sti"
[13] "family_cervical_cancer"
[14] "smoke_cigarretes"
[15] "alcohol_use"
[16] "hiv_serostatus"
[17] "hpv_detected"
[18] "bg_concentration"
[19] "cells"
[20] "hpv"
[21] "cq_hpv"
[22] "hpv_concentration"
[23] "hpv_viral_load_copies_per_10_3_cells"
[24] "predisposition_to_risk"
# Check the class of each column
hpv |>
lapply(class)
$ui
[1] "character"
$visit_type
[1] "character"
$age
[1] "numeric"
$marital_status
[1] "character"
$no_of_children
[1] "numeric"
$employment_status
[1] "character"
$education_level
[1] "character"
$contraceptive_use
[1] "character"
$contraceptive_type
[1] "character"
$first_sex_age
[1] "character"
$morethan_one_sexual_patner
[1] "character"
$had_sti
[1] "character"
$family_cervical_cancer
[1] "character"
$smoke_cigarretes
[1] "character"
$alcohol_use
[1] "character"
$hiv_serostatus
[1] "character"
$hpv_detected
[1] "character"
$bg_concentration
[1] "numeric"
$cells
[1] "numeric"
$hpv
[1] "character"
$cq_hpv
[1] "numeric"
$hpv_concentration
[1] "numeric"
$hpv_viral_load_copies_per_10_3_cells
[1] "numeric"
$predisposition_to_risk
[1] "character"
#unlist(lapply(RheumArth, class))
# Turn blank to NAs in contraceptive_type
hpv <- hpv |> mutate(contraceptive_type = na_if(contraceptive_type, ""))
# Replace NAs
hpv <- hpv |> mutate(contraceptive_type = replace_na(contraceptive_type, "none"))
#For contraceptive_use
hpv <- hpv |> mutate(contraceptive_use = na_if(contraceptive_use, ""))
# Replace NAs
hpv <- hpv |> mutate(contraceptive_use = replace_na(contraceptive_use, "No"))
#For education_level
hpv <- hpv |> mutate(education_level = na_if(education_level, ""))
# Replace NAs
hpv <- hpv |> mutate(education_level = replace_na(education_level, "Unable to read/Write"))
# Replace NAs in viral load with zero
hpv <- hpv |>
mutate(hpv_viral_load_copies_per_10_3_cells = ifelse(is.na(hpv_viral_load_copies_per_10_3_cells), 0, hpv_viral_load_copies_per_10_3_cells))
head(hpv)
# A tibble: 6 × 24
ui visit_type age marital_status no_of_children employment_status
<chr> <chr> <dbl> <chr> <dbl> <chr>
1 K001 Routine screening 37 Married 5 Self-employed/Bus…
2 K002 Routine screening 41 Married 4 Self-employed/Bus…
3 K003 Routine screening 33 Married 2 Employed
4 K004 Routine screening 33 Married 4 Employed
5 K005 Routine screening 28 Single 2 Employed
6 K006 Routine screening 44 Widowed 1 Self-employed/Bus…
# ℹ 18 more variables: education_level <chr>, contraceptive_use <chr>,
# contraceptive_type <chr>, first_sex_age <chr>,
# morethan_one_sexual_patner <chr>, had_sti <chr>,
# family_cervical_cancer <chr>, smoke_cigarretes <chr>, alcohol_use <chr>,
# hiv_serostatus <chr>, hpv_detected <chr>, bg_concentration <dbl>,
# cells <dbl>, hpv <chr>, cq_hpv <dbl>, hpv_concentration <dbl>,
# hpv_viral_load_copies_per_10_3_cells <dbl>, predisposition_to_risk <chr>
dim(hpv)
[1] 372 24
# Remove unwanted columns
hpv <- hpv |> select(-c(ui, cells, hpv, cq_hpv, hpv_concentration, predisposition_to_risk))
hpv |> colnames()
[1] "visit_type"
[2] "age"
[3] "marital_status"
[4] "no_of_children"
[5] "employment_status"
[6] "education_level"
[7] "contraceptive_use"
[8] "contraceptive_type"
[9] "first_sex_age"
[10] "morethan_one_sexual_patner"
[11] "had_sti"
[12] "family_cervical_cancer"
[13] "smoke_cigarretes"
[14] "alcohol_use"
[15] "hiv_serostatus"
[16] "hpv_detected"
[17] "bg_concentration"
[18] "hpv_viral_load_copies_per_10_3_cells"
Cleansing
skim(hpv)
Data summary
| Name |
hpv |
| Number of rows |
372 |
| Number of columns |
18 |
| _______________________ |
|
| Column type frequency: |
|
| character |
14 |
| numeric |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| visit_type |
0 |
1.00 |
17 |
17 |
0 |
2 |
0 |
| marital_status |
5 |
0.99 |
6 |
18 |
0 |
4 |
0 |
| employment_status |
2 |
0.99 |
8 |
22 |
0 |
3 |
0 |
| education_level |
0 |
1.00 |
7 |
20 |
0 |
4 |
0 |
| contraceptive_use |
0 |
1.00 |
2 |
3 |
0 |
2 |
0 |
| contraceptive_type |
0 |
1.00 |
4 |
19 |
0 |
8 |
0 |
| first_sex_age |
5 |
0.99 |
11 |
14 |
0 |
4 |
0 |
| morethan_one_sexual_patner |
4 |
0.99 |
2 |
3 |
0 |
2 |
0 |
| had_sti |
11 |
0.97 |
2 |
3 |
0 |
2 |
0 |
| family_cervical_cancer |
6 |
0.98 |
2 |
3 |
0 |
2 |
0 |
| smoke_cigarretes |
3 |
0.99 |
2 |
3 |
0 |
2 |
0 |
| alcohol_use |
0 |
1.00 |
5 |
12 |
0 |
3 |
0 |
| hiv_serostatus |
6 |
0.98 |
8 |
12 |
0 |
2 |
0 |
| hpv_detected |
2 |
0.99 |
2 |
3 |
0 |
2 |
0 |
Variable type: numeric
| age |
0 |
1 |
31.01 |
6.84 |
16.00 |
25.00 |
31.0 |
36.00 |
44.00 |
▂▇▇▇▅ |
| no_of_children |
0 |
1 |
2.35 |
1.58 |
0.00 |
1.00 |
2.0 |
3.00 |
8.00 |
▆▇▂▁▁ |
| bg_concentration |
1 |
1 |
38.37 |
58.19 |
0.02 |
4.98 |
16.3 |
45.80 |
421.48 |
▇▁▁▁▁ |
| hpv_viral_load_copies_per_10_3_cells |
0 |
1 |
25774.40 |
201986.22 |
0.00 |
0.00 |
0.0 |
132.82 |
3311231.27 |
▇▁▁▁▁ |
hpv |> select(marital_status) |> unique()
# A tibble: 5 × 1
marital_status
<chr>
1 Married
2 Single
3 Widowed
4 Divorced/Separated
5 <NA>
hpv |> select(employment_status) |> unique()
# A tibble: 4 × 1
employment_status
<chr>
1 Self-employed/Business
2 Employed
3 Unemployed
4 <NA>
hpv |> select(first_sex_age) |> unique()
# A tibble: 5 × 1
first_sex_age
<chr>
1 Below 15 years
2 15-19 years
3 20-25 years
4 Above 25 years
5 <NA>
hpv |> select(morethan_one_sexual_patner) |> unique()
# A tibble: 3 × 1
morethan_one_sexual_patner
<chr>
1 Yes
2 No
3 <NA>
hpv |> select(had_sti) |> unique()
# A tibble: 3 × 1
had_sti
<chr>
1 No
2 Yes
3 <NA>
hpv |> select(family_cervical_cancer) |> unique()
# A tibble: 3 × 1
family_cervical_cancer
<chr>
1 Yes
2 No
3 <NA>
hpv |> select(smoke_cigarretes) |> unique()
# A tibble: 3 × 1
smoke_cigarretes
<chr>
1 No
2 Yes
3 <NA>
hpv |> select(hiv_serostatus) |> unique()
# A tibble: 3 × 1
hiv_serostatus
<chr>
1 Reactive
2 Non-Reactive
3 <NA>
hpv |> select(hpv_detected) |> unique()
# A tibble: 3 × 1
hpv_detected
<chr>
1 Yes
2 No
3 <NA>
# Repalce NAs
hpv <- hpv |> mutate(marital_status = replace_na(marital_status, "Married"))
hpv <- hpv |> mutate(employment_status = replace_na(employment_status, "Unemployed"))
hpv <- hpv |> mutate(first_sex_age = replace_na(first_sex_age, "Below 15 years"))
hpv <- hpv |> mutate(morethan_one_sexual_patner = replace_na(morethan_one_sexual_patner, "Yes"))
hpv <- hpv |> mutate(had_sti = replace_na(had_sti, "No"))
hpv <- hpv |> mutate(family_cervical_cancer = replace_na(family_cervical_cancer, "No"))
hpv <- hpv |> mutate(smoke_cigarretes = replace_na(smoke_cigarretes, "No"))
hpv <- hpv |> mutate(hiv_serostatus = replace_na(hiv_serostatus, "Non-Reactive"))
hpv <- hpv |> mutate(hpv_detected = replace_na(hpv_detected, "No"))
skim(hpv)
Data summary
| Name |
hpv |
| Number of rows |
372 |
| Number of columns |
18 |
| _______________________ |
|
| Column type frequency: |
|
| character |
14 |
| numeric |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| visit_type |
0 |
1 |
17 |
17 |
0 |
2 |
0 |
| marital_status |
0 |
1 |
6 |
18 |
0 |
4 |
0 |
| employment_status |
0 |
1 |
8 |
22 |
0 |
3 |
0 |
| education_level |
0 |
1 |
7 |
20 |
0 |
4 |
0 |
| contraceptive_use |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| contraceptive_type |
0 |
1 |
4 |
19 |
0 |
8 |
0 |
| first_sex_age |
0 |
1 |
11 |
14 |
0 |
4 |
0 |
| morethan_one_sexual_patner |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| had_sti |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| family_cervical_cancer |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| smoke_cigarretes |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
| alcohol_use |
0 |
1 |
5 |
12 |
0 |
3 |
0 |
| hiv_serostatus |
0 |
1 |
8 |
12 |
0 |
2 |
0 |
| hpv_detected |
0 |
1 |
2 |
3 |
0 |
2 |
0 |
Variable type: numeric
| age |
0 |
1 |
31.01 |
6.84 |
16.00 |
25.00 |
31.0 |
36.00 |
44.00 |
▂▇▇▇▅ |
| no_of_children |
0 |
1 |
2.35 |
1.58 |
0.00 |
1.00 |
2.0 |
3.00 |
8.00 |
▆▇▂▁▁ |
| bg_concentration |
1 |
1 |
38.37 |
58.19 |
0.02 |
4.98 |
16.3 |
45.80 |
421.48 |
▇▁▁▁▁ |
| hpv_viral_load_copies_per_10_3_cells |
0 |
1 |
25774.40 |
201986.22 |
0.00 |
0.00 |
0.0 |
132.82 |
3311231.27 |
▇▁▁▁▁ |
# To factors
hpv_names <- c("visit_type", "marital_status", "employment_status", "education_level", "contraceptive_use", "contraceptive_type", "first_sex_age", "morethan_one_sexual_patner", "had_sti", "family_cervical_cancer", "smoke_cigarretes", "alcohol_use", "hiv_serostatus", "hpv_detected")
hpv <- hpv |>
mutate(across(all_of(hpv_names), factor))
str(hpv)
tibble [372 × 18] (S3: tbl_df/tbl/data.frame)
$ visit_type : Factor w/ 2 levels "Initial screening",..: 2 2 2 2 2 2 2 2 2 1 ...
$ age : num [1:372] 37 41 33 33 28 44 25 43 36 43 ...
$ marital_status : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 2 3 4 2 4 2 4 ...
$ no_of_children : num [1:372] 5 4 2 4 2 1 2 3 1 2 ...
$ employment_status : Factor w/ 3 levels "Employed","Self-employed/Business",..: 2 2 1 1 1 2 3 2 2 2 ...
$ education_level : Factor w/ 4 levels "College/University",..: 2 3 2 1 3 2 1 1 2 2 ...
$ contraceptive_use : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 1 2 2 2 ...
$ contraceptive_type : Factor w/ 8 levels "Condom","Depo",..: 2 3 2 3 2 6 6 1 3 2 ...
$ first_sex_age : Factor w/ 4 levels "15-19 years",..: 4 1 1 2 4 1 4 2 3 1 ...
$ morethan_one_sexual_patner : Factor w/ 2 levels "No","Yes": 2 1 2 1 2 1 2 2 2 2 ...
$ had_sti : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 1 ...
$ family_cervical_cancer : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...
$ smoke_cigarretes : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ alcohol_use : Factor w/ 3 levels "Never","Occasionally",..: 2 1 1 1 1 1 1 1 1 2 ...
$ hiv_serostatus : Factor w/ 2 levels "Non-Reactive",..: 2 2 2 2 2 2 2 2 2 2 ...
$ hpv_detected : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
$ bg_concentration : num [1:372] 66.08 126.15 7.49 198.47 79.43 ...
$ hpv_viral_load_copies_per_10_3_cells: num [1:372] 1.9 65.2 0 0 3403.7 ...
Exploration
lapply(hpv, function(x) {
if (is.numeric(x)) return(summary(x))
if (is.factor(x)) return(table(x))
})
$visit_type
x
Initial screening Routine screening
195 177
$age
Min. 1st Qu. Median Mean 3rd Qu. Max.
16.00 25.00 31.00 31.01 36.00 44.00
$marital_status
x
Divorced/Separated Married Single Widowed
23 259 72 18
$no_of_children
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 1.000 2.000 2.352 3.000 8.000
$employment_status
x
Employed Self-employed/Business Unemployed
70 198 104
$education_level
x
College/University Highschool Primary
99 136 135
Unable to read/Write
2
$contraceptive_use
x
No Yes
108 264
$contraceptive_type
x
Condom Depo Implant Implant Condom
32 101 85 1
IUCD none Oral Contraceptives Tubal Litigation
21 107 23 2
$first_sex_age
x
15-19 years 20-25 years Above 25 years Below 15 years
241 71 5 55
$morethan_one_sexual_patner
x
No Yes
85 287
$had_sti
x
No Yes
294 78
$family_cervical_cancer
x
No Yes
354 18
$smoke_cigarretes
x
No Yes
365 7
$alcohol_use
x
Never Occasionally Usually
325 45 2
$hiv_serostatus
x
Non-Reactive Reactive
186 186
$hpv_detected
x
No Yes
210 162
$bg_concentration
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.02378 4.97883 16.29782 38.37031 45.80327 421.48427 1
$hpv_viral_load_copies_per_10_3_cells
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 0.0 0.0 25774.4 132.8 3311231.3
Graphics
# First sex
hpv |>
select(first_sex_age) |>
count(first_sex_age) |>
ggplot(aes(x = reorder(first_sex_age, n), y = n, fill = first_sex_age)) + geom_col(show.legend = FALSE) +
geom_text(aes(label = n, vjust = -0.5)) +
theme_minimal() +
theme(axis.title.y = element_text(angle = 0)) +
labs(title = "Age at first intercourse")

# Contrceptives
hpv |>
select(contraceptive_type) |>
count(contraceptive_type) |>
ggplot(aes(x = reorder(contraceptive_type, n), y = n, fill = contraceptive_type)) + geom_col(show.legend = FALSE) +
geom_text(aes(label = n, vjust = -0.5)) +
theme_minimal() +
theme(axis.title.y = element_text(angle = 0)) +
labs(title = "Contraceptive types")

# Education
hpv |>
select(education_level) |>
count(education_level) |>
ggplot(aes(x = reorder(education_level, n), y = n, fill = education_level)) + geom_col(show.legend = FALSE) +
geom_text(aes(label = n, vjust = -0.5)) +
theme_minimal() +
theme(axis.title.y = element_text(angle = 0)) +
labs(title = "Levels of education")

# Marital status
hpv |>
select(marital_status) |>
count(marital_status) |>
ggplot(aes(x = reorder(marital_status, n), y = n, fill = marital_status)) + geom_col(show.legend = FALSE) +
geom_text(aes(label = n, vjust = -0.5)) +
theme_minimal() +
theme(axis.title.y = element_text(angle = 0)) +
labs(title = "Marital status")

Modeling with Logistic Regression
# Preprocess
set.seed(123)
hpv_split <- initial_split(hpv, prop = .8, strata = hpv_detected)
hpv_split
## <Training/Testing/Total>
## <297/75/372>
Training and Test Splits
# Training and test data
hpv_train <- hpv_split |> training()
hpv_test <- hpv_split |> testing()
# Recipe
hpv_m <- recipe(hpv_detected ~ ., data = hpv_train) |>
step_rose(hpv_detected)
# Prep and juice for test
hpv_prep <- hpv_m |> prep()
hpv_juice <- hpv_prep |> juice()
Logistic Regression
# Set engine
lr <- logistic_reg() |>
set_engine(engine = "glm") |>
set_mode("classification")
# Workflow
lr_wf <- workflow() |>
add_recipe(hpv_m) |>
add_model(lr)
# Cross validation
set.seed(456)
m_folds <- vfold_cv(hpv_train)
# Set metrics
ev_metrics <- metric_set(accuracy, sensitivity, specificity, precision, roc_auc, npv)
# Evaluate model
doParallel::registerDoParallel()
set.seed(789)
lr_rs <- fit_resamples(
lr_wf,
resamples = m_folds,
metrics = ev_metrics
)
## → A | error: factor contraceptive_type has new levels Tubal Litigation
## There were issues with some computations A: x1 → B | error: contrasts can be applied only to factors with 2 or more levels
## There were issues with some computations A: x1There were issues with some computations A: x1 B: x1 → C | error: factor education_level has new levels Unable to read/Write
## There were issues with some computations A: x1 B: x1There were issues with some computations A: x1 B: x1 C: x1There were issues with some computations A: x1 B: x1 C: x1
Metrics
# Get metrics
collect_metrics(lr_rs)
## # A tibble: 6 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.486 7 0.0220 pre0_mod0_post0
## 2 npv binary 0.420 7 0.0424 pre0_mod0_post0
## 3 precision binary 0.554 7 0.0369 pre0_mod0_post0
## 4 roc_auc binary 0.506 7 0.0307 pre0_mod0_post0
## 5 sensitivity binary 0.452 7 0.0546 pre0_mod0_post0
## 6 specificity binary 0.522 7 0.0707 pre0_mod0_post0
# Get summary
summary(lr_rs)
## splits.Length splits.Class splits.Mode id
## 4 vfold_split list Length:10
## 4 vfold_split list Class :character
## 4 vfold_split list Mode :character
## 4 vfold_split list
## 4 vfold_split list
## 4 vfold_split list
## 4 vfold_split list
## 4 vfold_split list
## 4 vfold_split list
## 4 vfold_split list
## .metrics.Length .metrics.Class .metrics.Mode
## 4 tbl_df list
## 4 tbl_df list
## 0 -none- NULL
## 4 tbl_df list
## 4 tbl_df list
## 0 -none- NULL
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
## 0 -none- NULL
## .notes.Length .notes.Class .notes.Mode
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
## 4 tbl_df list
Other Metrics
Logistic_m <- glm(
hpv_detected ~ .,
data = hpv_train,
family = "binomial"
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(Logistic_m)
##
## Call:
## glm(formula = hpv_detected ~ ., family = "binomial", data = hpv_train)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.119e+00 1.543e+00 0.726 0.46811
## visit_typeRoutine screening -3.796e-01 6.190e-01 -0.613 0.53975
## age -7.053e-02 4.046e-02 -1.743 0.08126
## marital_statusMarried -1.718e+00 7.945e-01 -2.162 0.03061
## marital_statusSingle -8.326e-01 8.316e-01 -1.001 0.31676
## marital_statusWidowed -6.275e-01 1.082e+00 -0.580 0.56179
## no_of_children -2.802e-02 1.845e-01 -0.152 0.87925
## employment_statusSelf-employed/Business 2.427e-01 6.094e-01 0.398 0.69045
## employment_statusUnemployed 1.238e-01 6.623e-01 0.187 0.85173
## education_levelHighschool 1.298e-02 5.718e-01 0.023 0.98190
## education_levelPrimary -4.822e-01 6.428e-01 -0.750 0.45314
## education_levelUnable to read/Write -1.722e+01 1.773e+04 -0.001 0.99922
## contraceptive_useYes 7.316e-01 7.842e-01 0.933 0.35086
## contraceptive_typeDepo 4.873e-01 6.978e-01 0.698 0.48495
## contraceptive_typeImplant -1.035e+00 8.047e-01 -1.286 0.19838
## contraceptive_typeImplant Condom 1.733e+01 1.773e+04 0.001 0.99922
## contraceptive_typeIUCD 3.925e-01 1.050e+00 0.374 0.70843
## contraceptive_typenone NA NA NA NA
## contraceptive_typeOral Contraceptives 9.156e-02 1.035e+00 0.088 0.92952
## contraceptive_typeTubal Litigation -1.715e+01 1.773e+04 -0.001 0.99923
## first_sex_age20-25 years -4.028e-01 5.235e-01 -0.769 0.44163
## first_sex_ageAbove 25 years 1.874e+00 1.433e+00 1.308 0.19081
## first_sex_ageBelow 15 years 4.244e-01 6.156e-01 0.689 0.49053
## morethan_one_sexual_patnerYes -1.297e-01 5.033e-01 -0.258 0.79668
## had_stiYes 4.253e-01 4.751e-01 0.895 0.37066
## family_cervical_cancerYes 8.151e-01 9.001e-01 0.906 0.36515
## smoke_cigarretesYes 8.688e-01 1.484e+00 0.585 0.55828
## alcohol_useOccasionally -7.296e-01 6.598e-01 -1.106 0.26881
## alcohol_useUsually 1.426e+01 1.155e+04 0.001 0.99902
## hiv_serostatusReactive 6.389e-01 6.785e-01 0.942 0.34632
## bg_concentration 1.093e-02 3.532e-03 3.093 0.00198
## hpv_viral_load_copies_per_10_3_cells 3.472e-02 1.030e-02 3.371 0.00075
##
## (Intercept)
## visit_typeRoutine screening
## age .
## marital_statusMarried *
## marital_statusSingle
## marital_statusWidowed
## no_of_children
## employment_statusSelf-employed/Business
## employment_statusUnemployed
## education_levelHighschool
## education_levelPrimary
## education_levelUnable to read/Write
## contraceptive_useYes
## contraceptive_typeDepo
## contraceptive_typeImplant
## contraceptive_typeImplant Condom
## contraceptive_typeIUCD
## contraceptive_typenone
## contraceptive_typeOral Contraceptives
## contraceptive_typeTubal Litigation
## first_sex_age20-25 years
## first_sex_ageAbove 25 years
## first_sex_ageBelow 15 years
## morethan_one_sexual_patnerYes
## had_stiYes
## family_cervical_cancerYes
## smoke_cigarretesYes
## alcohol_useOccasionally
## alcohol_useUsually
## hiv_serostatusReactive
## bg_concentration **
## hpv_viral_load_copies_per_10_3_cells ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 405.45 on 295 degrees of freedom
## Residual deviance: 190.83 on 265 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 252.83
##
## Number of Fisher Scoring iterations: 19
suppressWarnings(exp(cbind(Odds_ratio = coef(Logistic_m), confint(Logistic_m))))
## Waiting for profiling to be done...
## Odds_ratio 2.5 % 97.5 %
## (Intercept) 3.062983e+00 1.487053e-01 65.6276330
## visit_typeRoutine screening 6.841391e-01 2.001539e-01 2.3103482
## age 9.318973e-01 8.586981e-01 1.0072338
## marital_statusMarried 1.794440e-01 3.711114e-02 0.8723728
## marital_statusSingle 4.349323e-01 8.440324e-02 2.2809956
## marital_statusWidowed 5.339104e-01 5.324712e-02 4.1612406
## no_of_children 9.723655e-01 6.756950e-01 1.3988944
## employment_statusSelf-employed/Business 1.274691e+00 3.947499e-01 4.4048907
## employment_statusUnemployed 1.131786e+00 3.149256e-01 4.3160555
## education_levelHighschool 1.013059e+00 3.329960e-01 3.1899092
## education_levelPrimary 6.174085e-01 1.758937e-01 2.2305915
## education_levelUnable to read/Write 3.307008e-08 NA Inf
## contraceptive_useYes 2.078383e+00 4.371859e-01 9.8169572
## contraceptive_typeDepo 1.627931e+00 4.317220e-01 6.8590228
## contraceptive_typeImplant 3.552272e-01 7.080944e-02 1.7359335
## contraceptive_typeImplant Condom 3.355238e+07 0.000000e+00 NA
## contraceptive_typeIUCD 1.480695e+00 1.769290e-01 11.5439977
## contraceptive_typenone NA NA NA
## contraceptive_typeOral Contraceptives 1.095885e+00 1.337278e-01 8.2056882
## contraceptive_typeTubal Litigation 3.559450e-08 NA Inf
## first_sex_age20-25 years 6.684639e-01 2.281398e-01 1.8104253
## first_sex_ageAbove 25 years 6.516958e+00 2.365648e-01 103.7302085
## first_sex_ageBelow 15 years 1.528720e+00 4.497373e-01 5.1274001
## morethan_one_sexual_patnerYes 8.783741e-01 3.321797e-01 2.4316862
## had_stiYes 1.530061e+00 5.891806e-01 3.8513646
## family_cervical_cancerYes 2.259418e+00 3.474946e-01 12.7118750
## smoke_cigarretesYes 2.383935e+00 7.878702e-02 40.5628259
## alcohol_useOccasionally 4.821148e-01 1.223119e-01 1.6734329
## alcohol_useUsually 1.558789e+06 5.423684e-158 Inf
## hiv_serostatusReactive 1.894476e+00 5.146945e-01 7.5052775
## bg_concentration 1.010985e+00 1.004424e+00 1.0184661
## hpv_viral_load_copies_per_10_3_cells 1.035325e+00 1.018810e+00 1.0602343
ROC Curve
# Fit logistic regression
model <- glm(
hpv_detected ~ .,
data = hpv_train,
family = "binomial"
)
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predicted probabilities on test set
hpv_probs <- predict(model, newdata = hpv_test, type = "response")
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# ROC curve
library(pROC)
Type 'citation("pROC")' for a citation.
Attaching package: 'pROC'
The following objects are masked from 'package:stats':
cov, smooth, var
roc_obj <- roc(
hpv_test$hpv_detected, # true outcome
hpv_probs, # predicted probabilities
levels = c("No", "Yes"),# make sure first level is negative class
direction = "<"
)
# Plot ROC
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve for HPV Detection")

auc(roc_obj) # AUC value
Area under the curve: 0.7922