Description

Objective

Introduction

Fields Used

library(ggplot2)
library(cowplot)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ lubridate::stamp() masks cowplot::stamp()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
setwd("~/Downloads/25_Semesters/Fall/DATA101")

# Load CDC COVID-19 data (example structure)

covid_df <- read_csv("COVID-19_Case_Surveillance.csv")
## Rows: 4028318 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): current_status, sex, age_group, race_ethnicity_combined, hosp_yn, ...
## date (4): cdc_case_earliest_dt, cdc_report_dt, pos_spec_dt, onset_dt
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preview dataset
head(covid_df )
## # A tibble: 6 × 12
##   cdc_case_earliest_dt cdc_report_dt pos_spec_dt onset_dt   current_status sex  
##   <date>               <date>        <date>      <date>     <chr>          <chr>
## 1 2020-01-02           2020-01-02    2021-01-03  2020-01-02 Laboratory-co… Fema…
## 2 2020-12-20           2020-01-02    2021-01-12  2020-12-20 Laboratory-co… Fema…
## 3 2020-12-21           2020-01-02    2020-12-21  2020-12-21 Laboratory-co… Fema…
## 4 2020-01-02           2020-01-02    2021-01-04  2020-01-02 Laboratory-co… Fema…
## 5 2020-12-24           2020-01-02    2020-12-27  2020-12-24 Laboratory-co… Fema…
## 6 2020-11-02           2020-01-02    2020-01-02  2020-11-02 Laboratory-co… Fema…
## # ℹ 6 more variables: age_group <chr>, race_ethnicity_combined <chr>,
## #   hosp_yn <chr>, icu_yn <chr>, death_yn <chr>, medcond_yn <chr>
str(covid_df )
## spc_tbl_ [4,028,318 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ cdc_case_earliest_dt   : Date[1:4028318], format: "2020-01-02" "2020-12-20" ...
##  $ cdc_report_dt          : Date[1:4028318], format: "2020-01-02" "2020-01-02" ...
##  $ pos_spec_dt            : Date[1:4028318], format: "2021-01-03" "2021-01-12" ...
##  $ onset_dt               : Date[1:4028318], format: "2020-01-02" "2020-12-20" ...
##  $ current_status         : chr [1:4028318] "Laboratory-confirmed case" "Laboratory-confirmed case" "Laboratory-confirmed case" "Laboratory-confirmed case" ...
##  $ sex                    : chr [1:4028318] "Female" "Female" "Female" "Female" ...
##  $ age_group              : chr [1:4028318] "10 - 19 Years" "60 - 69 Years" "60 - 69 Years" "60 - 69 Years" ...
##  $ race_ethnicity_combined: chr [1:4028318] "White, Non-Hispanic" "White, Non-Hispanic" "Hispanic/Latino" "Multiple/Other, Non-Hispanic" ...
##  $ hosp_yn                : chr [1:4028318] "No" "No" "No" "No" ...
##  $ icu_yn                 : chr [1:4028318] "Missing" "Missing" "Missing" "Missing" ...
##  $ death_yn               : chr [1:4028318] "No" "Missing" "Missing" "No" ...
##  $ medcond_yn             : chr [1:4028318] "No" "Missing" "Missing" "No" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   cdc_case_earliest_dt = col_date(format = ""),
##   ..   cdc_report_dt = col_date(format = ""),
##   ..   pos_spec_dt = col_date(format = ""),
##   ..   onset_dt = col_date(format = ""),
##   ..   current_status = col_character(),
##   ..   sex = col_character(),
##   ..   age_group = col_character(),
##   ..   race_ethnicity_combined = col_character(),
##   ..   hosp_yn = col_character(),
##   ..   icu_yn = col_character(),
##   ..   death_yn = col_character(),
##   ..   medcond_yn = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
df_clean <- covid_df |>
  select(age_group, sex, medcond_yn, hosp_yn) |>
  mutate(across(
    c(age_group, sex, medcond_yn, hosp_yn),
    ~ na_if(na_if(as.character(.x), "Missing"), "Unknown")
  )) |>
  drop_na() |>
  mutate(across(
    c(age_group, sex, medcond_yn, hosp_yn),
    as.factor
  ))


str(df_clean)
## tibble [1,030,327 × 4] (S3: tbl_df/tbl/data.frame)
##  $ age_group : Factor w/ 9 levels "0 - 9 Years",..: 2 7 6 8 5 5 6 4 7 8 ...
##  $ sex       : Factor w/ 3 levels "Female","Male",..: 1 1 2 1 1 1 1 2 2 2 ...
##  $ medcond_yn: Factor w/ 2 levels "No","Yes": 1 1 2 2 1 2 2 1 1 2 ...
##  $ hosp_yn   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 2 ...

Levels of each factor

levels(df_clean$age_group)
## [1] "0 - 9 Years"   "10 - 19 Years" "20 - 29 Years" "30 - 39 Years"
## [5] "40 - 49 Years" "50 - 59 Years" "60 - 69 Years" "70 - 79 Years"
## [9] "80+ Years"
levels(df_clean$sex)
## [1] "Female" "Male"   "Other"
levels(df_clean$medcond_yn)
## [1] "No"  "Yes"
levels(df_clean$hosp_yn)
## [1] "No"  "Yes"
xtabs(~ hosp_yn + sex, data = df_clean)
##        sex
## hosp_yn Female   Male  Other
##     No  484909 428068     27
##     Yes  56469  60854      0
xtabs(~ hosp_yn + age_group, data = df_clean)
##        age_group
## hosp_yn 0 - 9 Years 10 - 19 Years 20 - 29 Years 30 - 39 Years 40 - 49 Years
##     No        38674        113012        178239        150048        145306
##     Yes         881          1448          4893          7939         11818
##        age_group
## hosp_yn 50 - 59 Years 60 - 69 Years 70 - 79 Years 80+ Years
##     No         133534         89962         41953     22276
##     Yes         19881         25202         23307     21954
xtabs(~ hosp_yn + medcond_yn, data = df_clean)
##        medcond_yn
## hosp_yn     No    Yes
##     No  548280 364724
##     Yes  19566  97757
logistic <- glm(
  hosp_yn ~ age_group + sex + medcond_yn,data = df_clean, family = "binomial")

summary(logistic)
## 
## Call:
## glm(formula = hosp_yn ~ age_group + sex + medcond_yn, family = "binomial", 
##     data = df_clean)
## 
## Coefficients:
##                         Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)            -4.234062   0.034629 -122.271   <2e-16 ***
## age_group10 - 19 Years -0.672428   0.043334  -15.517   <2e-16 ***
## age_group20 - 29 Years -0.004936   0.037263   -0.132    0.895    
## age_group30 - 39 Years  0.527492   0.036257   14.549   <2e-16 ***
## age_group40 - 49 Years  0.816849   0.035747   22.851   <2e-16 ***
## age_group50 - 59 Years  1.279404   0.035335   36.208   <2e-16 ***
## age_group60 - 69 Years  1.803887   0.035292   51.113   <2e-16 ***
## age_group70 - 79 Years  2.409035   0.035571   67.724   <2e-16 ***
## age_group80+ Years      2.939019   0.035959   81.732   <2e-16 ***
## sexMale                 0.282340   0.006867   41.116   <2e-16 ***
## sexOther               -7.406809  21.666266   -0.342    0.732    
## medcond_ynYes           1.302220   0.008752  148.794   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 730564  on 1030326  degrees of freedom
## Residual deviance: 574727  on 1030315  degrees of freedom
## AIC: 574751
## 
## Number of Fisher Scoring iterations: 9

Intercept

Age Group Effects

Hospitalization risk is shown to increase with age. Compared to children:

This confirms that age is a strong risk factor for hospitalization.

Sex Effect

Medical Condition Effect

The chance of those with previous medical condition begin hospitalization for every person without increases by 130%

This is the strongest predictor in the model and is highly significant with a p values less than 0.005.

Model Fit and Deviance

exp(coef(logistic))
##            (Intercept) age_group10 - 19 Years age_group20 - 29 Years 
##           1.449340e-02           5.104676e-01           9.950763e-01 
## age_group30 - 39 Years age_group40 - 49 Years age_group50 - 59 Years 
##           1.694677e+00           2.263356e+00           3.594496e+00 
## age_group60 - 69 Years age_group70 - 79 Years     age_group80+ Years 
##           6.073205e+00           1.112323e+01           1.889730e+01 
##                sexMale               sexOther          medcond_ynYes 
##           1.326229e+00           6.071048e-04           3.677450e+00

Interpretation

Age Group Effects

Sex Effects

Medical Condition Effect

r_square <- 1 - (logistic$deviance / logistic$null.deviance)
r_square
## [1] 0.2133098

Interpretation

1 - pchisq(
  logistic$null.deviance - logistic$deviance,
  df = length(logistic$coefficients) - 1
)
## [1] 0
predicted_data <- data.frame(
  probability_hosp = logistic$fitted.values,
  hosp_yn = df_clean$hosp_yn
)

head(predicted_data)
##   probability_hosp hosp_yn
## 1      0.007344077      No
## 2      0.080900435      No
## 3      0.202603940      No
## 4      0.372196166      No
## 5      0.031761822      No
## 6      0.107648058      No
# Convert outcome to numeric
df_clean$hosp_num <- ifelse(df_clean$hosp_yn == "Yes", 1, 0)

predicted_class <- ifelse(logistic$fitted.values > 0.5, 1, 0)

confusion <- table(
  Predicted = predicted_class,
  Actual = df_clean$hosp_num
)

confusion
##          Actual
## Predicted      0      1
##         0 894200  97058
##         1  18804  20265
TN <- confusion[1,1]
FP <- confusion[2,1]
FN <- confusion[1,2]
TP <- confusion[2,2]

accuracy <- (TP + TN) / (TP + TN + FP + FN)
sensitivity <- TP / (TP + FN)
specificity <- TN / (TN + FP)
precision <- TP / (TP + FP)     # positive predictive value
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)

# Print results
cat("Accuracy:    ", round(accuracy, 4), "\n")
## Accuracy:     0.8875
cat("Sensitivity: ", round(sensitivity, 4), "\n")
## Sensitivity:  0.1727
cat("Specificity: ", round(specificity, 4), "\n")
## Specificity:  0.9794
cat("Precision:   ", round(precision, 4), "\n")
## Precision:    0.5187
cat("F1 Score:    ", round(f1_score, 4), "\n")
## F1 Score:     0.2592

ROC and AUC

roc_obj <- roc(
  response = df_clean$hosp_yn,
  predictor = logistic$fitted.values,
  levels = c("No", "Yes")
)
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.8254
plot.roc(roc_obj, print.auc = TRUE)

Summary