Setup and Data

# Load packages
library(dplyr)
library(tidyverse)
library(haven)
library(here)
library(knitr)
library(kableExtra)
library(broom)
library(plotly)
library (gtsummary)
library (plotly)
library(ggplot2)
library(stringr)
library(gt)

#this data was taken from the cdcwonder surveillance report for 1981-2002
# Load CDC Wonder 1981 - 2002 datasets
cdc03 <- read.csv("/Users/nataliasmall/Downloads/EPI 553/third.csv") %>%
  janitor::clean_names()

# Display dataset dimensions
names(cdc03)

##  [1] "notes"                           "year_diagnosed"                 
##  [3] "year_diagnosed_code"             "sex_and_sexual_orientation"     
##  [5] "sex_and_sexual_orientation_code" "hiv_exposure_category"          
##  [7] "hiv_exposure_category_code"      "case_definition"                
##  [9] "case_definition_code"            "vital_status"                   
## [11] "vital_status_code"               "cases"

#Recoding and cleaning
cdc3_clean <- cdc03 %>%
  filter(notes != "Total") %>%
  drop_na(`vital_status_code`)

# Select variables of interest
cdc3_full <- cdc3_clean %>%
  select(year_diagnosed_code, sex_and_sexual_orientation, hiv_exposure_category, case_definition, vital_status_code, cases)

# Recode variables
cdc3_full <- cdc3_full %>%
  mutate(
    yeardiagnosed     = factor(case_when(
      year_diagnosed_code == 1981 ~ "Before 1982",
      year_diagnosed_code == 1982 ~ "1982",
      year_diagnosed_code == 1983 ~ "1983",
      year_diagnosed_code == 1984 ~ "1984",
      year_diagnosed_code == 1985 ~ "1985",
      year_diagnosed_code == 1986 ~ "1986",
      year_diagnosed_code == 1987 ~ "1987",
      year_diagnosed_code == 1988 ~ "1988",
      year_diagnosed_code == 1995 ~ "1995",
      year_diagnosed_code == 1996 ~ "1996",
      year_diagnosed_code == 1997 ~ "1997",
      year_diagnosed_code == 1998 ~ "1998",
      year_diagnosed_code == 1999 ~ "1999",
      year_diagnosed_code == 2000 ~ "2000",
      year_diagnosed_code == 2001 ~ "2001",
      year_diagnosed_code == 2002 ~ "2002"), levels = c("Before 1982", "1982", "1983", "1984", "1985", 
                                                        "1986", "1987", "1988", "1995", "1996", "1997", 
                                                        "1998", "1999", "2000", "2001", "2002")),
    sexandorientation = sex_and_sexual_orientation,
    hivexposure       = hiv_exposure_category,
    CaseCriteria      = case_definition,
    vitalstatus       = factor(ifelse(vital_status_code == 0, "Alive", "Dead")),
    cases             = cases
  )

# Select recoded variables, apply filters, drop missing
set.seed(553)
cdc3_full <- cdc3_full %>%
  select(yeardiagnosed, sexandorientation, hivexposure, CaseCriteria, vitalstatus, cases) %>%
  filter(cases > 0) %>%
  drop_na()

#cdcwonder data is pre-aggregated, so data points are not representative of only 1 person, could be 60, 40, 200, etc.
#total obs. in dataset is 685, but there are more individuals accounted for than that based on identified cases. weight is very important here. 
#total number of counts of cases: 11022
sum(cdc3_full$cases)

## [1] 11022

#total number of counts of cases reported as alive: 5306
sum(cdc3_full$cases[cdc3_full$vitalstatus == "Alive"], na.rm = TRUE)

## [1] 5306

#total number of counts of cases reported as dead: 5716
sum(cdc3_full$cases[cdc3_full$vitalstatus == "Dead"], na.rm = TRUE)

## [1] 5716

# Save analytic datasets
saveRDS(cdc3_full, here::here(
  "/Users/nataliasmall/Downloads/EPI 553/cdcwonderdata3.rds"
))

Descriptive Statistics

cdc3_full %>%
  select(yeardiagnosed, vitalstatus, cases) %>%
  summary() %>%
  kable(caption = "Descriptive Statistics: Key Continuous Variables") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Descriptive Statistics: Key Continuous Variables
yeardiagnosed	vitalstatus	cases
1995 :105	Alive:388	Min. : 1.00
1996 : 77	Dead :297	1st Qu.: 1.00
1997 : 76	NA	Median : 3.00
1998 : 74	NA	Mean : 16.09
1999 : 59	NA	3rd Qu.: 10.00
1988 : 53	NA	Max. :976.00
(Other):241	NA	NA

library(gtsummary)
# library that handles math for weighting
library(survey)

# Create the weighted design object
# ids = ~1: independent sampling
# weights = ~cases:use the cases variable to "inflate" each row to its representative value in the population
cdc_weighted <- svydesign(ids = ~1, weights = ~cases, data = cdc3_full)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

#tbl_svysummary: The "weighted version" of tbl_summary
ft0 <- cdc_weighted %>%
  tbl_svysummary(
    by = hivexposure,
    include = c(yeardiagnosed, sexandorientation, CaseCriteria, vitalstatus),
    label = list(
      yeardiagnosed ~ "Year Diagnosed",
      sexandorientation ~ "Sex and Sexual Orientation",
      CaseCriteria ~ "Case Criteria",
      vitalstatus ~ "Vital Status"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  bold_labels() %>%
  modify_caption("**Table 1: Survey-Weighted Descriptives Stratified by HIV Exposure (N = 11,022)**") %>%
  modify_footnote(everything() ~ "Total analytic sample size N=11,022. Percentages reflect survey-weighted distributions. Missing values handled via complete case analysis.")%>%
  as_gt() %>% 
  gtsave("/Users/nataliasmall/Downloads/EPI 553/outputs/Table1_WeightDesc.png")

print (ft0)

## [1] "/Users/nataliasmall/Downloads/EPI 553/outputs/Table1_WeightDesc.png"

p_bar <- ggplot(cdc3_full, aes(x = vitalstatus, weight = cases)) +
  geom_bar(fill = "darkblue", color = "white") +
  scale_y_continuous(labels = scales::comma) + #Note: this makes large numbers readable
  labs(
    title = "Figure 1: Actual Distribution of Mortality (Vital Status)",
    subtitle = "Weighted by case counts (Total N = 11,022)",
    x = "Vital Status",
    y = "Total Number of People"
  ) +
  theme_minimal(base_size = 13)
  
ggplotly(p_bar)

ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure1_barchart.png", 
       plot = p_bar, width = 8, height = 6, dpi = 300)

# Proportional Stacked Bar Chart: percentage of "Alive" vs "Dead" within each exposure group, adjusted by cases weight

# x = reorder(hivexposure, cases): reorder categories by the size of the case counts
# fill = vitalstatus: color the bars by "Alive" vs "Dead"
# weight = cases: count 11,022 people, not 685 rows
p_bivariate <- ggplot(cdc3_full, 
                      aes(x = reorder(hivexposure, cases), 
                          fill = vitalstatus,              
                          weight = cases)) + 
                          geom_bar(position = "fill") + 

# Flip the coordinates for readability
# Rotates the plot so long category names are easy to read
                          coord_flip() +                                            

# Format the y-axis (now the horizontal axis)
# Convert 0.25, 0.50, etc. into 25%, 50%
                          scale_y_continuous(labels = scales::percent_format()) +   

# Colors for the binary outcome
                          scale_fill_manual(values = c("Alive" = "darkblue",       
                                                       "Dead" = "grey")) +  
  labs(
    title = "Figure 2: Bivariate Relationship - Vital Status and HIV Exposure Group", 
    x = "HIV Exposure Category",                            
    y = "Percentage",                                       
    fill = "Status"                                         
  ) +
  theme_minimal()

ggplotly(p_bivariate)

ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure2_bivariate.png", 
       plot = p_bivariate, width = 8, height = 6, dpi = 300)

# Percentage of 'Dead' for each combination
heatmap_data <- cdc3_full %>%
  group_by(hivexposure, yeardiagnosed) %>%
  summarise(
    total_cases = sum(cases),
    percent_dead = sum(cases[vitalstatus == "Dead"]) / sum(cases) * 100,
    .groups = 'drop'
  )

# Create the Heatmap
p_heatmap <- ggplot(heatmap_data, aes(x = yeardiagnosed, y = hivexposure, fill = percent_dead)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "lightblue", high = "darkblue", mid = "blue", 
                       midpoint = 50, limit = c(0,100), name = "% Dead") +
  labs(
    title = "Figure 3.Exploratory Analysis: Mortality Heatmap by Exposure and Year",
    subtitle = "dark blue indicates higher mortality (N = 11,022)",
    x = "Year of Diagnosis",
    y = "HIV Exposure Category"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 3. Interactive Plot
ggplotly(p_heatmap)

ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure3_heatmap.png", 
       plot = p_heatmap, width = 8, height = 6, dpi = 300)

Future Logistic Regression and Modeling

#Future regression 
#simple logistic regression
model_logistic_simple <- glm(vitalstatus ~ hivexposure, 
             data = cdc3_full, 
             family = binomial(link = "logit"), 
             weights = cases) # This row is actually the amount of people, since data is agreggated 

summary(model_logistic_simple)

## 
## Call:
## glm(formula = vitalstatus ~ hivexposure, family = binomial(link = "logit"), 
##     data = cdc3_full, weights = cases)
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                             -0.86164    0.07299
## hivexposureIV drug use (female and hetero male)          1.63390    0.08544
## hivexposureMale homo/bisexual and IV drug use            0.93220    0.10184
## hivexposureMale homosexual/bisexual contact              0.82841    0.07685
## hivexposureReceipt of blood, blood components or tissue  0.82989    0.26236
##                                                         z value Pr(>|z|)    
## (Intercept)                                             -11.805  < 2e-16 ***
## hivexposureIV drug use (female and hetero male)          19.123  < 2e-16 ***
## hivexposureMale homo/bisexual and IV drug use             9.154  < 2e-16 ***
## hivexposureMale homosexual/bisexual contact              10.780  < 2e-16 ***
## hivexposureReceipt of blood, blood components or tissue   3.163  0.00156 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15264  on 684  degrees of freedom
## Residual deviance: 14799  on 680  degrees of freedom
## AIC: 14809
## 
## Number of Fisher Scoring iterations: 4

# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Simple Logistic Regression: vitalstatus ~ hivexposure (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Simple Logistic Regression: vitalstatus ~ hivexposure (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.422	0.073	-11.805	0.000	0.366	0.487
hivexposureIV drug use (female and hetero male)	5.124	0.085	19.123	0.000	4.338	6.065
hivexposureMale homo/bisexual and IV drug use	2.540	0.102	9.154	0.000	2.082	3.104
hivexposureMale homosexual/bisexual contact	2.290	0.077	10.780	0.000	1.972	2.665
hivexposureReceipt of blood, blood components or tissue	2.293	0.262	3.163	0.002	1.367	3.842

#Future regression relationship model
exposure_coefs <- tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "hivexposure")) %>%
  mutate(
    # Cleans the prefix "hivexposure" off the category names
    exposure_level = str_remove(term, "hivexposure")
  )

ref_row <- data.frame(
  term = "Reference",
  estimate = 1.0,
  std.error = 0,
  statistic = NA,
  p.value = NA,
  conf.low = 1.0,
  conf.high = 1.0,
  exposure_level = "Hemophilia/coagulation disorder (Ref)"
)

exposure_plot_data <- bind_rows(ref_row, exposure_coefs)

p3 <- ggplot(exposure_plot_data, aes(x = reorder(exposure_level, estimate), y = estimate)) +
  # Line at 1.0: Estimates above this mean higher risk of "Dead" vs "Alive"
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
                  size = 0.8, color = "darkblue") +
  coord_flip() +
  labs(
    title = "Figure 4: Association Between HIV Exposure and Mortality",
    subtitle = "Weighted Odds Ratios (N = 11,022 total cases)",
    x = "HIV Exposure Category",
    y = "Odds Ratio (95% CI)"
  ) +
  theme_minimal(base_size = 11)

ggplotly(p3)

ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure4_pred_assoc.png", 
       plot = p3, width = 8, height = 6, dpi = 300)

#Future regression models
#compare models
# Model A: vitalstatus ~ hivexposure
m_A <- svyglm(vitalstatus ~ hivexposure,
  design = cdc_weighted,
  family  = quasibinomial()
)

# Model B: vitalstatus ~ hivexposure + casecriteria
m_B <- svyglm(vitalstatus ~ hivexposure + CaseCriteria,
  design = cdc_weighted,
  family  = quasibinomial()
)

# Model C: Full model
m_C <- svyglm(vitalstatus ~ hivexposure + CaseCriteria + yeardiagnosed + sexandorientation,
  design = cdc_weighted,
  family  = quasibinomial()
)

#Future regression modeling
# formatted model table
make_tbl <- function(model) {
  tbl_regression(
    model,
    exponentiate = TRUE,
    conf.int     = FALSE
  ) %>%
    add_significance_stars(
      hide_ci  = TRUE,
      hide_se  = FALSE,
      pattern  = "{estimate}{stars}"
    ) %>%
    remove_row_type(type = "reference") %>%
    bold_labels() %>%
    italicize_levels() %>%
    modify_header(estimate ~ "**OR**", std.error ~ "**SE**")
}

t_A <- make_tbl(m_A)
t_B <- make_tbl(m_B)
t_C <- make_tbl(m_C)

# Remove orphaned CI footnotes
for (tbl_obj in list(t_A, t_B, t_C)) {
  tbl_obj$table_styling$abbreviation <-
    tbl_obj$table_styling$abbreviation %>%
    dplyr::filter(column != "conf.low")
}

ft1 <- tbl_merge(
  tbls = list(t_A, t_B, t_C),
  tab_spanner = c(
    "**Model A: vitalstatus ~ hivexposure**",
    "**Model B: vitalstatus ~ hivexposure + casecriteria**",
    "**Model C: Full model**"
  )
) %>%
  bold_labels() %>%
  modify_caption("**Table 2. Logistic Regression: Predictors of Mortality (Vital Status)**") %>%
  as_gt() %>%
  tab_footnote(
    footnote = "OR = odds ratio; SE = standard error. Survey-weighted quasibinomial logistic regression. ***p < 0.001; **p < 0.01; *p < 0.05.",
    locations = cells_title()
  ) %>% 
  gtsave("/Users/nataliasmall/Downloads/EPI 553/outputs/Table2_logRegression.png")

print (ft1)

## [1] "/Users/nataliasmall/Downloads/EPI 553/outputs/Table2_logRegression.png"

Progress Check-in 1: Data Cleaning and Exploratory Analysis

EPI 553 — Principles of Statistical Inference II (Spring 2026)

Natalia Small

March 10, 2026

Setup and Data

Descriptive Statistics

Future Logistic Regression and Modeling