Setup and Data

# Load packages
library(dplyr)
library(tidyverse)
library(haven)
library(here)
library(knitr)
library(kableExtra)
library(broom)
library(plotly)
library (gtsummary)
library(janitor)
library (plotly)
library(ggplot2)
library(stringr)
library(gt)
library(ggeffects)

# Load CDC Wonder 1981 - 2002 datasets
cdc_new <- read_rds("/Users/nataliasmall/Downloads/EPI 553/cdcwonderdata3.rds")|>
  clean_names()

# Display dataset dimensions
names(cdc_new)
## [1] "yeardiagnosed"     "sexandorientation" "hivexposure"      
## [4] "case_criteria"     "vitalstatus"       "cases"

##Re-Recoding And Cleaning

# recoding and cleaning
# recoding character variables into factors: this will be beneficial for future regression modeling and dummy coding
cdc_analytic <- cdc_new |>
  mutate(
    # outcome
    mortality = vitalstatus,
    # sex and sexual orientation
    sexandorientation = factor(case_when(
      sexandorientation == "Female (any)"                                 ~ "Female (any)",
      sexandorientation == "Male (bisexual)"                              ~ "Male (bisexual)",
      sexandorientation == "Male (heterosexual or pediatric)"             ~ "Male (heterosexual or pediatric)",
      sexandorientation == "Male (homosexual)  or Unknown Classification" ~ "Male (homosexual) or Unknown Classification",
      TRUE                                                                ~ NA
    ), levels = c("Female (any)", "Male (bisexual)", "Male (heterosexual or pediatric)", "Male (homosexual) or Unknown Classification")),
    # HIV exposure
    hivexposure = factor(case_when(
      hivexposure == "Heterosexual contact with HIV"                ~ "Heterosexual contact with HIV",
      hivexposure == "IV drug use (female and hetero male)"         ~ "IV drug use (female and hetero male)",
      hivexposure == "Male homo/bisexual and IV drug use"           ~ "Male homo/bisexual and IV drug use",
      hivexposure == "Male homosexual/bisexual contact"             ~ "Male homosexual/bisexual contact",
      hivexposure == "Receipt of blood, blood components or tissue" ~ "Receipt of blood, blood components or tissue",
      TRUE                                                          ~ NA
    ), levels = c("Male homosexual/bisexual contact", "Heterosexual contact with HIV", "IV drug use (female and hetero male)", 
                  "Male homo/bisexual and IV drug use", "Receipt of blood, blood components or tissue")),
    # case criteria: surveillance method
    case_criteria = factor(case_when(
      case_criteria == "1985 surveillance definition"                  ~ "1985 surveillance definition",
      case_criteria == "1987 definition and diagnosed definitively"    ~ "1987 definition and diagnosed definitively",
      case_criteria == "1987 definition and diagnosed presumptively"   ~ "1987 definition and diagnosed presumptively",
      case_criteria == "1993 definition and diagnosed definitively"    ~ "1993 definition and diagnosed definitively",
      case_criteria == "1993 definition and diagnosed immunologically" ~ "1993 definition and diagnosed immunologically",
      case_criteria == "1993 definition and diagnosed presumptively"   ~ "1993 definition and diagnosed presumptively",
      case_criteria == "Pre-1985 surveillance definition"              ~ "Pre-1985 surveillance definition",
      TRUE                                                             ~ NA
    ), levels = c("Pre-1985 surveillance definition", "1985 surveillance definition", "1987 definition and diagnosed definitively", 
                  "1987 definition and diagnosed presumptively", "1993 definition and diagnosed definitively", 
                  "1993 definition and diagnosed immunologically", "1993 definition and diagnosed presumptively")),
    # year diagnosed
    yeardiagnosed = yeardiagnosed,
    # cases (our weight variable: paired to mortality, accounting for aggregated data structure)
    cases = cases,
    # new variable accounting for two distinct periods (1981-1986 vs. 1995-2000)
    year_period = factor(case_when(
      yeardiagnosed %in% c("Before 1982", "1982", "1983", "1984", "1985", "1986", "1987", "1988") ~ "1981-1988",
      yeardiagnosed %in% c("1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002")        ~ "1995-2002",
      TRUE                                                                                        ~ NA
    ), levels = c("1981-1988", "1995-2002"))
  )|>
  filter(
    !is.na(yeardiagnosed), !is.na(sexandorientation), !is.na(hivexposure),
    !is.na(case_criteria), !is.na(mortality), !is.na(year_period), !is.na(cases)
  )

# set seed for reproducibility
set.seed(2124)
cdc_analytic <- cdc_analytic |>
  dplyr::select(yeardiagnosed, sexandorientation, hivexposure, 
                case_criteria, mortality, year_period, cases)

# Save re-recoded dataset
saveRDS(cdc_analytic, here::here("/Users/nataliasmall/Downloads/EPI 553/cdc_analytic.rds"))

##Confirmatory Calculations

#cdcwonder data is pre-aggregated, so data points are not representative of only 1 person, could be 60, 40, 200, etc.
#total obs. in dataset is 685, but there are more individuals accounted for than that based on identified cases. weight is very important here. 

#total number of counts of cases: 11022
sum(cdc_analytic$cases)
## [1] 11022
#total number of counts of cases reported as alive: 5306
sum(cdc_analytic$cases[cdc_analytic$mortality == "Alive"], na.rm = TRUE)
## [1] 5306
#total number of counts of cases reported as dead: 5716
sum(cdc_analytic$cases[cdc_analytic$mortality == "Dead"], na.rm = TRUE)
## [1] 5716

Re-Run Descriptive Statistics

# run table(cdc_new$sexandorientation, useNA = "always") if final observation do not match expectations: will reveal spelling of categories
cdc_analytic %>%
  select(yeardiagnosed, sexandorientation, hivexposure, case_criteria, mortality, year_period, cases) %>%
  summary() %>%
  kable(caption = "Descriptive Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Descriptive Statistics
yeardiagnosed sexandorientation hivexposure case_criteria mortality year_period cases
1995 :105 Female (any) :167 Male homosexual/bisexual contact :202 Pre-1985 surveillance definition :205 Alive:388 1981-1988:182 Min. : 1.00
1996 : 77 Male (bisexual) :140 Heterosexual contact with HIV :115 1985 surveillance definition : 69 Dead :297 1995-2002:503 1st Qu.: 1.00
1997 : 76 Male (heterosexual or pediatric) :189 IV drug use (female and hetero male) :180 1987 definition and diagnosed definitively :110 NA NA Median : 3.00
1998 : 74 Male (homosexual) or Unknown Classification:189 Male homo/bisexual and IV drug use :149 1987 definition and diagnosed presumptively : 98 NA NA Mean : 16.09
1999 : 59 NA Receipt of blood, blood components or tissue: 39 1993 definition and diagnosed definitively : 65 NA NA 3rd Qu.: 10.00
1988 : 53 NA NA 1993 definition and diagnosed immunologically:122 NA NA Max. :976.00
(Other):241 NA NA 1993 definition and diagnosed presumptively : 16 NA NA NA

Re-Confirm Table 1

library(gtsummary)
# library that handles math for weighting
library(survey)

# Create the weighted design object
# ids = ~1: independent sampling
# weights = ~cases:use the cases variable to "inflate" each row to its representative value in the population
cdc_weighted <- svydesign(ids = ~1, weights = ~cases, data = cdc_analytic)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

#tbl_svysummary: The "weighted version" of tbl_summary
cdc_weighted %>%
  tbl_svysummary(
    by = hivexposure,
    include = c(yeardiagnosed, year_period, sexandorientation, hivexposure, case_criteria, mortality),
    label = list(
      yeardiagnosed     ~ "Year Diagnosed",
      year_period       ~ "Year Period",
      sexandorientation ~ "Sex and Sexual Orientation",
      case_criteria     ~ "Case Criteria",
      mortality         ~ "Mortality"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  bold_labels() %>%
  modify_caption("**Table 1. Survey-Weighted Descriptive Statistics (Total N = 11,022)**") %>%
  as_gt()
Table 1. Survey-Weighted Descriptive Statistics (Total N = 11,022)
Characteristic Male homosexual/bisexual contact
N = 6,921
1
Heterosexual contact with HIV
N = 899
1
IV drug use (female and hetero male)
N = 2,345
1
Male homo/bisexual and IV drug use
N = 794
1
Receipt of blood, blood components or tissue
N = 63
1
Year Diagnosed




    Before 1982 12 (0.2%) 1 (0.1%) 1 (<0.1%) 2 (0.3%) 0 (0%)
    1982 9 (0.1%) 0 (0%) 8 (0.3%) 2 (0.3%) 0 (0%)
    1983 15 (0.2%) 0 (0%) 3 (0.1%) 3 (0.4%) 1 (1.6%)
    1984 35 (0.5%) 1 (0.1%) 9 (0.4%) 5 (0.6%) 2 (3.2%)
    1985 64 (0.9%) 0 (0%) 22 (0.9%) 13 (1.6%) 0 (0%)
    1986 255 (3.7%) 7 (0.8%) 81 (3.5%) 36 (4.5%) 1 (1.6%)
    1987 1,006 (15%) 66 (7.3%) 520 (22%) 126 (16%) 9 (14%)
    1988 1,361 (20%) 104 (12%) 795 (34%) 135 (17%) 13 (21%)
    1995 1,089 (16%) 147 (16%) 246 (10%) 125 (16%) 9 (14%)
    1996 973 (14%) 107 (12%) 168 (7.2%) 88 (11%) 6 (9.5%)
    1997 500 (7.2%) 108 (12%) 123 (5.2%) 63 (7.9%) 5 (7.9%)
    1998 338 (4.9%) 78 (8.7%) 74 (3.2%) 59 (7.4%) 3 (4.8%)
    1999 390 (5.6%) 74 (8.2%) 86 (3.7%) 44 (5.5%) 6 (9.5%)
    2000 374 (5.4%) 60 (6.7%) 77 (3.3%) 45 (5.7%) 0 (0%)
    2001 299 (4.3%) 92 (10%) 84 (3.6%) 32 (4.0%) 6 (9.5%)
    2002 201 (2.9%) 54 (6.0%) 48 (2.0%) 16 (2.0%) 2 (3.2%)
Year Period




    1981-1988 2,757 (40%) 179 (20%) 1,439 (61%) 322 (41%) 26 (41%)
    1995-2002 4,164 (60%) 720 (80%) 906 (39%) 472 (59%) 37 (59%)
Sex and Sexual Orientation




    Female (any) 0 (0%) 670 (75%) 677 (29%) 0 (0%) 32 (51%)
    Male (bisexual) 820 (12%) 0 (0%) 0 (0%) 213 (27%) 0 (0%)
    Male (heterosexual or pediatric) 9 (0.1%) 229 (25%) 1,668 (71%) 21 (2.6%) 31 (49%)
    Male (homosexual) or Unknown Classification 6,092 (88%) 0 (0%) 0 (0%) 560 (71%) 0 (0%)
Case Criteria




    Pre-1985 surveillance definition 3,603 (52%) 282 (31%) 1,225 (52%) 418 (53%) 33 (52%)
    1985 surveillance definition 181 (2.6%) 10 (1.1%) 48 (2.0%) 14 (1.8%) 1 (1.6%)
    1987 definition and diagnosed definitively 374 (5.4%) 50 (5.6%) 288 (12%) 63 (7.9%) 2 (3.2%)
    1987 definition and diagnosed presumptively 334 (4.8%) 52 (5.8%) 239 (10%) 48 (6.0%) 2 (3.2%)
    1993 definition and diagnosed definitively 53 (0.8%) 20 (2.2%) 44 (1.9%) 18 (2.3%) 2 (3.2%)
    1993 definition and diagnosed immunologically 2,371 (34%) 485 (54%) 489 (21%) 230 (29%) 22 (35%)
    1993 definition and diagnosed presumptively 5 (<0.1%) 0 (0%) 12 (0.5%) 3 (0.4%) 1 (1.6%)
Mortality




    Alive 3,518 (51%) 632 (70%) 741 (32%) 383 (48%) 32 (51%)
    Dead 3,403 (49%) 267 (30%) 1,604 (68%) 411 (52%) 31 (49%)
1 n (%)

Re-Confirm figure 1: Distribution Of Outcome Variable

p_bar <- ggplot(cdc_analytic, aes(x = mortality, weight = cases)) +
  geom_bar(fill = "darkblue", color = "white") +
  scale_y_continuous(labels = scales::comma) + # Gemini suggested: Makes numbers like 5,000 readable
  labs(
    title = "Figure 1: Distribution of Mortality",
    subtitle = "Weighted by case counts (Total N = 11,022)",
    x = "Mortality",
    y = "Total Number of People"
  ) +
  theme_minimal(base_size = 13)
  
ggplotly(p_bar)

##Logistic Regression

# Compare Models
# Unadjusted - Model A: mortality ~ hivexposure
m_A <- svyglm(mortality ~ hivexposure,
  design  = cdc_weighted,
  family  = quasibinomial())

# Adjusted - Model B: mortality ~ hivexposure + year_period
m_B <- svyglm(mortality ~ hivexposure + year_period,
  design = cdc_weighted,
  family  = quasibinomial())

# Adjusted - Model C: mortality ~ hivexposure + yeardiagnosed
m_C <- svyglm(mortality ~ hivexposure + yeardiagnosed,
  design = cdc_weighted,
  family  = quasibinomial())

# Adjusted - Model D: Full model
m_D <- svyglm(mortality ~ hivexposure + year_period + case_criteria + sexandorientation,
  design = cdc_weighted,
  family  = quasibinomial())

Formatted Regresion table

# formatted model table
make_tbl <- function(model) {
  tbl_regression(
    model,
    exponentiate = TRUE,
    conf.int     = TRUE
  ) %>%
    add_significance_stars(
      hide_ci  = TRUE,
      hide_se  = FALSE,
      pattern  = "{estimate}{stars}"
    ) %>%
    remove_row_type(type = "reference") %>%
    bold_labels() %>%
    italicize_levels() %>%
    modify_header(estimate ~ "**OR**", std.error ~ "**SE**", conf.low ~ "**95% CI**", p.value  ~ "**p-value**")
}

t_A <- make_tbl(m_A)
t_B <- make_tbl(m_B)
t_C <- make_tbl(m_C)
t_D <- make_tbl(m_D)

# Remove orphaned CI footnotes
for (tbl_obj in list(t_A, t_B, t_C)) {
  tbl_obj$table_styling$abbreviation <-
    tbl_obj$table_styling$abbreviation %>%
    dplyr::filter(column != "conf.low")
}

tbl_merge(
  tbls      = list(t_A, t_B, t_C, t_D),
  tab_spanner = c(
    "**Model 1 Unadjusted: mortality ~ hivexposure**",
    "**Model 2 Adjusted: mortality ~ hivexposure + year_period**",
    "**Model 3 Adjusted: mortality ~ hivexposure + yeardiagnosed**",
    "**Model 4 Adjusted: Full model**"
  )
) %>%
  bold_labels() %>%
  modify_caption("**Table 2. Logistic Regression: Predictors of Mortality**") %>%
  as_gt() %>%
  tab_footnote(
    footnote = "OR = odds ratio; SE = standard error. Survey-weighted quasibinomial logistic regression. ***p < 0.001; **p < 0.01; *p < 0.05.",
    locations = cells_title()
  )
Table 2. Logistic Regression: Predictors of Mortality
Characteristic
Model 1 Unadjusted: mortality ~ hivexposure
Model 2 Adjusted: mortality ~ hivexposure + year_period
Model 3 Adjusted: mortality ~ hivexposure + yeardiagnosed
Model 4 Adjusted: Full model
OR1 SE 95% CI p-value OR1 SE 95% CI p-value OR1 SE 95% CI p-value OR1 SE 95% CI p-value
hivexposure















    Heterosexual contact with HIV 0.44 0.584 0.14, 1.37 0.2 0.69 0.466 0.27, 1.71 0.4 0.80 0.511 0.29, 2.18 0.7 0.33 0.731 0.08, 1.40 0.13
    IV drug use (female and hetero male) 2.24 0.565 0.74, 6.78 0.2 1.40 0.425 0.61, 3.23 0.4 1.50 0.469 0.60, 3.77 0.4 0.66 0.694 0.17, 2.57 0.5
    Male homo/bisexual and IV drug use 1.11 0.550 0.38, 3.27 0.9 1.18 0.448 0.49, 2.85 0.7 1.20 0.484 0.47, 3.12 0.7 1.12 0.408 0.50, 2.50 0.8
    Receipt of blood, blood components or tissue 1.00 0.600 0.31, 3.26 >0.9 0.92 0.549 0.31, 2.69 0.9 0.96 0.568 0.32, 2.94 >0.9 0.44 0.774 0.10, 2.00 0.3
year_period















    1995-2002



0.02*** 0.512 0.01, 0.05 <0.001



0.03*** 0.555 0.01, 0.09 <0.001
yeardiagnosed















    1982







2.22 1.56 0.10, 47.3 0.6



    1983







2.88 1.50 0.15, 54.9 0.5



    1984







3.42 1.51 0.18, 65.8 0.4



    1985







1.33 1.24 0.12, 15.0 0.8



    1986







2.88 1.27 0.24, 34.6 0.4



    1987







2.12 1.20 0.20, 22.4 0.5



    1988







1.87 1.21 0.17, 20.2 0.6



    1995







0.09* 1.13 0.01, 0.87 0.037



    1996







0.04** 1.15 0.00, 0.41 0.006



    1997







0.03** 1.09 0.00, 0.27 0.002



    1998







0.03** 1.09 0.00, 0.23 0.001



    1999







0.02*** 1.11 0.00, 0.18 <0.001



    2000







0.00*** 1.05 0.00, 0.00 <0.001



    2001







0.00*** 1.05 0.00, 0.00 <0.001



    2002







0.00*** 1.07 0.00, 0.00 <0.001



case_criteria















    1985 surveillance definition











1.13 0.588 0.35, 3.57 0.8
    1987 definition and diagnosed definitively











1.11 0.475 0.44, 2.82 0.8
    1987 definition and diagnosed presumptively











0.94 0.485 0.36, 2.43 0.9
    1993 definition and diagnosed definitively











0.81 0.526 0.29, 2.26 0.7
    1993 definition and diagnosed immunologically











0.30* 0.591 0.09, 0.96 0.043
    1993 definition and diagnosed presumptively











0.34 0.800 0.07, 1.63 0.2
sexandorientation















    Male (bisexual)











0.37 0.759 0.08, 1.65 0.2
    Male (heterosexual or pediatric)











0.85 0.404 0.39, 1.89 0.7
    Male (homosexual) or Unknown Classification











0.43 0.749 0.10, 1.85 0.3
1 *p<0.05; **p<0.01; ***p<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio, SE = Standard Error

Model Diagnostics

# Calibration Check: Predicted vs Observed by Decile
cdc_analytic |>
  mutate(pred_prob = fitted(m_D),
         obs_outcome = as.numeric(mortality == "Dead"), 
         decile = ntile(pred_prob, 10)) |>
  group_by(decile) |>
  summarise(
    mean_pred = weighted.mean(pred_prob, w = cases),
    mean_obs  = weighted.mean(obs_outcome, w = cases),
    .groups = "drop"
  ) |>
  ggplot(aes(x = mean_pred, y = mean_obs)) +
  geom_abline(slope = 1, intercept = 0, color = "grey", linetype = "dashed") +
  geom_point(size = 3, color = "darkblue") +
  geom_line(color = "darkblue") +
  labs(title = "Calibration Plot: Observed vs. Predicted Probability of Mortality",
       subtitle = "Points should fall on the dashed line for perfect calibration",
       x = "Mean Predicted Probability (within decile)",
       y = "Observed Proportion (within decile)") +
  theme_minimal()

# Cook’s distance plot
# Residuals vs Fitted
# Residuals vs Leverage
par(mfrow = c(2, 2))
plot(m_D)

Regression Visualizations

# Calculate predicted probabilities for hivexposure
pred_prob <- ggpredict(m_D, terms = "hivexposure")

plot(pred_prob) +
  labs(
    title = "Figure 1: Predicted Probability of Mortality by HIV Exposure",
    subtitle = "Adjusted for Year Period, Sex/Orientation, and Case Criteria",
    x = "HIV Exposure Category",
    y = "Predicted Probability of Death"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Forest Plot
tidy(m_D, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(x = estimate, y = reorder(term, estimate))) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
  geom_point(size = 3, color = "darkblue") +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2, color = "darkblue") +
  scale_x_log10() +
  labs(title = "Figure 2: Adjusted Odds Ratios for HIV Mortality",
       subtitle = "Reference line at OR = 1; log-scale x-axis",
       x = "Adjusted Odds Ratio (95% CI)", 
       y = NULL) +
  theme_minimal()

End of Progress Check-In 2