Setup and Data

# Load packages
library(dplyr)
library(tidyverse)
library(haven)
library(here)
library(knitr)
library(kableExtra)
library(broom)
library(plotly)
library (gtsummary)
library(janitor)
library (plotly)
library(ggplot2)
library(stringr)
library(gt)
library(ggeffects)

# Load CDC Wonder 1981 - 2002 datasets
cdc_new <- read_rds("/Users/nataliasmall/Downloads/EPI 553/cdcwonderdata3.rds")|>
  clean_names()

# Display dataset dimensions
names(cdc_new)

## [1] "yeardiagnosed"     "sexandorientation" "hivexposure"      
## [4] "case_criteria"     "vitalstatus"       "cases"

##Re-Recoding And Cleaning

# recoding and cleaning
# recoding character variables into factors: this will be beneficial for future regression modeling and dummy coding
cdc_analytic <- cdc_new |>
  mutate(
    # outcome
    mortality = vitalstatus,
    # sex and sexual orientation
    sexandorientation = factor(case_when(
      sexandorientation == "Female (any)"                                 ~ "Female (any)",
      sexandorientation == "Male (bisexual)"                              ~ "Male (bisexual)",
      sexandorientation == "Male (heterosexual or pediatric)"             ~ "Male (heterosexual or pediatric)",
      sexandorientation == "Male (homosexual)  or Unknown Classification" ~ "Male (homosexual) or Unknown Classification",
      TRUE                                                                ~ NA
    ), levels = c("Female (any)", "Male (bisexual)", "Male (heterosexual or pediatric)", "Male (homosexual) or Unknown Classification")),
    # HIV exposure
    hivexposure = factor(case_when(
      hivexposure == "Heterosexual contact with HIV"                ~ "Heterosexual contact with HIV",
      hivexposure == "IV drug use (female and hetero male)"         ~ "IV drug use (female and hetero male)",
      hivexposure == "Male homo/bisexual and IV drug use"           ~ "Male homo/bisexual and IV drug use",
      hivexposure == "Male homosexual/bisexual contact"             ~ "Male homosexual/bisexual contact",
      hivexposure == "Receipt of blood, blood components or tissue" ~ "Receipt of blood, blood components or tissue",
      TRUE                                                          ~ NA
    ), levels = c("Male homosexual/bisexual contact", "Heterosexual contact with HIV", "IV drug use (female and hetero male)", 
                  "Male homo/bisexual and IV drug use", "Receipt of blood, blood components or tissue")),
    # case criteria: surveillance method
    case_criteria = factor(case_when(
      case_criteria == "1985 surveillance definition"                  ~ "1985 surveillance definition",
      case_criteria == "1987 definition and diagnosed definitively"    ~ "1987 definition and diagnosed definitively",
      case_criteria == "1987 definition and diagnosed presumptively"   ~ "1987 definition and diagnosed presumptively",
      case_criteria == "1993 definition and diagnosed definitively"    ~ "1993 definition and diagnosed definitively",
      case_criteria == "1993 definition and diagnosed immunologically" ~ "1993 definition and diagnosed immunologically",
      case_criteria == "1993 definition and diagnosed presumptively"   ~ "1993 definition and diagnosed presumptively",
      case_criteria == "Pre-1985 surveillance definition"              ~ "Pre-1985 surveillance definition",
      TRUE                                                             ~ NA
    ), levels = c("Pre-1985 surveillance definition", "1985 surveillance definition", "1987 definition and diagnosed definitively", 
                  "1987 definition and diagnosed presumptively", "1993 definition and diagnosed definitively", 
                  "1993 definition and diagnosed immunologically", "1993 definition and diagnosed presumptively")),
    # year diagnosed
    yeardiagnosed = yeardiagnosed,
    # cases (our weight variable: paired to mortality, accounting for aggregated data structure)
    cases = cases,
    # new variable accounting for two distinct periods (1981-1986 vs. 1995-2000)
    year_period = factor(case_when(
      yeardiagnosed %in% c("Before 1982", "1982", "1983", "1984", "1985", "1986", "1987", "1988") ~ "1981-1988",
      yeardiagnosed %in% c("1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002")        ~ "1995-2002",
      TRUE                                                                                        ~ NA
    ), levels = c("1981-1988", "1995-2002"))
  )|>
  filter(
    !is.na(yeardiagnosed), !is.na(sexandorientation), !is.na(hivexposure),
    !is.na(case_criteria), !is.na(mortality), !is.na(year_period), !is.na(cases)
  )

# set seed for reproducibility
set.seed(2124)
cdc_analytic <- cdc_analytic |>
  dplyr::select(yeardiagnosed, sexandorientation, hivexposure, 
                case_criteria, mortality, year_period, cases)

# Save re-recoded dataset
saveRDS(cdc_analytic, here::here("/Users/nataliasmall/Downloads/EPI 553/cdc_analytic.rds"))

##Confirmatory Calculations

#cdcwonder data is pre-aggregated, so data points are not representative of only 1 person, could be 60, 40, 200, etc.
#total obs. in dataset is 685, but there are more individuals accounted for than that based on identified cases. weight is very important here. 

#total number of counts of cases: 11022
sum(cdc_analytic$cases)

## [1] 11022

#total number of counts of cases reported as alive: 5306
sum(cdc_analytic$cases[cdc_analytic$mortality == "Alive"], na.rm = TRUE)

## [1] 5306

#total number of counts of cases reported as dead: 5716
sum(cdc_analytic$cases[cdc_analytic$mortality == "Dead"], na.rm = TRUE)

## [1] 5716

Re-Run Descriptive Statistics

# run table(cdc_new$sexandorientation, useNA = "always") if final observation do not match expectations: will reveal spelling of categories
cdc_analytic %>%
  select(yeardiagnosed, sexandorientation, hivexposure, case_criteria, mortality, year_period, cases) %>%
  summary() %>%
  kable(caption = "Descriptive Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Descriptive Statistics
yeardiagnosed	sexandorientation	hivexposure	case_criteria	mortality	year_period	cases
1995 :105	Female (any) :167	Male homosexual/bisexual contact :202	Pre-1985 surveillance definition :205	Alive:388	1981-1988:182	Min. : 1.00
1996 : 77	Male (bisexual) :140	Heterosexual contact with HIV :115	1985 surveillance definition : 69	Dead :297	1995-2002:503	1st Qu.: 1.00
1997 : 76	Male (heterosexual or pediatric) :189	IV drug use (female and hetero male) :180	1987 definition and diagnosed definitively :110	NA	NA	Median : 3.00
1998 : 74	Male (homosexual) or Unknown Classification:189	Male homo/bisexual and IV drug use :149	1987 definition and diagnosed presumptively : 98	NA	NA	Mean : 16.09
1999 : 59	NA	Receipt of blood, blood components or tissue: 39	1993 definition and diagnosed definitively : 65	NA	NA	3rd Qu.: 10.00
1988 : 53	NA	NA	1993 definition and diagnosed immunologically:122	NA	NA	Max. :976.00
(Other):241	NA	NA	1993 definition and diagnosed presumptively : 16	NA	NA	NA

Re-Confirm Table 1

library(gtsummary)
# library that handles math for weighting
library(survey)

# Create the weighted design object
# ids = ~1: independent sampling
# weights = ~cases:use the cases variable to "inflate" each row to its representative value in the population
cdc_weighted <- svydesign(ids = ~1, weights = ~cases, data = cdc_analytic)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

#tbl_svysummary: The "weighted version" of tbl_summary
cdc_weighted %>%
  tbl_svysummary(
    by = hivexposure,
    include = c(yeardiagnosed, year_period, sexandorientation, hivexposure, case_criteria, mortality),
    label = list(
      yeardiagnosed     ~ "Year Diagnosed",
      year_period       ~ "Year Period",
      sexandorientation ~ "Sex and Sexual Orientation",
      case_criteria     ~ "Case Criteria",
      mortality         ~ "Mortality"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  bold_labels() %>%
  modify_caption("**Table 1. Survey-Weighted Descriptive Statistics (Total N = 11,022)**") %>%
  as_gt()

**Table 1. Survey-Weighted Descriptive Statistics (Total N = 11,022)**
Characteristic	Male homosexual/bisexual contact N = 6,921¹	Heterosexual contact with HIV N = 899¹	IV drug use (female and hetero male) N = 2,345¹	Male homo/bisexual and IV drug use N = 794¹	Receipt of blood, blood components or tissue N = 63¹
Year Diagnosed
Before 1982	12 (0.2%)	1 (0.1%)	1 (<0.1%)	2 (0.3%)	0 (0%)
1982	9 (0.1%)	0 (0%)	8 (0.3%)	2 (0.3%)	0 (0%)
1983	15 (0.2%)	0 (0%)	3 (0.1%)	3 (0.4%)	1 (1.6%)
1984	35 (0.5%)	1 (0.1%)	9 (0.4%)	5 (0.6%)	2 (3.2%)
1985	64 (0.9%)	0 (0%)	22 (0.9%)	13 (1.6%)	0 (0%)
1986	255 (3.7%)	7 (0.8%)	81 (3.5%)	36 (4.5%)	1 (1.6%)
1987	1,006 (15%)	66 (7.3%)	520 (22%)	126 (16%)	9 (14%)
1988	1,361 (20%)	104 (12%)	795 (34%)	135 (17%)	13 (21%)
1995	1,089 (16%)	147 (16%)	246 (10%)	125 (16%)	9 (14%)
1996	973 (14%)	107 (12%)	168 (7.2%)	88 (11%)	6 (9.5%)
1997	500 (7.2%)	108 (12%)	123 (5.2%)	63 (7.9%)	5 (7.9%)
1998	338 (4.9%)	78 (8.7%)	74 (3.2%)	59 (7.4%)	3 (4.8%)
1999	390 (5.6%)	74 (8.2%)	86 (3.7%)	44 (5.5%)	6 (9.5%)
2000	374 (5.4%)	60 (6.7%)	77 (3.3%)	45 (5.7%)	0 (0%)
2001	299 (4.3%)	92 (10%)	84 (3.6%)	32 (4.0%)	6 (9.5%)
2002	201 (2.9%)	54 (6.0%)	48 (2.0%)	16 (2.0%)	2 (3.2%)
Year Period
1981-1988	2,757 (40%)	179 (20%)	1,439 (61%)	322 (41%)	26 (41%)
1995-2002	4,164 (60%)	720 (80%)	906 (39%)	472 (59%)	37 (59%)
Sex and Sexual Orientation
Female (any)	0 (0%)	670 (75%)	677 (29%)	0 (0%)	32 (51%)
Male (bisexual)	820 (12%)	0 (0%)	0 (0%)	213 (27%)	0 (0%)
Male (heterosexual or pediatric)	9 (0.1%)	229 (25%)	1,668 (71%)	21 (2.6%)	31 (49%)
Male (homosexual) or Unknown Classification	6,092 (88%)	0 (0%)	0 (0%)	560 (71%)	0 (0%)
Case Criteria
Pre-1985 surveillance definition	3,603 (52%)	282 (31%)	1,225 (52%)	418 (53%)	33 (52%)
1985 surveillance definition	181 (2.6%)	10 (1.1%)	48 (2.0%)	14 (1.8%)	1 (1.6%)
1987 definition and diagnosed definitively	374 (5.4%)	50 (5.6%)	288 (12%)	63 (7.9%)	2 (3.2%)
1987 definition and diagnosed presumptively	334 (4.8%)	52 (5.8%)	239 (10%)	48 (6.0%)	2 (3.2%)
1993 definition and diagnosed definitively	53 (0.8%)	20 (2.2%)	44 (1.9%)	18 (2.3%)	2 (3.2%)
1993 definition and diagnosed immunologically	2,371 (34%)	485 (54%)	489 (21%)	230 (29%)	22 (35%)
1993 definition and diagnosed presumptively	5 (<0.1%)	0 (0%)	12 (0.5%)	3 (0.4%)	1 (1.6%)
Mortality
Alive	3,518 (51%)	632 (70%)	741 (32%)	383 (48%)	32 (51%)
Dead	3,403 (49%)	267 (30%)	1,604 (68%)	411 (52%)	31 (49%)
¹ n (%)

Re-Confirm figure 1: Distribution Of Outcome Variable

p_bar <- ggplot(cdc_analytic, aes(x = mortality, weight = cases)) +
  geom_bar(fill = "darkblue", color = "white") +
  scale_y_continuous(labels = scales::comma) + # Gemini suggested: Makes numbers like 5,000 readable
  labs(
    title = "Figure 1: Distribution of Mortality",
    subtitle = "Weighted by case counts (Total N = 11,022)",
    x = "Mortality",
    y = "Total Number of People"
  ) +
  theme_minimal(base_size = 13)
  
ggplotly(p_bar)

##Logistic Regression

# Compare Models
# Unadjusted - Model A: mortality ~ hivexposure
m_A <- svyglm(mortality ~ hivexposure,
  design  = cdc_weighted,
  family  = quasibinomial())

# Adjusted - Model B: mortality ~ hivexposure + year_period
m_B <- svyglm(mortality ~ hivexposure + year_period,
  design = cdc_weighted,
  family  = quasibinomial())

# Adjusted - Model C: mortality ~ hivexposure + yeardiagnosed
m_C <- svyglm(mortality ~ hivexposure + yeardiagnosed,
  design = cdc_weighted,
  family  = quasibinomial())

# Adjusted - Model D: Full model
m_D <- svyglm(mortality ~ hivexposure + year_period + case_criteria + sexandorientation,
  design = cdc_weighted,
  family  = quasibinomial())

Formatted Regresion table

# formatted model table
make_tbl <- function(model) {
  tbl_regression(
    model,
    exponentiate = TRUE,
    conf.int     = TRUE
  ) %>%
    add_significance_stars(
      hide_ci  = TRUE,
      hide_se  = FALSE,
      pattern  = "{estimate}{stars}"
    ) %>%
    remove_row_type(type = "reference") %>%
    bold_labels() %>%
    italicize_levels() %>%
    modify_header(estimate ~ "**OR**", std.error ~ "**SE**", conf.low ~ "**95% CI**", p.value  ~ "**p-value**")
}

t_A <- make_tbl(m_A)
t_B <- make_tbl(m_B)
t_C <- make_tbl(m_C)
t_D <- make_tbl(m_D)

# Remove orphaned CI footnotes
for (tbl_obj in list(t_A, t_B, t_C)) {
  tbl_obj$table_styling$abbreviation <-
    tbl_obj$table_styling$abbreviation %>%
    dplyr::filter(column != "conf.low")
}

tbl_merge(
  tbls      = list(t_A, t_B, t_C, t_D),
  tab_spanner = c(
    "**Model 1 Unadjusted: mortality ~ hivexposure**",
    "**Model 2 Adjusted: mortality ~ hivexposure + year_period**",
    "**Model 3 Adjusted: mortality ~ hivexposure + yeardiagnosed**",
    "**Model 4 Adjusted: Full model**"
  )
) %>%
  bold_labels() %>%
  modify_caption("**Table 2. Logistic Regression: Predictors of Mortality**") %>%
  as_gt() %>%
  tab_footnote(
    footnote = "OR = odds ratio; SE = standard error. Survey-weighted quasibinomial logistic regression. ***p < 0.001; **p < 0.01; *p < 0.05.",
    locations = cells_title()
  )

**Table 2. Logistic Regression: Predictors of Mortality**
Characteristic	Model 1 Unadjusted: mortality ~ hivexposure				Model 2 Adjusted: mortality ~ hivexposure + year_period				Model 3 Adjusted: mortality ~ hivexposure + yeardiagnosed				Model 4 Adjusted: Full model
Characteristic	OR¹	SE	95% CI	p-value	OR¹	SE	95% CI	p-value	OR¹	SE	95% CI	p-value	OR¹	SE	95% CI	p-value
hivexposure
Heterosexual contact with HIV	0.44	0.584	0.14, 1.37	0.2	0.69	0.466	0.27, 1.71	0.4	0.80	0.511	0.29, 2.18	0.7	0.33	0.731	0.08, 1.40	0.13
IV drug use (female and hetero male)	2.24	0.565	0.74, 6.78	0.2	1.40	0.425	0.61, 3.23	0.4	1.50	0.469	0.60, 3.77	0.4	0.66	0.694	0.17, 2.57	0.5
Male homo/bisexual and IV drug use	1.11	0.550	0.38, 3.27	0.9	1.18	0.448	0.49, 2.85	0.7	1.20	0.484	0.47, 3.12	0.7	1.12	0.408	0.50, 2.50	0.8
Receipt of blood, blood components or tissue	1.00	0.600	0.31, 3.26	>0.9	0.92	0.549	0.31, 2.69	0.9	0.96	0.568	0.32, 2.94	>0.9	0.44	0.774	0.10, 2.00	0.3
year_period
1995-2002					0.02***	0.512	0.01, 0.05	<0.001					0.03***	0.555	0.01, 0.09	<0.001
yeardiagnosed
1982									2.22	1.56	0.10, 47.3	0.6
1983									2.88	1.50	0.15, 54.9	0.5
1984									3.42	1.51	0.18, 65.8	0.4
1985									1.33	1.24	0.12, 15.0	0.8
1986									2.88	1.27	0.24, 34.6	0.4
1987									2.12	1.20	0.20, 22.4	0.5
1988									1.87	1.21	0.17, 20.2	0.6
1995									0.09*	1.13	0.01, 0.87	0.037
1996									0.04**	1.15	0.00, 0.41	0.006
1997									0.03**	1.09	0.00, 0.27	0.002
1998									0.03**	1.09	0.00, 0.23	0.001
1999									0.02***	1.11	0.00, 0.18	<0.001
2000									0.00***	1.05	0.00, 0.00	<0.001
2001									0.00***	1.05	0.00, 0.00	<0.001
2002									0.00***	1.07	0.00, 0.00	<0.001
case_criteria
1985 surveillance definition													1.13	0.588	0.35, 3.57	0.8
1987 definition and diagnosed definitively													1.11	0.475	0.44, 2.82	0.8
1987 definition and diagnosed presumptively													0.94	0.485	0.36, 2.43	0.9
1993 definition and diagnosed definitively													0.81	0.526	0.29, 2.26	0.7
1993 definition and diagnosed immunologically													0.30*	0.591	0.09, 0.96	0.043
1993 definition and diagnosed presumptively													0.34	0.800	0.07, 1.63	0.2
sexandorientation
Male (bisexual)													0.37	0.759	0.08, 1.65	0.2
Male (heterosexual or pediatric)													0.85	0.404	0.39, 1.89	0.7
Male (homosexual) or Unknown Classification													0.43	0.749	0.10, 1.85	0.3
¹ p<0.05; p<0.01; **p<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio, SE = Standard Error

Model Diagnostics

# Calibration Check: Predicted vs Observed by Decile
cdc_analytic |>
  mutate(pred_prob = fitted(m_D),
         obs_outcome = as.numeric(mortality == "Dead"), 
         decile = ntile(pred_prob, 10)) |>
  group_by(decile) |>
  summarise(
    mean_pred = weighted.mean(pred_prob, w = cases),
    mean_obs  = weighted.mean(obs_outcome, w = cases),
    .groups = "drop"
  ) |>
  ggplot(aes(x = mean_pred, y = mean_obs)) +
  geom_abline(slope = 1, intercept = 0, color = "grey", linetype = "dashed") +
  geom_point(size = 3, color = "darkblue") +
  geom_line(color = "darkblue") +
  labs(title = "Calibration Plot: Observed vs. Predicted Probability of Mortality",
       subtitle = "Points should fall on the dashed line for perfect calibration",
       x = "Mean Predicted Probability (within decile)",
       y = "Observed Proportion (within decile)") +
  theme_minimal()

# Cook’s distance plot
# Residuals vs Fitted
# Residuals vs Leverage
par(mfrow = c(2, 2))
plot(m_D)

Regression Visualizations

# Calculate predicted probabilities for hivexposure
pred_prob <- ggpredict(m_D, terms = "hivexposure")

plot(pred_prob) +
  labs(
    title = "Figure 1: Predicted Probability of Mortality by HIV Exposure",
    subtitle = "Adjusted for Year Period, Sex/Orientation, and Case Criteria",
    x = "HIV Exposure Category",
    y = "Predicted Probability of Death"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Forest Plot
tidy(m_D, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(x = estimate, y = reorder(term, estimate))) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
  geom_point(size = 3, color = "darkblue") +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2, color = "darkblue") +
  scale_x_log10() +
  labs(title = "Figure 2: Adjusted Odds Ratios for HIV Mortality",
       subtitle = "Reference line at OR = 1; log-scale x-axis",
       x = "Adjusted Odds Ratio (95% CI)", 
       y = NULL) +
  theme_minimal()

End of Progress Check-In 2

Progress Check-in 2: Regression Results and Visualizations

EPI 553 — Principles of Statistical Inference II (Spring 2026)

Natalia Small

April 7, 2026

Setup and Data

Re-Run Descriptive Statistics

Re-Confirm Table 1

Re-Confirm figure 1: Distribution Of Outcome Variable

Formatted Regresion table

Model Diagnostics

Regression Visualizations