# Load packages
library(dplyr)
library(tidyverse)
library(haven)
library(here)
library(knitr)
library(kableExtra)
library(broom)
library(plotly)
library (gtsummary)
library(janitor)
library (plotly)
library(ggplot2)
library(stringr)
library(gt)
library(ggeffects)
# Load CDC Wonder 1981 - 2002 datasets
cdc_new <- read_rds("/Users/nataliasmall/Downloads/EPI 553/cdcwonderdata3.rds")|>
clean_names()
# Display dataset dimensions
names(cdc_new)## [1] "yeardiagnosed" "sexandorientation" "hivexposure"
## [4] "case_criteria" "vitalstatus" "cases"
##Re-Recoding And Cleaning
# recoding and cleaning
# recoding character variables into factors: this will be beneficial for future regression modeling and dummy coding
cdc_analytic <- cdc_new |>
mutate(
# outcome
mortality = vitalstatus,
# sex and sexual orientation
sexandorientation = factor(case_when(
sexandorientation == "Female (any)" ~ "Female (any)",
sexandorientation == "Male (bisexual)" ~ "Male (bisexual)",
sexandorientation == "Male (heterosexual or pediatric)" ~ "Male (heterosexual or pediatric)",
sexandorientation == "Male (homosexual) or Unknown Classification" ~ "Male (homosexual) or Unknown Classification",
TRUE ~ NA
), levels = c("Female (any)", "Male (bisexual)", "Male (heterosexual or pediatric)", "Male (homosexual) or Unknown Classification")),
# HIV exposure
hivexposure = factor(case_when(
hivexposure == "Heterosexual contact with HIV" ~ "Heterosexual contact with HIV",
hivexposure == "IV drug use (female and hetero male)" ~ "IV drug use (female and hetero male)",
hivexposure == "Male homo/bisexual and IV drug use" ~ "Male homo/bisexual and IV drug use",
hivexposure == "Male homosexual/bisexual contact" ~ "Male homosexual/bisexual contact",
hivexposure == "Receipt of blood, blood components or tissue" ~ "Receipt of blood, blood components or tissue",
TRUE ~ NA
), levels = c("Male homosexual/bisexual contact", "Heterosexual contact with HIV", "IV drug use (female and hetero male)",
"Male homo/bisexual and IV drug use", "Receipt of blood, blood components or tissue")),
# case criteria: surveillance method
case_criteria = factor(case_when(
case_criteria == "1985 surveillance definition" ~ "1985 surveillance definition",
case_criteria == "1987 definition and diagnosed definitively" ~ "1987 definition and diagnosed definitively",
case_criteria == "1987 definition and diagnosed presumptively" ~ "1987 definition and diagnosed presumptively",
case_criteria == "1993 definition and diagnosed definitively" ~ "1993 definition and diagnosed definitively",
case_criteria == "1993 definition and diagnosed immunologically" ~ "1993 definition and diagnosed immunologically",
case_criteria == "1993 definition and diagnosed presumptively" ~ "1993 definition and diagnosed presumptively",
case_criteria == "Pre-1985 surveillance definition" ~ "Pre-1985 surveillance definition",
TRUE ~ NA
), levels = c("Pre-1985 surveillance definition", "1985 surveillance definition", "1987 definition and diagnosed definitively",
"1987 definition and diagnosed presumptively", "1993 definition and diagnosed definitively",
"1993 definition and diagnosed immunologically", "1993 definition and diagnosed presumptively")),
# year diagnosed
yeardiagnosed = yeardiagnosed,
# cases (our weight variable: paired to mortality, accounting for aggregated data structure)
cases = cases,
# new variable accounting for two distinct periods (1981-1986 vs. 1995-2000)
year_period = factor(case_when(
yeardiagnosed %in% c("Before 1982", "1982", "1983", "1984", "1985", "1986", "1987", "1988") ~ "1981-1988",
yeardiagnosed %in% c("1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002") ~ "1995-2002",
TRUE ~ NA
), levels = c("1981-1988", "1995-2002"))
)|>
filter(
!is.na(yeardiagnosed), !is.na(sexandorientation), !is.na(hivexposure),
!is.na(case_criteria), !is.na(mortality), !is.na(year_period), !is.na(cases)
)
# set seed for reproducibility
set.seed(2124)
cdc_analytic <- cdc_analytic |>
dplyr::select(yeardiagnosed, sexandorientation, hivexposure,
case_criteria, mortality, year_period, cases)
# Save re-recoded dataset
saveRDS(cdc_analytic, here::here("/Users/nataliasmall/Downloads/EPI 553/cdc_analytic.rds"))##Confirmatory Calculations
#cdcwonder data is pre-aggregated, so data points are not representative of only 1 person, could be 60, 40, 200, etc.
#total obs. in dataset is 685, but there are more individuals accounted for than that based on identified cases. weight is very important here.
#total number of counts of cases: 11022
sum(cdc_analytic$cases)## [1] 11022
#total number of counts of cases reported as alive: 5306
sum(cdc_analytic$cases[cdc_analytic$mortality == "Alive"], na.rm = TRUE)## [1] 5306
#total number of counts of cases reported as dead: 5716
sum(cdc_analytic$cases[cdc_analytic$mortality == "Dead"], na.rm = TRUE)## [1] 5716
# run table(cdc_new$sexandorientation, useNA = "always") if final observation do not match expectations: will reveal spelling of categories
cdc_analytic %>%
select(yeardiagnosed, sexandorientation, hivexposure, case_criteria, mortality, year_period, cases) %>%
summary() %>%
kable(caption = "Descriptive Statistics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| yeardiagnosed | sexandorientation | hivexposure | case_criteria | mortality | year_period | cases | |
|---|---|---|---|---|---|---|---|
| 1995 :105 | Female (any) :167 | Male homosexual/bisexual contact :202 | Pre-1985 surveillance definition :205 | Alive:388 | 1981-1988:182 | Min. : 1.00 | |
| 1996 : 77 | Male (bisexual) :140 | Heterosexual contact with HIV :115 | 1985 surveillance definition : 69 | Dead :297 | 1995-2002:503 | 1st Qu.: 1.00 | |
| 1997 : 76 | Male (heterosexual or pediatric) :189 | IV drug use (female and hetero male) :180 | 1987 definition and diagnosed definitively :110 | NA | NA | Median : 3.00 | |
| 1998 : 74 | Male (homosexual) or Unknown Classification:189 | Male homo/bisexual and IV drug use :149 | 1987 definition and diagnosed presumptively : 98 | NA | NA | Mean : 16.09 | |
| 1999 : 59 | NA | Receipt of blood, blood components or tissue: 39 | 1993 definition and diagnosed definitively : 65 | NA | NA | 3rd Qu.: 10.00 | |
| 1988 : 53 | NA | NA | 1993 definition and diagnosed immunologically:122 | NA | NA | Max. :976.00 | |
| (Other):241 | NA | NA | 1993 definition and diagnosed presumptively : 16 | NA | NA | NA |
library(gtsummary)
# library that handles math for weighting
library(survey)
# Create the weighted design object
# ids = ~1: independent sampling
# weights = ~cases:use the cases variable to "inflate" each row to its representative value in the population
cdc_weighted <- svydesign(ids = ~1, weights = ~cases, data = cdc_analytic)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
#tbl_svysummary: The "weighted version" of tbl_summary
cdc_weighted %>%
tbl_svysummary(
by = hivexposure,
include = c(yeardiagnosed, year_period, sexandorientation, hivexposure, case_criteria, mortality),
label = list(
yeardiagnosed ~ "Year Diagnosed",
year_period ~ "Year Period",
sexandorientation ~ "Sex and Sexual Orientation",
case_criteria ~ "Case Criteria",
mortality ~ "Mortality"
),
statistic = list(
all_categorical() ~ "{n} ({p}%)"
)
) %>%
bold_labels() %>%
modify_caption("**Table 1. Survey-Weighted Descriptive Statistics (Total N = 11,022)**") %>%
as_gt()| Characteristic | Male homosexual/bisexual contact N = 6,9211 |
Heterosexual contact with HIV N = 8991 |
IV drug use (female and hetero male) N = 2,3451 |
Male homo/bisexual and IV drug use N = 7941 |
Receipt of blood, blood components or tissue N = 631 |
|---|---|---|---|---|---|
| Year Diagnosed | |||||
| Before 1982 | 12 (0.2%) | 1 (0.1%) | 1 (<0.1%) | 2 (0.3%) | 0 (0%) |
| 1982 | 9 (0.1%) | 0 (0%) | 8 (0.3%) | 2 (0.3%) | 0 (0%) |
| 1983 | 15 (0.2%) | 0 (0%) | 3 (0.1%) | 3 (0.4%) | 1 (1.6%) |
| 1984 | 35 (0.5%) | 1 (0.1%) | 9 (0.4%) | 5 (0.6%) | 2 (3.2%) |
| 1985 | 64 (0.9%) | 0 (0%) | 22 (0.9%) | 13 (1.6%) | 0 (0%) |
| 1986 | 255 (3.7%) | 7 (0.8%) | 81 (3.5%) | 36 (4.5%) | 1 (1.6%) |
| 1987 | 1,006 (15%) | 66 (7.3%) | 520 (22%) | 126 (16%) | 9 (14%) |
| 1988 | 1,361 (20%) | 104 (12%) | 795 (34%) | 135 (17%) | 13 (21%) |
| 1995 | 1,089 (16%) | 147 (16%) | 246 (10%) | 125 (16%) | 9 (14%) |
| 1996 | 973 (14%) | 107 (12%) | 168 (7.2%) | 88 (11%) | 6 (9.5%) |
| 1997 | 500 (7.2%) | 108 (12%) | 123 (5.2%) | 63 (7.9%) | 5 (7.9%) |
| 1998 | 338 (4.9%) | 78 (8.7%) | 74 (3.2%) | 59 (7.4%) | 3 (4.8%) |
| 1999 | 390 (5.6%) | 74 (8.2%) | 86 (3.7%) | 44 (5.5%) | 6 (9.5%) |
| 2000 | 374 (5.4%) | 60 (6.7%) | 77 (3.3%) | 45 (5.7%) | 0 (0%) |
| 2001 | 299 (4.3%) | 92 (10%) | 84 (3.6%) | 32 (4.0%) | 6 (9.5%) |
| 2002 | 201 (2.9%) | 54 (6.0%) | 48 (2.0%) | 16 (2.0%) | 2 (3.2%) |
| Year Period | |||||
| 1981-1988 | 2,757 (40%) | 179 (20%) | 1,439 (61%) | 322 (41%) | 26 (41%) |
| 1995-2002 | 4,164 (60%) | 720 (80%) | 906 (39%) | 472 (59%) | 37 (59%) |
| Sex and Sexual Orientation | |||||
| Female (any) | 0 (0%) | 670 (75%) | 677 (29%) | 0 (0%) | 32 (51%) |
| Male (bisexual) | 820 (12%) | 0 (0%) | 0 (0%) | 213 (27%) | 0 (0%) |
| Male (heterosexual or pediatric) | 9 (0.1%) | 229 (25%) | 1,668 (71%) | 21 (2.6%) | 31 (49%) |
| Male (homosexual) or Unknown Classification | 6,092 (88%) | 0 (0%) | 0 (0%) | 560 (71%) | 0 (0%) |
| Case Criteria | |||||
| Pre-1985 surveillance definition | 3,603 (52%) | 282 (31%) | 1,225 (52%) | 418 (53%) | 33 (52%) |
| 1985 surveillance definition | 181 (2.6%) | 10 (1.1%) | 48 (2.0%) | 14 (1.8%) | 1 (1.6%) |
| 1987 definition and diagnosed definitively | 374 (5.4%) | 50 (5.6%) | 288 (12%) | 63 (7.9%) | 2 (3.2%) |
| 1987 definition and diagnosed presumptively | 334 (4.8%) | 52 (5.8%) | 239 (10%) | 48 (6.0%) | 2 (3.2%) |
| 1993 definition and diagnosed definitively | 53 (0.8%) | 20 (2.2%) | 44 (1.9%) | 18 (2.3%) | 2 (3.2%) |
| 1993 definition and diagnosed immunologically | 2,371 (34%) | 485 (54%) | 489 (21%) | 230 (29%) | 22 (35%) |
| 1993 definition and diagnosed presumptively | 5 (<0.1%) | 0 (0%) | 12 (0.5%) | 3 (0.4%) | 1 (1.6%) |
| Mortality | |||||
| Alive | 3,518 (51%) | 632 (70%) | 741 (32%) | 383 (48%) | 32 (51%) |
| Dead | 3,403 (49%) | 267 (30%) | 1,604 (68%) | 411 (52%) | 31 (49%) |
| 1 n (%) | |||||
p_bar <- ggplot(cdc_analytic, aes(x = mortality, weight = cases)) +
geom_bar(fill = "darkblue", color = "white") +
scale_y_continuous(labels = scales::comma) + # Gemini suggested: Makes numbers like 5,000 readable
labs(
title = "Figure 1: Distribution of Mortality",
subtitle = "Weighted by case counts (Total N = 11,022)",
x = "Mortality",
y = "Total Number of People"
) +
theme_minimal(base_size = 13)
ggplotly(p_bar)##Logistic Regression
# Compare Models
# Unadjusted - Model A: mortality ~ hivexposure
m_A <- svyglm(mortality ~ hivexposure,
design = cdc_weighted,
family = quasibinomial())
# Adjusted - Model B: mortality ~ hivexposure + year_period
m_B <- svyglm(mortality ~ hivexposure + year_period,
design = cdc_weighted,
family = quasibinomial())
# Adjusted - Model C: mortality ~ hivexposure + yeardiagnosed
m_C <- svyglm(mortality ~ hivexposure + yeardiagnosed,
design = cdc_weighted,
family = quasibinomial())
# Adjusted - Model D: Full model
m_D <- svyglm(mortality ~ hivexposure + year_period + case_criteria + sexandorientation,
design = cdc_weighted,
family = quasibinomial())# formatted model table
make_tbl <- function(model) {
tbl_regression(
model,
exponentiate = TRUE,
conf.int = TRUE
) %>%
add_significance_stars(
hide_ci = TRUE,
hide_se = FALSE,
pattern = "{estimate}{stars}"
) %>%
remove_row_type(type = "reference") %>%
bold_labels() %>%
italicize_levels() %>%
modify_header(estimate ~ "**OR**", std.error ~ "**SE**", conf.low ~ "**95% CI**", p.value ~ "**p-value**")
}
t_A <- make_tbl(m_A)
t_B <- make_tbl(m_B)
t_C <- make_tbl(m_C)
t_D <- make_tbl(m_D)
# Remove orphaned CI footnotes
for (tbl_obj in list(t_A, t_B, t_C)) {
tbl_obj$table_styling$abbreviation <-
tbl_obj$table_styling$abbreviation %>%
dplyr::filter(column != "conf.low")
}
tbl_merge(
tbls = list(t_A, t_B, t_C, t_D),
tab_spanner = c(
"**Model 1 Unadjusted: mortality ~ hivexposure**",
"**Model 2 Adjusted: mortality ~ hivexposure + year_period**",
"**Model 3 Adjusted: mortality ~ hivexposure + yeardiagnosed**",
"**Model 4 Adjusted: Full model**"
)
) %>%
bold_labels() %>%
modify_caption("**Table 2. Logistic Regression: Predictors of Mortality**") %>%
as_gt() %>%
tab_footnote(
footnote = "OR = odds ratio; SE = standard error. Survey-weighted quasibinomial logistic regression. ***p < 0.001; **p < 0.01; *p < 0.05.",
locations = cells_title()
)| Characteristic |
Model 1 Unadjusted: mortality ~ hivexposure
|
Model 2 Adjusted: mortality ~ hivexposure + year_period
|
Model 3 Adjusted: mortality ~ hivexposure + yeardiagnosed
|
Model 4 Adjusted: Full model
|
||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| OR1 | SE | 95% CI | p-value | OR1 | SE | 95% CI | p-value | OR1 | SE | 95% CI | p-value | OR1 | SE | 95% CI | p-value | |
| hivexposure | ||||||||||||||||
| Heterosexual contact with HIV | 0.44 | 0.584 | 0.14, 1.37 | 0.2 | 0.69 | 0.466 | 0.27, 1.71 | 0.4 | 0.80 | 0.511 | 0.29, 2.18 | 0.7 | 0.33 | 0.731 | 0.08, 1.40 | 0.13 |
| IV drug use (female and hetero male) | 2.24 | 0.565 | 0.74, 6.78 | 0.2 | 1.40 | 0.425 | 0.61, 3.23 | 0.4 | 1.50 | 0.469 | 0.60, 3.77 | 0.4 | 0.66 | 0.694 | 0.17, 2.57 | 0.5 |
| Male homo/bisexual and IV drug use | 1.11 | 0.550 | 0.38, 3.27 | 0.9 | 1.18 | 0.448 | 0.49, 2.85 | 0.7 | 1.20 | 0.484 | 0.47, 3.12 | 0.7 | 1.12 | 0.408 | 0.50, 2.50 | 0.8 |
| Receipt of blood, blood components or tissue | 1.00 | 0.600 | 0.31, 3.26 | >0.9 | 0.92 | 0.549 | 0.31, 2.69 | 0.9 | 0.96 | 0.568 | 0.32, 2.94 | >0.9 | 0.44 | 0.774 | 0.10, 2.00 | 0.3 |
| year_period | ||||||||||||||||
| 1995-2002 | 0.02*** | 0.512 | 0.01, 0.05 | <0.001 | 0.03*** | 0.555 | 0.01, 0.09 | <0.001 | ||||||||
| yeardiagnosed | ||||||||||||||||
| 1982 | 2.22 | 1.56 | 0.10, 47.3 | 0.6 | ||||||||||||
| 1983 | 2.88 | 1.50 | 0.15, 54.9 | 0.5 | ||||||||||||
| 1984 | 3.42 | 1.51 | 0.18, 65.8 | 0.4 | ||||||||||||
| 1985 | 1.33 | 1.24 | 0.12, 15.0 | 0.8 | ||||||||||||
| 1986 | 2.88 | 1.27 | 0.24, 34.6 | 0.4 | ||||||||||||
| 1987 | 2.12 | 1.20 | 0.20, 22.4 | 0.5 | ||||||||||||
| 1988 | 1.87 | 1.21 | 0.17, 20.2 | 0.6 | ||||||||||||
| 1995 | 0.09* | 1.13 | 0.01, 0.87 | 0.037 | ||||||||||||
| 1996 | 0.04** | 1.15 | 0.00, 0.41 | 0.006 | ||||||||||||
| 1997 | 0.03** | 1.09 | 0.00, 0.27 | 0.002 | ||||||||||||
| 1998 | 0.03** | 1.09 | 0.00, 0.23 | 0.001 | ||||||||||||
| 1999 | 0.02*** | 1.11 | 0.00, 0.18 | <0.001 | ||||||||||||
| 2000 | 0.00*** | 1.05 | 0.00, 0.00 | <0.001 | ||||||||||||
| 2001 | 0.00*** | 1.05 | 0.00, 0.00 | <0.001 | ||||||||||||
| 2002 | 0.00*** | 1.07 | 0.00, 0.00 | <0.001 | ||||||||||||
| case_criteria | ||||||||||||||||
| 1985 surveillance definition | 1.13 | 0.588 | 0.35, 3.57 | 0.8 | ||||||||||||
| 1987 definition and diagnosed definitively | 1.11 | 0.475 | 0.44, 2.82 | 0.8 | ||||||||||||
| 1987 definition and diagnosed presumptively | 0.94 | 0.485 | 0.36, 2.43 | 0.9 | ||||||||||||
| 1993 definition and diagnosed definitively | 0.81 | 0.526 | 0.29, 2.26 | 0.7 | ||||||||||||
| 1993 definition and diagnosed immunologically | 0.30* | 0.591 | 0.09, 0.96 | 0.043 | ||||||||||||
| 1993 definition and diagnosed presumptively | 0.34 | 0.800 | 0.07, 1.63 | 0.2 | ||||||||||||
| sexandorientation | ||||||||||||||||
| Male (bisexual) | 0.37 | 0.759 | 0.08, 1.65 | 0.2 | ||||||||||||
| Male (heterosexual or pediatric) | 0.85 | 0.404 | 0.39, 1.89 | 0.7 | ||||||||||||
| Male (homosexual) or Unknown Classification | 0.43 | 0.749 | 0.10, 1.85 | 0.3 | ||||||||||||
| 1 *p<0.05; **p<0.01; ***p<0.001 | ||||||||||||||||
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio, SE = Standard Error | ||||||||||||||||
# Calibration Check: Predicted vs Observed by Decile
cdc_analytic |>
mutate(pred_prob = fitted(m_D),
obs_outcome = as.numeric(mortality == "Dead"),
decile = ntile(pred_prob, 10)) |>
group_by(decile) |>
summarise(
mean_pred = weighted.mean(pred_prob, w = cases),
mean_obs = weighted.mean(obs_outcome, w = cases),
.groups = "drop"
) |>
ggplot(aes(x = mean_pred, y = mean_obs)) +
geom_abline(slope = 1, intercept = 0, color = "grey", linetype = "dashed") +
geom_point(size = 3, color = "darkblue") +
geom_line(color = "darkblue") +
labs(title = "Calibration Plot: Observed vs. Predicted Probability of Mortality",
subtitle = "Points should fall on the dashed line for perfect calibration",
x = "Mean Predicted Probability (within decile)",
y = "Observed Proportion (within decile)") +
theme_minimal()# Calculate predicted probabilities for hivexposure
pred_prob <- ggpredict(m_D, terms = "hivexposure")
plot(pred_prob) +
labs(
title = "Figure 1: Predicted Probability of Mortality by HIV Exposure",
subtitle = "Adjusted for Year Period, Sex/Orientation, and Case Criteria",
x = "HIV Exposure Category",
y = "Predicted Probability of Death"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Forest Plot
tidy(m_D, exponentiate = TRUE, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
ggplot(aes(x = estimate, y = reorder(term, estimate))) +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_point(size = 3, color = "darkblue") +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2, color = "darkblue") +
scale_x_log10() +
labs(title = "Figure 2: Adjusted Odds Ratios for HIV Mortality",
subtitle = "Reference line at OR = 1; log-scale x-axis",
x = "Adjusted Odds Ratio (95% CI)",
y = NULL) +
theme_minimal()End of Progress Check-In 2