Setup and Data

# Load packages
library(dplyr)
library(tidyverse)
library(haven)
library(here)
library(knitr)
library(kableExtra)
library(broom)
library(plotly)
library (gtsummary)
library (plotly)
library(ggplot2)
library(stringr)
library(gt)

#this data was taken from the cdcwonder surveillance report for 1981-2002
# Load CDC Wonder 1981 - 2002 datasets
cdc03 <- read.csv("/Users/nataliasmall/Downloads/EPI 553/third.csv") %>%
  janitor::clean_names()

# Display dataset dimensions
names(cdc03)
##  [1] "notes"                           "year_diagnosed"                 
##  [3] "year_diagnosed_code"             "sex_and_sexual_orientation"     
##  [5] "sex_and_sexual_orientation_code" "hiv_exposure_category"          
##  [7] "hiv_exposure_category_code"      "case_definition"                
##  [9] "case_definition_code"            "vital_status"                   
## [11] "vital_status_code"               "cases"
#Recoding and cleaning
cdc3_clean <- cdc03 %>%
  filter(notes != "Total") %>%
  drop_na(`vital_status_code`)

# Select variables of interest
cdc3_full <- cdc3_clean %>%
  select(year_diagnosed_code, sex_and_sexual_orientation, hiv_exposure_category, case_definition, vital_status_code, cases)

# Recode variables
cdc3_full <- cdc3_full %>%
  mutate(
    yeardiagnosed     = factor(case_when(
      year_diagnosed_code == 1981 ~ "Before 1982",
      year_diagnosed_code == 1982 ~ "1982",
      year_diagnosed_code == 1983 ~ "1983",
      year_diagnosed_code == 1984 ~ "1984",
      year_diagnosed_code == 1985 ~ "1985",
      year_diagnosed_code == 1986 ~ "1986",
      year_diagnosed_code == 1987 ~ "1987",
      year_diagnosed_code == 1988 ~ "1988",
      year_diagnosed_code == 1995 ~ "1995",
      year_diagnosed_code == 1996 ~ "1996",
      year_diagnosed_code == 1997 ~ "1997",
      year_diagnosed_code == 1998 ~ "1998",
      year_diagnosed_code == 1999 ~ "1999",
      year_diagnosed_code == 2000 ~ "2000",
      year_diagnosed_code == 2001 ~ "2001",
      year_diagnosed_code == 2002 ~ "2002"), levels = c("Before 1982", "1982", "1983", "1984", "1985", 
                                                        "1986", "1987", "1988", "1995", "1996", "1997", 
                                                        "1998", "1999", "2000", "2001", "2002")),
    sexandorientation = sex_and_sexual_orientation,
    hivexposure       = hiv_exposure_category,
    CaseCriteria      = case_definition,
    vitalstatus       = factor(ifelse(vital_status_code == 0, "Alive", "Dead")),
    cases             = cases
  )

# Select recoded variables, apply filters, drop missing
set.seed(553)
cdc3_full <- cdc3_full %>%
  select(yeardiagnosed, sexandorientation, hivexposure, CaseCriteria, vitalstatus, cases) %>%
  filter(cases > 0) %>%
  drop_na()

#cdcwonder data is pre-aggregated, so data points are not representative of only 1 person, could be 60, 40, 200, etc.
#total obs. in dataset is 685, but there are more individuals accounted for than that based on identified cases. weight is very important here. 
#total number of counts of cases: 11022
sum(cdc3_full$cases)
## [1] 11022
#total number of counts of cases reported as alive: 5306
sum(cdc3_full$cases[cdc3_full$vitalstatus == "Alive"], na.rm = TRUE)
## [1] 5306
#total number of counts of cases reported as dead: 5716
sum(cdc3_full$cases[cdc3_full$vitalstatus == "Dead"], na.rm = TRUE)
## [1] 5716
# Save analytic datasets
saveRDS(cdc3_full, here::here(
  "/Users/nataliasmall/Downloads/EPI 553/cdcwonderdata3.rds"
))

Descriptive Statistics

cdc3_full %>%
  select(yeardiagnosed, vitalstatus, cases) %>%
  summary() %>%
  kable(caption = "Descriptive Statistics: Key Continuous Variables") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Descriptive Statistics: Key Continuous Variables
yeardiagnosed vitalstatus cases
1995 :105 Alive:388 Min. : 1.00
1996 : 77 Dead :297 1st Qu.: 1.00
1997 : 76 NA Median : 3.00
1998 : 74 NA Mean : 16.09
1999 : 59 NA 3rd Qu.: 10.00
1988 : 53 NA Max. :976.00
(Other):241 NA NA
library(gtsummary)
# library that handles math for weighting
library(survey)

# Create the weighted design object
# ids = ~1: independent sampling
# weights = ~cases:use the cases variable to "inflate" each row to its representative value in the population
cdc_weighted <- svydesign(ids = ~1, weights = ~cases, data = cdc3_full)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

#tbl_svysummary: The "weighted version" of tbl_summary
ft0 <- cdc_weighted %>%
  tbl_svysummary(
    by = hivexposure,
    include = c(yeardiagnosed, sexandorientation, CaseCriteria, vitalstatus),
    label = list(
      yeardiagnosed ~ "Year Diagnosed",
      sexandorientation ~ "Sex and Sexual Orientation",
      CaseCriteria ~ "Case Criteria",
      vitalstatus ~ "Vital Status"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  bold_labels() %>%
  modify_caption("**Table 1: Survey-Weighted Descriptives Stratified by HIV Exposure (N = 11,022)**") %>%
  modify_footnote(everything() ~ "Total analytic sample size N=11,022. Percentages reflect survey-weighted distributions. Missing values handled via complete case analysis.")%>%
  as_gt() %>% 
  gtsave("/Users/nataliasmall/Downloads/EPI 553/outputs/Table1_WeightDesc.png")

print (ft0)
## [1] "/Users/nataliasmall/Downloads/EPI 553/outputs/Table1_WeightDesc.png"
p_bar <- ggplot(cdc3_full, aes(x = vitalstatus, weight = cases)) +
  geom_bar(fill = "darkblue", color = "white") +
  scale_y_continuous(labels = scales::comma) + #Note: this makes large numbers readable
  labs(
    title = "Figure 1: Actual Distribution of Mortality (Vital Status)",
    subtitle = "Weighted by case counts (Total N = 11,022)",
    x = "Vital Status",
    y = "Total Number of People"
  ) +
  theme_minimal(base_size = 13)
  
ggplotly(p_bar)
ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure1_barchart.png", 
       plot = p_bar, width = 8, height = 6, dpi = 300)
# Proportional Stacked Bar Chart: percentage of "Alive" vs "Dead" within each exposure group, adjusted by cases weight

# x = reorder(hivexposure, cases): reorder categories by the size of the case counts
# fill = vitalstatus: color the bars by "Alive" vs "Dead"
# weight = cases: count 11,022 people, not 685 rows
p_bivariate <- ggplot(cdc3_full, 
                      aes(x = reorder(hivexposure, cases), 
                          fill = vitalstatus,              
                          weight = cases)) + 
                          geom_bar(position = "fill") + 

# Flip the coordinates for readability
# Rotates the plot so long category names are easy to read
                          coord_flip() +                                            

# Format the y-axis (now the horizontal axis)
# Convert 0.25, 0.50, etc. into 25%, 50%
                          scale_y_continuous(labels = scales::percent_format()) +   

# Colors for the binary outcome
                          scale_fill_manual(values = c("Alive" = "darkblue",       
                                                       "Dead" = "grey")) +  
  labs(
    title = "Figure 2: Bivariate Relationship - Vital Status and HIV Exposure Group", 
    x = "HIV Exposure Category",                            
    y = "Percentage",                                       
    fill = "Status"                                         
  ) +
  theme_minimal()

ggplotly(p_bivariate)                       
ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure2_bivariate.png", 
       plot = p_bivariate, width = 8, height = 6, dpi = 300)
# Percentage of 'Dead' for each combination
heatmap_data <- cdc3_full %>%
  group_by(hivexposure, yeardiagnosed) %>%
  summarise(
    total_cases = sum(cases),
    percent_dead = sum(cases[vitalstatus == "Dead"]) / sum(cases) * 100,
    .groups = 'drop'
  )

# Create the Heatmap
p_heatmap <- ggplot(heatmap_data, aes(x = yeardiagnosed, y = hivexposure, fill = percent_dead)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "lightblue", high = "darkblue", mid = "blue", 
                       midpoint = 50, limit = c(0,100), name = "% Dead") +
  labs(
    title = "Figure 3.Exploratory Analysis: Mortality Heatmap by Exposure and Year",
    subtitle = "dark blue indicates higher mortality (N = 11,022)",
    x = "Year of Diagnosis",
    y = "HIV Exposure Category"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 3. Interactive Plot
ggplotly(p_heatmap)
ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure3_heatmap.png", 
       plot = p_heatmap, width = 8, height = 6, dpi = 300)

Future Logistic Regression and Modeling

#Future regression 
#simple logistic regression
model_logistic_simple <- glm(vitalstatus ~ hivexposure, 
             data = cdc3_full, 
             family = binomial(link = "logit"), 
             weights = cases) # This row is actually the amount of people, since data is agreggated 

summary(model_logistic_simple)
## 
## Call:
## glm(formula = vitalstatus ~ hivexposure, family = binomial(link = "logit"), 
##     data = cdc3_full, weights = cases)
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                             -0.86164    0.07299
## hivexposureIV drug use (female and hetero male)          1.63390    0.08544
## hivexposureMale homo/bisexual and IV drug use            0.93220    0.10184
## hivexposureMale homosexual/bisexual contact              0.82841    0.07685
## hivexposureReceipt of blood, blood components or tissue  0.82989    0.26236
##                                                         z value Pr(>|z|)    
## (Intercept)                                             -11.805  < 2e-16 ***
## hivexposureIV drug use (female and hetero male)          19.123  < 2e-16 ***
## hivexposureMale homo/bisexual and IV drug use             9.154  < 2e-16 ***
## hivexposureMale homosexual/bisexual contact              10.780  < 2e-16 ***
## hivexposureReceipt of blood, blood components or tissue   3.163  0.00156 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15264  on 684  degrees of freedom
## Residual deviance: 14799  on 680  degrees of freedom
## AIC: 14809
## 
## Number of Fisher Scoring iterations: 4
# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Simple Logistic Regression: vitalstatus ~ hivexposure (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Simple Logistic Regression: vitalstatus ~ hivexposure (Odds Ratios)
Term Odds Ratio Std. Error z-statistic p-value 95% CI Lower 95% CI Upper
(Intercept) 0.422 0.073 -11.805 0.000 0.366 0.487
hivexposureIV drug use (female and hetero male) 5.124 0.085 19.123 0.000 4.338 6.065
hivexposureMale homo/bisexual and IV drug use 2.540 0.102 9.154 0.000 2.082 3.104
hivexposureMale homosexual/bisexual contact 2.290 0.077 10.780 0.000 1.972 2.665
hivexposureReceipt of blood, blood components or tissue 2.293 0.262 3.163 0.002 1.367 3.842
#Future regression relationship model
exposure_coefs <- tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "hivexposure")) %>%
  mutate(
    # Cleans the prefix "hivexposure" off the category names
    exposure_level = str_remove(term, "hivexposure")
  )

ref_row <- data.frame(
  term = "Reference",
  estimate = 1.0,
  std.error = 0,
  statistic = NA,
  p.value = NA,
  conf.low = 1.0,
  conf.high = 1.0,
  exposure_level = "Hemophilia/coagulation disorder (Ref)"
)

exposure_plot_data <- bind_rows(ref_row, exposure_coefs)

p3 <- ggplot(exposure_plot_data, aes(x = reorder(exposure_level, estimate), y = estimate)) +
  # Line at 1.0: Estimates above this mean higher risk of "Dead" vs "Alive"
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
                  size = 0.8, color = "darkblue") +
  coord_flip() +
  labs(
    title = "Figure 4: Association Between HIV Exposure and Mortality",
    subtitle = "Weighted Odds Ratios (N = 11,022 total cases)",
    x = "HIV Exposure Category",
    y = "Odds Ratio (95% CI)"
  ) +
  theme_minimal(base_size = 11)

ggplotly(p3)
ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure4_pred_assoc.png", 
       plot = p3, width = 8, height = 6, dpi = 300)
#Future regression models
#compare models
# Model A: vitalstatus ~ hivexposure
m_A <- svyglm(vitalstatus ~ hivexposure,
  design = cdc_weighted,
  family  = quasibinomial()
)

# Model B: vitalstatus ~ hivexposure + casecriteria
m_B <- svyglm(vitalstatus ~ hivexposure + CaseCriteria,
  design = cdc_weighted,
  family  = quasibinomial()
)

# Model C: Full model
m_C <- svyglm(vitalstatus ~ hivexposure + CaseCriteria + yeardiagnosed + sexandorientation,
  design = cdc_weighted,
  family  = quasibinomial()
)
#Future regression modeling
# formatted model table
make_tbl <- function(model) {
  tbl_regression(
    model,
    exponentiate = TRUE,
    conf.int     = FALSE
  ) %>%
    add_significance_stars(
      hide_ci  = TRUE,
      hide_se  = FALSE,
      pattern  = "{estimate}{stars}"
    ) %>%
    remove_row_type(type = "reference") %>%
    bold_labels() %>%
    italicize_levels() %>%
    modify_header(estimate ~ "**OR**", std.error ~ "**SE**")
}

t_A <- make_tbl(m_A)
t_B <- make_tbl(m_B)
t_C <- make_tbl(m_C)

# Remove orphaned CI footnotes
for (tbl_obj in list(t_A, t_B, t_C)) {
  tbl_obj$table_styling$abbreviation <-
    tbl_obj$table_styling$abbreviation %>%
    dplyr::filter(column != "conf.low")
}

ft1 <- tbl_merge(
  tbls = list(t_A, t_B, t_C),
  tab_spanner = c(
    "**Model A: vitalstatus ~ hivexposure**",
    "**Model B: vitalstatus ~ hivexposure + casecriteria**",
    "**Model C: Full model**"
  )
) %>%
  bold_labels() %>%
  modify_caption("**Table 2. Logistic Regression: Predictors of Mortality (Vital Status)**") %>%
  as_gt() %>%
  tab_footnote(
    footnote = "OR = odds ratio; SE = standard error. Survey-weighted quasibinomial logistic regression. ***p < 0.001; **p < 0.01; *p < 0.05.",
    locations = cells_title()
  ) %>% 
  gtsave("/Users/nataliasmall/Downloads/EPI 553/outputs/Table2_logRegression.png")

print (ft1)
## [1] "/Users/nataliasmall/Downloads/EPI 553/outputs/Table2_logRegression.png"