# Load packages
library(dplyr)
library(tidyverse)
library(haven)
library(here)
library(knitr)
library(kableExtra)
library(broom)
library(plotly)
library (gtsummary)
library (plotly)
library(ggplot2)
library(stringr)
library(gt)
#this data was taken from the cdcwonder surveillance report for 1981-2002
# Load CDC Wonder 1981 - 2002 datasets
cdc03 <- read.csv("/Users/nataliasmall/Downloads/EPI 553/third.csv") %>%
janitor::clean_names()
# Display dataset dimensions
names(cdc03)## [1] "notes" "year_diagnosed"
## [3] "year_diagnosed_code" "sex_and_sexual_orientation"
## [5] "sex_and_sexual_orientation_code" "hiv_exposure_category"
## [7] "hiv_exposure_category_code" "case_definition"
## [9] "case_definition_code" "vital_status"
## [11] "vital_status_code" "cases"
#Recoding and cleaning
cdc3_clean <- cdc03 %>%
filter(notes != "Total") %>%
drop_na(`vital_status_code`)
# Select variables of interest
cdc3_full <- cdc3_clean %>%
select(year_diagnosed_code, sex_and_sexual_orientation, hiv_exposure_category, case_definition, vital_status_code, cases)
# Recode variables
cdc3_full <- cdc3_full %>%
mutate(
yeardiagnosed = factor(case_when(
year_diagnosed_code == 1981 ~ "Before 1982",
year_diagnosed_code == 1982 ~ "1982",
year_diagnosed_code == 1983 ~ "1983",
year_diagnosed_code == 1984 ~ "1984",
year_diagnosed_code == 1985 ~ "1985",
year_diagnosed_code == 1986 ~ "1986",
year_diagnosed_code == 1987 ~ "1987",
year_diagnosed_code == 1988 ~ "1988",
year_diagnosed_code == 1995 ~ "1995",
year_diagnosed_code == 1996 ~ "1996",
year_diagnosed_code == 1997 ~ "1997",
year_diagnosed_code == 1998 ~ "1998",
year_diagnosed_code == 1999 ~ "1999",
year_diagnosed_code == 2000 ~ "2000",
year_diagnosed_code == 2001 ~ "2001",
year_diagnosed_code == 2002 ~ "2002"), levels = c("Before 1982", "1982", "1983", "1984", "1985",
"1986", "1987", "1988", "1995", "1996", "1997",
"1998", "1999", "2000", "2001", "2002")),
sexandorientation = sex_and_sexual_orientation,
hivexposure = hiv_exposure_category,
CaseCriteria = case_definition,
vitalstatus = factor(ifelse(vital_status_code == 0, "Alive", "Dead")),
cases = cases
)
# Select recoded variables, apply filters, drop missing
set.seed(553)
cdc3_full <- cdc3_full %>%
select(yeardiagnosed, sexandorientation, hivexposure, CaseCriteria, vitalstatus, cases) %>%
filter(cases > 0) %>%
drop_na()
#cdcwonder data is pre-aggregated, so data points are not representative of only 1 person, could be 60, 40, 200, etc.
#total obs. in dataset is 685, but there are more individuals accounted for than that based on identified cases. weight is very important here.
#total number of counts of cases: 11022
sum(cdc3_full$cases)## [1] 11022
#total number of counts of cases reported as alive: 5306
sum(cdc3_full$cases[cdc3_full$vitalstatus == "Alive"], na.rm = TRUE)## [1] 5306
#total number of counts of cases reported as dead: 5716
sum(cdc3_full$cases[cdc3_full$vitalstatus == "Dead"], na.rm = TRUE)## [1] 5716
cdc3_full %>%
select(yeardiagnosed, vitalstatus, cases) %>%
summary() %>%
kable(caption = "Descriptive Statistics: Key Continuous Variables") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| yeardiagnosed | vitalstatus | cases | |
|---|---|---|---|
| 1995 :105 | Alive:388 | Min. : 1.00 | |
| 1996 : 77 | Dead :297 | 1st Qu.: 1.00 | |
| 1997 : 76 | NA | Median : 3.00 | |
| 1998 : 74 | NA | Mean : 16.09 | |
| 1999 : 59 | NA | 3rd Qu.: 10.00 | |
| 1988 : 53 | NA | Max. :976.00 | |
| (Other):241 | NA | NA |
library(gtsummary)
# library that handles math for weighting
library(survey)
# Create the weighted design object
# ids = ~1: independent sampling
# weights = ~cases:use the cases variable to "inflate" each row to its representative value in the population
cdc_weighted <- svydesign(ids = ~1, weights = ~cases, data = cdc3_full)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
#tbl_svysummary: The "weighted version" of tbl_summary
ft0 <- cdc_weighted %>%
tbl_svysummary(
by = hivexposure,
include = c(yeardiagnosed, sexandorientation, CaseCriteria, vitalstatus),
label = list(
yeardiagnosed ~ "Year Diagnosed",
sexandorientation ~ "Sex and Sexual Orientation",
CaseCriteria ~ "Case Criteria",
vitalstatus ~ "Vital Status"
),
statistic = list(
all_categorical() ~ "{n} ({p}%)"
)
) %>%
bold_labels() %>%
modify_caption("**Table 1: Survey-Weighted Descriptives Stratified by HIV Exposure (N = 11,022)**") %>%
modify_footnote(everything() ~ "Total analytic sample size N=11,022. Percentages reflect survey-weighted distributions. Missing values handled via complete case analysis.")%>%
as_gt() %>%
gtsave("/Users/nataliasmall/Downloads/EPI 553/outputs/Table1_WeightDesc.png")
print (ft0)## [1] "/Users/nataliasmall/Downloads/EPI 553/outputs/Table1_WeightDesc.png"
p_bar <- ggplot(cdc3_full, aes(x = vitalstatus, weight = cases)) +
geom_bar(fill = "darkblue", color = "white") +
scale_y_continuous(labels = scales::comma) + #Note: this makes large numbers readable
labs(
title = "Figure 1: Actual Distribution of Mortality (Vital Status)",
subtitle = "Weighted by case counts (Total N = 11,022)",
x = "Vital Status",
y = "Total Number of People"
) +
theme_minimal(base_size = 13)
ggplotly(p_bar)ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure1_barchart.png",
plot = p_bar, width = 8, height = 6, dpi = 300)# Proportional Stacked Bar Chart: percentage of "Alive" vs "Dead" within each exposure group, adjusted by cases weight
# x = reorder(hivexposure, cases): reorder categories by the size of the case counts
# fill = vitalstatus: color the bars by "Alive" vs "Dead"
# weight = cases: count 11,022 people, not 685 rows
p_bivariate <- ggplot(cdc3_full,
aes(x = reorder(hivexposure, cases),
fill = vitalstatus,
weight = cases)) +
geom_bar(position = "fill") +
# Flip the coordinates for readability
# Rotates the plot so long category names are easy to read
coord_flip() +
# Format the y-axis (now the horizontal axis)
# Convert 0.25, 0.50, etc. into 25%, 50%
scale_y_continuous(labels = scales::percent_format()) +
# Colors for the binary outcome
scale_fill_manual(values = c("Alive" = "darkblue",
"Dead" = "grey")) +
labs(
title = "Figure 2: Bivariate Relationship - Vital Status and HIV Exposure Group",
x = "HIV Exposure Category",
y = "Percentage",
fill = "Status"
) +
theme_minimal()
ggplotly(p_bivariate) ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure2_bivariate.png",
plot = p_bivariate, width = 8, height = 6, dpi = 300)# Percentage of 'Dead' for each combination
heatmap_data <- cdc3_full %>%
group_by(hivexposure, yeardiagnosed) %>%
summarise(
total_cases = sum(cases),
percent_dead = sum(cases[vitalstatus == "Dead"]) / sum(cases) * 100,
.groups = 'drop'
)
# Create the Heatmap
p_heatmap <- ggplot(heatmap_data, aes(x = yeardiagnosed, y = hivexposure, fill = percent_dead)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "lightblue", high = "darkblue", mid = "blue",
midpoint = 50, limit = c(0,100), name = "% Dead") +
labs(
title = "Figure 3.Exploratory Analysis: Mortality Heatmap by Exposure and Year",
subtitle = "dark blue indicates higher mortality (N = 11,022)",
x = "Year of Diagnosis",
y = "HIV Exposure Category"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# 3. Interactive Plot
ggplotly(p_heatmap)#Future regression
#simple logistic regression
model_logistic_simple <- glm(vitalstatus ~ hivexposure,
data = cdc3_full,
family = binomial(link = "logit"),
weights = cases) # This row is actually the amount of people, since data is agreggated
summary(model_logistic_simple)##
## Call:
## glm(formula = vitalstatus ~ hivexposure, family = binomial(link = "logit"),
## data = cdc3_full, weights = cases)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -0.86164 0.07299
## hivexposureIV drug use (female and hetero male) 1.63390 0.08544
## hivexposureMale homo/bisexual and IV drug use 0.93220 0.10184
## hivexposureMale homosexual/bisexual contact 0.82841 0.07685
## hivexposureReceipt of blood, blood components or tissue 0.82989 0.26236
## z value Pr(>|z|)
## (Intercept) -11.805 < 2e-16 ***
## hivexposureIV drug use (female and hetero male) 19.123 < 2e-16 ***
## hivexposureMale homo/bisexual and IV drug use 9.154 < 2e-16 ***
## hivexposureMale homosexual/bisexual contact 10.780 < 2e-16 ***
## hivexposureReceipt of blood, blood components or tissue 3.163 0.00156 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15264 on 684 degrees of freedom
## Residual deviance: 14799 on 680 degrees of freedom
## AIC: 14809
##
## Number of Fisher Scoring iterations: 4
# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: vitalstatus ~ hivexposure (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.422 | 0.073 | -11.805 | 0.000 | 0.366 | 0.487 |
| hivexposureIV drug use (female and hetero male) | 5.124 | 0.085 | 19.123 | 0.000 | 4.338 | 6.065 |
| hivexposureMale homo/bisexual and IV drug use | 2.540 | 0.102 | 9.154 | 0.000 | 2.082 | 3.104 |
| hivexposureMale homosexual/bisexual contact | 2.290 | 0.077 | 10.780 | 0.000 | 1.972 | 2.665 |
| hivexposureReceipt of blood, blood components or tissue | 2.293 | 0.262 | 3.163 | 0.002 | 1.367 | 3.842 |
#Future regression relationship model
exposure_coefs <- tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "hivexposure")) %>%
mutate(
# Cleans the prefix "hivexposure" off the category names
exposure_level = str_remove(term, "hivexposure")
)
ref_row <- data.frame(
term = "Reference",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
exposure_level = "Hemophilia/coagulation disorder (Ref)"
)
exposure_plot_data <- bind_rows(ref_row, exposure_coefs)
p3 <- ggplot(exposure_plot_data, aes(x = reorder(exposure_level, estimate), y = estimate)) +
# Line at 1.0: Estimates above this mean higher risk of "Dead" vs "Alive"
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Figure 4: Association Between HIV Exposure and Mortality",
subtitle = "Weighted Odds Ratios (N = 11,022 total cases)",
x = "HIV Exposure Category",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 11)
ggplotly(p3)ggsave(filename = "/Users/nataliasmall/Downloads/EPI 553/figures/Figure4_pred_assoc.png",
plot = p3, width = 8, height = 6, dpi = 300)#Future regression models
#compare models
# Model A: vitalstatus ~ hivexposure
m_A <- svyglm(vitalstatus ~ hivexposure,
design = cdc_weighted,
family = quasibinomial()
)
# Model B: vitalstatus ~ hivexposure + casecriteria
m_B <- svyglm(vitalstatus ~ hivexposure + CaseCriteria,
design = cdc_weighted,
family = quasibinomial()
)
# Model C: Full model
m_C <- svyglm(vitalstatus ~ hivexposure + CaseCriteria + yeardiagnosed + sexandorientation,
design = cdc_weighted,
family = quasibinomial()
)#Future regression modeling
# formatted model table
make_tbl <- function(model) {
tbl_regression(
model,
exponentiate = TRUE,
conf.int = FALSE
) %>%
add_significance_stars(
hide_ci = TRUE,
hide_se = FALSE,
pattern = "{estimate}{stars}"
) %>%
remove_row_type(type = "reference") %>%
bold_labels() %>%
italicize_levels() %>%
modify_header(estimate ~ "**OR**", std.error ~ "**SE**")
}
t_A <- make_tbl(m_A)
t_B <- make_tbl(m_B)
t_C <- make_tbl(m_C)
# Remove orphaned CI footnotes
for (tbl_obj in list(t_A, t_B, t_C)) {
tbl_obj$table_styling$abbreviation <-
tbl_obj$table_styling$abbreviation %>%
dplyr::filter(column != "conf.low")
}
ft1 <- tbl_merge(
tbls = list(t_A, t_B, t_C),
tab_spanner = c(
"**Model A: vitalstatus ~ hivexposure**",
"**Model B: vitalstatus ~ hivexposure + casecriteria**",
"**Model C: Full model**"
)
) %>%
bold_labels() %>%
modify_caption("**Table 2. Logistic Regression: Predictors of Mortality (Vital Status)**") %>%
as_gt() %>%
tab_footnote(
footnote = "OR = odds ratio; SE = standard error. Survey-weighted quasibinomial logistic regression. ***p < 0.001; **p < 0.01; *p < 0.05.",
locations = cells_title()
) %>%
gtsave("/Users/nataliasmall/Downloads/EPI 553/outputs/Table2_logRegression.png")
print (ft1)## [1] "/Users/nataliasmall/Downloads/EPI 553/outputs/Table2_logRegression.png"