This project simulates and analyzes a clinical dataset representing
antibody responses following COVID-19 vaccination. It includes
demographic, clinical, and immunological response features. Analysis
includes ANOVA, logistic regression, and insightful visualizations using
ggpubr, ggpmisc, and
gghighlight.
required_packages <- c("tidyverse", "ggpubr", "ggsignif", "ggpmisc", "gghighlight", "broom", "readr")
for (pkg in required_packages) {
if (!requireNamespace(pkg, quietly = TRUE)) install.packages(pkg)
library(pkg, character.only = TRUE)
}
## Warning: package 'dplyr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'ggpubr' was built under R version 4.3.3
## Warning: package 'ggsignif' was built under R version 4.3.3
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
## Warning: package 'ggpmisc' was built under R version 4.3.3
## Loading required package: ggpp
## Warning: package 'ggpp' was built under R version 4.3.3
##
## Attaching package: 'ggpp'
##
## The following objects are masked from 'package:ggpubr':
##
## as_npc, as_npcx, as_npcy
##
## The following object is masked from 'package:ggplot2':
##
## annotate
## Warning: package 'gghighlight' was built under R version 4.3.3
set.seed(123)
n <- 500
patients <- tibble(
patient_id = 1:n,
age = round(rnorm(n, 45, 15)),
sex = sample(c("Male", "Female"), n, replace = TRUE),
bmi = round(rnorm(n, 25, 4), 1),
comorbidity = sample(c("None", "Hypertension", "Diabetes", "Both"),
n, replace = TRUE, prob = c(0.5, 0.2, 0.2, 0.1)),
dose = sample(c("Dose 1", "Dose 2", "Booster"), n, replace = TRUE)
)
timepoints <- c(0, 14, 28, 90)
sim_data <- expand.grid(patient_id = patients$patient_id, time = timepoints) %>%
left_join(patients, by = "patient_id") %>%
mutate(
base_titer = rnorm(nrow(.), 20, 5),
titer = base_titer +
case_when(
dose == "Dose 1" ~ time * 0.8,
dose == "Dose 2" ~ time * 1.2,
dose == "Booster" ~ time * 1.5
) +
ifelse(comorbidity != "None", -runif(nrow(.), 2, 5), 0) +
rnorm(nrow(.), 0, 10),
seropositive = ifelse(titer > 50, 1, 0)
)
write_csv(sim_data, "C:/Users/lukew/OneDrive/Desktop/Lukas/R_DataFiles/data/simulated_vaccine_data.csv")
day28 <- sim_data %>% filter(time == 28)
# ANOVA
anova_model <- aov(titer ~ dose, data = day28)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## dose 2 31249 15625 114.4 <2e-16 ***
## Residuals 497 67877 137
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova_model)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = titer ~ dose, data = day28)
##
## $dose
## diff lwr upr p adj
## Dose 1-Booster -19.363404 -22.372864 -16.353943 0
## Dose 2-Booster -9.037723 -11.991703 -6.083743 0
## Dose 2-Dose 1 10.325681 7.237859 13.413503 0
# Logistic Regression
logit_model <- glm(seropositive ~ age + sex + bmi + comorbidity + dose,
data = day28, family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = seropositive ~ age + sex + bmi + comorbidity +
## dose, family = binomial, data = day28)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.713401 0.806492 3.364 0.000767 ***
## age -0.003731 0.007037 -0.530 0.595980
## sexMale 0.060879 0.206620 0.295 0.768269
## bmi -0.060654 0.025552 -2.374 0.017610 *
## comorbidityDiabetes -0.096433 0.395587 -0.244 0.807407
## comorbidityHypertension 0.256620 0.384350 0.668 0.504342
## comorbidityNone 0.776243 0.350019 2.218 0.026574 *
## doseDose 1 -2.638844 0.274974 -9.597 < 2e-16 ***
## doseDose 2 -1.414918 0.250863 -5.640 1.7e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 689.94 on 499 degrees of freedom
## Residual deviance: 563.72 on 491 degrees of freedom
## AIC: 581.72
##
## Number of Fisher Scoring iterations: 4
tidy(logit_model, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 9 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 15.1 0.806 3.36 7.67e- 4 3.16 75.0
## 2 age 0.996 0.00704 -0.530 5.96e- 1 0.983 1.01
## 3 sexMale 1.06 0.207 0.295 7.68e- 1 0.709 1.59
## 4 bmi 0.941 0.0256 -2.37 1.76e- 2 0.895 0.989
## 5 comorbidityDiabetes 0.908 0.396 -0.244 8.07e- 1 0.417 1.97
## 6 comorbidityHypertens… 1.29 0.384 0.668 5.04e- 1 0.608 2.76
## 7 comorbidityNone 2.17 0.350 2.22 2.66e- 2 1.10 4.34
## 8 doseDose 1 0.0714 0.275 -9.60 8.26e-22 0.0411 0.121
## 9 doseDose 2 0.243 0.251 -5.64 1.70e- 8 0.147 0.394
ggpubr and
ggsignifggboxplot(day28, x = "dose", y = "titer", color = "dose", palette = "jco", add = "jitter") +
stat_compare_means(method = "anova", label.y = 120) +
stat_compare_means(label = "p.signif", method = "t.test",
comparisons = list(c("Dose 1", "Dose 2"), c("Dose 2", "Booster"), c("Dose 1", "Booster")),
tip.length = 0.01) +
labs(title = "Antibody Titers by Vaccine Dose at Day 28",
y = "Antibody Titer", x = "Dose")
ggpmisc)mean_titers <- sim_data %>%
group_by(dose, time) %>%
summarise(mean_titer = mean(titer), .groups = "drop")
ggplot(mean_titers, aes(x = time, y = mean_titer, color = dose)) +
geom_line(linewidth = 1.2) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2, raw = TRUE), se = FALSE) +
stat_poly_eq(
formula = y ~ poly(x, 2, raw = TRUE),
aes(label = paste(after_stat(eq.label), after_stat(adj.rr.label), sep = "~~~")),
parse = TRUE
) +
labs(
title = "Mean Antibody Response Over Time by Dose",
x = "Days Since Vaccination", y = "Mean Antibody Titer"
) +
theme_minimal()
gghighlight)top90 <- sim_data %>% filter(time == 90)
ggplot(top90, aes(x = age, y = titer, color = dose)) +
geom_point(size = 2, alpha = 0.6) +
gghighlight(
titer > quantile(titer, 0.9),
label_key = dose,
use_direct_label = TRUE
) +
labs(
title = "Top 10% Responders at Day 90",
x = "Age", y = "Antibody Titer"
) +
theme_minimal()
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
## Warning: ggrepel: 40 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
The polynomial regression fits shown for each dose group are based on mean antibody titers at four fixed timepoints. Because of the deterministic structure of the simulation, these yield near-perfect adjusted R² values (≈ 1.00), which is expected and not misleading.