🧪 Project Overview

This project simulates and analyzes a clinical dataset representing antibody responses following COVID-19 vaccination. It includes demographic, clinical, and immunological response features. Analysis includes ANOVA, logistic regression, and insightful visualizations using ggpubr, ggpmisc, and gghighlight.


1. Load Required Libraries

required_packages <- c("tidyverse", "ggpubr", "ggsignif", "ggpmisc", "gghighlight", "broom", "readr")
for (pkg in required_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) install.packages(pkg)
  library(pkg, character.only = TRUE)
}
## Warning: package 'dplyr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'ggpubr' was built under R version 4.3.3
## Warning: package 'ggsignif' was built under R version 4.3.3
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
## Warning: package 'ggpmisc' was built under R version 4.3.3
## Loading required package: ggpp
## Warning: package 'ggpp' was built under R version 4.3.3
## 
## Attaching package: 'ggpp'
## 
## The following objects are masked from 'package:ggpubr':
## 
##     as_npc, as_npcx, as_npcy
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## Warning: package 'gghighlight' was built under R version 4.3.3

2. Simulate Patient Demographics and Vaccine Data

set.seed(123)
n <- 500
patients <- tibble(
  patient_id = 1:n,
  age = round(rnorm(n, 45, 15)),
  sex = sample(c("Male", "Female"), n, replace = TRUE),
  bmi = round(rnorm(n, 25, 4), 1),
  comorbidity = sample(c("None", "Hypertension", "Diabetes", "Both"),
                       n, replace = TRUE, prob = c(0.5, 0.2, 0.2, 0.1)),
  dose = sample(c("Dose 1", "Dose 2", "Booster"), n, replace = TRUE)
)

3. Simulate Longitudinal Antibody Titers

timepoints <- c(0, 14, 28, 90)
sim_data <- expand.grid(patient_id = patients$patient_id, time = timepoints) %>%
  left_join(patients, by = "patient_id") %>%
  mutate(
    base_titer = rnorm(nrow(.), 20, 5),
    titer = base_titer +
      case_when(
        dose == "Dose 1" ~ time * 0.8,
        dose == "Dose 2" ~ time * 1.2,
        dose == "Booster" ~ time * 1.5
      ) +
      ifelse(comorbidity != "None", -runif(nrow(.), 2, 5), 0) +
      rnorm(nrow(.), 0, 10),
    seropositive = ifelse(titer > 50, 1, 0)
  )

write_csv(sim_data, "C:/Users/lukew/OneDrive/Desktop/Lukas/R_DataFiles/data/simulated_vaccine_data.csv")

4. Statistical Analysis (Day 28)

day28 <- sim_data %>% filter(time == 28)

# ANOVA
anova_model <- aov(titer ~ dose, data = day28)
summary(anova_model)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## dose          2  31249   15625   114.4 <2e-16 ***
## Residuals   497  67877     137                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova_model)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = titer ~ dose, data = day28)
## 
## $dose
##                      diff        lwr        upr p adj
## Dose 1-Booster -19.363404 -22.372864 -16.353943     0
## Dose 2-Booster  -9.037723 -11.991703  -6.083743     0
## Dose 2-Dose 1   10.325681   7.237859  13.413503     0
# Logistic Regression
logit_model <- glm(seropositive ~ age + sex + bmi + comorbidity + dose,
                   data = day28, family = binomial)
summary(logit_model)
## 
## Call:
## glm(formula = seropositive ~ age + sex + bmi + comorbidity + 
##     dose, family = binomial, data = day28)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              2.713401   0.806492   3.364 0.000767 ***
## age                     -0.003731   0.007037  -0.530 0.595980    
## sexMale                  0.060879   0.206620   0.295 0.768269    
## bmi                     -0.060654   0.025552  -2.374 0.017610 *  
## comorbidityDiabetes     -0.096433   0.395587  -0.244 0.807407    
## comorbidityHypertension  0.256620   0.384350   0.668 0.504342    
## comorbidityNone          0.776243   0.350019   2.218 0.026574 *  
## doseDose 1              -2.638844   0.274974  -9.597  < 2e-16 ***
## doseDose 2              -1.414918   0.250863  -5.640  1.7e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 689.94  on 499  degrees of freedom
## Residual deviance: 563.72  on 491  degrees of freedom
## AIC: 581.72
## 
## Number of Fisher Scoring iterations: 4
tidy(logit_model, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 9 × 7
##   term                  estimate std.error statistic  p.value conf.low conf.high
##   <chr>                    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)            15.1      0.806       3.36  7.67e- 4   3.16      75.0  
## 2 age                     0.996    0.00704    -0.530 5.96e- 1   0.983      1.01 
## 3 sexMale                 1.06     0.207       0.295 7.68e- 1   0.709      1.59 
## 4 bmi                     0.941    0.0256     -2.37  1.76e- 2   0.895      0.989
## 5 comorbidityDiabetes     0.908    0.396      -0.244 8.07e- 1   0.417      1.97 
## 6 comorbidityHypertens…   1.29     0.384       0.668 5.04e- 1   0.608      2.76 
## 7 comorbidityNone         2.17     0.350       2.22  2.66e- 2   1.10       4.34 
## 8 doseDose 1              0.0714   0.275      -9.60  8.26e-22   0.0411     0.121
## 9 doseDose 2              0.243    0.251      -5.64  1.70e- 8   0.147      0.394

5. Visualizations

A. Boxplot with Significance using ggpubr and ggsignif

ggboxplot(day28, x = "dose", y = "titer", color = "dose", palette = "jco", add = "jitter") +
  stat_compare_means(method = "anova", label.y = 120) +
  stat_compare_means(label = "p.signif", method = "t.test",
                     comparisons = list(c("Dose 1", "Dose 2"), c("Dose 2", "Booster"), c("Dose 1", "Booster")),
                     tip.length = 0.01) +
  labs(title = "Antibody Titers by Vaccine Dose at Day 28",
       y = "Antibody Titer", x = "Dose")

B. Mean Titer Over Time + Poly Eq (using ggpmisc)

mean_titers <- sim_data %>%
  group_by(dose, time) %>%
  summarise(mean_titer = mean(titer), .groups = "drop")

ggplot(mean_titers, aes(x = time, y = mean_titer, color = dose)) +
  geom_line(linewidth = 1.2) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2, raw = TRUE), se = FALSE) +
  stat_poly_eq(
    formula = y ~ poly(x, 2, raw = TRUE),
    aes(label = paste(after_stat(eq.label), after_stat(adj.rr.label), sep = "~~~")),
    parse = TRUE
  ) +
  labs(
    title = "Mean Antibody Response Over Time by Dose",
    x = "Days Since Vaccination", y = "Mean Antibody Titer"
  ) +
  theme_minimal()

C. Top 10% Responders Highlight (using gghighlight)

top90 <- sim_data %>% filter(time == 90)

ggplot(top90, aes(x = age, y = titer, color = dose)) +
  geom_point(size = 2, alpha = 0.6) +
  gghighlight(
    titer > quantile(titer, 0.9),
    label_key = dose,
    use_direct_label = TRUE
  ) +
  labs(
    title = "Top 10% Responders at Day 90",
    x = "Age", y = "Antibody Titer"
  ) +
  theme_minimal()
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
## Warning: ggrepel: 40 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps


📌 Note on Model Fit

The polynomial regression fits shown for each dose group are based on mean antibody titers at four fixed timepoints. Because of the deterministic structure of the simulation, these yield near-perfect adjusted R² values (≈ 1.00), which is expected and not misleading.