This R Markdown document performs temporal trend analysis of resistance gene prevalence using isolate-level resistance gene data. It includes:
library(tidyverse)
library(lubridate)
library(stringr)
library(lme4)
library(lmerTest)
library(emmeans)
library(performance)
library(DHARMa)
library(moments)
library(lmtest)
library(car)
library(glmmTMB)
library(mgcv)
library(scales)
Update the file path below if needed.
library(readr)
library(dplyr)
df <- read_csv("rgi_combined_blast.csv")
This section keeps high-confidence hits, extracts patient ID,
extracts the sample date from Sample_ID, and converts it to
a proper date.
# Clean and parse data
dat0 <- df %>% # <-- was 'raw' before
filter(Cut_Off %in% c("Strict", "Perfect")) %>%
filter(!is.na(Sample_ID), !is.na(Best_Hit_ARO)) %>%
mutate(
patient_id = str_extract(Sample_ID, "^[^-]+"),
sample_date_chr = str_extract(Sample_ID, "\\d{2}-\\d{2}-\\d{4}$"),
sample_date = dmy(sample_date_chr)
) %>%
filter(!is.na(sample_date), !is.na(patient_id))
cat("Date range:\n")
## Date range:
print(range(dat0$sample_date))
## [1] "1980-01-02" "2020-12-03"
cat("Year distribution:\n")
## Year distribution:
print(
dat0 %>%
mutate(year = year(sample_date)) %>%
count(year) %>%
arrange(year)
)
## # A tibble: 38 × 2
## year n
## <dbl> <int>
## 1 1980 242
## 2 1981 61
## 3 1982 121
## 4 1985 60
## 5 1986 62
## 6 1987 245
## 7 1988 124
## 8 1989 61
## 9 1990 123
## 10 1991 242
## # ℹ 28 more rows
dat_unique <- dat0 %>%
distinct(Sample_ID, patient_id, sample_date, Best_Hit_ARO)
gene_mat <- dat_unique %>%
mutate(value = 1L) %>%
pivot_wider(
names_from = Best_Hit_ARO,
values_from = value,
values_fill = 0
)
meta_cols <- c("Sample_ID", "patient_id", "sample_date")
gene_cols <- setdiff(names(gene_mat), meta_cols)
iso <- gene_mat %>%
mutate(
gene_count = rowSums(across(all_of(gene_cols))),
total_genes = length(gene_cols),
gene_prevalence = gene_count / total_genes,
year = year(sample_date)
) %>%
arrange(sample_date) %>%
mutate(
time_years = as.numeric(sample_date - min(sample_date)) / 365.25
)
year_summary <- iso %>%
group_by(year) %>%
summarise(
n_isolates = n(),
mean_prev = mean(gene_prevalence),
sd_prev = sd(gene_prevalence),
se_prev = sd_prev / sqrt(n_isolates),
.groups = "drop"
)
year_summary
ggplot(year_summary, aes(x = year, y = mean_prev)) +
geom_point() +
geom_line() +
geom_errorbar(
aes(
ymin = mean_prev - se_prev,
ymax = mean_prev + se_prev
),
width = 0.2
) +
labs(
title = "Temporal Trend of Resistance Gene Prevalence",
x = "Year",
y = "Mean Gene Prevalence"
) +
theme_minimal()
hist(
iso$gene_prevalence,
breaks = 30,
main = "Distribution of Gene Prevalence"
)
qqnorm(iso$gene_prevalence)
qqline(iso$gene_prevalence)
cat("\n============================\n")
##
## ============================
cat("ASSUMPTION CHECK RESULTS\n")
## ASSUMPTION CHECK RESULTS
cat("============================\n")
## ============================
shapiro_test <- shapiro.test(iso$gene_prevalence)
cat("\nShapiro-Wilk test (raw data):\n")
##
## Shapiro-Wilk test (raw data):
print(shapiro_test)
##
## Shapiro-Wilk normality test
##
## data: iso$gene_prevalence
## W = 0.86363, p-value < 2.2e-16
skew_val <- skewness(iso$gene_prevalence)
kurt_val <- kurtosis(iso$gene_prevalence)
cat("\nSkewness:", round(skew_val, 3), "\n")
##
## Skewness: 1.154
cat("Kurtosis:", round(kurt_val, 3), "\n")
## Kurtosis: 12.292
lm_tmp <- lm(gene_prevalence ~ time_years, data = iso)
bp_test <- bptest(lm_tmp)
cat("\nBreusch-Pagan test:\n")
##
## Breusch-Pagan test:
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: lm_tmp
## BP = 0.99522, df = 1, p-value = 0.3185
res <- residuals(lm_tmp)
shapiro_res <- shapiro.test(res)
cat("\nShapiro-Wilk test (residuals):\n")
##
## Shapiro-Wilk test (residuals):
print(shapiro_res)
##
## Shapiro-Wilk normality test
##
## data: res
## W = 0.87392, p-value < 2.2e-16
cat("\nOutlier test:\n")
##
## Outlier test:
print(outlierTest(lm_tmp))
## rstudent unadjusted p-value Bonferroni p
## 963 6.513425 1.0167e-10 1.4478e-07
## 1017 6.510787 1.0342e-10 1.4727e-07
## 1024 6.510586 1.0355e-10 1.4746e-07
## 1309 6.499080 1.1154e-10 1.5884e-07
## 167 5.992648 2.6127e-09 3.7204e-06
## 1056 5.934648 3.6944e-09 5.2609e-06
## 1088 5.933692 3.7155e-09 5.2908e-06
## 1231 5.927581 3.8529e-09 5.4865e-06
## 105 5.430255 6.6105e-08 9.4134e-05
## 1326 4.785898 1.8797e-06 2.6767e-03
min_val <- min(iso$gene_prevalence)
max_val <- max(iso$gene_prevalence)
cat("\nBoundary check:\n")
##
## Boundary check:
cat("Min =", min_val, "\n")
## Min = 0.2374429
cat("Max =", max_val, "\n")
## Max = 0.3287671
normal_raw <- shapiro_test$p.value > 0.05
normal_res <- shapiro_res$p.value > 0.05
homo <- bp_test$p.value > 0.05
low_skew <- abs(skew_val) < 1
bounded <- (min_val >= 0 & max_val <= 1)
cat("\n============================\n")
##
## ============================
cat("INTERPRETATION\n")
## INTERPRETATION
cat("============================\n")
## ============================
if(normal_raw){
cat("✔ Raw data approximately normal\n")
} else {
cat("✖ Raw data NOT normal\n")
}
## ✖ Raw data NOT normal
if(normal_res){
cat("✔ Residuals approximately normal\n")
} else {
cat("✖ Residuals NOT normal\n")
}
## ✖ Residuals NOT normal
if(homo){
cat("✔ Homoscedasticity assumption satisfied\n")
} else {
cat("✖ Heteroscedasticity detected\n")
}
## ✔ Homoscedasticity assumption satisfied
if(low_skew){
cat("✔ Skewness acceptable\n")
} else {
cat("✖ High skewness detected\n")
}
## ✖ High skewness detected
if(bounded){
cat("✔ Outcome is bounded (0–1)\n")
}
## ✔ Outcome is bounded (0–1)
cat("\n============================\n")
##
## ============================
cat("PREPARING DATA\n")
## PREPARING DATA
cat("============================\n")
## ============================
iso <- iso %>%
mutate(patient_id = factor(patient_id))
if(any(iso$gene_prevalence < 0 | iso$gene_prevalence > 1, na.rm = TRUE)){
stop("gene_prevalence must be between 0 and 1.")
}
n_obs <- nrow(iso)
iso <- iso %>%
mutate(
gene_prev_adj = (gene_prevalence * (n_obs - 1) + 0.5) / n_obs
)
cat("\n============================\n")
##
## ============================
cat("FITTING CANDIDATE MODELS\n")
## FITTING CANDIDATE MODELS
cat("============================\n")
## ============================
model_linear <- glmmTMB(
gene_prev_adj ~ time_years + (1 | patient_id),
family = beta_family(link = "logit"),
data = iso
)
model_gam <- gam(
gene_prev_adj ~ s(time_years, k = 5) + s(patient_id, bs = "re"),
family = betar(link = "logit"),
data = iso,
method = "ML"
)
cat("\n============================\n")
##
## ============================
cat("MODEL COMPARISON\n")
## MODEL COMPARISON
cat("============================\n")
## ============================
AIC_linear <- AIC(model_linear)
AIC_gam <- AIC(model_gam)
delta_AIC <- AIC_linear - AIC_gam
cat("Linear model AIC:", round(AIC_linear, 2), "\n")
## Linear model AIC: -10378.27
cat("GAM model AIC:", round(AIC_gam, 2), "\n")
## GAM model AIC: -10639.74
cat("Delta AIC (Linear - GAM):", round(delta_AIC, 2), "\n")
## Delta AIC (Linear - GAM): 261.47
gam_sum <- summary(model_gam)
s_table <- gam_sum$s.table
time_smooth_row <- grep("^s\\(time_years", rownames(s_table))
edf_time <- if(length(time_smooth_row) > 0) s_table[time_smooth_row, "edf"] else NA
p_smooth <- if(length(time_smooth_row) > 0) s_table[time_smooth_row, "p-value"] else NA
cat("\nGAM smooth term for time_years:\n")
##
## GAM smooth term for time_years:
cat("edf =", round(edf_time, 3), "\n")
## edf = 1.001
cat("p-value =", signif(p_smooth, 3), "\n")
## p-value = 0.5
cat("\n============================\n")
##
## ============================
cat("FINAL MODEL SELECTION\n")
## FINAL MODEL SELECTION
cat("============================\n")
## ============================
if(!is.na(delta_AIC) && delta_AIC > 2 && !is.na(edf_time) && edf_time > 1.05){
final_model <- model_gam
final_model_type <- "gam"
cat("✔ Final model selected: NONLINEAR beta GAM\n")
cat("Reason: GAM improved AIC meaningfully and the time smooth suggests nonlinearity.\n")
} else {
final_model <- model_linear
final_model_type <- "linear"
cat("✔ Final model selected: LINEAR temporal effect beta mixed model\n")
if(abs(delta_AIC) <= 2){
cat("Reason: AIC difference is small; simpler linear model retained.\n")
} else if(!is.na(edf_time) && edf_time <= 1.05){
cat("Reason: GAM smooth is essentially linear (edf ≈ 1); linear model retained.\n")
} else {
cat("Reason: No convincing improvement from nonlinear modeling.\n")
}
}
## ✔ Final model selected: LINEAR temporal effect beta mixed model
## Reason: GAM smooth is essentially linear (edf ≈ 1); linear model retained.
cat("\n============================\n")
##
## ============================
cat("FINAL MODEL SUMMARY\n")
## FINAL MODEL SUMMARY
cat("============================\n")
## ============================
print(summary(final_model))
## Family: beta ( logit )
## Formula: gene_prev_adj ~ time_years + (1 | patient_id)
## Data: iso
##
## AIC BIC logLik -2*log(L) df.resid
## -10378.3 -10357.2 5193.1 -10386.3 1420
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## patient_id (Intercept) 0.0008929 0.02988
## Number of obs: 1424, groups: patient_id, 255
##
## Dispersion parameter for beta family (): 6.92e+03
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9603438 0.0052307 -183.60 <2e-16 ***
## time_years -0.0001027 0.0001522 -0.67 0.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n============================\n")
##
## ============================
cat("MODEL INTERPRETATION\n")
## MODEL INTERPRETATION
cat("============================\n")
## ============================
if(final_model_type == "linear"){
model_sum <- summary(final_model)
coef_table <- model_sum$coefficients$cond
beta <- coef_table["time_years", "Estimate"]
se <- coef_table["time_years", "Std. Error"]
pval <- coef_table["time_years", "Pr(>|z|)"]
odds_ratio <- exp(beta)
cat("\nFinal model: Linear temporal effect beta mixed model\n")
cat("Beta (time_years) =", round(beta, 4), "\n")
cat("SE =", round(se, 4), "\n")
cat("P-value =", signif(pval, 3), "\n")
cat("Odds ratio =", round(odds_ratio, 3), "\n")
if(pval < 0.05){
if(beta > 0){
cat("\n✔ Significant INCREASING temporal trend in mean gene prevalence.\n")
} else {
cat("\n✔ Significant DECREASING temporal trend in mean gene prevalence.\n")
}
} else {
cat("\n✖ No statistically significant temporal trend detected.\n")
}
cat("\nEffect size interpretation:\n")
if(odds_ratio > 1){
cat("→ Each 1-year increase multiplies the odds of the mean prevalence by", round(odds_ratio, 3), ".\n")
} else {
cat("→ Each 1-year increase multiplies the odds of the mean prevalence by", round(odds_ratio, 3), ", indicating a decrease.\n")
}
} else if(final_model_type == "gam"){
gam_sum <- summary(final_model)
s_table <- gam_sum$s.table
time_smooth_row <- grep("^s\\(time_years", rownames(s_table))
edf_time <- s_table[time_smooth_row, "edf"]
p_smooth <- s_table[time_smooth_row, "p-value"]
cat("\nFinal model: Nonlinear beta GAM\n")
cat("Smooth term for time_years:\n")
cat("edf =", round(edf_time, 3), "\n")
cat("P-value =", signif(p_smooth, 3), "\n")
if(p_smooth < 0.05){
cat("\n✔ Significant NONLINEAR temporal pattern detected in mean gene prevalence.\n")
} else {
cat("\n✖ No statistically significant nonlinear temporal trend detected.\n")
}
if(edf_time <= 1.05){
cat("The estimated smooth is effectively linear (edf ≈ 1).\n")
} else {
cat("The estimated smooth indicates a nonlinear pattern over time.\n")
}
}
##
## Final model: Linear temporal effect beta mixed model
## Beta (time_years) = -1e-04
## SE = 2e-04
## P-value = 0.5
## Odds ratio = 1
##
## ✖ No statistically significant temporal trend detected.
##
## Effect size interpretation:
## → Each 1-year increase multiplies the odds of the mean prevalence by 1 , indicating a decrease.
cat("\n============================\n")
##
## ============================
cat("RESULTS TEXT TEMPLATE\n")
## RESULTS TEXT TEMPLATE
cat("============================\n")
## ============================
if(final_model_type == "linear"){
rand_var <- as.numeric(VarCorr(final_model)$cond$patient_id[1])
cat(
paste0(
"A beta mixed-effects regression model was fitted to evaluate temporal trends in resistance gene prevalence while accounting for clustering at the patient level. ",
"The fixed effect for time was ",
ifelse(beta > 0, "positive", "negative"),
" but small (β = ", round(beta, 4),
", SE = ", round(se, 4),
", p = ", signif(pval, 3), "). ",
ifelse(pval < 0.05,
"This indicates a statistically significant temporal trend in mean gene prevalence. ",
"This indicates no statistically significant temporal trend in mean gene prevalence. "),
"The corresponding odds ratio was ", round(odds_ratio, 3),
", meaning that each additional year multiplied the odds of the mean prevalence by this factor. ",
"The random intercept variance for patient ID was ", round(rand_var, 5),
", reflecting the degree of between-patient variability in baseline prevalence."
),
"\n"
)
} else if(final_model_type == "gam"){
cat(
paste0(
"A beta generalized additive mixed model was fitted to evaluate potential nonlinear temporal trends in resistance gene prevalence while accounting for clustering at the patient level. ",
"The smooth term for time had an effective degrees of freedom (edf) of ", round(edf_time, 3),
" (p = ", signif(p_smooth, 3), "). ",
ifelse(p_smooth < 0.05,
"This indicates evidence of a statistically significant nonlinear temporal pattern in mean gene prevalence over time.",
"This indicates no statistically significant nonlinear temporal trend in mean gene prevalence over time.")
),
"\n"
)
}
## A beta mixed-effects regression model was fitted to evaluate temporal trends in resistance gene prevalence while accounting for clustering at the patient level. The fixed effect for time was negative but small (β = -1e-04, SE = 2e-04, p = 0.5). This indicates no statistically significant temporal trend in mean gene prevalence. The corresponding odds ratio was 1, meaning that each additional year multiplied the odds of the mean prevalence by this factor. The random intercept variance for patient ID was 0.00089, reflecting the degree of between-patient variability in baseline prevalence.
cat("\n============================\n")
##
## ============================
cat("FINAL MODEL DIAGNOSTICS\n")
## FINAL MODEL DIAGNOSTICS
cat("============================\n")
## ============================
if(final_model_type == "linear"){
sim_res <- simulateResiduals(final_model)
plot(sim_res)
uni_test <- testUniformity(sim_res)
cat("\nUniformity test:\n")
print(uni_test)
disp_test <- testDispersion(sim_res)
cat("\nDispersion test:\n")
print(disp_test)
out_test <- testOutliers(sim_res)
cat("\nOutlier test:\n")
print(out_test)
cat("\n============================\n")
cat("DIAGNOSTIC INTERPRETATION\n")
cat("============================\n")
if(uni_test$p.value > 0.05){
cat("✔ Residuals are approximately uniform → model fit is adequate.\n")
} else {
cat("✖ Residuals deviate from uniformity → possible model misspecification.\n")
}
if(disp_test$p.value > 0.05){
cat("✔ No over/under-dispersion detected.\n")
} else {
cat("✖ Dispersion issue detected → model may not capture the variance structure adequately.\n")
}
if(out_test$p.value > 0.05){
cat("✔ No major outlier problem detected.\n")
} else {
cat("✖ Potential outlier problem detected.\n")
}
if(uni_test$p.value > 0.05 &&
disp_test$p.value > 0.05 &&
out_test$p.value > 0.05){
cat("\n✔ FINAL: Model diagnostics indicate an ADEQUATE FIT.\n")
} else {
cat("\n⚠FINAL: Some diagnostic issues detected — interpret results with caution.\n")
}
} else if(final_model_type == "gam"){
cat("GAM diagnostic check:\n")
gam.check(final_model)
cat("\nAttempting DHARMa residual diagnostics for GAM...\n")
gam_dharma <- try(simulateResiduals(final_model), silent = TRUE)
if(!inherits(gam_dharma, "try-error")){
plot(gam_dharma)
uni_test <- testUniformity(gam_dharma)
disp_test <- testDispersion(gam_dharma)
out_test <- testOutliers(gam_dharma)
cat("\nUniformity test:\n")
print(uni_test)
cat("\nDispersion test:\n")
print(disp_test)
cat("\nOutlier test:\n")
print(out_test)
cat("\n============================\n")
cat("DIAGNOSTIC INTERPRETATION\n")
cat("============================\n")
if(uni_test$p.value > 0.05){
cat("✔ Residuals are approximately uniform.\n")
} else {
cat("✖ Residuals deviate from uniformity → possible model misspecification.\n")
}
if(disp_test$p.value > 0.05){
cat("✔ No strong dispersion issue detected.\n")
} else {
cat("✖ Dispersion issue detected.\n")
}
if(out_test$p.value > 0.05){
cat("✔ No major outlier problem detected.\n")
} else {
cat("✖ Potential outlier problem detected.\n")
}
} else {
cat("DHARMa simulation was not available for this GAM fit in the current setup.\n")
cat("Use gam.check() output and residual plots for GAM diagnostics.\n")
}
}
##
## Uniformity test:
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.13537, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
## Dispersion test:
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 1.0641, p-value = 0.328
## alternative hypothesis: two.sided
##
## Outlier test:
##
## DHARMa outlier test based on exact binomial test with approximate
## expectations
##
## data: sim_res
## outliers at both margin(s) = 26, observations = 1424, p-value =
## 0.0001344
## alternative hypothesis: true probability of success is not equal to 0.007968127
## 95 percent confidence interval:
## 0.01196080 0.02663903
## sample estimates:
## frequency of outliers (expected: 0.00796812749003984 )
## 0.01825843
##
##
## ============================
## DIAGNOSTIC INTERPRETATION
## ============================
## ✖ Residuals deviate from uniformity → possible model misspecification.
## ✔ No over/under-dispersion detected.
## ✖ Potential outlier problem detected.
##
## ⚠FINAL: Some diagnostic issues detected — interpret results with caution.
cat("\n============================\n")
##
## ============================
cat("MODEL-BASED VISUALIZATION\n")
## MODEL-BASED VISUALIZATION
cat("============================\n")
## ============================
ref_patient <- levels(iso$patient_id)[1]
newdat <- data.frame(
time_years = seq(
min(iso$time_years, na.rm = TRUE),
max(iso$time_years, na.rm = TRUE),
length.out = 200
),
patient_id = factor(ref_patient, levels = levels(iso$patient_id))
)
newdat$pred_linear <- predict(
model_linear,
newdata = newdat,
type = "response",
re.form = NA
)
newdat$pred_gam <- predict(
model_gam,
newdata = newdat,
type = "response",
exclude = "s(patient_id)"
)
ggplot(iso, aes(x = time_years, y = gene_prev_adj)) +
geom_point(alpha = 0.25) +
geom_line(
data = newdat,
aes(x = time_years, y = pred_linear),
linetype = "dashed",
linewidth = 1
) +
geom_line(
data = newdat,
aes(x = time_years, y = pred_gam),
linewidth = 1.1,
color = "blue"
) +
labs(
title = "Linear vs Nonlinear Temporal Trend",
subtitle = "Dashed = linear beta mixed model | Solid blue = beta GAM",
x = "Time (years)",
y = "Adjusted gene prevalence"
) +
theme_minimal()
cat("\n============================\n")
##
## ============================
cat("POST-HOC ANALYSIS: CATEGORICAL TIME PERIODS\n")
## POST-HOC ANALYSIS: CATEGORICAL TIME PERIODS
cat("============================\n")
## ============================
iso <- iso %>%
mutate(
period = cut(
year,
breaks = c(1970, 1995, 2005, 2015, 2025),
labels = c("Early", "Mid1", "Mid2", "Late"),
include.lowest = TRUE,
right = TRUE
),
period = factor(period, levels = c("Early", "Mid1", "Mid2", "Late"))
)
if(any(is.na(iso$period))){
warning("Some observations were not assigned to a period. Check the range of 'year'.")
}
cat("\nNumber of isolates per period:\n")
##
## Number of isolates per period:
print(table(iso$period, useNA = "ifany"))
##
## Early Mid1 Mid2 Late
## 46 227 714 437
cat("\n============================\n")
##
## ============================
cat("FITTING PERIOD MODEL\n")
## FITTING PERIOD MODEL
cat("============================\n")
## ============================
model_cat <- glmmTMB(
gene_prev_adj ~ period + (1 | patient_id),
family = beta_family(link = "logit"),
data = iso
)
print(summary(model_cat))
## Family: beta ( logit )
## Formula: gene_prev_adj ~ period + (1 | patient_id)
## Data: iso
##
## AIC BIC logLik -2*log(L) df.resid
## -10374.3 -10342.8 5193.2 -10386.3 1418
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## patient_id (Intercept) 0.0008892 0.02982
## Number of obs: 1424, groups: patient_id, 255
##
## Dispersion parameter for beta family (): 6.91e+03
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.633e-01 6.266e-03 -153.74 <2e-16 ***
## periodMid1 -1.670e-03 6.262e-03 -0.27 0.790
## periodMid2 9.965e-05 6.257e-03 0.02 0.987
## periodLate -1.581e-04 6.351e-03 -0.02 0.980
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n============================\n")
##
## ============================
cat("OVERALL TEST OF PERIOD EFFECT\n")
## OVERALL TEST OF PERIOD EFFECT
cat("============================\n")
## ============================
model_cat_null <- glmmTMB(
gene_prev_adj ~ 1 + (1 | patient_id),
family = beta_family(link = "logit"),
data = iso
)
overall_test <- anova(model_cat_null, model_cat)
print(overall_test)
## Data: iso
## Models:
## model_cat_null: gene_prev_adj ~ 1 + (1 | patient_id), zi=~0, disp=~1
## model_cat: gene_prev_adj ~ period + (1 | patient_id), zi=~0, disp=~1
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## model_cat_null 3 -10380 -10364 5192.9 -10386
## model_cat 6 -10374 -10343 5193.2 -10386 0.5043 3 0.9179
overall_p <- overall_test$`Pr(>Chisq)`[2]
if(!is.na(overall_p)){
if(overall_p < 0.05){
cat("\n✔ Overall, gene prevalence differs significantly across time periods.\n")
} else {
cat("\n✖ No overall statistically significant difference in gene prevalence across time periods.\n")
}
} else {
cat("\nâš Could not extract overall p-value automatically. Please inspect the ANOVA table above.\n")
}
##
## ✖ No overall statistically significant difference in gene prevalence across time periods.
cat("\n============================\n")
##
## ============================
cat("ESTIMATED MARGINAL MEANS BY PERIOD\n")
## ESTIMATED MARGINAL MEANS BY PERIOD
cat("============================\n")
## ============================
period_emm <- emmeans(model_cat, ~ period, type = "response")
emm_summary <- summary(period_emm, infer = TRUE)
print(emm_summary)
## period response SE df asymp.LCL asymp.UCL null z.ratio p.value
## Early 0.2762 0.001250 Inf 0.2738 0.2787 0.5 -153.739 <.0001
## Mid1 0.2759 0.000583 Inf 0.2748 0.2770 0.5 -330.582 <.0001
## Mid2 0.2762 0.000464 Inf 0.2753 0.2772 0.5 -414.902 <.0001
## Late 0.2762 0.000487 Inf 0.2752 0.2771 0.5 -395.364 <.0001
##
## Confidence level used: 0.95
## Intervals are back-transformed from the logit scale
## Tests are performed on the logit scale
cat("\n============================\n")
##
## ============================
cat("PAIRWISE COMPARISONS BETWEEN PERIODS\n")
## PAIRWISE COMPARISONS BETWEEN PERIODS
cat("============================\n")
## ============================
period_pairs <- pairs(emmeans(model_cat, ~ period), adjust = "tukey")
pairs_summary <- summary(period_pairs, infer = TRUE)
print(pairs_summary)
## contrast estimate SE df asymp.LCL asymp.UCL z.ratio p.value
## Early - Mid1 1.67e-03 0.00626 Inf -0.01442 0.01776 0.267 0.9934
## Early - Mid2 -9.97e-05 0.00626 Inf -0.01617 0.01597 -0.016 1.0000
## Early - Late 1.58e-04 0.00635 Inf -0.01616 0.01647 0.025 1.0000
## Mid1 - Mid2 -1.77e-03 0.00253 Inf -0.00828 0.00474 -0.699 0.8976
## Mid1 - Late -1.51e-03 0.00281 Inf -0.00873 0.00571 -0.538 0.9497
## Mid2 - Late 2.58e-04 0.00189 Inf -0.00459 0.00510 0.137 0.9991
##
## Results are given on the log odds ratio (not the response) scale.
## Confidence level used: 0.95
## Conf-level adjustment: tukey method for comparing a family of 4 estimates
## P value adjustment: tukey method for comparing a family of 4 estimates
cat("\n============================\n")
##
## ============================
cat("INTERPRETATION: ESTIMATED MEAN PREVALENCE BY PERIOD\n")
## INTERPRETATION: ESTIMATED MEAN PREVALENCE BY PERIOD
cat("============================\n")
## ============================
emm_df <- as.data.frame(summary(period_emm, infer = TRUE))
resp_col <- intersect(c("response", "prob", "rate", "mean", "emmean"), names(emm_df))
if(length(resp_col) == 0){
stop("Could not identify the response-scale mean column in emm_df.")
}
resp_col <- resp_col[1]
lower_col <- intersect(c("lower.CL", "asymp.LCL", "LCL"), names(emm_df))
upper_col <- intersect(c("upper.CL", "asymp.UCL", "UCL"), names(emm_df))
if(length(lower_col) == 0 || length(upper_col) == 0){
stop("Could not identify confidence interval columns in emm_df. Check names(emm_df).")
}
lower_col <- lower_col[1]
upper_col <- upper_col[1]
for(i in seq_len(nrow(emm_df))){
cat(
paste0(
as.character(emm_df$period[i]), ": estimated mean prevalence = ",
round(as.numeric(emm_df[[resp_col]][i]), 3),
" (95% CI ",
round(as.numeric(emm_df[[lower_col]][i]), 3), " to ",
round(as.numeric(emm_df[[upper_col]][i]), 3), ")\n"
)
)
}
## Early: estimated mean prevalence = 0.276 (95% CI 0.274 to 0.279)
## Mid1: estimated mean prevalence = 0.276 (95% CI 0.275 to 0.277)
## Mid2: estimated mean prevalence = 0.276 (95% CI 0.275 to 0.277)
## Late: estimated mean prevalence = 0.276 (95% CI 0.275 to 0.277)
cat("\n============================\n")
##
## ============================
cat("INTERPRETATION: PAIRWISE DIFFERENCES\n")
## INTERPRETATION: PAIRWISE DIFFERENCES
cat("============================\n")
## ============================
pairs_df <- as.data.frame(pairs_summary)
period_means <- emm_df[[resp_col]]
names(period_means) <- as.character(emm_df$period)
sig_pairs <- pairs_df %>%
filter(p.value < 0.05)
if(nrow(sig_pairs) == 0){
cat("✖ No statistically significant pairwise differences were detected between periods after Tukey adjustment.\n")
} else {
cat("✔ Statistically significant pairwise differences were detected between some periods:\n\n")
for(i in seq_len(nrow(sig_pairs))){
contrast_i <- as.character(sig_pairs$contrast[i])
split_contrast <- strsplit(contrast_i, " - ")[[1]]
p1 <- split_contrast[1]
p2 <- split_contrast[2]
m1 <- period_means[p1]
m2 <- period_means[p2]
if(is.na(m1) || is.na(m2)){
direction_text <- paste0("Difference detected between ", p1, " and ", p2)
} else if(m1 > m2){
direction_text <- paste0(p1, " had higher estimated mean prevalence than ", p2)
} else if(m2 > m1){
direction_text <- paste0(p2, " had higher estimated mean prevalence than ", p1)
} else {
direction_text <- paste0(p1, " and ", p2, " had similar estimated means despite statistical significance")
}
cat(
paste0(
"- ", direction_text,
" (adjusted p = ", signif(sig_pairs$p.value[i], 3), ").\n"
)
)
}
}
## ✖ No statistically significant pairwise differences were detected between periods after Tukey adjustment.
cat("\n============================\n")
##
## ============================
cat("RESULTS TEXT TEMPLATE\n")
## RESULTS TEXT TEMPLATE
cat("============================\n")
## ============================
if(!is.na(overall_p) && overall_p < 0.05){
cat(
paste0(
"As a post-hoc analysis, calendar time was categorized into four periods (Early, Mid1, Mid2, and Late), ",
"and a beta mixed-effects model with a random intercept for patient ID was fitted. ",
"There was evidence of an overall difference in mean gene prevalence across periods (likelihood ratio test p = ",
signif(overall_p, 3), "). "
)
)
if(nrow(sig_pairs) == 0){
cat(
"However, no individual pairwise differences remained statistically significant after Tukey adjustment.\n"
)
} else {
cat(
"Tukey-adjusted pairwise comparisons indicated that at least some periods differed significantly in estimated mean prevalence. ",
"See the pairwise comparison output above for the specific contrasts.\n"
)
}
} else if(!is.na(overall_p) && overall_p >= 0.05){
cat(
paste0(
"In a post-hoc analysis, calendar time was categorized into four periods (Early, Mid1, Mid2, and Late), ",
"and a beta mixed-effects model with a random intercept for patient ID was fitted. ",
"No overall difference in mean gene prevalence was detected across periods (likelihood ratio test p = ",
signif(overall_p, 3), "). ",
"Consistent with this, Tukey-adjusted pairwise comparisons did not indicate strong evidence of differences between periods."
),
"\n"
)
} else {
cat(
"The categorical period model was fitted successfully, but the overall p-value could not be extracted automatically. Please use the ANOVA table above for interpretation.\n"
)
}
## In a post-hoc analysis, calendar time was categorized into four periods (Early, Mid1, Mid2, and Late), and a beta mixed-effects model with a random intercept for patient ID was fitted. No overall difference in mean gene prevalence was detected across periods (likelihood ratio test p = 0.918). Consistent with this, Tukey-adjusted pairwise comparisons did not indicate strong evidence of differences between periods.
cat("\n============================\n")
##
## ============================
cat("PER-GENE TREND ANALYSIS\n")
## PER-GENE TREND ANALYSIS
cat("============================\n")
## ============================
gene_long <- iso %>%
select(patient_id, time_years, all_of(gene_cols)) %>%
pivot_longer(
cols = all_of(gene_cols),
names_to = "gene",
values_to = "presence"
) %>%
mutate(
patient_id = factor(patient_id),
presence = case_when(
presence %in% c(1, "1", TRUE) ~ 1L,
presence %in% c(0, "0", FALSE) ~ 0L,
TRUE ~ suppressWarnings(as.integer(as.character(presence)))
)
)
fit_gene_model <- function(dat) {
dat <- dat %>%
filter(!is.na(presence), !is.na(time_years), !is.na(patient_id))
n_obs <- nrow(dat)
n_patients <- dplyr::n_distinct(dat$patient_id)
n_pos <- sum(dat$presence == 1, na.rm = TRUE)
n_neg <- sum(dat$presence == 0, na.rm = TRUE)
if(n_obs == 0 || n_pos == 0 || n_neg == 0){
return(data.frame(
n_obs = n_obs,
n_patients = n_patients,
n_pos = n_pos,
prevalence = ifelse(n_obs > 0, n_pos / n_obs, NA_real_),
beta = NA_real_,
se = NA_real_,
p_value = NA_real_,
odds_ratio = NA_real_,
ci_low = NA_real_,
ci_high = NA_real_,
note = "Not estimable: no variation in outcome"
))
}
fit <- try(
glmmTMB(
presence ~ time_years + (1 | patient_id),
family = binomial(link = "logit"),
data = dat
),
silent = TRUE
)
if(inherits(fit, "try-error")){
return(data.frame(
n_obs = n_obs,
n_patients = n_patients,
n_pos = n_pos,
prevalence = n_pos / n_obs,
beta = NA_real_,
se = NA_real_,
p_value = NA_real_,
odds_ratio = NA_real_,
ci_low = NA_real_,
ci_high = NA_real_,
note = "Model failed to converge"
))
}
coef_table <- summary(fit)$coefficients$cond
if(!"time_years" %in% rownames(coef_table)){
return(data.frame(
n_obs = n_obs,
n_patients = n_patients,
n_pos = n_pos,
prevalence = n_pos / n_obs,
beta = NA_real_,
se = NA_real_,
p_value = NA_real_,
odds_ratio = NA_real_,
ci_low = NA_real_,
ci_high = NA_real_,
note = "time_years coefficient not available"
))
}
beta <- coef_table["time_years", "Estimate"]
se <- coef_table["time_years", "Std. Error"]
p_value <- coef_table["time_years", "Pr(>|z|)"]
odds_ratio <- exp(beta)
ci_low <- exp(beta - 1.96 * se)
ci_high <- exp(beta + 1.96 * se)
note_text <- ifelse(
n_pos < 5 || n_neg < 5,
"Sparse outcome; interpret cautiously",
"OK"
)
data.frame(
n_obs = n_obs,
n_patients = n_patients,
n_pos = n_pos,
prevalence = n_pos / n_obs,
beta = beta,
se = se,
p_value = p_value,
odds_ratio = odds_ratio,
ci_low = ci_low,
ci_high = ci_high,
note = note_text
)
}
gene_results <- gene_long %>%
group_by(gene) %>%
group_modify(~ fit_gene_model(.x)) %>%
ungroup() %>%
mutate(
p_adj = p.adjust(p_value, method = "fdr"),
trend = case_when(
is.na(p_adj) ~ "Not estimable",
p_adj < 0.05 & beta > 0 ~ "Increasing",
p_adj < 0.05 & beta < 0 ~ "Decreasing",
TRUE ~ "No significant trend"
),
interpretation = case_when(
is.na(p_adj) ~ "Trend could not be estimated reliably.",
p_adj < 0.05 & beta > 0 ~ "Gene presence increased significantly over time.",
p_adj < 0.05 & beta < 0 ~ "Gene presence decreased significantly over time.",
TRUE ~ "No statistically significant temporal trend after FDR correction."
)
) %>%
arrange(p_adj)
gene_results
cat("\n============================\n")
##
## ============================
cat("SIGNIFICANT PER-GENE TRENDS (FDR < 0.05)\n")
## SIGNIFICANT PER-GENE TRENDS (FDR < 0.05)
cat("============================\n")
## ============================
sig_genes <- gene_results %>%
filter(!is.na(p_adj), p_adj < 0.05)
if(nrow(sig_genes) == 0){
cat("✖ No genes showed a statistically significant temporal trend after FDR correction.\n")
} else {
print(sig_genes)
}
## # A tibble: 5 × 15
## gene n_obs n_patients n_pos prevalence beta se p_value odds_ratio
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TriA 1424 255 1071 0.752 -0.0881 0.0174 0.000000410 0.916
## 2 MexI 1424 255 1409 0.989 0.104 0.0259 0.0000543 1.11
## 3 OpmD 1424 255 1409 0.989 0.104 0.0259 0.0000543 1.11
## 4 PDC-3 1424 255 649 0.456 -0.110 0.0316 0.000506 0.896
## 5 MexG 1424 255 1399 0.982 0.127 0.0381 0.000840 1.14
## # ℹ 6 more variables: ci_low <dbl>, ci_high <dbl>, note <chr>, p_adj <dbl>,
## # trend <chr>, interpretation <chr>
cat("\n============================\n")
##
## ============================
cat("INTERPRETATION\n")
## INTERPRETATION
cat("============================\n")
## ============================
n_tested <- sum(!is.na(gene_results$p_value))
n_sig <- sum(!is.na(gene_results$p_adj) & gene_results$p_adj < 0.05)
n_increasing <- sum(gene_results$trend == "Increasing", na.rm = TRUE)
n_decreasing <- sum(gene_results$trend == "Decreasing", na.rm = TRUE)
cat("Genes tested:", n_tested, "\n")
## Genes tested: 214
cat("Significant after FDR correction:", n_sig, "\n")
## Significant after FDR correction: 5
cat("Increasing trends:", n_increasing, "\n")
## Increasing trends: 3
cat("Decreasing trends:", n_decreasing, "\n")
## Decreasing trends: 2
if(n_sig == 0){
cat("\nOverall interpretation:\n")
cat("No individual resistance genes showed a statistically significant linear temporal trend after controlling for multiple testing using the false discovery rate (FDR).\n")
} else {
cat("\nOverall interpretation:\n")
cat("One or more individual resistance genes showed statistically significant temporal trends after FDR correction.\n\n")
for(i in seq_len(nrow(sig_genes))){
cat(
paste0(
"- ", sig_genes$gene[i], ": ",
sig_genes$trend[i],
" trend (β = ", round(sig_genes$beta[i], 4),
", OR = ", round(sig_genes$odds_ratio[i], 3),
", 95% CI ", round(sig_genes$ci_low[i], 3),
" to ", round(sig_genes$ci_high[i], 3),
", FDR-adjusted p = ", signif(sig_genes$p_adj[i], 3), ").\n"
)
)
}
}
##
## Overall interpretation:
## One or more individual resistance genes showed statistically significant temporal trends after FDR correction.
##
## - TriA: Decreasing trend (β = -0.0881, OR = 0.916, 95% CI 0.885 to 0.947, FDR-adjusted p = 8.78e-05).
## - MexI: Increasing trend (β = 0.1044, OR = 1.11, 95% CI 1.055 to 1.168, FDR-adjusted p = 0.00388).
## - OpmD: Increasing trend (β = 0.1044, OR = 1.11, 95% CI 1.055 to 1.168, FDR-adjusted p = 0.00388).
## - PDC-3: Decreasing trend (β = -0.1097, OR = 0.896, 95% CI 0.842 to 0.953, FDR-adjusted p = 0.0271).
## - MexG: Increasing trend (β = 0.1271, OR = 1.136, 95% CI 1.054 to 1.223, FDR-adjusted p = 0.036).
cat("\n============================\n")
##
## ============================
cat("RESULTS TEXT TEMPLATE\n")
## RESULTS TEXT TEMPLATE
cat("============================\n")
## ============================
if(n_sig == 0){
cat(
paste0(
"Per-gene temporal trend analyses were performed using mixed-effects logistic regression models with a random intercept for patient ID. ",
n_tested, " genes were evaluable. After controlling for multiple testing using the false discovery rate, ",
"no individual genes showed a statistically significant linear temporal trend. ",
"These findings suggest that the presence of individual resistance genes remained broadly stable over time in this dataset."
),
"\n"
)
} else {
cat(
paste0(
"Per-gene temporal trend analyses were performed using mixed-effects logistic regression models with a random intercept for patient ID. ",
n_tested, " genes were evaluable, and ", n_sig,
" showed statistically significant temporal trends after false discovery rate correction. ",
n_increasing, " gene(s) showed increasing trends and ",
n_decreasing, " gene(s) showed decreasing trends over time."
),
"\n"
)
}
## Per-gene temporal trend analyses were performed using mixed-effects logistic regression models with a random intercept for patient ID. 214 genes were evaluable, and 5 showed statistically significant temporal trends after false discovery rate correction. 3 gene(s) showed increasing trends and 2 gene(s) showed decreasing trends over time.
cat("\n============================\n")
##
## ============================
cat("VISUALIZE TOP GENES\n")
## VISUALIZE TOP GENES
cat("============================\n")
## ============================
gene_results_plot <- gene_results %>%
filter(!is.na(p_adj)) %>%
arrange(p_adj)
if(nrow(gene_results_plot) == 0){
stop("No genes with valid adjusted p-values available for plotting.")
}
top_genes <- gene_results_plot %>%
slice_head(n = 6) %>%
pull(gene)
cat("\nTop genes selected for plotting:\n")
##
## Top genes selected for plotting:
print(top_genes)
## [1] "TriA" "MexI" "OpmD" "PDC-3" "MexG" "mexM"
plot_data <- gene_long %>%
filter(gene %in% top_genes) %>%
left_join(
gene_results %>%
select(gene, beta, odds_ratio, p_value, p_adj, trend, interpretation),
by = "gene"
) %>%
mutate(
facet_label = paste0(
gene,
"\nTrend: ", trend,
"\nFDR p = ", ifelse(is.na(p_adj), "NA", signif(p_adj, 3))
)
)
facet_order <- plot_data %>%
distinct(gene, facet_label, p_adj) %>%
arrange(p_adj) %>%
pull(facet_label)
plot_data$facet_label <- factor(plot_data$facet_label, levels = facet_order)
pred_list <- lapply(top_genes, function(g){
dat_g <- gene_long %>%
filter(gene == g) %>%
mutate(patient_id = factor(patient_id))
if(length(unique(dat_g$presence[!is.na(dat_g$presence)])) < 2){
return(NULL)
}
fit_g <- try(
glmmTMB(
presence ~ time_years + (1 | patient_id),
family = binomial(link = "logit"),
data = dat_g
),
silent = TRUE
)
if(inherits(fit_g, "try-error")){
return(NULL)
}
ref_patient <- levels(dat_g$patient_id)[1]
newdat <- data.frame(
time_years = seq(
min(dat_g$time_years, na.rm = TRUE),
max(dat_g$time_years, na.rm = TRUE),
length.out = 200
),
patient_id = factor(ref_patient, levels = levels(dat_g$patient_id))
)
newdat$pred <- predict(
fit_g,
newdata = newdat,
type = "response",
re.form = NA
)
res_row <- gene_results %>% filter(gene == g) %>% slice(1)
newdat$gene <- g
newdat$facet_label <- paste0(
g,
"\nTrend: ", res_row$trend,
"\nFDR p = ", ifelse(is.na(res_row$p_adj), "NA", signif(res_row$p_adj, 3))
)
newdat
})
pred_data <- bind_rows(pred_list)
pred_data$facet_label <- factor(pred_data$facet_label, levels = facet_order)
p_top_genes <- ggplot(plot_data, aes(x = time_years, y = presence)) +
geom_jitter(
height = 0.05,
width = 0.1,
alpha = 0.25,
size = 1.1
) +
geom_line(
data = pred_data,
aes(x = time_years, y = pred),
inherit.aes = FALSE,
color = "steelblue",
linewidth = 1
) +
facet_wrap(~ facet_label, scales = "fixed") +
labs(
title = "Top Resistance Genes Over Time",
subtitle = "Model-based predicted probabilities from mixed-effects logistic regression",
x = "Time (years)",
y = "Probability of gene presence"
) +
theme_minimal(base_size = 14) +
theme(
strip.text = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
print(p_top_genes)
cat("\n============================\n")
##
## ============================
cat("INTERPRETATION\n")
## INTERPRETATION
cat("============================\n")
## ============================
top_gene_summary <- gene_results %>%
filter(gene %in% top_genes) %>%
arrange(p_adj)
if(nrow(top_gene_summary) == 0){
cat("No evaluable genes were available for visualization.\n")
} else {
cat("The figure displays the genes with the smallest FDR-adjusted p-values from the per-gene mixed-effects logistic regression analysis.\n")
cat("Each panel shows the observed binary data and the model-based predicted probability of gene presence over time.\n\n")
for(i in seq_len(nrow(top_gene_summary))){
cat(
paste0(
"- ", top_gene_summary$gene[i], ": ",
top_gene_summary$trend[i],
" trend",
if(!is.na(top_gene_summary$beta[i])) paste0(" (β = ", round(top_gene_summary$beta[i], 4), ")") else "",
if(!is.na(top_gene_summary$odds_ratio[i])) paste0(", OR = ", round(top_gene_summary$odds_ratio[i], 3)) else "",
if(!is.na(top_gene_summary$p_adj[i])) paste0(", FDR-adjusted p = ", signif(top_gene_summary$p_adj[i], 3)) else "",
".\n"
)
)
}
sig_top <- top_gene_summary %>%
filter(!is.na(p_adj), p_adj < 0.05)
cat("\nOverall interpretation:\n")
if(nrow(sig_top) == 0){
cat("These genes were prioritized for visualization because they had the smallest adjusted p-values, but none remained statistically significant after false discovery rate correction. Accordingly, the plotted trajectories should be interpreted as descriptive rather than confirmatory evidence of temporal change.\n")
} else {
cat("At least one of the plotted genes remained statistically significant after false discovery rate correction. The fitted trajectories illustrate the direction and magnitude of those gene-specific temporal trends.\n")
}
}
## The figure displays the genes with the smallest FDR-adjusted p-values from the per-gene mixed-effects logistic regression analysis.
## Each panel shows the observed binary data and the model-based predicted probability of gene presence over time.
##
## - TriA: Decreasing trend (β = -0.0881), OR = 0.916, FDR-adjusted p = 8.78e-05.
## - MexI: Increasing trend (β = 0.1044), OR = 1.11, FDR-adjusted p = 0.00388.
## - OpmD: Increasing trend (β = 0.1044), OR = 1.11, FDR-adjusted p = 0.00388.
## - PDC-3: Decreasing trend (β = -0.1097), OR = 0.896, FDR-adjusted p = 0.0271.
## - MexG: Increasing trend (β = 0.1271), OR = 1.136, FDR-adjusted p = 0.036.
## - mexM: No significant trend trend (β = -0.2704), OR = 0.763, FDR-adjusted p = 0.0873.
##
## Overall interpretation:
## At least one of the plotted genes remained statistically significant after false discovery rate correction. The fitted trajectories illustrate the direction and magnitude of those gene-specific temporal trends.
cat("\n============================\n")
##
## ============================
cat("COMBINED TOP-GENE TREND PLOT (USING sample_date)\n")
## COMBINED TOP-GENE TREND PLOT (USING sample_date)
cat("============================\n")
## ============================
iso_plot <- iso %>%
mutate(
sample_date = as.Date(sample_date),
sample_year = year(sample_date),
patient_id = factor(patient_id)
)
if(all(is.na(iso_plot$sample_date))){
stop("sample_date could not be converted to Date. Check its format.")
}
cat("\nSample date range:\n")
##
## Sample date range:
cat("First sample date:", as.character(min(iso_plot$sample_date, na.rm = TRUE)), "\n")
## First sample date: 1980-01-02
cat("Last sample date: ", as.character(max(iso_plot$sample_date, na.rm = TRUE)), "\n")
## Last sample date: 2020-12-03
cat("\nSample year range:\n")
##
## Sample year range:
cat("Start year:", min(iso_plot$sample_year, na.rm = TRUE), "\n")
## Start year: 1980
cat("End year: ", max(iso_plot$sample_year, na.rm = TRUE), "\n")
## End year: 2020
gene_results_plot <- gene_results %>%
filter(!is.na(p_adj)) %>%
arrange(p_adj)
if(nrow(gene_results_plot) == 0){
stop("No genes with valid adjusted p-values available for plotting.")
}
top_genes <- gene_results_plot %>%
slice_head(n = 6) %>%
pull(gene)
cat("\nTop genes selected for plotting:\n")
##
## Top genes selected for plotting:
print(top_genes)
## [1] "TriA" "MexI" "OpmD" "PDC-3" "MexG" "mexM"
plot_long <- iso_plot %>%
select(patient_id, sample_date, sample_year, all_of(gene_cols)) %>%
pivot_longer(
cols = all_of(gene_cols),
names_to = "gene",
values_to = "presence"
) %>%
mutate(
presence = case_when(
presence %in% c(1, "1", TRUE) ~ 1L,
presence %in% c(0, "0", FALSE) ~ 0L,
TRUE ~ suppressWarnings(as.integer(as.character(presence)))
)
) %>%
filter(
gene %in% top_genes,
!is.na(sample_year),
!is.na(presence),
!is.na(patient_id)
)
if(nrow(plot_long) == 0){
stop("No valid plotting data available after reshaping iso.")
}
start_year <- min(plot_long$sample_year, na.rm = TRUE)
end_year <- max(plot_long$sample_year, na.rm = TRUE)
cat("\nTime range used for plot:\n")
##
## Time range used for plot:
cat("Start year =", start_year, "\n")
## Start year = 1980
cat("End year =", end_year, "\n")
## End year = 2020
pred_list <- lapply(top_genes, function(g){
dat_g <- plot_long %>%
filter(gene == g) %>%
mutate(patient_id = factor(patient_id))
if(length(unique(dat_g$presence[!is.na(dat_g$presence)])) < 2){
return(NULL)
}
fit_g <- try(
glmmTMB(
presence ~ sample_year + (1 | patient_id),
family = binomial(link = "logit"),
data = dat_g
),
silent = TRUE
)
if(inherits(fit_g, "try-error")){
return(NULL)
}
ref_patient <- levels(dat_g$patient_id)[1]
newdat <- data.frame(
sample_year = seq(start_year, end_year, length.out = 200),
patient_id = factor(ref_patient, levels = levels(dat_g$patient_id))
)
newdat$pred <- predict(
fit_g,
newdata = newdat,
type = "response",
re.form = NA
)
g_info <- gene_results %>%
filter(gene == g) %>%
slice(1)
newdat$gene <- g
newdat$trend <- g_info$trend
newdat$p_adj <- g_info$p_adj
newdat
})
pred_data <- bind_rows(pred_list)
if(nrow(pred_data) == 0){
stop("Prediction data could not be generated for the selected genes.")
}
obs_yearly <- plot_long %>%
group_by(gene, sample_year) %>%
summarise(
prevalence = mean(presence, na.rm = TRUE),
n = n(),
.groups = "drop"
)
p_top_combined <- ggplot() +
geom_point(
data = obs_yearly,
aes(x = sample_year, y = prevalence, color = gene),
alpha = 0.65,
size = 2
) +
geom_line(
data = pred_data,
aes(x = sample_year, y = pred, color = gene),
linewidth = 1.2
) +
scale_x_continuous(
limits = c(start_year, end_year),
breaks = seq(start_year, end_year, by = 5)
) +
scale_y_continuous(
labels = label_percent(accuracy = 1),
limits = c(0, 1)
) +
labs(
title = "Temporal Trends of Top Resistance Genes",
subtitle = paste0("Based on sample_date (", start_year, " to ", end_year, ")"),
x = "Sample year",
y = "Gene prevalence (%)",
color = "Gene"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "right",
panel.grid.minor = element_blank()
)
print(p_top_combined)
cat("\n============================\n")
##
## ============================
cat("INTERPRETATION\n")
## INTERPRETATION
cat("============================\n")
## ============================
top_gene_summary <- gene_results %>%
filter(gene %in% top_genes) %>%
arrange(p_adj)
cat("This figure shows the top-ranked genes based on the smallest FDR-adjusted p-values from the per-gene trend analysis.\n")
## This figure shows the top-ranked genes based on the smallest FDR-adjusted p-values from the per-gene trend analysis.
cat("The x-axis is based on calendar year derived from iso$sample_date.\n")
## The x-axis is based on calendar year derived from iso$sample_date.
cat("Points show observed yearly prevalence, and colored lines show model-based predicted prevalence from mixed-effects logistic regression.\n\n")
## Points show observed yearly prevalence, and colored lines show model-based predicted prevalence from mixed-effects logistic regression.
for(i in seq_len(nrow(top_gene_summary))){
cat(
paste0(
"- ", top_gene_summary$gene[i], ": ",
top_gene_summary$trend[i],
" trend",
if(!is.na(top_gene_summary$odds_ratio[i])) paste0(", OR = ", round(top_gene_summary$odds_ratio[i], 3)) else "",
if(!is.na(top_gene_summary$p_adj[i])) paste0(", FDR-adjusted p = ", signif(top_gene_summary$p_adj[i], 3)) else "",
".\n"
)
)
}
## - TriA: Decreasing trend, OR = 0.916, FDR-adjusted p = 8.78e-05.
## - MexI: Increasing trend, OR = 1.11, FDR-adjusted p = 0.00388.
## - OpmD: Increasing trend, OR = 1.11, FDR-adjusted p = 0.00388.
## - PDC-3: Decreasing trend, OR = 0.896, FDR-adjusted p = 0.0271.
## - MexG: Increasing trend, OR = 1.136, FDR-adjusted p = 0.036.
## - mexM: No significant trend trend, OR = 0.763, FDR-adjusted p = 0.0873.
sig_top <- top_gene_summary %>%
filter(!is.na(p_adj), p_adj < 0.05)
cat("\nOverall interpretation:\n")
##
## Overall interpretation:
if(nrow(sig_top) == 0){
cat("Although these genes had the smallest adjusted p-values, none remained statistically significant after false discovery rate correction. Therefore, the plotted trajectories should be interpreted as descriptive rather than confirmatory.\n")
} else {
cat("One or more plotted genes remained statistically significant after false discovery rate correction. The lines illustrate the direction and relative magnitude of gene-specific temporal change over calendar time.\n")
}
## One or more plotted genes remained statistically significant after false discovery rate correction. The lines illustrate the direction and relative magnitude of gene-specific temporal change over calendar time.
cat("\n============================\n")
##
## ============================
cat("RESULTS TEXT TEMPLATE\n")
## RESULTS TEXT TEMPLATE
cat("============================\n")
## ============================
if(nrow(sig_top) == 0){
cat(
paste0(
"Figure X displays the top-ranked genes from the per-gene trend analysis using calendar year derived from sample_date. ",
"Observed yearly prevalence values are shown as points, and mixed-effects logistic regression predictions are shown as colored lines. ",
"Although these genes were prioritized for visualization, none remained statistically significant after false discovery rate correction, and the plotted trends should therefore be interpreted descriptively."
),
"\n"
)
} else {
cat(
paste0(
"Figure X displays the top-ranked genes from the per-gene trend analysis using calendar year derived from sample_date. ",
"Observed yearly prevalence values are shown as points, and mixed-effects logistic regression predictions are shown as colored lines. ",
"One or more plotted genes remained statistically significant after false discovery rate correction, illustrating gene-specific increases or decreases in prevalence over calendar time."
),
"\n"
)
}
## Figure X displays the top-ranked genes from the per-gene trend analysis using calendar year derived from sample_date. Observed yearly prevalence values are shown as points, and mixed-effects logistic regression predictions are shown as colored lines. One or more plotted genes remained statistically significant after false discovery rate correction, illustrating gene-specific increases or decreases in prevalence over calendar time.
To render this file to PDF in RStudio:
rmarkdown::render("temporal_trend_analysis.Rmd", output_format = "pdf_document")
Make sure you have a LaTeX engine installed. A common option is TinyTeX:
install.packages("tinytex")
tinytex::install_tinytex()