Overview
Load libraries
Load data
Clean and parse data
Quality check
Create isolate-level dataset
Year summary
Visualization of overall trend
Initial assumption checks
Assumption tests and interpretation
Prepare data for beta regression
Fit candidate models
Model comparison
Final model selection
Final model summary and interpretation
Automated results text
Final model diagnostics
Model-based visual comparison
Post-hoc analysis by time period
Period model
Overall test of period effect
Estimated marginal means by period
Pairwise comparisons between periods
Interpretation of estimated means and pairwise differences
Automated results text for period analysis
Per-gene trend analysis
Function to fit one gene model
Fit per-gene models
Significant genes only
Per-gene interpretation
Results text for per-gene analysis
Visualize top genes
Interpretation of top-gene figure
Combined top-gene trend plot using sample date
Interpretation of combined plot
Results text for combined figure
Notes

Overview

This R Markdown document performs temporal trend analysis of resistance gene prevalence using isolate-level resistance gene data. It includes:

data cleaning and date parsing
isolate-level prevalence calculation
yearly summaries and visualization
assumption checks
beta mixed-effects and beta GAM modeling
model comparison and diagnostics
post-hoc categorical time-period analysis
per-gene trend analysis
visualization of top-ranked genes

Load libraries

library(tidyverse)
library(lubridate)
library(stringr)
library(lme4)
library(lmerTest)
library(emmeans)
library(performance)
library(DHARMa)
library(moments)
library(lmtest)
library(car)
library(glmmTMB)
library(mgcv)
library(scales)

Load data

Update the file path below if needed.

library(readr)
library(dplyr)

df <- read_csv("rgi_combined_blast.csv")

Clean and parse data

This section keeps high-confidence hits, extracts patient ID, extracts the sample date from Sample_ID, and converts it to a proper date.

# Clean and parse data
dat0 <- df %>%   # <-- was 'raw' before
  filter(Cut_Off %in% c("Strict", "Perfect")) %>%
  filter(!is.na(Sample_ID), !is.na(Best_Hit_ARO)) %>%
  mutate(
    patient_id = str_extract(Sample_ID, "^[^-]+"),
    sample_date_chr = str_extract(Sample_ID, "\\d{2}-\\d{2}-\\d{4}$"),
    sample_date = dmy(sample_date_chr)
  ) %>%
  filter(!is.na(sample_date), !is.na(patient_id))

Quality check

cat("Date range:\n")

## Date range:

print(range(dat0$sample_date))

## [1] "1980-01-02" "2020-12-03"

cat("Year distribution:\n")

## Year distribution:

print(
  dat0 %>%
    mutate(year = year(sample_date)) %>%
    count(year) %>%
    arrange(year)
)

## # A tibble: 38 × 2
##     year     n
##    <dbl> <int>
##  1  1980   242
##  2  1981    61
##  3  1982   121
##  4  1985    60
##  5  1986    62
##  6  1987   245
##  7  1988   124
##  8  1989    61
##  9  1990   123
## 10  1991   242
## # ℹ 28 more rows

Create isolate-level dataset

dat_unique <- dat0 %>%
  distinct(Sample_ID, patient_id, sample_date, Best_Hit_ARO)

gene_mat <- dat_unique %>%
  mutate(value = 1L) %>%
  pivot_wider(
    names_from = Best_Hit_ARO,
    values_from = value,
    values_fill = 0
  )

meta_cols <- c("Sample_ID", "patient_id", "sample_date")
gene_cols <- setdiff(names(gene_mat), meta_cols)

iso <- gene_mat %>%
  mutate(
    gene_count = rowSums(across(all_of(gene_cols))),
    total_genes = length(gene_cols),
    gene_prevalence = gene_count / total_genes,
    year = year(sample_date)
  ) %>%
  arrange(sample_date) %>%
  mutate(
    time_years = as.numeric(sample_date - min(sample_date)) / 365.25
  )

Year summary

year_summary <- iso %>%
  group_by(year) %>%
  summarise(
    n_isolates = n(),
    mean_prev = mean(gene_prevalence),
    sd_prev = sd(gene_prevalence),
    se_prev = sd_prev / sqrt(n_isolates),
    .groups = "drop"
  )

year_summary

Visualization of overall trend

ggplot(year_summary, aes(x = year, y = mean_prev)) +
  geom_point() +
  geom_line() +
  geom_errorbar(
    aes(
      ymin = mean_prev - se_prev,
      ymax = mean_prev + se_prev
    ),
    width = 0.2
  ) +
  labs(
    title = "Temporal Trend of Resistance Gene Prevalence",
    x = "Year",
    y = "Mean Gene Prevalence"
  ) +
  theme_minimal()

Initial assumption checks

hist(
  iso$gene_prevalence,
  breaks = 30,
  main = "Distribution of Gene Prevalence"
)

qqnorm(iso$gene_prevalence)
qqline(iso$gene_prevalence)

Assumption tests and interpretation

cat("\n============================\n")

## 
## ============================

cat("ASSUMPTION CHECK RESULTS\n")

## ASSUMPTION CHECK RESULTS

cat("============================\n")

## ============================

shapiro_test <- shapiro.test(iso$gene_prevalence)
cat("\nShapiro-Wilk test (raw data):\n")

## 
## Shapiro-Wilk test (raw data):

print(shapiro_test)

## 
##  Shapiro-Wilk normality test
## 
## data:  iso$gene_prevalence
## W = 0.86363, p-value < 2.2e-16

skew_val <- skewness(iso$gene_prevalence)
kurt_val <- kurtosis(iso$gene_prevalence)

cat("\nSkewness:", round(skew_val, 3), "\n")

## 
## Skewness: 1.154

cat("Kurtosis:", round(kurt_val, 3), "\n")

## Kurtosis: 12.292

lm_tmp <- lm(gene_prevalence ~ time_years, data = iso)
bp_test <- bptest(lm_tmp)

cat("\nBreusch-Pagan test:\n")

## 
## Breusch-Pagan test:

print(bp_test)

## 
##  studentized Breusch-Pagan test
## 
## data:  lm_tmp
## BP = 0.99522, df = 1, p-value = 0.3185

res <- residuals(lm_tmp)
shapiro_res <- shapiro.test(res)

cat("\nShapiro-Wilk test (residuals):\n")

## 
## Shapiro-Wilk test (residuals):

print(shapiro_res)

## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.87392, p-value < 2.2e-16

cat("\nOutlier test:\n")

## 
## Outlier test:

print(outlierTest(lm_tmp))

##      rstudent unadjusted p-value Bonferroni p
## 963  6.513425         1.0167e-10   1.4478e-07
## 1017 6.510787         1.0342e-10   1.4727e-07
## 1024 6.510586         1.0355e-10   1.4746e-07
## 1309 6.499080         1.1154e-10   1.5884e-07
## 167  5.992648         2.6127e-09   3.7204e-06
## 1056 5.934648         3.6944e-09   5.2609e-06
## 1088 5.933692         3.7155e-09   5.2908e-06
## 1231 5.927581         3.8529e-09   5.4865e-06
## 105  5.430255         6.6105e-08   9.4134e-05
## 1326 4.785898         1.8797e-06   2.6767e-03

min_val <- min(iso$gene_prevalence)
max_val <- max(iso$gene_prevalence)

cat("\nBoundary check:\n")

## 
## Boundary check:

cat("Min =", min_val, "\n")

## Min = 0.2374429

cat("Max =", max_val, "\n")

## Max = 0.3287671

normal_raw <- shapiro_test$p.value > 0.05
normal_res <- shapiro_res$p.value > 0.05
homo <- bp_test$p.value > 0.05
low_skew <- abs(skew_val) < 1
bounded <- (min_val >= 0 & max_val <= 1)

cat("\n============================\n")

## 
## ============================

cat("INTERPRETATION\n")

## INTERPRETATION

cat("============================\n")

## ============================

if(normal_raw){
  cat("✔ Raw data approximately normal\n")
} else {
  cat("✖ Raw data NOT normal\n")
}

## ✖ Raw data NOT normal

if(normal_res){
  cat("✔ Residuals approximately normal\n")
} else {
  cat("✖ Residuals NOT normal\n")
}

## ✖ Residuals NOT normal

if(homo){
  cat("✔ Homoscedasticity assumption satisfied\n")
} else {
  cat("✖ Heteroscedasticity detected\n")
}

## ✔ Homoscedasticity assumption satisfied

if(low_skew){
  cat("✔ Skewness acceptable\n")
} else {
  cat("✖ High skewness detected\n")
}

## ✖ High skewness detected

if(bounded){
  cat("✔ Outcome is bounded (0–1)\n")
}

## ✔ Outcome is bounded (0–1)

Prepare data for beta regression

cat("\n============================\n")

## 
## ============================

cat("PREPARING DATA\n")

## PREPARING DATA

cat("============================\n")

## ============================

iso <- iso %>%
  mutate(patient_id = factor(patient_id))

if(any(iso$gene_prevalence < 0 | iso$gene_prevalence > 1, na.rm = TRUE)){
  stop("gene_prevalence must be between 0 and 1.")
}

n_obs <- nrow(iso)

iso <- iso %>%
  mutate(
    gene_prev_adj = (gene_prevalence * (n_obs - 1) + 0.5) / n_obs
  )

Fit candidate models

cat("\n============================\n")

## 
## ============================

cat("FITTING CANDIDATE MODELS\n")

## FITTING CANDIDATE MODELS

cat("============================\n")

## ============================

model_linear <- glmmTMB(
  gene_prev_adj ~ time_years + (1 | patient_id),
  family = beta_family(link = "logit"),
  data = iso
)

model_gam <- gam(
  gene_prev_adj ~ s(time_years, k = 5) + s(patient_id, bs = "re"),
  family = betar(link = "logit"),
  data = iso,
  method = "ML"
)

Model comparison

cat("\n============================\n")

## 
## ============================

cat("MODEL COMPARISON\n")

## MODEL COMPARISON

cat("============================\n")

## ============================

AIC_linear <- AIC(model_linear)
AIC_gam <- AIC(model_gam)
delta_AIC <- AIC_linear - AIC_gam

cat("Linear model AIC:", round(AIC_linear, 2), "\n")

## Linear model AIC: -10378.27

cat("GAM model AIC:", round(AIC_gam, 2), "\n")

## GAM model AIC: -10639.74

cat("Delta AIC (Linear - GAM):", round(delta_AIC, 2), "\n")

## Delta AIC (Linear - GAM): 261.47

gam_sum <- summary(model_gam)
s_table <- gam_sum$s.table

time_smooth_row <- grep("^s\\(time_years", rownames(s_table))
edf_time <- if(length(time_smooth_row) > 0) s_table[time_smooth_row, "edf"] else NA
p_smooth <- if(length(time_smooth_row) > 0) s_table[time_smooth_row, "p-value"] else NA

cat("\nGAM smooth term for time_years:\n")

## 
## GAM smooth term for time_years:

cat("edf =", round(edf_time, 3), "\n")

## edf = 1.001

cat("p-value =", signif(p_smooth, 3), "\n")

## p-value = 0.5

Final model selection

cat("\n============================\n")

## 
## ============================

cat("FINAL MODEL SELECTION\n")

## FINAL MODEL SELECTION

cat("============================\n")

## ============================

if(!is.na(delta_AIC) && delta_AIC > 2 && !is.na(edf_time) && edf_time > 1.05){

  final_model <- model_gam
  final_model_type <- "gam"

  cat("✔ Final model selected: NONLINEAR beta GAM\n")
  cat("Reason: GAM improved AIC meaningfully and the time smooth suggests nonlinearity.\n")

} else {

  final_model <- model_linear
  final_model_type <- "linear"

  cat("✔ Final model selected: LINEAR temporal effect beta mixed model\n")

  if(abs(delta_AIC) <= 2){
    cat("Reason: AIC difference is small; simpler linear model retained.\n")
  } else if(!is.na(edf_time) && edf_time <= 1.05){
    cat("Reason: GAM smooth is essentially linear (edf ≈ 1); linear model retained.\n")
  } else {
    cat("Reason: No convincing improvement from nonlinear modeling.\n")
  }
}

## ✔ Final model selected: LINEAR temporal effect beta mixed model
## Reason: GAM smooth is essentially linear (edf ≈ 1); linear model retained.

Final model summary and interpretation

cat("\n============================\n")

## 
## ============================

cat("FINAL MODEL SUMMARY\n")

## FINAL MODEL SUMMARY

cat("============================\n")

## ============================

print(summary(final_model))

##  Family: beta  ( logit )
## Formula:          gene_prev_adj ~ time_years + (1 | patient_id)
## Data: iso
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  -10378.3  -10357.2    5193.1  -10386.3      1420 
## 
## Random effects:
## 
## Conditional model:
##  Groups     Name        Variance  Std.Dev.
##  patient_id (Intercept) 0.0008929 0.02988 
## Number of obs: 1424, groups:  patient_id, 255
## 
## Dispersion parameter for beta family (): 6.92e+03 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.9603438  0.0052307 -183.60   <2e-16 ***
## time_years  -0.0001027  0.0001522   -0.67      0.5    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\n============================\n")

## 
## ============================

cat("MODEL INTERPRETATION\n")

## MODEL INTERPRETATION

cat("============================\n")

## ============================

if(final_model_type == "linear"){

  model_sum <- summary(final_model)
  coef_table <- model_sum$coefficients$cond

  beta <- coef_table["time_years", "Estimate"]
  se <- coef_table["time_years", "Std. Error"]
  pval <- coef_table["time_years", "Pr(>|z|)"]
  odds_ratio <- exp(beta)

  cat("\nFinal model: Linear temporal effect beta mixed model\n")
  cat("Beta (time_years) =", round(beta, 4), "\n")
  cat("SE =", round(se, 4), "\n")
  cat("P-value =", signif(pval, 3), "\n")
  cat("Odds ratio =", round(odds_ratio, 3), "\n")

  if(pval < 0.05){
    if(beta > 0){
      cat("\n✔ Significant INCREASING temporal trend in mean gene prevalence.\n")
    } else {
      cat("\n✔ Significant DECREASING temporal trend in mean gene prevalence.\n")
    }
  } else {
    cat("\n✖ No statistically significant temporal trend detected.\n")
  }

  cat("\nEffect size interpretation:\n")
  if(odds_ratio > 1){
    cat("→ Each 1-year increase multiplies the odds of the mean prevalence by", round(odds_ratio, 3), ".\n")
  } else {
    cat("→ Each 1-year increase multiplies the odds of the mean prevalence by", round(odds_ratio, 3), ", indicating a decrease.\n")
  }

} else if(final_model_type == "gam"){

  gam_sum <- summary(final_model)
  s_table <- gam_sum$s.table

  time_smooth_row <- grep("^s\\(time_years", rownames(s_table))
  edf_time <- s_table[time_smooth_row, "edf"]
  p_smooth <- s_table[time_smooth_row, "p-value"]

  cat("\nFinal model: Nonlinear beta GAM\n")
  cat("Smooth term for time_years:\n")
  cat("edf =", round(edf_time, 3), "\n")
  cat("P-value =", signif(p_smooth, 3), "\n")

  if(p_smooth < 0.05){
    cat("\n✔ Significant NONLINEAR temporal pattern detected in mean gene prevalence.\n")
  } else {
    cat("\n✖ No statistically significant nonlinear temporal trend detected.\n")
  }

  if(edf_time <= 1.05){
    cat("The estimated smooth is effectively linear (edf ≈ 1).\n")
  } else {
    cat("The estimated smooth indicates a nonlinear pattern over time.\n")
  }
}

## 
## Final model: Linear temporal effect beta mixed model
## Beta (time_years) = -1e-04 
## SE = 2e-04 
## P-value = 0.5 
## Odds ratio = 1 
## 
## ✖ No statistically significant temporal trend detected.
## 
## Effect size interpretation:
## → Each 1-year increase multiplies the odds of the mean prevalence by 1 , indicating a decrease.

Automated results text

cat("\n============================\n")

## 
## ============================

cat("RESULTS TEXT TEMPLATE\n")

## RESULTS TEXT TEMPLATE

cat("============================\n")

## ============================

if(final_model_type == "linear"){

  rand_var <- as.numeric(VarCorr(final_model)$cond$patient_id[1])

  cat(
    paste0(
      "A beta mixed-effects regression model was fitted to evaluate temporal trends in resistance gene prevalence while accounting for clustering at the patient level. ",
      "The fixed effect for time was ",
      ifelse(beta > 0, "positive", "negative"),
      " but small (β = ", round(beta, 4),
      ", SE = ", round(se, 4),
      ", p = ", signif(pval, 3), "). ",
      ifelse(pval < 0.05,
             "This indicates a statistically significant temporal trend in mean gene prevalence. ",
             "This indicates no statistically significant temporal trend in mean gene prevalence. "),
      "The corresponding odds ratio was ", round(odds_ratio, 3),
      ", meaning that each additional year multiplied the odds of the mean prevalence by this factor. ",
      "The random intercept variance for patient ID was ", round(rand_var, 5),
      ", reflecting the degree of between-patient variability in baseline prevalence."
    ),
    "\n"
  )

} else if(final_model_type == "gam"){

  cat(
    paste0(
      "A beta generalized additive mixed model was fitted to evaluate potential nonlinear temporal trends in resistance gene prevalence while accounting for clustering at the patient level. ",
      "The smooth term for time had an effective degrees of freedom (edf) of ", round(edf_time, 3),
      " (p = ", signif(p_smooth, 3), "). ",
      ifelse(p_smooth < 0.05,
             "This indicates evidence of a statistically significant nonlinear temporal pattern in mean gene prevalence over time.",
             "This indicates no statistically significant nonlinear temporal trend in mean gene prevalence over time.")
    ),
    "\n"
  )
}

## A beta mixed-effects regression model was fitted to evaluate temporal trends in resistance gene prevalence while accounting for clustering at the patient level. The fixed effect for time was negative but small (β = -1e-04, SE = 2e-04, p = 0.5). This indicates no statistically significant temporal trend in mean gene prevalence. The corresponding odds ratio was 1, meaning that each additional year multiplied the odds of the mean prevalence by this factor. The random intercept variance for patient ID was 0.00089, reflecting the degree of between-patient variability in baseline prevalence.

Final model diagnostics

cat("\n============================\n")

## 
## ============================

cat("FINAL MODEL DIAGNOSTICS\n")

## FINAL MODEL DIAGNOSTICS

cat("============================\n")

## ============================

if(final_model_type == "linear"){

  sim_res <- simulateResiduals(final_model)

  plot(sim_res)

  uni_test <- testUniformity(sim_res)
  cat("\nUniformity test:\n")
  print(uni_test)

  disp_test <- testDispersion(sim_res)
  cat("\nDispersion test:\n")
  print(disp_test)

  out_test <- testOutliers(sim_res)
  cat("\nOutlier test:\n")
  print(out_test)

  cat("\n============================\n")
  cat("DIAGNOSTIC INTERPRETATION\n")
  cat("============================\n")

  if(uni_test$p.value > 0.05){
    cat("✔ Residuals are approximately uniform → model fit is adequate.\n")
  } else {
    cat("✖ Residuals deviate from uniformity → possible model misspecification.\n")
  }

  if(disp_test$p.value > 0.05){
    cat("✔ No over/under-dispersion detected.\n")
  } else {
    cat("✖ Dispersion issue detected → model may not capture the variance structure adequately.\n")
  }

  if(out_test$p.value > 0.05){
    cat("✔ No major outlier problem detected.\n")
  } else {
    cat("✖ Potential outlier problem detected.\n")
  }

  if(uni_test$p.value > 0.05 &&
     disp_test$p.value > 0.05 &&
     out_test$p.value > 0.05){

    cat("\n✔ FINAL: Model diagnostics indicate an ADEQUATE FIT.\n")

  } else {

    cat("\n⚠ FINAL: Some diagnostic issues detected — interpret results with caution.\n")
  }

} else if(final_model_type == "gam"){

  cat("GAM diagnostic check:\n")
  gam.check(final_model)

  cat("\nAttempting DHARMa residual diagnostics for GAM...\n")

  gam_dharma <- try(simulateResiduals(final_model), silent = TRUE)

  if(!inherits(gam_dharma, "try-error")){

    plot(gam_dharma)

    uni_test <- testUniformity(gam_dharma)
    disp_test <- testDispersion(gam_dharma)
    out_test <- testOutliers(gam_dharma)

    cat("\nUniformity test:\n")
    print(uni_test)

    cat("\nDispersion test:\n")
    print(disp_test)

    cat("\nOutlier test:\n")
    print(out_test)

    cat("\n============================\n")
    cat("DIAGNOSTIC INTERPRETATION\n")
    cat("============================\n")

    if(uni_test$p.value > 0.05){
      cat("✔ Residuals are approximately uniform.\n")
    } else {
      cat("✖ Residuals deviate from uniformity → possible model misspecification.\n")
    }

    if(disp_test$p.value > 0.05){
      cat("✔ No strong dispersion issue detected.\n")
    } else {
      cat("✖ Dispersion issue detected.\n")
    }

    if(out_test$p.value > 0.05){
      cat("✔ No major outlier problem detected.\n")
    } else {
      cat("✖ Potential outlier problem detected.\n")
    }

  } else {

    cat("DHARMa simulation was not available for this GAM fit in the current setup.\n")
    cat("Use gam.check() output and residual plots for GAM diagnostics.\n")
  }
}

## 
## Uniformity test:
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  simulationOutput$scaledResiduals
## D = 0.13537, p-value < 2.2e-16
## alternative hypothesis: two-sided

## 
## Dispersion test:
## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 1.0641, p-value = 0.328
## alternative hypothesis: two.sided

## 
## Outlier test:
## 
##  DHARMa outlier test based on exact binomial test with approximate
##  expectations
## 
## data:  sim_res
## outliers at both margin(s) = 26, observations = 1424, p-value =
## 0.0001344
## alternative hypothesis: true probability of success is not equal to 0.007968127
## 95 percent confidence interval:
##  0.01196080 0.02663903
## sample estimates:
## frequency of outliers (expected: 0.00796812749003984 ) 
##                                             0.01825843 
## 
## 
## ============================
## DIAGNOSTIC INTERPRETATION
## ============================
## ✖ Residuals deviate from uniformity → possible model misspecification.
## ✔ No over/under-dispersion detected.
## ✖ Potential outlier problem detected.
## 
## ⚠ FINAL: Some diagnostic issues detected — interpret results with caution.

Model-based visual comparison

cat("\n============================\n")

## 
## ============================

cat("MODEL-BASED VISUALIZATION\n")

## MODEL-BASED VISUALIZATION

cat("============================\n")

## ============================

ref_patient <- levels(iso$patient_id)[1]

newdat <- data.frame(
  time_years = seq(
    min(iso$time_years, na.rm = TRUE),
    max(iso$time_years, na.rm = TRUE),
    length.out = 200
  ),
  patient_id = factor(ref_patient, levels = levels(iso$patient_id))
)

newdat$pred_linear <- predict(
  model_linear,
  newdata = newdat,
  type = "response",
  re.form = NA
)

newdat$pred_gam <- predict(
  model_gam,
  newdata = newdat,
  type = "response",
  exclude = "s(patient_id)"
)

ggplot(iso, aes(x = time_years, y = gene_prev_adj)) +
  geom_point(alpha = 0.25) +
  geom_line(
    data = newdat,
    aes(x = time_years, y = pred_linear),
    linetype = "dashed",
    linewidth = 1
  ) +
  geom_line(
    data = newdat,
    aes(x = time_years, y = pred_gam),
    linewidth = 1.1,
    color = "blue"
  ) +
  labs(
    title = "Linear vs Nonlinear Temporal Trend",
    subtitle = "Dashed = linear beta mixed model | Solid blue = beta GAM",
    x = "Time (years)",
    y = "Adjusted gene prevalence"
  ) +
  theme_minimal()

Post-hoc analysis by time period

cat("\n============================\n")

## 
## ============================

cat("POST-HOC ANALYSIS: CATEGORICAL TIME PERIODS\n")

## POST-HOC ANALYSIS: CATEGORICAL TIME PERIODS

cat("============================\n")

## ============================

iso <- iso %>%
  mutate(
    period = cut(
      year,
      breaks = c(1970, 1995, 2005, 2015, 2025),
      labels = c("Early", "Mid1", "Mid2", "Late"),
      include.lowest = TRUE,
      right = TRUE
    ),
    period = factor(period, levels = c("Early", "Mid1", "Mid2", "Late"))
  )

if(any(is.na(iso$period))){
  warning("Some observations were not assigned to a period. Check the range of 'year'.")
}

cat("\nNumber of isolates per period:\n")

## 
## Number of isolates per period:

print(table(iso$period, useNA = "ifany"))

## 
## Early  Mid1  Mid2  Late 
##    46   227   714   437

Period model

cat("\n============================\n")

## 
## ============================

cat("FITTING PERIOD MODEL\n")

## FITTING PERIOD MODEL

cat("============================\n")

## ============================

model_cat <- glmmTMB(
  gene_prev_adj ~ period + (1 | patient_id),
  family = beta_family(link = "logit"),
  data = iso
)

print(summary(model_cat))

##  Family: beta  ( logit )
## Formula:          gene_prev_adj ~ period + (1 | patient_id)
## Data: iso
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  -10374.3  -10342.8    5193.2  -10386.3      1418 
## 
## Random effects:
## 
## Conditional model:
##  Groups     Name        Variance  Std.Dev.
##  patient_id (Intercept) 0.0008892 0.02982 
## Number of obs: 1424, groups:  patient_id, 255
## 
## Dispersion parameter for beta family (): 6.91e+03 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.633e-01  6.266e-03 -153.74   <2e-16 ***
## periodMid1  -1.670e-03  6.262e-03   -0.27    0.790    
## periodMid2   9.965e-05  6.257e-03    0.02    0.987    
## periodLate  -1.581e-04  6.351e-03   -0.02    0.980    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overall test of period effect

cat("\n============================\n")

## 
## ============================

cat("OVERALL TEST OF PERIOD EFFECT\n")

## OVERALL TEST OF PERIOD EFFECT

cat("============================\n")

## ============================

model_cat_null <- glmmTMB(
  gene_prev_adj ~ 1 + (1 | patient_id),
  family = beta_family(link = "logit"),
  data = iso
)

overall_test <- anova(model_cat_null, model_cat)
print(overall_test)

## Data: iso
## Models:
## model_cat_null: gene_prev_adj ~ 1 + (1 | patient_id), zi=~0, disp=~1
## model_cat: gene_prev_adj ~ period + (1 | patient_id), zi=~0, disp=~1
##                Df    AIC    BIC logLik deviance  Chisq Chi Df Pr(>Chisq)
## model_cat_null  3 -10380 -10364 5192.9   -10386                         
## model_cat       6 -10374 -10343 5193.2   -10386 0.5043      3     0.9179

overall_p <- overall_test$`Pr(>Chisq)`[2]

if(!is.na(overall_p)){
  if(overall_p < 0.05){
    cat("\n✔ Overall, gene prevalence differs significantly across time periods.\n")
  } else {
    cat("\n✖ No overall statistically significant difference in gene prevalence across time periods.\n")
  }
} else {
  cat("\n⚠ Could not extract overall p-value automatically. Please inspect the ANOVA table above.\n")
}

## 
## ✖ No overall statistically significant difference in gene prevalence across time periods.

Estimated marginal means by period

cat("\n============================\n")

## 
## ============================

cat("ESTIMATED MARGINAL MEANS BY PERIOD\n")

## ESTIMATED MARGINAL MEANS BY PERIOD

cat("============================\n")

## ============================

period_emm <- emmeans(model_cat, ~ period, type = "response")
emm_summary <- summary(period_emm, infer = TRUE)
print(emm_summary)

##  period response       SE  df asymp.LCL asymp.UCL null  z.ratio p.value
##  Early    0.2762 0.001250 Inf    0.2738    0.2787  0.5 -153.739  <.0001
##  Mid1     0.2759 0.000583 Inf    0.2748    0.2770  0.5 -330.582  <.0001
##  Mid2     0.2762 0.000464 Inf    0.2753    0.2772  0.5 -414.902  <.0001
##  Late     0.2762 0.000487 Inf    0.2752    0.2771  0.5 -395.364  <.0001
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the logit scale 
## Tests are performed on the logit scale

Pairwise comparisons between periods

cat("\n============================\n")

## 
## ============================

cat("PAIRWISE COMPARISONS BETWEEN PERIODS\n")

## PAIRWISE COMPARISONS BETWEEN PERIODS

cat("============================\n")

## ============================

period_pairs <- pairs(emmeans(model_cat, ~ period), adjust = "tukey")
pairs_summary <- summary(period_pairs, infer = TRUE)
print(pairs_summary)

##  contrast      estimate      SE  df asymp.LCL asymp.UCL z.ratio p.value
##  Early - Mid1  1.67e-03 0.00626 Inf  -0.01442   0.01776   0.267  0.9934
##  Early - Mid2 -9.97e-05 0.00626 Inf  -0.01617   0.01597  -0.016  1.0000
##  Early - Late  1.58e-04 0.00635 Inf  -0.01616   0.01647   0.025  1.0000
##  Mid1 - Mid2  -1.77e-03 0.00253 Inf  -0.00828   0.00474  -0.699  0.8976
##  Mid1 - Late  -1.51e-03 0.00281 Inf  -0.00873   0.00571  -0.538  0.9497
##  Mid2 - Late   2.58e-04 0.00189 Inf  -0.00459   0.00510   0.137  0.9991
## 
## Results are given on the log odds ratio (not the response) scale. 
## Confidence level used: 0.95 
## Conf-level adjustment: tukey method for comparing a family of 4 estimates 
## P value adjustment: tukey method for comparing a family of 4 estimates

Interpretation of estimated means and pairwise differences

cat("\n============================\n")

## 
## ============================

cat("INTERPRETATION: ESTIMATED MEAN PREVALENCE BY PERIOD\n")

## INTERPRETATION: ESTIMATED MEAN PREVALENCE BY PERIOD

cat("============================\n")

## ============================

emm_df <- as.data.frame(summary(period_emm, infer = TRUE))

resp_col <- intersect(c("response", "prob", "rate", "mean", "emmean"), names(emm_df))
if(length(resp_col) == 0){
  stop("Could not identify the response-scale mean column in emm_df.")
}
resp_col <- resp_col[1]

lower_col <- intersect(c("lower.CL", "asymp.LCL", "LCL"), names(emm_df))
upper_col <- intersect(c("upper.CL", "asymp.UCL", "UCL"), names(emm_df))

if(length(lower_col) == 0 || length(upper_col) == 0){
  stop("Could not identify confidence interval columns in emm_df. Check names(emm_df).")
}

lower_col <- lower_col[1]
upper_col <- upper_col[1]

for(i in seq_len(nrow(emm_df))){
  cat(
    paste0(
      as.character(emm_df$period[i]), ": estimated mean prevalence = ",
      round(as.numeric(emm_df[[resp_col]][i]), 3),
      " (95% CI ",
      round(as.numeric(emm_df[[lower_col]][i]), 3), " to ",
      round(as.numeric(emm_df[[upper_col]][i]), 3), ")\n"
    )
  )
}

## Early: estimated mean prevalence = 0.276 (95% CI 0.274 to 0.279)
## Mid1: estimated mean prevalence = 0.276 (95% CI 0.275 to 0.277)
## Mid2: estimated mean prevalence = 0.276 (95% CI 0.275 to 0.277)
## Late: estimated mean prevalence = 0.276 (95% CI 0.275 to 0.277)

cat("\n============================\n")

## 
## ============================

cat("INTERPRETATION: PAIRWISE DIFFERENCES\n")

## INTERPRETATION: PAIRWISE DIFFERENCES

cat("============================\n")

## ============================

pairs_df <- as.data.frame(pairs_summary)

period_means <- emm_df[[resp_col]]
names(period_means) <- as.character(emm_df$period)

sig_pairs <- pairs_df %>%
  filter(p.value < 0.05)

if(nrow(sig_pairs) == 0){

  cat("✖ No statistically significant pairwise differences were detected between periods after Tukey adjustment.\n")

} else {

  cat("✔ Statistically significant pairwise differences were detected between some periods:\n\n")

  for(i in seq_len(nrow(sig_pairs))){

    contrast_i <- as.character(sig_pairs$contrast[i])
    split_contrast <- strsplit(contrast_i, " - ")[[1]]

    p1 <- split_contrast[1]
    p2 <- split_contrast[2]

    m1 <- period_means[p1]
    m2 <- period_means[p2]

    if(is.na(m1) || is.na(m2)){
      direction_text <- paste0("Difference detected between ", p1, " and ", p2)
    } else if(m1 > m2){
      direction_text <- paste0(p1, " had higher estimated mean prevalence than ", p2)
    } else if(m2 > m1){
      direction_text <- paste0(p2, " had higher estimated mean prevalence than ", p1)
    } else {
      direction_text <- paste0(p1, " and ", p2, " had similar estimated means despite statistical significance")
    }

    cat(
      paste0(
        "- ", direction_text,
        " (adjusted p = ", signif(sig_pairs$p.value[i], 3), ").\n"
      )
    )
  }
}

## ✖ No statistically significant pairwise differences were detected between periods after Tukey adjustment.

Automated results text for period analysis

cat("\n============================\n")

## 
## ============================

cat("RESULTS TEXT TEMPLATE\n")

## RESULTS TEXT TEMPLATE

cat("============================\n")

## ============================

if(!is.na(overall_p) && overall_p < 0.05){

  cat(
    paste0(
      "As a post-hoc analysis, calendar time was categorized into four periods (Early, Mid1, Mid2, and Late), ",
      "and a beta mixed-effects model with a random intercept for patient ID was fitted. ",
      "There was evidence of an overall difference in mean gene prevalence across periods (likelihood ratio test p = ",
      signif(overall_p, 3), "). "
    )
  )

  if(nrow(sig_pairs) == 0){
    cat(
      "However, no individual pairwise differences remained statistically significant after Tukey adjustment.\n"
    )
  } else {
    cat(
      "Tukey-adjusted pairwise comparisons indicated that at least some periods differed significantly in estimated mean prevalence. ",
      "See the pairwise comparison output above for the specific contrasts.\n"
    )
  }

} else if(!is.na(overall_p) && overall_p >= 0.05){

  cat(
    paste0(
      "In a post-hoc analysis, calendar time was categorized into four periods (Early, Mid1, Mid2, and Late), ",
      "and a beta mixed-effects model with a random intercept for patient ID was fitted. ",
      "No overall difference in mean gene prevalence was detected across periods (likelihood ratio test p = ",
      signif(overall_p, 3), "). ",
      "Consistent with this, Tukey-adjusted pairwise comparisons did not indicate strong evidence of differences between periods."
    ),
    "\n"
  )

} else {

  cat(
    "The categorical period model was fitted successfully, but the overall p-value could not be extracted automatically. Please use the ANOVA table above for interpretation.\n"
  )
}

## In a post-hoc analysis, calendar time was categorized into four periods (Early, Mid1, Mid2, and Late), and a beta mixed-effects model with a random intercept for patient ID was fitted. No overall difference in mean gene prevalence was detected across periods (likelihood ratio test p = 0.918). Consistent with this, Tukey-adjusted pairwise comparisons did not indicate strong evidence of differences between periods.

Per-gene trend analysis

cat("\n============================\n")

## 
## ============================

cat("PER-GENE TREND ANALYSIS\n")

## PER-GENE TREND ANALYSIS

cat("============================\n")

## ============================

gene_long <- iso %>%
  select(patient_id, time_years, all_of(gene_cols)) %>%
  pivot_longer(
    cols = all_of(gene_cols),
    names_to = "gene",
    values_to = "presence"
  ) %>%
  mutate(
    patient_id = factor(patient_id),
    presence = case_when(
      presence %in% c(1, "1", TRUE) ~ 1L,
      presence %in% c(0, "0", FALSE) ~ 0L,
      TRUE ~ suppressWarnings(as.integer(as.character(presence)))
    )
  )

Function to fit one gene model

fit_gene_model <- function(dat) {

  dat <- dat %>%
    filter(!is.na(presence), !is.na(time_years), !is.na(patient_id))

  n_obs <- nrow(dat)
  n_patients <- dplyr::n_distinct(dat$patient_id)
  n_pos <- sum(dat$presence == 1, na.rm = TRUE)
  n_neg <- sum(dat$presence == 0, na.rm = TRUE)

  if(n_obs == 0 || n_pos == 0 || n_neg == 0){
    return(data.frame(
      n_obs = n_obs,
      n_patients = n_patients,
      n_pos = n_pos,
      prevalence = ifelse(n_obs > 0, n_pos / n_obs, NA_real_),
      beta = NA_real_,
      se = NA_real_,
      p_value = NA_real_,
      odds_ratio = NA_real_,
      ci_low = NA_real_,
      ci_high = NA_real_,
      note = "Not estimable: no variation in outcome"
    ))
  }

  fit <- try(
    glmmTMB(
      presence ~ time_years + (1 | patient_id),
      family = binomial(link = "logit"),
      data = dat
    ),
    silent = TRUE
  )

  if(inherits(fit, "try-error")){
    return(data.frame(
      n_obs = n_obs,
      n_patients = n_patients,
      n_pos = n_pos,
      prevalence = n_pos / n_obs,
      beta = NA_real_,
      se = NA_real_,
      p_value = NA_real_,
      odds_ratio = NA_real_,
      ci_low = NA_real_,
      ci_high = NA_real_,
      note = "Model failed to converge"
    ))
  }

  coef_table <- summary(fit)$coefficients$cond

  if(!"time_years" %in% rownames(coef_table)){
    return(data.frame(
      n_obs = n_obs,
      n_patients = n_patients,
      n_pos = n_pos,
      prevalence = n_pos / n_obs,
      beta = NA_real_,
      se = NA_real_,
      p_value = NA_real_,
      odds_ratio = NA_real_,
      ci_low = NA_real_,
      ci_high = NA_real_,
      note = "time_years coefficient not available"
    ))
  }

  beta <- coef_table["time_years", "Estimate"]
  se <- coef_table["time_years", "Std. Error"]
  p_value <- coef_table["time_years", "Pr(>|z|)"]
  odds_ratio <- exp(beta)
  ci_low <- exp(beta - 1.96 * se)
  ci_high <- exp(beta + 1.96 * se)

  note_text <- ifelse(
    n_pos < 5 || n_neg < 5,
    "Sparse outcome; interpret cautiously",
    "OK"
  )

  data.frame(
    n_obs = n_obs,
    n_patients = n_patients,
    n_pos = n_pos,
    prevalence = n_pos / n_obs,
    beta = beta,
    se = se,
    p_value = p_value,
    odds_ratio = odds_ratio,
    ci_low = ci_low,
    ci_high = ci_high,
    note = note_text
  )
}

Fit per-gene models

gene_results <- gene_long %>%
  group_by(gene) %>%
  group_modify(~ fit_gene_model(.x)) %>%
  ungroup() %>%
  mutate(
    p_adj = p.adjust(p_value, method = "fdr"),
    trend = case_when(
      is.na(p_adj) ~ "Not estimable",
      p_adj < 0.05 & beta > 0 ~ "Increasing",
      p_adj < 0.05 & beta < 0 ~ "Decreasing",
      TRUE ~ "No significant trend"
    ),
    interpretation = case_when(
      is.na(p_adj) ~ "Trend could not be estimated reliably.",
      p_adj < 0.05 & beta > 0 ~ "Gene presence increased significantly over time.",
      p_adj < 0.05 & beta < 0 ~ "Gene presence decreased significantly over time.",
      TRUE ~ "No statistically significant temporal trend after FDR correction."
    )
  ) %>%
  arrange(p_adj)

gene_results

Significant genes only

cat("\n============================\n")

## 
## ============================

cat("SIGNIFICANT PER-GENE TRENDS (FDR < 0.05)\n")

## SIGNIFICANT PER-GENE TRENDS (FDR < 0.05)

cat("============================\n")

## ============================

sig_genes <- gene_results %>%
  filter(!is.na(p_adj), p_adj < 0.05)

if(nrow(sig_genes) == 0){
  cat("✖ No genes showed a statistically significant temporal trend after FDR correction.\n")
} else {
  print(sig_genes)
}

## # A tibble: 5 × 15
##   gene  n_obs n_patients n_pos prevalence    beta     se     p_value odds_ratio
##   <chr> <int>      <int> <int>      <dbl>   <dbl>  <dbl>       <dbl>      <dbl>
## 1 TriA   1424        255  1071      0.752 -0.0881 0.0174 0.000000410      0.916
## 2 MexI   1424        255  1409      0.989  0.104  0.0259 0.0000543        1.11 
## 3 OpmD   1424        255  1409      0.989  0.104  0.0259 0.0000543        1.11 
## 4 PDC-3  1424        255   649      0.456 -0.110  0.0316 0.000506         0.896
## 5 MexG   1424        255  1399      0.982  0.127  0.0381 0.000840         1.14 
## # ℹ 6 more variables: ci_low <dbl>, ci_high <dbl>, note <chr>, p_adj <dbl>,
## #   trend <chr>, interpretation <chr>

Per-gene interpretation

cat("\n============================\n")

## 
## ============================

cat("INTERPRETATION\n")

## INTERPRETATION

cat("============================\n")

## ============================

n_tested <- sum(!is.na(gene_results$p_value))
n_sig <- sum(!is.na(gene_results$p_adj) & gene_results$p_adj < 0.05)
n_increasing <- sum(gene_results$trend == "Increasing", na.rm = TRUE)
n_decreasing <- sum(gene_results$trend == "Decreasing", na.rm = TRUE)

cat("Genes tested:", n_tested, "\n")

## Genes tested: 214

cat("Significant after FDR correction:", n_sig, "\n")

## Significant after FDR correction: 5

cat("Increasing trends:", n_increasing, "\n")

## Increasing trends: 3

cat("Decreasing trends:", n_decreasing, "\n")

## Decreasing trends: 2

if(n_sig == 0){

  cat("\nOverall interpretation:\n")
  cat("No individual resistance genes showed a statistically significant linear temporal trend after controlling for multiple testing using the false discovery rate (FDR).\n")

} else {

  cat("\nOverall interpretation:\n")
  cat("One or more individual resistance genes showed statistically significant temporal trends after FDR correction.\n\n")

  for(i in seq_len(nrow(sig_genes))){
    cat(
      paste0(
        "- ", sig_genes$gene[i], ": ",
        sig_genes$trend[i],
        " trend (β = ", round(sig_genes$beta[i], 4),
        ", OR = ", round(sig_genes$odds_ratio[i], 3),
        ", 95% CI ", round(sig_genes$ci_low[i], 3),
        " to ", round(sig_genes$ci_high[i], 3),
        ", FDR-adjusted p = ", signif(sig_genes$p_adj[i], 3), ").\n"
      )
    )
  }
}

## 
## Overall interpretation:
## One or more individual resistance genes showed statistically significant temporal trends after FDR correction.
## 
## - TriA: Decreasing trend (β = -0.0881, OR = 0.916, 95% CI 0.885 to 0.947, FDR-adjusted p = 8.78e-05).
## - MexI: Increasing trend (β = 0.1044, OR = 1.11, 95% CI 1.055 to 1.168, FDR-adjusted p = 0.00388).
## - OpmD: Increasing trend (β = 0.1044, OR = 1.11, 95% CI 1.055 to 1.168, FDR-adjusted p = 0.00388).
## - PDC-3: Decreasing trend (β = -0.1097, OR = 0.896, 95% CI 0.842 to 0.953, FDR-adjusted p = 0.0271).
## - MexG: Increasing trend (β = 0.1271, OR = 1.136, 95% CI 1.054 to 1.223, FDR-adjusted p = 0.036).

Results text for per-gene analysis

cat("\n============================\n")

## 
## ============================

cat("RESULTS TEXT TEMPLATE\n")

## RESULTS TEXT TEMPLATE

cat("============================\n")

## ============================

if(n_sig == 0){

  cat(
    paste0(
      "Per-gene temporal trend analyses were performed using mixed-effects logistic regression models with a random intercept for patient ID. ",
      n_tested, " genes were evaluable. After controlling for multiple testing using the false discovery rate, ",
      "no individual genes showed a statistically significant linear temporal trend. ",
      "These findings suggest that the presence of individual resistance genes remained broadly stable over time in this dataset."
    ),
    "\n"
  )

} else {

  cat(
    paste0(
      "Per-gene temporal trend analyses were performed using mixed-effects logistic regression models with a random intercept for patient ID. ",
      n_tested, " genes were evaluable, and ", n_sig,
      " showed statistically significant temporal trends after false discovery rate correction. ",
      n_increasing, " gene(s) showed increasing trends and ",
      n_decreasing, " gene(s) showed decreasing trends over time."
    ),
    "\n"
  )
}

## Per-gene temporal trend analyses were performed using mixed-effects logistic regression models with a random intercept for patient ID. 214 genes were evaluable, and 5 showed statistically significant temporal trends after false discovery rate correction. 3 gene(s) showed increasing trends and 2 gene(s) showed decreasing trends over time.

Visualize top genes

cat("\n============================\n")

## 
## ============================

cat("VISUALIZE TOP GENES\n")

## VISUALIZE TOP GENES

cat("============================\n")

## ============================

gene_results_plot <- gene_results %>%
  filter(!is.na(p_adj)) %>%
  arrange(p_adj)

if(nrow(gene_results_plot) == 0){
  stop("No genes with valid adjusted p-values available for plotting.")
}

top_genes <- gene_results_plot %>%
  slice_head(n = 6) %>%
  pull(gene)

cat("\nTop genes selected for plotting:\n")

## 
## Top genes selected for plotting:

print(top_genes)

## [1] "TriA"  "MexI"  "OpmD"  "PDC-3" "MexG"  "mexM"

plot_data <- gene_long %>%
  filter(gene %in% top_genes) %>%
  left_join(
    gene_results %>%
      select(gene, beta, odds_ratio, p_value, p_adj, trend, interpretation),
    by = "gene"
  ) %>%
  mutate(
    facet_label = paste0(
      gene,
      "\nTrend: ", trend,
      "\nFDR p = ", ifelse(is.na(p_adj), "NA", signif(p_adj, 3))
    )
  )

facet_order <- plot_data %>%
  distinct(gene, facet_label, p_adj) %>%
  arrange(p_adj) %>%
  pull(facet_label)

plot_data$facet_label <- factor(plot_data$facet_label, levels = facet_order)

pred_list <- lapply(top_genes, function(g){

  dat_g <- gene_long %>%
    filter(gene == g) %>%
    mutate(patient_id = factor(patient_id))

  if(length(unique(dat_g$presence[!is.na(dat_g$presence)])) < 2){
    return(NULL)
  }

  fit_g <- try(
    glmmTMB(
      presence ~ time_years + (1 | patient_id),
      family = binomial(link = "logit"),
      data = dat_g
    ),
    silent = TRUE
  )

  if(inherits(fit_g, "try-error")){
    return(NULL)
  }

  ref_patient <- levels(dat_g$patient_id)[1]

  newdat <- data.frame(
    time_years = seq(
      min(dat_g$time_years, na.rm = TRUE),
      max(dat_g$time_years, na.rm = TRUE),
      length.out = 200
    ),
    patient_id = factor(ref_patient, levels = levels(dat_g$patient_id))
  )

  newdat$pred <- predict(
    fit_g,
    newdata = newdat,
    type = "response",
    re.form = NA
  )

  res_row <- gene_results %>% filter(gene == g) %>% slice(1)

  newdat$gene <- g
  newdat$facet_label <- paste0(
    g,
    "\nTrend: ", res_row$trend,
    "\nFDR p = ", ifelse(is.na(res_row$p_adj), "NA", signif(res_row$p_adj, 3))
  )

  newdat
})

pred_data <- bind_rows(pred_list)
pred_data$facet_label <- factor(pred_data$facet_label, levels = facet_order)

p_top_genes <- ggplot(plot_data, aes(x = time_years, y = presence)) +
  geom_jitter(
    height = 0.05,
    width = 0.1,
    alpha = 0.25,
    size = 1.1
  ) +
  geom_line(
    data = pred_data,
    aes(x = time_years, y = pred),
    inherit.aes = FALSE,
    color = "steelblue",
    linewidth = 1
  ) +
  facet_wrap(~ facet_label, scales = "fixed") +
  labs(
    title = "Top Resistance Genes Over Time",
    subtitle = "Model-based predicted probabilities from mixed-effects logistic regression",
    x = "Time (years)",
    y = "Probability of gene presence"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    strip.text = element_text(face = "bold", size = 11),
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )

print(p_top_genes)

Interpretation of top-gene figure

cat("\n============================\n")

## 
## ============================

cat("INTERPRETATION\n")

## INTERPRETATION

cat("============================\n")

## ============================

top_gene_summary <- gene_results %>%
  filter(gene %in% top_genes) %>%
  arrange(p_adj)

if(nrow(top_gene_summary) == 0){

  cat("No evaluable genes were available for visualization.\n")

} else {

  cat("The figure displays the genes with the smallest FDR-adjusted p-values from the per-gene mixed-effects logistic regression analysis.\n")
  cat("Each panel shows the observed binary data and the model-based predicted probability of gene presence over time.\n\n")

  for(i in seq_len(nrow(top_gene_summary))){
    cat(
      paste0(
        "- ", top_gene_summary$gene[i], ": ",
        top_gene_summary$trend[i],
        " trend",
        if(!is.na(top_gene_summary$beta[i])) paste0(" (β = ", round(top_gene_summary$beta[i], 4), ")") else "",
        if(!is.na(top_gene_summary$odds_ratio[i])) paste0(", OR = ", round(top_gene_summary$odds_ratio[i], 3)) else "",
        if(!is.na(top_gene_summary$p_adj[i])) paste0(", FDR-adjusted p = ", signif(top_gene_summary$p_adj[i], 3)) else "",
        ".\n"
      )
    )
  }

  sig_top <- top_gene_summary %>%
    filter(!is.na(p_adj), p_adj < 0.05)

  cat("\nOverall interpretation:\n")

  if(nrow(sig_top) == 0){
    cat("These genes were prioritized for visualization because they had the smallest adjusted p-values, but none remained statistically significant after false discovery rate correction. Accordingly, the plotted trajectories should be interpreted as descriptive rather than confirmatory evidence of temporal change.\n")
  } else {
    cat("At least one of the plotted genes remained statistically significant after false discovery rate correction. The fitted trajectories illustrate the direction and magnitude of those gene-specific temporal trends.\n")
  }
}

## The figure displays the genes with the smallest FDR-adjusted p-values from the per-gene mixed-effects logistic regression analysis.
## Each panel shows the observed binary data and the model-based predicted probability of gene presence over time.
## 
## - TriA: Decreasing trend (β = -0.0881), OR = 0.916, FDR-adjusted p = 8.78e-05.
## - MexI: Increasing trend (β = 0.1044), OR = 1.11, FDR-adjusted p = 0.00388.
## - OpmD: Increasing trend (β = 0.1044), OR = 1.11, FDR-adjusted p = 0.00388.
## - PDC-3: Decreasing trend (β = -0.1097), OR = 0.896, FDR-adjusted p = 0.0271.
## - MexG: Increasing trend (β = 0.1271), OR = 1.136, FDR-adjusted p = 0.036.
## - mexM: No significant trend trend (β = -0.2704), OR = 0.763, FDR-adjusted p = 0.0873.
## 
## Overall interpretation:
## At least one of the plotted genes remained statistically significant after false discovery rate correction. The fitted trajectories illustrate the direction and magnitude of those gene-specific temporal trends.

Combined top-gene trend plot using sample date

cat("\n============================\n")

## 
## ============================

cat("COMBINED TOP-GENE TREND PLOT (USING sample_date)\n")

## COMBINED TOP-GENE TREND PLOT (USING sample_date)

cat("============================\n")

## ============================

iso_plot <- iso %>%
  mutate(
    sample_date = as.Date(sample_date),
    sample_year = year(sample_date),
    patient_id = factor(patient_id)
  )

if(all(is.na(iso_plot$sample_date))){
  stop("sample_date could not be converted to Date. Check its format.")
}

cat("\nSample date range:\n")

## 
## Sample date range:

cat("First sample date:", as.character(min(iso_plot$sample_date, na.rm = TRUE)), "\n")

## First sample date: 1980-01-02

cat("Last sample date: ", as.character(max(iso_plot$sample_date, na.rm = TRUE)), "\n")

## Last sample date:  2020-12-03

cat("\nSample year range:\n")

## 
## Sample year range:

cat("Start year:", min(iso_plot$sample_year, na.rm = TRUE), "\n")

## Start year: 1980

cat("End year:  ", max(iso_plot$sample_year, na.rm = TRUE), "\n")

## End year:   2020

gene_results_plot <- gene_results %>%
  filter(!is.na(p_adj)) %>%
  arrange(p_adj)

if(nrow(gene_results_plot) == 0){
  stop("No genes with valid adjusted p-values available for plotting.")
}

top_genes <- gene_results_plot %>%
  slice_head(n = 6) %>%
  pull(gene)

cat("\nTop genes selected for plotting:\n")

## 
## Top genes selected for plotting:

print(top_genes)

## [1] "TriA"  "MexI"  "OpmD"  "PDC-3" "MexG"  "mexM"

plot_long <- iso_plot %>%
  select(patient_id, sample_date, sample_year, all_of(gene_cols)) %>%
  pivot_longer(
    cols = all_of(gene_cols),
    names_to = "gene",
    values_to = "presence"
  ) %>%
  mutate(
    presence = case_when(
      presence %in% c(1, "1", TRUE) ~ 1L,
      presence %in% c(0, "0", FALSE) ~ 0L,
      TRUE ~ suppressWarnings(as.integer(as.character(presence)))
    )
  ) %>%
  filter(
    gene %in% top_genes,
    !is.na(sample_year),
    !is.na(presence),
    !is.na(patient_id)
  )

if(nrow(plot_long) == 0){
  stop("No valid plotting data available after reshaping iso.")
}

start_year <- min(plot_long$sample_year, na.rm = TRUE)
end_year <- max(plot_long$sample_year, na.rm = TRUE)

cat("\nTime range used for plot:\n")

## 
## Time range used for plot:

cat("Start year =", start_year, "\n")

## Start year = 1980

cat("End year   =", end_year, "\n")

## End year   = 2020

pred_list <- lapply(top_genes, function(g){

  dat_g <- plot_long %>%
    filter(gene == g) %>%
    mutate(patient_id = factor(patient_id))

  if(length(unique(dat_g$presence[!is.na(dat_g$presence)])) < 2){
    return(NULL)
  }

  fit_g <- try(
    glmmTMB(
      presence ~ sample_year + (1 | patient_id),
      family = binomial(link = "logit"),
      data = dat_g
    ),
    silent = TRUE
  )

  if(inherits(fit_g, "try-error")){
    return(NULL)
  }

  ref_patient <- levels(dat_g$patient_id)[1]

  newdat <- data.frame(
    sample_year = seq(start_year, end_year, length.out = 200),
    patient_id = factor(ref_patient, levels = levels(dat_g$patient_id))
  )

  newdat$pred <- predict(
    fit_g,
    newdata = newdat,
    type = "response",
    re.form = NA
  )

  g_info <- gene_results %>%
    filter(gene == g) %>%
    slice(1)

  newdat$gene <- g
  newdat$trend <- g_info$trend
  newdat$p_adj <- g_info$p_adj

  newdat
})

pred_data <- bind_rows(pred_list)

if(nrow(pred_data) == 0){
  stop("Prediction data could not be generated for the selected genes.")
}

obs_yearly <- plot_long %>%
  group_by(gene, sample_year) %>%
  summarise(
    prevalence = mean(presence, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  )

p_top_combined <- ggplot() +
  geom_point(
    data = obs_yearly,
    aes(x = sample_year, y = prevalence, color = gene),
    alpha = 0.65,
    size = 2
  ) +
  geom_line(
    data = pred_data,
    aes(x = sample_year, y = pred, color = gene),
    linewidth = 1.2
  ) +
  scale_x_continuous(
    limits = c(start_year, end_year),
    breaks = seq(start_year, end_year, by = 5)
  ) +
  scale_y_continuous(
    labels = label_percent(accuracy = 1),
    limits = c(0, 1)
  ) +
  labs(
    title = "Temporal Trends of Top Resistance Genes",
    subtitle = paste0("Based on sample_date (", start_year, " to ", end_year, ")"),
    x = "Sample year",
    y = "Gene prevalence (%)",
    color = "Gene"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "right",
    panel.grid.minor = element_blank()
  )

print(p_top_combined)

Interpretation of combined plot

cat("\n============================\n")

## 
## ============================

cat("INTERPRETATION\n")

## INTERPRETATION

cat("============================\n")

## ============================

top_gene_summary <- gene_results %>%
  filter(gene %in% top_genes) %>%
  arrange(p_adj)

cat("This figure shows the top-ranked genes based on the smallest FDR-adjusted p-values from the per-gene trend analysis.\n")

## This figure shows the top-ranked genes based on the smallest FDR-adjusted p-values from the per-gene trend analysis.

cat("The x-axis is based on calendar year derived from iso$sample_date.\n")

## The x-axis is based on calendar year derived from iso$sample_date.

cat("Points show observed yearly prevalence, and colored lines show model-based predicted prevalence from mixed-effects logistic regression.\n\n")

## Points show observed yearly prevalence, and colored lines show model-based predicted prevalence from mixed-effects logistic regression.

for(i in seq_len(nrow(top_gene_summary))){
  cat(
    paste0(
      "- ", top_gene_summary$gene[i], ": ",
      top_gene_summary$trend[i],
      " trend",
      if(!is.na(top_gene_summary$odds_ratio[i])) paste0(", OR = ", round(top_gene_summary$odds_ratio[i], 3)) else "",
      if(!is.na(top_gene_summary$p_adj[i])) paste0(", FDR-adjusted p = ", signif(top_gene_summary$p_adj[i], 3)) else "",
      ".\n"
    )
  )
}

## - TriA: Decreasing trend, OR = 0.916, FDR-adjusted p = 8.78e-05.
## - MexI: Increasing trend, OR = 1.11, FDR-adjusted p = 0.00388.
## - OpmD: Increasing trend, OR = 1.11, FDR-adjusted p = 0.00388.
## - PDC-3: Decreasing trend, OR = 0.896, FDR-adjusted p = 0.0271.
## - MexG: Increasing trend, OR = 1.136, FDR-adjusted p = 0.036.
## - mexM: No significant trend trend, OR = 0.763, FDR-adjusted p = 0.0873.

sig_top <- top_gene_summary %>%
  filter(!is.na(p_adj), p_adj < 0.05)

cat("\nOverall interpretation:\n")

## 
## Overall interpretation:

if(nrow(sig_top) == 0){
  cat("Although these genes had the smallest adjusted p-values, none remained statistically significant after false discovery rate correction. Therefore, the plotted trajectories should be interpreted as descriptive rather than confirmatory.\n")
} else {
  cat("One or more plotted genes remained statistically significant after false discovery rate correction. The lines illustrate the direction and relative magnitude of gene-specific temporal change over calendar time.\n")
}

## One or more plotted genes remained statistically significant after false discovery rate correction. The lines illustrate the direction and relative magnitude of gene-specific temporal change over calendar time.

Results text for combined figure

cat("\n============================\n")

## 
## ============================

cat("RESULTS TEXT TEMPLATE\n")

## RESULTS TEXT TEMPLATE

cat("============================\n")

## ============================

if(nrow(sig_top) == 0){
  cat(
    paste0(
      "Figure X displays the top-ranked genes from the per-gene trend analysis using calendar year derived from sample_date. ",
      "Observed yearly prevalence values are shown as points, and mixed-effects logistic regression predictions are shown as colored lines. ",
      "Although these genes were prioritized for visualization, none remained statistically significant after false discovery rate correction, and the plotted trends should therefore be interpreted descriptively."
    ),
    "\n"
  )
} else {
  cat(
    paste0(
      "Figure X displays the top-ranked genes from the per-gene trend analysis using calendar year derived from sample_date. ",
      "Observed yearly prevalence values are shown as points, and mixed-effects logistic regression predictions are shown as colored lines. ",
      "One or more plotted genes remained statistically significant after false discovery rate correction, illustrating gene-specific increases or decreases in prevalence over calendar time."
    ),
    "\n"
  )
}

## Figure X displays the top-ranked genes from the per-gene trend analysis using calendar year derived from sample_date. Observed yearly prevalence values are shown as points, and mixed-effects logistic regression predictions are shown as colored lines. One or more plotted genes remained statistically significant after false discovery rate correction, illustrating gene-specific increases or decreases in prevalence over calendar time.

Notes

To render this file to PDF in RStudio:

rmarkdown::render("temporal_trend_analysis.Rmd", output_format = "pdf_document")

Make sure you have a LaTeX engine installed. A common option is TinyTeX:

install.packages("tinytex")
tinytex::install_tinytex()

Temporal Trend Analysis of Resistance Gene Prevalence

Melika Moradi

2026-03-27