1 Load data

2 QC Pipeline

QC is performed before rarefaction to identify and exclude low-quality samples that would otherwise drag down the rarefaction threshold for all samples.

bowtie_logs <- list.files(
  "/home/miguel/Documentos/analysis_sevilla_rescap/pipeline/results/bowtie2",
  pattern = "\\.bowtie2\\.log$", full.names = TRUE
)

qc_capture <- map_dfr(bowtie_logs, function(f) {
  txt       <- readLines(f)
  rate_line <- grep("overall alignment rate", txt, value = TRUE)
  rate      <- as.numeric(gsub("%.*", "", trimws(rate_line)))
  tibble(dataset = gsub("\\.bowtie2\\.log$", "", basename(f)), on_target_pct = rate)
}) %>%
  left_join(metadata, by = "dataset")

qc_capture %>%
  ggplot(aes(x = reorder(dataset, on_target_pct), y = on_target_pct, fill = Tipo)) +
  geom_col() +
  geom_hline(yintercept = 20, linetype = "dashed", color = "red", alpha = .5) +
  coord_flip() +
  scale_fill_manual(values = tipo_colors) +
  labs(title = "ResCap capture efficiency", x = NULL, y = "On-target reads (%)") +
  theme_classic() +
  theme(axis.text.y = element_text(size = 7))

kma_hits <- Tabla_inicial_prev %>%
  filter(p_value < 0.05) %>%
  group_by(dataset) %>%
  summarise(n_templates = n_distinct(template), .groups = "drop") %>%
  left_join(metadata, by = "dataset") %>%
  left_join(select(qc_capture, dataset, on_target_pct), by = "dataset")

kma_hits %>%
  ggplot(aes(x = on_target_pct, y = n_templates, color = Tipo, label = dataset)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "grey40", linetype = "dashed") +
  geom_text_repel(size = 2.5, max.overlaps = 10) +
  scale_color_manual(values = tipo_colors) +
  labs(title = "Capture efficiency vs resistome size",
       x = "On-target reads (%)", y = "KMA templates detected") +
  theme_classic()

datatable(arrange(kma_hits, desc(n_templates)), rownames = FALSE,
          options = list(pageLength = 15, scrollX = TRUE))
# Correlation between capture efficiency and raw gene richness (pre-rarefaction)
kma_hits %>%
  filter(!is.na(on_target_pct)) %>%
  ggplot(aes(x = on_target_pct, y = n_templates, color = Tipo, label = dataset)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "grey40", linetype = "dashed") +
  geom_text_repel(size = 2.5, max.overlaps = 10) +
  scale_color_manual(values = tipo_colors) +
  labs(title = "Capture efficiency vs gene richness (pre-rarefaction)",
       x = "On-target reads (%)", y = "KMA templates detected") +
  theme_classic()

# Total depth per sample — flag outliers before rarefaction
sample_depth <- Tabla_inicial_raw_collapsed %>%
  group_by(dataset) %>%
  summarise(total_depth = sum(depth), .groups = "drop") %>%
  left_join(metadata %>% distinct(dataset, Tipo), by = "dataset") %>%
  arrange(total_depth)

depth_threshold <- quantile(sample_depth$total_depth, 0.05)  # bottom 5%

sample_depth %>%
  ggplot(aes(x = reorder(dataset, total_depth), y = total_depth, fill = Tipo)) +
  geom_col() +
  geom_hline(yintercept = depth_threshold, linetype = "dashed", color = "red") +
  coord_flip() +
  scale_fill_manual(values = tipo_colors) +
  labs(title = "Total depth per sample (red = 5th percentile threshold)",
       x = NULL, y = "Total depth") +
  theme_classic() +
  theme(axis.text.y = element_text(size = 7))

# Samples below threshold — review before deciding to exclude
low_depth_samples <- sample_depth %>% filter(total_depth < depth_threshold)
if (nrow(low_depth_samples) > 0) {
  cat("Samples below 5th percentile depth threshold (", round(depth_threshold, 1), "):\n")
  print(low_depth_samples)
}
## Samples below 5th percentile depth threshold ( 466815.4 ):
## # A tibble: 3 × 3
##   dataset total_depth Tipo  
##   <chr>         <dbl> <chr> 
## 1 20_S10      348322. Salida
## 2 18_S9       419057. Salida
## 3 16_S7       466300. Salida
# ── Exclude outlier samples here if needed ────────────────────────────────────
# samples_to_exclude <- c("SAMPLE_ID_1", "SAMPLE_ID_2")
# Tabla_inicial_raw_collapsed <- Tabla_inicial_raw_collapsed %>%
#   filter(!dataset %in% samples_to_exclude)

3 Rarefaction (normalisation)

All databases (ABR + BAC + REL) are rarefied together since all reads come from the same Illumina run. KMA depth (coverage normalised by gene length) is used as an integer proxy for read counts. 1 000 rarefactions are averaged in parallel to reduce stochasticity.

# Rarefaction curves to assess whether raremax captures sufficient diversity
Tabla_rare_pre <- Tabla_inicial_raw_collapsed %>%
  select(template, depth, dataset) %>%
  group_by(dataset, template) %>%
  summarise(depth = sum(depth), .groups = "drop") %>%
  pivot_wider(names_from = template, values_from = depth, values_fill = 0) %>%
  column_to_rownames("dataset")

Tabla_rare_pre[] <- lapply(Tabla_rare_pre, as.integer)

# Colour by Tipo
tipo_vec <- metadata %>%
  filter(dataset %in% rownames(Tabla_rare_pre)) %>%
  arrange(match(dataset, rownames(Tabla_rare_pre))) %>%
  pull(Tipo)
col_vec <- tipo_colors[tipo_vec]

rarecurve(Tabla_rare_pre, step = 50, col = col_vec,
          label = FALSE, main = "Rarefaction curves (colour = Tipo)",
          ylab = "Gene richness", xlab = "Depth")
abline(v = min(rowSums(Tabla_rare_pre)), lty = 2, col = "black")
legend("bottomright", legend = names(tipo_colors), fill = tipo_colors, bty = "n")

3.1 Rarefaction summary

tibble(
  Metric = c("Rarefaction depth (raremax)", "Samples", "Genes before", "Genes after"),
  Value  = c(raremax, nrow(Tabla_rare),
             ncol(Tabla_rare),
             n_distinct(Tabla_inicial$template))
) %>% knitr::kable()
Metric Value
Rarefaction depth (raremax) 347690
Samples 48
Genes before 13308
Genes after 13305

3.2 Depth before rarefaction

Tabla_inicial_raw_collapsed %>%
  group_by(database, dataset) %>%
  summarise(total_depth = sum(depth), .groups = "drop") %>%
  left_join(metadata %>% distinct(dataset, Tipo), by = "dataset") %>%
  ggplot(aes(x = reorder(dataset, total_depth), y = total_depth, fill = Tipo)) +
  geom_col() +
  facet_wrap(~database, scales = "free") +
  scale_fill_manual(values = tipo_colors) +
  labs(title = "Total depth per sample — before rarefaction", x = NULL, y = "Total depth") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5),
        strip.background = element_blank())

3.3 Depth after rarefaction

Tabla_inicial %>%
  group_by(database, dataset, Tipo) %>%
  summarise(total_depth = sum(depth), .groups = "drop") %>%
  ggplot(aes(x = reorder(dataset, total_depth), y = total_depth, fill = Tipo)) +
  geom_col() +
  facet_wrap(~database, scales = "free") +
  scale_fill_manual(values = tipo_colors) +
  labs(title = "Total depth per sample — after rarefaction", x = NULL, y = "Total depth") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5),
        strip.background = element_blank())

4 Diversity index

4.1 Shapiro test

diver_gral %>%
  group_by(database2, index, Tipo) %>%
  group_modify(~ tidy(shapiro.test(.x$value))) %>%
  datatable(rownames = FALSE, filter = "top",
            options = list(pageLength = 30, scrollX = TRUE))

4.2 Kruskal-Wallis test

diver_gral %>%
  group_by(database2, index) %>%
  group_modify(~ tidy(kruskal.test(.x$value ~ .x$Tipo))) %>%
  datatable(rownames = FALSE, filter = "top",
            options = list(pageLength = 30, scrollX = TRUE))

4.3 Shannon Index

plot_diversity("shannon", "Shannon diversity")

diver_gral %>% filter(index == "shannon") %>%
  group_by(Tipo, database2) %>%
  summarise(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE), .groups = "drop") %>%
  datatable(rownames = FALSE, filter = "top", options = list(pageLength = 30, scrollX = TRUE))
grouped_ggbetweenstats(
  data = diver_gral %>% filter(index == "shannon", !database2 %in% c("gral","BAC")),
  x = Tipo, y = value, type = "np", grouping.var = database2,
  ggplot.component = list(scale_fill_manual(values = tipo_colors), theme_rescap)
)

4.4 Observed richness

plot_diversity("richness", "Observed gene richness")

diver_gral %>% filter(index == "richness") %>%
  group_by(Tipo, database2) %>%
  summarise(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE), .groups = "drop") %>%
  datatable(rownames = FALSE, filter = "top", options = list(pageLength = 30, scrollX = TRUE))
grouped_ggbetweenstats(
  data = diver_gral %>% filter(index == "richness", !database2 %in% c("gral","BAC")),
  x = Tipo, y = value, type = "np", grouping.var = database2,
  ggplot.component = list(scale_fill_manual(values = tipo_colors), theme_rescap)
)

4.5 Dominance Simpson

plot_diversity("dominance.simpson", "Dominance Simpson")

grouped_ggbetweenstats(
  data = diver_gral %>% filter(index == "dominance.simpson", !database2 %in% c("gral","BAC")),
  x = Tipo, y = value, type = "np", grouping.var = database2
)

4.6 Evenness Simpson

plot_diversity("evenness.simpson", "Evenness Simpson")

grouped_ggbetweenstats(
  data = diver_gral %>% filter(index == "evenness.simpson", !database2 %in% c("gral","BAC")),
  x = Tipo, y = value, type = "np", grouping.var = database2
)

5 PCA & UMAP

5.1 PCA — General

plot_pca("gral", "Tipo",        "PCA General — by Tipo") +
  scale_color_manual(values = tipo_colors)

plot_pca("gral", "EDAR_Nombre", "PCA General — by EDAR")

5.2 PCA — ABR / BAC / REL

plot_pca("ABR",        "Tipo", "PCA ABR")         + scale_color_manual(values = tipo_colors)

plot_pca("BAC",        "Tipo", "PCA BAC")         + scale_color_manual(values = tipo_colors)

plot_pca("REL",        "Tipo", "PCA REL")         + scale_color_manual(values = tipo_colors)

plot_pca("BAC_Metal",  "Tipo", "PCA BAC Metals")  + scale_color_manual(values = tipo_colors)

plot_pca("BAC_Biocide","Tipo", "PCA BAC Biocide") + scale_color_manual(values = tipo_colors)

5.3 UMAP — General

plot_umap("gral", "Tipo", "UMAP General") + scale_color_manual(values = tipo_colors)

5.4 UMAP — ABR / BAC / REL

plot_umap("ABR",        "Tipo", "UMAP ABR")         + scale_color_manual(values = tipo_colors)

plot_umap("BAC",        "Tipo", "UMAP BAC")         + scale_color_manual(values = tipo_colors)

plot_umap("REL",        "Tipo", "UMAP REL")         + scale_color_manual(values = tipo_colors)

plot_umap("BAC_Metal",  "Tipo", "UMAP BAC Metals")  + scale_color_manual(values = tipo_colors)

plot_umap("BAC_Biocide","Tipo", "UMAP BAC Biocide") + scale_color_manual(values = tipo_colors)

6 Adonis (PERMANOVA)

6.1 General — Tipo

run_adonis(spread_list$gral, "Tipo")

6.2 General — EDAR

run_adonis(spread_list$gral, "EDAR_Nombre")

6.3 ABR

run_adonis(spread_list$ABR, "Tipo")

6.4 BAC

run_adonis(spread_list$BAC, "Tipo")

6.5 REL

run_adonis(spread_list$REL, "Tipo")

7 Gene statistics

7.1 Distinct genes by Tipo

Tabla_inicial %>%
  group_by(database, Tipo, template) %>% summarise(.groups = "drop") %>%
  count(Tipo, database, name = "Different_genes") %>%
  ggplot(aes(x = Tipo, y = Different_genes, fill = database)) +
  geom_col() + theme_rescap + theme(legend.position = "right", aspect.ratio = 1)

Tabla_inicial %>%
  group_by(database, Tipo, dataset) %>%
  summarise(n_genes = n_distinct(template), .groups = "drop") %>%
  grouped_ggbetweenstats(
    data = ., x = Tipo, y = n_genes, grouping.var = database, type = "np"
  )

7.2 Proportion by Tipo & EDAR

Tabla_inicial %>%
  group_by(database, Tipo, template) %>% summarise(.groups = "drop") %>%
  count(Tipo, database, name = "n") %>%
  ggplot(aes(x = Tipo, y = n, fill = database)) +
  geom_col(position = "fill") + theme_rescap + theme(legend.position = "right", aspect.ratio = 1)

Tabla_inicial %>%
  group_by(database, EDAR_Nombre, template) %>% summarise(.groups = "drop") %>%
  count(EDAR_Nombre, database, name = "n") %>%
  ggplot(aes(x = EDAR_Nombre, y = n, fill = database)) +
  geom_col(position = "fill") + theme_rescap + theme(legend.position = "right", aspect.ratio = 1)

Tabla_inicial %>%
  group_by(database, Fecha, template) %>% summarise(.groups = "drop") %>%
  count(Fecha, database, name = "n") %>%
  ggplot(aes(x = Fecha, y = n, fill = database)) +
  geom_col(position = "fill") + theme_rescap + theme(legend.position = "right", aspect.ratio = 1)

7.3 Proportion per sample

Tabla_inicial %>%
  group_by(Tipo, database, dataset) %>%
  summarise(n_genes = n_distinct(template), .groups = "drop") %>%
  ggplot(aes(x = dataset, y = n_genes, fill = database)) +
  geom_col(position = "fill") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank())

7.4 Kruskal-Wallis genes by Tipo

Tabla_inicial %>%
  group_by(database, dataset, Tipo) %>%
  summarise(n_genes = n_distinct(template), .groups = "drop") %>%
  group_by(database) %>%
  group_modify(~ tidy(kruskal.test(.x$n_genes ~ .x$Tipo))) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4))) %>%
  datatable(rownames = FALSE, filter = "top", options = list(pageLength = 30, scrollX = TRUE))

8 Top ARGs

8.1 Top 20 ARGs — heatmap

top20_genes <- Tabla_inicial %>%
  filter(database == "ABR") %>%
  group_by(template) %>%
  summarise(mean_depth = mean(depth), .groups = "drop") %>%
  slice_max(mean_depth, n = 20) %>%
  pull(template)

top20_mat <- Tabla_inicial %>%
  filter(database == "ABR", template %in% top20_genes) %>%
  group_by(dataset, template) %>%
  summarise(depth = sum(depth), .groups = "drop") %>%
  pivot_wider(names_from = template, values_from = depth, values_fill = 0) %>%
  column_to_rownames("dataset")

annot_row <- Tabla_inicial %>%
  distinct(dataset, Tipo, EDAR_Nombre) %>%
  column_to_rownames("dataset")

annot_col <- abr_annotations %>%
  filter(template %in% top20_genes) %>%
  select(template, Class) %>%
  distinct() %>%
  mutate(Class = sub(",.*", "", Class)) %>%
  column_to_rownames("template")

pheatmap(log1p(top20_mat),
         annotation_row  = annot_row,
         annotation_col  = annot_col,
         clustering_distance_rows = "euclidean",
         clustering_distance_cols = "euclidean",
         fontsize_col = 8, fontsize_row = 7,
         main = "Top 20 ARGs by mean depth (log1p)")

8.2 Top 20 ARGs — bar chart

Tabla_inicial %>%
  filter(database == "ABR", template %in% top20_genes) %>%
  group_by(template, Tipo) %>%
  summarise(mean_depth = mean(depth), .groups = "drop") %>%
  left_join(abr_annotations %>% mutate(Class = sub(",.*", "", Class)) %>%
              select(template, Class) %>% distinct(), by = "template") %>%
  ggplot(aes(x = reorder(template, mean_depth), y = mean_depth, fill = Class)) +
  geom_col() +
  facet_wrap(~Tipo, scales = "free_x") +
  coord_flip() +
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Top 20 ARGs — mean depth by sample type",
       x = NULL, y = "Mean depth") +
  theme_classic() +
  theme(strip.background = element_blank())

8.3 Gene overlap (UpSet)

# Genes present in each Tipo (in ≥1 sample)
genes_by_tipo <- Tabla_inicial %>%
  filter(database == "ABR") %>%
  group_by(Tipo, template) %>%
  summarise(.groups = "drop") %>%
  split(.$Tipo) %>%
  map(~ pull(.x, template))

upset(fromList(genes_by_tipo),
      order.by = "freq",
      nsets    = length(genes_by_tipo),
      text.scale = 1.3,
      mainbar.y.label = "Shared ARGs",
      sets.x.label    = "ARGs per Tipo")

# Genes present per EDAR
genes_by_edar <- Tabla_inicial %>%
  filter(database == "ABR") %>%
  group_by(EDAR_Nombre, template) %>%
  summarise(.groups = "drop") %>%
  split(.$EDAR_Nombre) %>%
  map(~ pull(.x, template))

upset(fromList(genes_by_edar),
      order.by = "freq",
      nsets    = min(length(genes_by_edar), 8),
      text.scale = 1.1,
      mainbar.y.label = "Shared ARGs",
      sets.x.label    = "ARGs per EDAR")

9 ABR diversity

9.1 By Tipo

plot_abr_bar(abr_pct(abr_base, "Tipo"), "Tipo")

9.2 By EDAR (Hospital)

plot_abr_bar(abr_pct(filter(abr_base, Tipo == "Hospital"), "EDAR_Nombre"), "EDAR_Nombre")

9.3 By EDAR (Entrada)

plot_abr_bar(abr_pct(filter(abr_base, Tipo == "Entrada"), "EDAR_Nombre"), "EDAR_Nombre")

9.4 By EDAR (Salida)

plot_abr_bar(abr_pct(filter(abr_base, Tipo == "Salida"), "EDAR_Nombre"), "EDAR_Nombre")

9.5 By Tipo per EDAR

walk(edar_names, function(edar) {
  print(plot_abr_bar(abr_pct(filter(abr_base, EDAR_Nombre == edar), "Tipo"), "Tipo") +
          ggtitle(edar))
})

9.6 Temporal by EDAR (Salida)

walk(edar_names, function(edar) {
  dat <- filter(abr_base, EDAR_Nombre == edar, Tipo == "Salida")
  if (nrow(dat) > 0)
    print(plot_abr_bar(abr_pct(dat, "Fecha"), "Fecha") +
            ggtitle(paste(edar, "— Salida over time")))
})

10 BAC / REL diversity

10.1 BAC — by Class_compound

bac_pct(filter(Tabla_inicial_info, database == "BAC"), "Class_compound", "Tipo") %>%
  ggplot(aes(x = Tipo, y = pct, fill = Class_compound)) +
  geom_col(position = "fill") +
  scale_fill_viridis(discrete = TRUE, option = "D") + theme_classic()

10.2 BAC Metal — by Compound

bac_pct(filter(Tabla_inicial_info, database_BAC == "BAC_Metal"), "Compound", "Tipo") %>%
  ggplot(aes(x = Tipo, y = pct, fill = Compound)) +
  geom_col(position = "fill") +
  scale_fill_viridis(discrete = TRUE, option = "D") + theme_classic()

10.3 BAC Biocide — by Class_compound

bac_pct(filter(Tabla_inicial_info, database_BAC == "BAC_Biocide"), "Class_compound", "Tipo") %>%
  ggplot(aes(x = Tipo, y = pct, fill = Class_compound)) +
  geom_col(position = "fill") +
  scale_fill_viridis(discrete = TRUE, option = "D") + theme_classic()

10.4 REL — by mobility type

Tabla_inicial_info %>%
  filter(database == "REL") %>%
  separate(template, c("Tipo1","Gen"), sep = "\\|", remove = FALSE) %>%
  bac_pct("Tipo1", "Tipo") %>%
  ggplot(aes(x = Tipo, y = pct, fill = Tipo1)) +
  geom_col(position = "fill") +
  scale_fill_viridis(discrete = TRUE, option = "D") + theme_classic()

11 Temporal dynamics

11.1 ARG richness over time

Tabla_inicial %>%
  filter(database == "ABR", Tipo %in% c("Entrada","Salida")) %>%
  group_by(dataset, Fecha, EDAR_Nombre, Tipo) %>%
  summarise(n_genes = n_distinct(template), .groups = "drop") %>%
  ggplot(aes(x = Fecha, y = n_genes, color = Tipo,
             group = interaction(EDAR_Nombre, Tipo))) +
  geom_line() + geom_point(size = 2) +
  facet_wrap(~EDAR_Nombre, scales = "free_y") +
  scale_color_manual(values = tipo_colors) +
  labs(title = "ARG richness over time by EDAR", x = NULL, y = "Number of ARGs") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.background = element_blank())

11.2 Hospital resistome over time

Tabla_inicial %>%
  filter(database == "ABR", Tipo == "Hospital") %>%
  group_by(dataset, Fecha, EDAR_Nombre) %>%
  summarise(n_genes = n_distinct(template), .groups = "drop") %>%
  ggplot(aes(x = Fecha, y = n_genes, color = EDAR_Nombre, group = EDAR_Nombre)) +
  geom_line() + geom_point(size = 2) +
  labs(title = "Hospital ARG richness over time", x = NULL,
       y = "Number of ARGs", color = "Hospital") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

12 EDAR treatment efficiency

12.1 ARG removal per EDAR

edar_efficiency <- Tabla_inicial %>%
  filter(database == "ABR", Tipo %in% c("Entrada","Salida")) %>%
  group_by(Fecha, EDAR_Nombre, Tipo) %>%
  summarise(total_depth = sum(depth), n_genes = n_distinct(template), .groups = "drop") %>%
  pivot_wider(names_from = Tipo, values_from = c(total_depth, n_genes)) %>%
  mutate(
    removal_depth_pct = (total_depth_Entrada - total_depth_Salida) / total_depth_Entrada * 100,
    removal_genes_pct = (n_genes_Entrada    - n_genes_Salida)    / n_genes_Entrada    * 100
  )

edar_efficiency %>%
  pivot_longer(cols = c(removal_depth_pct, removal_genes_pct),
               names_to = "metric", values_to = "removal") %>%
  mutate(metric = recode(metric,
    removal_depth_pct = "Abundance (depth)",
    removal_genes_pct = "Richness (n genes)")) %>%
  ggplot(aes(x = Fecha, y = removal, fill = EDAR_Nombre)) +
  geom_col(position = "dodge") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  facet_wrap(~metric) +
  labs(title = "ARG removal efficiency (Entrada → Salida)",
       y = "% reduction", x = NULL, fill = "EDAR") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.background = element_blank())

datatable(arrange(edar_efficiency, EDAR_Nombre, Fecha), rownames = FALSE,
          options = list(pageLength = 20, scrollX = TRUE))

12.2 BAC / REL removal per EDAR

map_dfr(c("BAC","REL"), function(db) {
  Tabla_inicial %>%
    filter(database == db, Tipo %in% c("Entrada","Salida")) %>%
    group_by(Fecha, EDAR_Nombre, Tipo) %>%
    summarise(total_depth = sum(depth), n_genes = n_distinct(template), .groups = "drop") %>%
    pivot_wider(names_from = Tipo, values_from = c(total_depth, n_genes)) %>%
    mutate(
      database = db,
      removal_depth_pct = (total_depth_Entrada - total_depth_Salida) / total_depth_Entrada * 100,
      removal_genes_pct = (n_genes_Entrada    - n_genes_Salida)    / n_genes_Entrada    * 100
    )
}) %>%
  pivot_longer(cols = c(removal_depth_pct, removal_genes_pct),
               names_to = "metric", values_to = "removal") %>%
  mutate(metric = recode(metric,
    removal_depth_pct = "Abundance (depth)",
    removal_genes_pct = "Richness (n genes)")) %>%
  ggplot(aes(x = Fecha, y = removal, fill = EDAR_Nombre)) +
  geom_col(position = "dodge") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  facet_grid(database ~ metric) +
  labs(title = "BAC / REL removal efficiency (Entrada → Salida)",
       y = "% reduction", x = NULL, fill = "EDAR") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.background = element_blank())

12.3 Antibiotic class removal

class_removal <- Tabla_inicial_info %>%
  filter(database == "ABR", Tipo %in% c("Entrada","Salida")) %>%
  mutate(Class = sub(",.*", "", Class)) %>%
  group_by(Fecha, EDAR_Nombre, Tipo, Class) %>%
  summarise(total_depth = sum(depth), .groups = "drop") %>%
  pivot_wider(names_from = Tipo, values_from = total_depth, values_fill = 0) %>%
  mutate(removal_pct = (Entrada - Salida) / (Entrada + 1) * 100)

class_removal %>%
  group_by(EDAR_Nombre, Class) %>%
  summarise(mean_removal = mean(removal_pct, na.rm = TRUE), .groups = "drop") %>%
  ggplot(aes(x = reorder(Class, mean_removal), y = mean_removal, fill = mean_removal > 0)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~EDAR_Nombre) +
  scale_fill_manual(values = c("TRUE" = "#00A087", "FALSE" = "#E64B35"),
                    labels = c("TRUE" = "Removed", "FALSE" = "Enriched")) +
  labs(title = "Mean ARG removal by antibiotic class per EDAR",
       x = NULL, y = "Mean % reduction", fill = NULL) +
  theme_classic() +
  theme(strip.background = element_blank(), axis.text.y = element_text(size = 7))

12.4 EDAR comparison

Tabla_inicial %>%
  filter(database == "ABR") %>%
  group_by(EDAR_Nombre, Tipo, Fecha, dataset) %>%
  summarise(n_genes = n_distinct(template), .groups = "drop") %>%
  ggplot(aes(x = EDAR_Nombre, y = n_genes, fill = Tipo)) +
  geom_boxplot(outlier.size = 1) +
  scale_fill_manual(values = tipo_colors) +
  labs(title = "ARG richness by EDAR and sample type", x = NULL, y = "Number of ARGs") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

13 Clinical priority ARGs

13.1 Prevalence by sample type

clinical_df %>%
  group_by(Tipo, gene_group, dataset) %>%
  summarise(present = n() > 0, .groups = "drop") %>%
  group_by(Tipo, gene_group) %>%
  summarise(prevalence = mean(present) * 100, .groups = "drop") %>%
  ggplot(aes(x = gene_group, y = prevalence, fill = Tipo)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = tipo_colors) +
  labs(title = "Prevalence of clinical priority ARGs", x = NULL, y = "% samples positive") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

13.2 Heatmap

clinical_matrix <- clinical_df %>%
  group_by(dataset, template) %>%
  summarise(depth = sum(depth), .groups = "drop") %>%
  pivot_wider(names_from = template, values_from = depth, values_fill = 0) %>%
  column_to_rownames("dataset")

if (ncol(clinical_matrix) > 1) {
  annot <- clinical_df %>% distinct(dataset, Tipo) %>% column_to_rownames("dataset")
  pheatmap(log1p(clinical_matrix),
           annotation_row  = annot,
           clustering_distance_rows = "euclidean",
           clustering_distance_cols = "euclidean",
           fontsize_col = 7, fontsize_row = 8,
           main = "Clinical priority ARGs (log1p depth)")
}

14 ARG co-occurrence

14.1 ARG–ARG Spearman correlations

abr_wide      <- spread_list$ABR
min_prev      <- ceiling(nrow(abr_wide) * 0.25)
abr_wide_filt <- abr_wide[, colSums(abr_wide > 0) >= min_prev]

if (ncol(abr_wide_filt) >= 4) {
  cor_res <- rcorr(as.matrix(abr_wide_filt), type = "spearman")
  cor_mat <- cor_res$r
  cor_mat[cor_res$P > 0.05] <- 0
  diag(cor_mat) <- NA

  pheatmap(cor_mat,
           color  = colorRampPalette(c("#2166AC","white","#B2182B"))(100),
           breaks = seq(-1, 1, length.out = 101),
           na_col = "grey90",
           clustering_distance_rows = "euclidean",
           clustering_distance_cols = "euclidean",
           fontsize = 7,
           main = paste0("ARG co-occurrence (Spearman, p<0.05, n=",
                         ncol(abr_wide_filt)," genes)"))
}

14.2 ARG–BAC / REL co-selection

db_totals <- Tabla_inicial %>%
  group_by(database, dataset) %>%
  summarise(depth = sum(depth), richness = n_distinct(template), .groups = "drop") %>%
  pivot_wider(names_from = database, values_from = c(depth, richness),
              names_glue = "{database}_{.value}") %>%
  left_join(metadata %>% distinct(dataset, Tipo), by = "dataset")

p1 <- db_totals %>%
  ggplot(aes(x = log1p(ABR_depth), y = log1p(BAC_depth), color = Tipo)) +
  geom_point(size = 2) + geom_smooth(method = "lm", se = FALSE) +
  scale_color_manual(values = tipo_colors) +
  labs(title = "ARG vs MRG/BRG", x = "log(ARG depth)", y = "log(MRG/BRG depth)") +
  theme_classic()

p2 <- db_totals %>%
  ggplot(aes(x = log1p(ABR_depth), y = log1p(REL_depth), color = Tipo)) +
  geom_point(size = 2) + geom_smooth(method = "lm", se = FALSE) +
  scale_color_manual(values = tipo_colors) +
  labs(title = "ARG vs Relaxases", x = "log(ARG depth)", y = "log(Relaxase depth)") +
  theme_classic()

grid.arrange(p1, p2, ncol = 2)

map_dfr(list(
  list(x = "ABR_depth", y = "BAC_depth", label = "ARG vs MRG/BRG"),
  list(x = "ABR_depth", y = "REL_depth", label = "ARG vs Relaxases")
), function(pair) {
  ct <- cor.test(log1p(db_totals[[pair$x]]), log1p(db_totals[[pair$y]]),
                 method = "spearman", use = "complete.obs")
  tibble(Comparison = pair$label,
         rho     = round(ct$estimate, 3),
         p_value = round(ct$p.value, 4))
}) %>%
  datatable(rownames = FALSE, options = list(pageLength = 5))

15 Hospital-exclusive resistome

Genes detected in Hospital samples but absent from all Entrada and Salida samples across the dataset.

15.1 Overview

cat("Hospital-exclusive ARGs (absent from all EDAR samples):", length(genes_hosp_only), "\n")
## Hospital-exclusive ARGs (absent from all EDAR samples): 110
hosp_excl %>%
  count(Class, sort = TRUE) %>%
  ggplot(aes(x = reorder(Class, n), y = n, fill = Class)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Hospital-exclusive ARGs by antibiotic class",
       x = NULL, y = "Number of genes") +
  theme_classic()

15.2 By hospital

hosp_excl %>%
  group_by(EDAR_Nombre, template, Class) %>%
  summarise(mean_depth = mean(depth), .groups = "drop") %>%
  ggplot(aes(x = reorder(template, mean_depth), y = mean_depth, fill = Class)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~EDAR_Nombre, scales = "free") +
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Hospital-exclusive ARGs — mean depth by hospital",
       x = NULL, y = "Mean depth") +
  theme_classic() +
  theme(strip.background = element_blank(), axis.text.y = element_text(size = 7))

15.3 Table

hosp_excl %>%
  group_by(template, Class, Phenotype, EDAR_Nombre) %>%
  summarise(n_samples = n_distinct(dataset),
            mean_depth = round(mean(depth), 3), .groups = "drop") %>%
  arrange(desc(mean_depth)) %>%
  datatable(rownames = FALSE, filter = "top",
            options = list(pageLength = 20, scrollX = TRUE))

15.4 Clinical priority — hospital only

hosp_excl %>%
  filter(template %in% clinical_df$template) %>%
  group_by(template, Class, EDAR_Nombre, Fecha) %>%
  summarise(depth = sum(depth), .groups = "drop") %>%
  ggplot(aes(x = Fecha, y = depth, color = EDAR_Nombre, group = interaction(template, EDAR_Nombre))) +
  geom_line() + geom_point(size = 2) +
  facet_wrap(~template, scales = "free_y") +
  labs(title = "Clinical priority ARGs — hospital exclusive, over time",
       x = NULL, y = "Depth", color = "Hospital") +
  theme_classic() +
  theme(strip.background = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))

16 EDAR-specific resistome

Analysis of genes exclusive to EDAR samples (Entrada and Salida) and genes that distinguish Entrada from Salida (potentially enriched or eliminated during treatment).

16.1 EDAR vs Hospital overlap

upset(fromList(list(
  Hospital = genes_hosp,
  Entrada  = genes_entrada,
  Salida   = genes_salida
)),
order.by  = "freq",
text.scale = 1.3,
mainbar.y.label = "Shared ARGs",
sets.x.label    = "Total ARGs")

16.2 EDAR-exclusive ARGs

cat("ARGs exclusive to EDAR (Entrada+Salida, absent from all hospitals):", length(genes_edar_only), "\n")
## ARGs exclusive to EDAR (Entrada+Salida, absent from all hospitals): 95
Tabla_inicial_info %>%
  filter(database == "ABR", template %in% genes_edar_only) %>%
  mutate(Class = sub(",.*", "", Class)) %>%
  count(Class, sort = TRUE) %>%
  ggplot(aes(x = reorder(Class, n), y = n, fill = Class)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "EDAR-exclusive ARGs by antibiotic class", x = NULL, y = "n genes") +
  theme_classic()

16.3 Entrada-exclusive (potentially removed)

cat("ARGs found ONLY in Entrada (not Salida):", length(genes_entrada_only), "\n")
## ARGs found ONLY in Entrada (not Salida): 180
Tabla_inicial_info %>%
  filter(database == "ABR", template %in% genes_entrada_only) %>%
  mutate(Class = sub(",.*", "", Class)) %>%
  group_by(template, Class, EDAR_Nombre) %>%
  summarise(mean_depth = mean(depth), .groups = "drop") %>%
  ggplot(aes(x = reorder(template, mean_depth), y = mean_depth, fill = Class)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~EDAR_Nombre, scales = "free") +
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Entrada-exclusive ARGs (absent from Salida) — potentially removed by treatment",
       x = NULL, y = "Mean depth") +
  theme_classic() +
  theme(strip.background = element_blank(), axis.text.y = element_text(size = 7))

16.4 Salida-exclusive (potentially enriched)

cat("ARGs found ONLY in Salida (not Entrada):", length(genes_salida_only), "\n")
## ARGs found ONLY in Salida (not Entrada): 65
Tabla_inicial_info %>%
  filter(database == "ABR", template %in% genes_salida_only) %>%
  mutate(Class = sub(",.*", "", Class)) %>%
  group_by(template, Class, EDAR_Nombre) %>%
  summarise(mean_depth = mean(depth), .groups = "drop") %>%
  ggplot(aes(x = reorder(template, mean_depth), y = mean_depth, fill = Class)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~EDAR_Nombre, scales = "free") +
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Salida-exclusive ARGs (absent from Entrada) — potentially enriched by treatment",
       x = NULL, y = "Mean depth") +
  theme_classic() +
  theme(strip.background = element_blank(), axis.text.y = element_text(size = 7))

16.5 Shared Entrada–Salida: abundance change

# For genes present in both, compare mean depth Entrada vs Salida
entrada_salida_shared <- Tabla_inicial_info %>%
  filter(database == "ABR", template %in% genes_shared_es,
         Tipo %in% c("Entrada","Salida")) %>%
  mutate(Class = sub(",.*", "", Class)) %>%
  group_by(template, Class, Tipo) %>%
  summarise(mean_depth = mean(depth), .groups = "drop") %>%
  pivot_wider(names_from = Tipo, values_from = mean_depth, values_fill = 0) %>%
  mutate(log2FC = log2((Salida + 1) / (Entrada + 1))) %>%
  arrange(log2FC)

# Top enriched and depleted genes
bind_rows(
  slice_max(entrada_salida_shared, log2FC, n = 15),
  slice_min(entrada_salida_shared, log2FC, n = 15)
) %>%
  distinct() %>%
  ggplot(aes(x = reorder(template, log2FC), y = log2FC,
             fill = log2FC > 0)) +
  geom_col() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  coord_flip() +
  scale_fill_manual(values = c("TRUE" = "#E64B35", "FALSE" = "#00A087"),
                    labels = c("TRUE" = "Enriched in Salida",
                               "FALSE" = "Depleted in Salida")) +
  labs(title = "Top 15 ARGs enriched / depleted in Salida vs Entrada (log2 fold-change)",
       x = NULL, y = "log2(Salida / Entrada)", fill = NULL) +
  theme_classic() +
  theme(axis.text.y = element_text(size = 7))

16.6 Summary tables

cat("=== Entrada-exclusive ARGs ===\n")
## === Entrada-exclusive ARGs ===
Tabla_inicial_info %>%
  filter(database == "ABR", template %in% genes_entrada_only) %>%
  mutate(Class = sub(",.*", "", Class)) %>%
  group_by(template, Class, Phenotype) %>%
  summarise(n_samples = n_distinct(dataset),
            mean_depth = round(mean(depth), 3), .groups = "drop") %>%
  arrange(desc(mean_depth)) %>%
  datatable(rownames = FALSE, filter = "top",
            caption = "Entrada-exclusive ARGs",
            options = list(pageLength = 15, scrollX = TRUE))
cat("=== Salida-exclusive ARGs ===\n")
## === Salida-exclusive ARGs ===
Tabla_inicial_info %>%
  filter(database == "ABR", template %in% genes_salida_only) %>%
  mutate(Class = sub(",.*", "", Class)) %>%
  group_by(template, Class, Phenotype) %>%
  summarise(n_samples = n_distinct(dataset),
            mean_depth = round(mean(depth), 3), .groups = "drop") %>%
  arrange(desc(mean_depth)) %>%
  datatable(rownames = FALSE, filter = "top",
            caption = "Salida-exclusive ARGs",
            options = list(pageLength = 15, scrollX = TRUE))

17 Session info

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=es_ES.UTF-8    
##  [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=es_ES.UTF-8   
##  [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Madrid
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] UpSetR_1.4.0       Hmisc_5.2-5        ggstatsplot_0.13.6 doParallel_1.0.17 
##  [5] iterators_1.0.14   foreach_1.5.2      ggrepel_0.9.8      broom_1.0.12      
##  [9] microbiome_1.24.0  ggpubr_0.6.3       ggnewscale_0.5.2   microbial_0.0.22  
## [13] phyloseq_1.46.0    viridis_0.6.5      viridisLite_0.4.3  DT_0.34.0         
## [17] lubridate_1.9.5    forcats_1.0.1      stringr_1.6.0      dplyr_1.2.1       
## [21] purrr_1.2.1        readr_2.2.0        tidyr_1.3.2        tibble_3.3.1      
## [25] tidyverse_2.0.0    pheatmap_1.0.13    gridExtra_2.3      umap_0.2.10.0     
## [29] factoextra_2.0.0   ggplot2_4.0.2      vegan_2.7-3        permute_0.9-10    
## [33] MASS_7.3-60.0.1   
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.3               prismatic_1.1.2            
##   [3] bitops_1.0-9                datawizard_1.3.0           
##   [5] rpart_4.1.27                lifecycle_1.0.5            
##   [7] rstatix_0.7.3               edgeR_4.0.16               
##   [9] vroom_1.7.1                 lattice_0.22-9             
##  [11] crosstalk_1.2.2             insight_1.4.6              
##  [13] backports_1.5.1             magrittr_2.0.5             
##  [15] limma_3.58.1                sass_0.4.10                
##  [17] rmarkdown_2.31              jquerylib_0.1.4            
##  [19] yaml_2.3.12                 otel_0.2.0                 
##  [21] askpass_1.2.1               reticulate_1.45.0          
##  [23] RColorBrewer_1.1-3          ade4_1.7-24                
##  [25] abind_1.4-8                 zlibbioc_1.48.2            
##  [27] Rtsne_0.17                  quadprog_1.5-8             
##  [29] GenomicRanges_1.54.1        BiocGenerics_0.48.1        
##  [31] RCurl_1.98-1.18             nnet_7.3-20                
##  [33] GenomeInfoDbData_1.2.11     IRanges_2.36.0             
##  [35] S4Vectors_0.40.2            correlation_0.8.8          
##  [37] RSpectra_0.16-2             codetools_0.2-20           
##  [39] DelayedArray_0.28.0         SuppDists_1.1-9.9          
##  [41] tidyselect_1.2.1            farver_2.1.2               
##  [43] gmp_0.7-5.1                 effectsize_1.0.2           
##  [45] base64enc_0.1-6             matrixStats_1.5.0          
##  [47] stats4_4.3.3                jsonlite_2.0.0             
##  [49] multtest_2.58.0             Formula_1.2-5              
##  [51] survival_3.8-6              emmeans_2.0.2              
##  [53] BWStest_0.2.3               tools_4.3.3                
##  [55] PMCMRplus_1.9.12            Rcpp_1.1.1                 
##  [57] glue_1.8.0                  SparseArray_1.2.4          
##  [59] xfun_0.57                   mgcv_1.9-3                 
##  [61] DESeq2_1.42.1               kSamples_1.2-12            
##  [63] MatrixGenerics_1.14.0       GenomeInfoDb_1.38.8        
##  [65] withr_3.0.2                 fastmap_1.2.0              
##  [67] boot_1.3-32                 rhdf5filters_1.14.1        
##  [69] openssl_2.3.5               digest_0.6.39              
##  [71] timechange_0.4.0            R6_2.6.1                   
##  [73] estimability_1.5.1          colorspace_2.1-2           
##  [75] utf8_1.2.6                  generics_0.1.4             
##  [77] data.table_1.18.2.1         htmlwidgets_1.6.4          
##  [79] S4Arrays_1.2.1              parameters_0.28.3          
##  [81] pkgconfig_2.0.3             gtable_0.3.6               
##  [83] Rmpfr_1.1-2                 statsExpressions_1.7.4     
##  [85] S7_0.2.1                    XVector_0.42.0             
##  [87] htmltools_0.5.9             carData_3.0-6              
##  [89] biomformat_1.30.0           multcompView_0.1-11        
##  [91] scales_1.4.0                Biobase_2.62.0             
##  [93] png_0.1-9                   knitr_1.51                 
##  [95] rstudioapi_0.18.0           tzdb_0.5.0                 
##  [97] reshape2_1.4.5              checkmate_2.3.4            
##  [99] coda_0.19-4.1               nlme_3.1-169               
## [101] cachem_1.1.0                rhdf5_2.46.1               
## [103] foreign_0.8-91              pillar_1.11.1              
## [105] grid_4.3.3                  vctrs_0.7.2                
## [107] randomForest_4.7-1.2        car_3.1-5                  
## [109] cluster_2.1.8.2             htmlTable_2.4.3            
## [111] paletteer_1.7.0             evaluate_1.0.5             
## [113] zeallot_0.2.0               mvtnorm_1.3-6              
## [115] cli_3.6.5                   locfit_1.5-9.12            
## [117] compiler_4.3.3              rlang_1.2.0                
## [119] crayon_1.5.3                rstantools_2.6.0           
## [121] ggsignif_0.6.4              labeling_0.4.3             
## [123] rematch2_2.1.2              plyr_1.8.9                 
## [125] stringi_1.8.7               BiocParallel_1.36.0        
## [127] Biostrings_2.70.3           bayestestR_0.17.0          
## [129] Matrix_1.6-5                hms_1.1.4                  
## [131] patchwork_1.3.2             bit64_4.6.0-1              
## [133] Rhdf5lib_1.24.2             statmod_1.5.1              
## [135] SummarizedExperiment_1.32.0 memoise_2.0.1              
## [137] igraph_2.2.3                RcppParallel_5.1.11-2      
## [139] bslib_0.10.0                phangorn_2.12.1            
## [141] fastmatch_1.1-8             bit_4.6.0                  
## [143] ape_5.8-1