QC is performed before rarefaction to identify and exclude low-quality samples that would otherwise drag down the rarefaction threshold for all samples.
bowtie_logs <- list.files(
"/home/miguel/Documentos/analysis_sevilla_rescap/pipeline/results/bowtie2",
pattern = "\\.bowtie2\\.log$", full.names = TRUE
)
qc_capture <- map_dfr(bowtie_logs, function(f) {
txt <- readLines(f)
rate_line <- grep("overall alignment rate", txt, value = TRUE)
rate <- as.numeric(gsub("%.*", "", trimws(rate_line)))
tibble(dataset = gsub("\\.bowtie2\\.log$", "", basename(f)), on_target_pct = rate)
}) %>%
left_join(metadata, by = "dataset")
qc_capture %>%
ggplot(aes(x = reorder(dataset, on_target_pct), y = on_target_pct, fill = Tipo)) +
geom_col() +
geom_hline(yintercept = 20, linetype = "dashed", color = "red", alpha = .5) +
coord_flip() +
scale_fill_manual(values = tipo_colors) +
labs(title = "ResCap capture efficiency", x = NULL, y = "On-target reads (%)") +
theme_classic() +
theme(axis.text.y = element_text(size = 7))
kma_hits <- Tabla_inicial_prev %>%
filter(p_value < 0.05) %>%
group_by(dataset) %>%
summarise(n_templates = n_distinct(template), .groups = "drop") %>%
left_join(metadata, by = "dataset") %>%
left_join(select(qc_capture, dataset, on_target_pct), by = "dataset")
kma_hits %>%
ggplot(aes(x = on_target_pct, y = n_templates, color = Tipo, label = dataset)) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = TRUE, color = "grey40", linetype = "dashed") +
geom_text_repel(size = 2.5, max.overlaps = 10) +
scale_color_manual(values = tipo_colors) +
labs(title = "Capture efficiency vs resistome size",
x = "On-target reads (%)", y = "KMA templates detected") +
theme_classic()
datatable(arrange(kma_hits, desc(n_templates)), rownames = FALSE,
options = list(pageLength = 15, scrollX = TRUE))
# Correlation between capture efficiency and raw gene richness (pre-rarefaction)
kma_hits %>%
filter(!is.na(on_target_pct)) %>%
ggplot(aes(x = on_target_pct, y = n_templates, color = Tipo, label = dataset)) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = TRUE, color = "grey40", linetype = "dashed") +
geom_text_repel(size = 2.5, max.overlaps = 10) +
scale_color_manual(values = tipo_colors) +
labs(title = "Capture efficiency vs gene richness (pre-rarefaction)",
x = "On-target reads (%)", y = "KMA templates detected") +
theme_classic()
# Total depth per sample — flag outliers before rarefaction
sample_depth <- Tabla_inicial_raw_collapsed %>%
group_by(dataset) %>%
summarise(total_depth = sum(depth), .groups = "drop") %>%
left_join(metadata %>% distinct(dataset, Tipo), by = "dataset") %>%
arrange(total_depth)
depth_threshold <- quantile(sample_depth$total_depth, 0.05) # bottom 5%
sample_depth %>%
ggplot(aes(x = reorder(dataset, total_depth), y = total_depth, fill = Tipo)) +
geom_col() +
geom_hline(yintercept = depth_threshold, linetype = "dashed", color = "red") +
coord_flip() +
scale_fill_manual(values = tipo_colors) +
labs(title = "Total depth per sample (red = 5th percentile threshold)",
x = NULL, y = "Total depth") +
theme_classic() +
theme(axis.text.y = element_text(size = 7))
# Samples below threshold — review before deciding to exclude
low_depth_samples <- sample_depth %>% filter(total_depth < depth_threshold)
if (nrow(low_depth_samples) > 0) {
cat("Samples below 5th percentile depth threshold (", round(depth_threshold, 1), "):\n")
print(low_depth_samples)
}
## Samples below 5th percentile depth threshold ( 466815.4 ):
## # A tibble: 3 × 3
## dataset total_depth Tipo
## <chr> <dbl> <chr>
## 1 20_S10 348322. Salida
## 2 18_S9 419057. Salida
## 3 16_S7 466300. Salida
# ── Exclude outlier samples here if needed ────────────────────────────────────
# samples_to_exclude <- c("SAMPLE_ID_1", "SAMPLE_ID_2")
# Tabla_inicial_raw_collapsed <- Tabla_inicial_raw_collapsed %>%
# filter(!dataset %in% samples_to_exclude)
All databases (ABR + BAC + REL) are rarefied together since all reads
come from the same Illumina run. KMA depth (coverage
normalised by gene length) is used as an integer proxy for read counts.
1 000 rarefactions are averaged in parallel to reduce
stochasticity.
# Rarefaction curves to assess whether raremax captures sufficient diversity
Tabla_rare_pre <- Tabla_inicial_raw_collapsed %>%
select(template, depth, dataset) %>%
group_by(dataset, template) %>%
summarise(depth = sum(depth), .groups = "drop") %>%
pivot_wider(names_from = template, values_from = depth, values_fill = 0) %>%
column_to_rownames("dataset")
Tabla_rare_pre[] <- lapply(Tabla_rare_pre, as.integer)
# Colour by Tipo
tipo_vec <- metadata %>%
filter(dataset %in% rownames(Tabla_rare_pre)) %>%
arrange(match(dataset, rownames(Tabla_rare_pre))) %>%
pull(Tipo)
col_vec <- tipo_colors[tipo_vec]
rarecurve(Tabla_rare_pre, step = 50, col = col_vec,
label = FALSE, main = "Rarefaction curves (colour = Tipo)",
ylab = "Gene richness", xlab = "Depth")
abline(v = min(rowSums(Tabla_rare_pre)), lty = 2, col = "black")
legend("bottomright", legend = names(tipo_colors), fill = tipo_colors, bty = "n")
tibble(
Metric = c("Rarefaction depth (raremax)", "Samples", "Genes before", "Genes after"),
Value = c(raremax, nrow(Tabla_rare),
ncol(Tabla_rare),
n_distinct(Tabla_inicial$template))
) %>% knitr::kable()
| Metric | Value |
|---|---|
| Rarefaction depth (raremax) | 347690 |
| Samples | 48 |
| Genes before | 13308 |
| Genes after | 13305 |
Tabla_inicial_raw_collapsed %>%
group_by(database, dataset) %>%
summarise(total_depth = sum(depth), .groups = "drop") %>%
left_join(metadata %>% distinct(dataset, Tipo), by = "dataset") %>%
ggplot(aes(x = reorder(dataset, total_depth), y = total_depth, fill = Tipo)) +
geom_col() +
facet_wrap(~database, scales = "free") +
scale_fill_manual(values = tipo_colors) +
labs(title = "Total depth per sample — before rarefaction", x = NULL, y = "Total depth") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5),
strip.background = element_blank())
Tabla_inicial %>%
group_by(database, dataset, Tipo) %>%
summarise(total_depth = sum(depth), .groups = "drop") %>%
ggplot(aes(x = reorder(dataset, total_depth), y = total_depth, fill = Tipo)) +
geom_col() +
facet_wrap(~database, scales = "free") +
scale_fill_manual(values = tipo_colors) +
labs(title = "Total depth per sample — after rarefaction", x = NULL, y = "Total depth") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5),
strip.background = element_blank())
diver_gral %>%
group_by(database2, index, Tipo) %>%
group_modify(~ tidy(shapiro.test(.x$value))) %>%
datatable(rownames = FALSE, filter = "top",
options = list(pageLength = 30, scrollX = TRUE))
diver_gral %>%
group_by(database2, index) %>%
group_modify(~ tidy(kruskal.test(.x$value ~ .x$Tipo))) %>%
datatable(rownames = FALSE, filter = "top",
options = list(pageLength = 30, scrollX = TRUE))
plot_diversity("shannon", "Shannon diversity")
diver_gral %>% filter(index == "shannon") %>%
group_by(Tipo, database2) %>%
summarise(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE), .groups = "drop") %>%
datatable(rownames = FALSE, filter = "top", options = list(pageLength = 30, scrollX = TRUE))
grouped_ggbetweenstats(
data = diver_gral %>% filter(index == "shannon", !database2 %in% c("gral","BAC")),
x = Tipo, y = value, type = "np", grouping.var = database2,
ggplot.component = list(scale_fill_manual(values = tipo_colors), theme_rescap)
)
plot_diversity("richness", "Observed gene richness")
diver_gral %>% filter(index == "richness") %>%
group_by(Tipo, database2) %>%
summarise(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE), .groups = "drop") %>%
datatable(rownames = FALSE, filter = "top", options = list(pageLength = 30, scrollX = TRUE))
grouped_ggbetweenstats(
data = diver_gral %>% filter(index == "richness", !database2 %in% c("gral","BAC")),
x = Tipo, y = value, type = "np", grouping.var = database2,
ggplot.component = list(scale_fill_manual(values = tipo_colors), theme_rescap)
)
plot_diversity("dominance.simpson", "Dominance Simpson")
grouped_ggbetweenstats(
data = diver_gral %>% filter(index == "dominance.simpson", !database2 %in% c("gral","BAC")),
x = Tipo, y = value, type = "np", grouping.var = database2
)
plot_diversity("evenness.simpson", "Evenness Simpson")
grouped_ggbetweenstats(
data = diver_gral %>% filter(index == "evenness.simpson", !database2 %in% c("gral","BAC")),
x = Tipo, y = value, type = "np", grouping.var = database2
)
plot_pca("gral", "Tipo", "PCA General — by Tipo") +
scale_color_manual(values = tipo_colors)
plot_pca("gral", "EDAR_Nombre", "PCA General — by EDAR")
plot_pca("ABR", "Tipo", "PCA ABR") + scale_color_manual(values = tipo_colors)
plot_pca("BAC", "Tipo", "PCA BAC") + scale_color_manual(values = tipo_colors)
plot_pca("REL", "Tipo", "PCA REL") + scale_color_manual(values = tipo_colors)
plot_pca("BAC_Metal", "Tipo", "PCA BAC Metals") + scale_color_manual(values = tipo_colors)
plot_pca("BAC_Biocide","Tipo", "PCA BAC Biocide") + scale_color_manual(values = tipo_colors)
plot_umap("gral", "Tipo", "UMAP General") + scale_color_manual(values = tipo_colors)
plot_umap("ABR", "Tipo", "UMAP ABR") + scale_color_manual(values = tipo_colors)
plot_umap("BAC", "Tipo", "UMAP BAC") + scale_color_manual(values = tipo_colors)
plot_umap("REL", "Tipo", "UMAP REL") + scale_color_manual(values = tipo_colors)
plot_umap("BAC_Metal", "Tipo", "UMAP BAC Metals") + scale_color_manual(values = tipo_colors)
plot_umap("BAC_Biocide","Tipo", "UMAP BAC Biocide") + scale_color_manual(values = tipo_colors)
run_adonis(spread_list$gral, "Tipo")
run_adonis(spread_list$gral, "EDAR_Nombre")
run_adonis(spread_list$ABR, "Tipo")
run_adonis(spread_list$BAC, "Tipo")
run_adonis(spread_list$REL, "Tipo")
Tabla_inicial %>%
group_by(database, Tipo, template) %>% summarise(.groups = "drop") %>%
count(Tipo, database, name = "Different_genes") %>%
ggplot(aes(x = Tipo, y = Different_genes, fill = database)) +
geom_col() + theme_rescap + theme(legend.position = "right", aspect.ratio = 1)
Tabla_inicial %>%
group_by(database, Tipo, dataset) %>%
summarise(n_genes = n_distinct(template), .groups = "drop") %>%
grouped_ggbetweenstats(
data = ., x = Tipo, y = n_genes, grouping.var = database, type = "np"
)
Tabla_inicial %>%
group_by(database, Tipo, template) %>% summarise(.groups = "drop") %>%
count(Tipo, database, name = "n") %>%
ggplot(aes(x = Tipo, y = n, fill = database)) +
geom_col(position = "fill") + theme_rescap + theme(legend.position = "right", aspect.ratio = 1)
Tabla_inicial %>%
group_by(database, EDAR_Nombre, template) %>% summarise(.groups = "drop") %>%
count(EDAR_Nombre, database, name = "n") %>%
ggplot(aes(x = EDAR_Nombre, y = n, fill = database)) +
geom_col(position = "fill") + theme_rescap + theme(legend.position = "right", aspect.ratio = 1)
Tabla_inicial %>%
group_by(database, Fecha, template) %>% summarise(.groups = "drop") %>%
count(Fecha, database, name = "n") %>%
ggplot(aes(x = Fecha, y = n, fill = database)) +
geom_col(position = "fill") + theme_rescap + theme(legend.position = "right", aspect.ratio = 1)
Tabla_inicial %>%
group_by(Tipo, database, dataset) %>%
summarise(n_genes = n_distinct(template), .groups = "drop") %>%
ggplot(aes(x = dataset, y = n_genes, fill = database)) +
geom_col(position = "fill") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank())
Tabla_inicial %>%
group_by(database, dataset, Tipo) %>%
summarise(n_genes = n_distinct(template), .groups = "drop") %>%
group_by(database) %>%
group_modify(~ tidy(kruskal.test(.x$n_genes ~ .x$Tipo))) %>%
mutate(across(where(is.numeric), ~ round(.x, 4))) %>%
datatable(rownames = FALSE, filter = "top", options = list(pageLength = 30, scrollX = TRUE))
top20_genes <- Tabla_inicial %>%
filter(database == "ABR") %>%
group_by(template) %>%
summarise(mean_depth = mean(depth), .groups = "drop") %>%
slice_max(mean_depth, n = 20) %>%
pull(template)
top20_mat <- Tabla_inicial %>%
filter(database == "ABR", template %in% top20_genes) %>%
group_by(dataset, template) %>%
summarise(depth = sum(depth), .groups = "drop") %>%
pivot_wider(names_from = template, values_from = depth, values_fill = 0) %>%
column_to_rownames("dataset")
annot_row <- Tabla_inicial %>%
distinct(dataset, Tipo, EDAR_Nombre) %>%
column_to_rownames("dataset")
annot_col <- abr_annotations %>%
filter(template %in% top20_genes) %>%
select(template, Class) %>%
distinct() %>%
mutate(Class = sub(",.*", "", Class)) %>%
column_to_rownames("template")
pheatmap(log1p(top20_mat),
annotation_row = annot_row,
annotation_col = annot_col,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
fontsize_col = 8, fontsize_row = 7,
main = "Top 20 ARGs by mean depth (log1p)")
Tabla_inicial %>%
filter(database == "ABR", template %in% top20_genes) %>%
group_by(template, Tipo) %>%
summarise(mean_depth = mean(depth), .groups = "drop") %>%
left_join(abr_annotations %>% mutate(Class = sub(",.*", "", Class)) %>%
select(template, Class) %>% distinct(), by = "template") %>%
ggplot(aes(x = reorder(template, mean_depth), y = mean_depth, fill = Class)) +
geom_col() +
facet_wrap(~Tipo, scales = "free_x") +
coord_flip() +
scale_fill_viridis(discrete = TRUE, option = "D") +
labs(title = "Top 20 ARGs — mean depth by sample type",
x = NULL, y = "Mean depth") +
theme_classic() +
theme(strip.background = element_blank())
# Genes present in each Tipo (in ≥1 sample)
genes_by_tipo <- Tabla_inicial %>%
filter(database == "ABR") %>%
group_by(Tipo, template) %>%
summarise(.groups = "drop") %>%
split(.$Tipo) %>%
map(~ pull(.x, template))
upset(fromList(genes_by_tipo),
order.by = "freq",
nsets = length(genes_by_tipo),
text.scale = 1.3,
mainbar.y.label = "Shared ARGs",
sets.x.label = "ARGs per Tipo")
# Genes present per EDAR
genes_by_edar <- Tabla_inicial %>%
filter(database == "ABR") %>%
group_by(EDAR_Nombre, template) %>%
summarise(.groups = "drop") %>%
split(.$EDAR_Nombre) %>%
map(~ pull(.x, template))
upset(fromList(genes_by_edar),
order.by = "freq",
nsets = min(length(genes_by_edar), 8),
text.scale = 1.1,
mainbar.y.label = "Shared ARGs",
sets.x.label = "ARGs per EDAR")
plot_abr_bar(abr_pct(abr_base, "Tipo"), "Tipo")
plot_abr_bar(abr_pct(filter(abr_base, Tipo == "Hospital"), "EDAR_Nombre"), "EDAR_Nombre")
plot_abr_bar(abr_pct(filter(abr_base, Tipo == "Entrada"), "EDAR_Nombre"), "EDAR_Nombre")
plot_abr_bar(abr_pct(filter(abr_base, Tipo == "Salida"), "EDAR_Nombre"), "EDAR_Nombre")
walk(edar_names, function(edar) {
print(plot_abr_bar(abr_pct(filter(abr_base, EDAR_Nombre == edar), "Tipo"), "Tipo") +
ggtitle(edar))
})
walk(edar_names, function(edar) {
dat <- filter(abr_base, EDAR_Nombre == edar, Tipo == "Salida")
if (nrow(dat) > 0)
print(plot_abr_bar(abr_pct(dat, "Fecha"), "Fecha") +
ggtitle(paste(edar, "— Salida over time")))
})
bac_pct(filter(Tabla_inicial_info, database == "BAC"), "Class_compound", "Tipo") %>%
ggplot(aes(x = Tipo, y = pct, fill = Class_compound)) +
geom_col(position = "fill") +
scale_fill_viridis(discrete = TRUE, option = "D") + theme_classic()
bac_pct(filter(Tabla_inicial_info, database_BAC == "BAC_Metal"), "Compound", "Tipo") %>%
ggplot(aes(x = Tipo, y = pct, fill = Compound)) +
geom_col(position = "fill") +
scale_fill_viridis(discrete = TRUE, option = "D") + theme_classic()
bac_pct(filter(Tabla_inicial_info, database_BAC == "BAC_Biocide"), "Class_compound", "Tipo") %>%
ggplot(aes(x = Tipo, y = pct, fill = Class_compound)) +
geom_col(position = "fill") +
scale_fill_viridis(discrete = TRUE, option = "D") + theme_classic()
Tabla_inicial_info %>%
filter(database == "REL") %>%
separate(template, c("Tipo1","Gen"), sep = "\\|", remove = FALSE) %>%
bac_pct("Tipo1", "Tipo") %>%
ggplot(aes(x = Tipo, y = pct, fill = Tipo1)) +
geom_col(position = "fill") +
scale_fill_viridis(discrete = TRUE, option = "D") + theme_classic()
Tabla_inicial %>%
filter(database == "ABR", Tipo %in% c("Entrada","Salida")) %>%
group_by(dataset, Fecha, EDAR_Nombre, Tipo) %>%
summarise(n_genes = n_distinct(template), .groups = "drop") %>%
ggplot(aes(x = Fecha, y = n_genes, color = Tipo,
group = interaction(EDAR_Nombre, Tipo))) +
geom_line() + geom_point(size = 2) +
facet_wrap(~EDAR_Nombre, scales = "free_y") +
scale_color_manual(values = tipo_colors) +
labs(title = "ARG richness over time by EDAR", x = NULL, y = "Number of ARGs") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.background = element_blank())
Tabla_inicial %>%
filter(database == "ABR", Tipo == "Hospital") %>%
group_by(dataset, Fecha, EDAR_Nombre) %>%
summarise(n_genes = n_distinct(template), .groups = "drop") %>%
ggplot(aes(x = Fecha, y = n_genes, color = EDAR_Nombre, group = EDAR_Nombre)) +
geom_line() + geom_point(size = 2) +
labs(title = "Hospital ARG richness over time", x = NULL,
y = "Number of ARGs", color = "Hospital") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
edar_efficiency <- Tabla_inicial %>%
filter(database == "ABR", Tipo %in% c("Entrada","Salida")) %>%
group_by(Fecha, EDAR_Nombre, Tipo) %>%
summarise(total_depth = sum(depth), n_genes = n_distinct(template), .groups = "drop") %>%
pivot_wider(names_from = Tipo, values_from = c(total_depth, n_genes)) %>%
mutate(
removal_depth_pct = (total_depth_Entrada - total_depth_Salida) / total_depth_Entrada * 100,
removal_genes_pct = (n_genes_Entrada - n_genes_Salida) / n_genes_Entrada * 100
)
edar_efficiency %>%
pivot_longer(cols = c(removal_depth_pct, removal_genes_pct),
names_to = "metric", values_to = "removal") %>%
mutate(metric = recode(metric,
removal_depth_pct = "Abundance (depth)",
removal_genes_pct = "Richness (n genes)")) %>%
ggplot(aes(x = Fecha, y = removal, fill = EDAR_Nombre)) +
geom_col(position = "dodge") +
geom_hline(yintercept = 0, linetype = "dashed") +
facet_wrap(~metric) +
labs(title = "ARG removal efficiency (Entrada → Salida)",
y = "% reduction", x = NULL, fill = "EDAR") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.background = element_blank())
datatable(arrange(edar_efficiency, EDAR_Nombre, Fecha), rownames = FALSE,
options = list(pageLength = 20, scrollX = TRUE))
map_dfr(c("BAC","REL"), function(db) {
Tabla_inicial %>%
filter(database == db, Tipo %in% c("Entrada","Salida")) %>%
group_by(Fecha, EDAR_Nombre, Tipo) %>%
summarise(total_depth = sum(depth), n_genes = n_distinct(template), .groups = "drop") %>%
pivot_wider(names_from = Tipo, values_from = c(total_depth, n_genes)) %>%
mutate(
database = db,
removal_depth_pct = (total_depth_Entrada - total_depth_Salida) / total_depth_Entrada * 100,
removal_genes_pct = (n_genes_Entrada - n_genes_Salida) / n_genes_Entrada * 100
)
}) %>%
pivot_longer(cols = c(removal_depth_pct, removal_genes_pct),
names_to = "metric", values_to = "removal") %>%
mutate(metric = recode(metric,
removal_depth_pct = "Abundance (depth)",
removal_genes_pct = "Richness (n genes)")) %>%
ggplot(aes(x = Fecha, y = removal, fill = EDAR_Nombre)) +
geom_col(position = "dodge") +
geom_hline(yintercept = 0, linetype = "dashed") +
facet_grid(database ~ metric) +
labs(title = "BAC / REL removal efficiency (Entrada → Salida)",
y = "% reduction", x = NULL, fill = "EDAR") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), strip.background = element_blank())
class_removal <- Tabla_inicial_info %>%
filter(database == "ABR", Tipo %in% c("Entrada","Salida")) %>%
mutate(Class = sub(",.*", "", Class)) %>%
group_by(Fecha, EDAR_Nombre, Tipo, Class) %>%
summarise(total_depth = sum(depth), .groups = "drop") %>%
pivot_wider(names_from = Tipo, values_from = total_depth, values_fill = 0) %>%
mutate(removal_pct = (Entrada - Salida) / (Entrada + 1) * 100)
class_removal %>%
group_by(EDAR_Nombre, Class) %>%
summarise(mean_removal = mean(removal_pct, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = reorder(Class, mean_removal), y = mean_removal, fill = mean_removal > 0)) +
geom_col() +
coord_flip() +
facet_wrap(~EDAR_Nombre) +
scale_fill_manual(values = c("TRUE" = "#00A087", "FALSE" = "#E64B35"),
labels = c("TRUE" = "Removed", "FALSE" = "Enriched")) +
labs(title = "Mean ARG removal by antibiotic class per EDAR",
x = NULL, y = "Mean % reduction", fill = NULL) +
theme_classic() +
theme(strip.background = element_blank(), axis.text.y = element_text(size = 7))
Tabla_inicial %>%
filter(database == "ABR") %>%
group_by(EDAR_Nombre, Tipo, Fecha, dataset) %>%
summarise(n_genes = n_distinct(template), .groups = "drop") %>%
ggplot(aes(x = EDAR_Nombre, y = n_genes, fill = Tipo)) +
geom_boxplot(outlier.size = 1) +
scale_fill_manual(values = tipo_colors) +
labs(title = "ARG richness by EDAR and sample type", x = NULL, y = "Number of ARGs") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
clinical_df %>%
group_by(Tipo, gene_group, dataset) %>%
summarise(present = n() > 0, .groups = "drop") %>%
group_by(Tipo, gene_group) %>%
summarise(prevalence = mean(present) * 100, .groups = "drop") %>%
ggplot(aes(x = gene_group, y = prevalence, fill = Tipo)) +
geom_col(position = "dodge") +
scale_fill_manual(values = tipo_colors) +
labs(title = "Prevalence of clinical priority ARGs", x = NULL, y = "% samples positive") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
clinical_matrix <- clinical_df %>%
group_by(dataset, template) %>%
summarise(depth = sum(depth), .groups = "drop") %>%
pivot_wider(names_from = template, values_from = depth, values_fill = 0) %>%
column_to_rownames("dataset")
if (ncol(clinical_matrix) > 1) {
annot <- clinical_df %>% distinct(dataset, Tipo) %>% column_to_rownames("dataset")
pheatmap(log1p(clinical_matrix),
annotation_row = annot,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
fontsize_col = 7, fontsize_row = 8,
main = "Clinical priority ARGs (log1p depth)")
}
abr_wide <- spread_list$ABR
min_prev <- ceiling(nrow(abr_wide) * 0.25)
abr_wide_filt <- abr_wide[, colSums(abr_wide > 0) >= min_prev]
if (ncol(abr_wide_filt) >= 4) {
cor_res <- rcorr(as.matrix(abr_wide_filt), type = "spearman")
cor_mat <- cor_res$r
cor_mat[cor_res$P > 0.05] <- 0
diag(cor_mat) <- NA
pheatmap(cor_mat,
color = colorRampPalette(c("#2166AC","white","#B2182B"))(100),
breaks = seq(-1, 1, length.out = 101),
na_col = "grey90",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
fontsize = 7,
main = paste0("ARG co-occurrence (Spearman, p<0.05, n=",
ncol(abr_wide_filt)," genes)"))
}
db_totals <- Tabla_inicial %>%
group_by(database, dataset) %>%
summarise(depth = sum(depth), richness = n_distinct(template), .groups = "drop") %>%
pivot_wider(names_from = database, values_from = c(depth, richness),
names_glue = "{database}_{.value}") %>%
left_join(metadata %>% distinct(dataset, Tipo), by = "dataset")
p1 <- db_totals %>%
ggplot(aes(x = log1p(ABR_depth), y = log1p(BAC_depth), color = Tipo)) +
geom_point(size = 2) + geom_smooth(method = "lm", se = FALSE) +
scale_color_manual(values = tipo_colors) +
labs(title = "ARG vs MRG/BRG", x = "log(ARG depth)", y = "log(MRG/BRG depth)") +
theme_classic()
p2 <- db_totals %>%
ggplot(aes(x = log1p(ABR_depth), y = log1p(REL_depth), color = Tipo)) +
geom_point(size = 2) + geom_smooth(method = "lm", se = FALSE) +
scale_color_manual(values = tipo_colors) +
labs(title = "ARG vs Relaxases", x = "log(ARG depth)", y = "log(Relaxase depth)") +
theme_classic()
grid.arrange(p1, p2, ncol = 2)
map_dfr(list(
list(x = "ABR_depth", y = "BAC_depth", label = "ARG vs MRG/BRG"),
list(x = "ABR_depth", y = "REL_depth", label = "ARG vs Relaxases")
), function(pair) {
ct <- cor.test(log1p(db_totals[[pair$x]]), log1p(db_totals[[pair$y]]),
method = "spearman", use = "complete.obs")
tibble(Comparison = pair$label,
rho = round(ct$estimate, 3),
p_value = round(ct$p.value, 4))
}) %>%
datatable(rownames = FALSE, options = list(pageLength = 5))
Genes detected in Hospital samples but absent from all Entrada and Salida samples across the dataset.
cat("Hospital-exclusive ARGs (absent from all EDAR samples):", length(genes_hosp_only), "\n")
## Hospital-exclusive ARGs (absent from all EDAR samples): 110
hosp_excl %>%
count(Class, sort = TRUE) %>%
ggplot(aes(x = reorder(Class, n), y = n, fill = Class)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_viridis(discrete = TRUE, option = "D") +
labs(title = "Hospital-exclusive ARGs by antibiotic class",
x = NULL, y = "Number of genes") +
theme_classic()
hosp_excl %>%
group_by(EDAR_Nombre, template, Class) %>%
summarise(mean_depth = mean(depth), .groups = "drop") %>%
ggplot(aes(x = reorder(template, mean_depth), y = mean_depth, fill = Class)) +
geom_col() +
coord_flip() +
facet_wrap(~EDAR_Nombre, scales = "free") +
scale_fill_viridis(discrete = TRUE, option = "D") +
labs(title = "Hospital-exclusive ARGs — mean depth by hospital",
x = NULL, y = "Mean depth") +
theme_classic() +
theme(strip.background = element_blank(), axis.text.y = element_text(size = 7))
hosp_excl %>%
group_by(template, Class, Phenotype, EDAR_Nombre) %>%
summarise(n_samples = n_distinct(dataset),
mean_depth = round(mean(depth), 3), .groups = "drop") %>%
arrange(desc(mean_depth)) %>%
datatable(rownames = FALSE, filter = "top",
options = list(pageLength = 20, scrollX = TRUE))
hosp_excl %>%
filter(template %in% clinical_df$template) %>%
group_by(template, Class, EDAR_Nombre, Fecha) %>%
summarise(depth = sum(depth), .groups = "drop") %>%
ggplot(aes(x = Fecha, y = depth, color = EDAR_Nombre, group = interaction(template, EDAR_Nombre))) +
geom_line() + geom_point(size = 2) +
facet_wrap(~template, scales = "free_y") +
labs(title = "Clinical priority ARGs — hospital exclusive, over time",
x = NULL, y = "Depth", color = "Hospital") +
theme_classic() +
theme(strip.background = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))
Analysis of genes exclusive to EDAR samples (Entrada and Salida) and genes that distinguish Entrada from Salida (potentially enriched or eliminated during treatment).
upset(fromList(list(
Hospital = genes_hosp,
Entrada = genes_entrada,
Salida = genes_salida
)),
order.by = "freq",
text.scale = 1.3,
mainbar.y.label = "Shared ARGs",
sets.x.label = "Total ARGs")
cat("ARGs exclusive to EDAR (Entrada+Salida, absent from all hospitals):", length(genes_edar_only), "\n")
## ARGs exclusive to EDAR (Entrada+Salida, absent from all hospitals): 95
Tabla_inicial_info %>%
filter(database == "ABR", template %in% genes_edar_only) %>%
mutate(Class = sub(",.*", "", Class)) %>%
count(Class, sort = TRUE) %>%
ggplot(aes(x = reorder(Class, n), y = n, fill = Class)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_viridis(discrete = TRUE, option = "D") +
labs(title = "EDAR-exclusive ARGs by antibiotic class", x = NULL, y = "n genes") +
theme_classic()
cat("ARGs found ONLY in Entrada (not Salida):", length(genes_entrada_only), "\n")
## ARGs found ONLY in Entrada (not Salida): 180
Tabla_inicial_info %>%
filter(database == "ABR", template %in% genes_entrada_only) %>%
mutate(Class = sub(",.*", "", Class)) %>%
group_by(template, Class, EDAR_Nombre) %>%
summarise(mean_depth = mean(depth), .groups = "drop") %>%
ggplot(aes(x = reorder(template, mean_depth), y = mean_depth, fill = Class)) +
geom_col() +
coord_flip() +
facet_wrap(~EDAR_Nombre, scales = "free") +
scale_fill_viridis(discrete = TRUE, option = "D") +
labs(title = "Entrada-exclusive ARGs (absent from Salida) — potentially removed by treatment",
x = NULL, y = "Mean depth") +
theme_classic() +
theme(strip.background = element_blank(), axis.text.y = element_text(size = 7))
cat("ARGs found ONLY in Salida (not Entrada):", length(genes_salida_only), "\n")
## ARGs found ONLY in Salida (not Entrada): 65
Tabla_inicial_info %>%
filter(database == "ABR", template %in% genes_salida_only) %>%
mutate(Class = sub(",.*", "", Class)) %>%
group_by(template, Class, EDAR_Nombre) %>%
summarise(mean_depth = mean(depth), .groups = "drop") %>%
ggplot(aes(x = reorder(template, mean_depth), y = mean_depth, fill = Class)) +
geom_col() +
coord_flip() +
facet_wrap(~EDAR_Nombre, scales = "free") +
scale_fill_viridis(discrete = TRUE, option = "D") +
labs(title = "Salida-exclusive ARGs (absent from Entrada) — potentially enriched by treatment",
x = NULL, y = "Mean depth") +
theme_classic() +
theme(strip.background = element_blank(), axis.text.y = element_text(size = 7))
cat("=== Entrada-exclusive ARGs ===\n")
## === Entrada-exclusive ARGs ===
Tabla_inicial_info %>%
filter(database == "ABR", template %in% genes_entrada_only) %>%
mutate(Class = sub(",.*", "", Class)) %>%
group_by(template, Class, Phenotype) %>%
summarise(n_samples = n_distinct(dataset),
mean_depth = round(mean(depth), 3), .groups = "drop") %>%
arrange(desc(mean_depth)) %>%
datatable(rownames = FALSE, filter = "top",
caption = "Entrada-exclusive ARGs",
options = list(pageLength = 15, scrollX = TRUE))
cat("=== Salida-exclusive ARGs ===\n")
## === Salida-exclusive ARGs ===
Tabla_inicial_info %>%
filter(database == "ABR", template %in% genes_salida_only) %>%
mutate(Class = sub(",.*", "", Class)) %>%
group_by(template, Class, Phenotype) %>%
summarise(n_samples = n_distinct(dataset),
mean_depth = round(mean(depth), 3), .groups = "drop") %>%
arrange(desc(mean_depth)) %>%
datatable(rownames = FALSE, filter = "top",
caption = "Salida-exclusive ARGs",
options = list(pageLength = 15, scrollX = TRUE))
sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 24.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
##
## locale:
## [1] LC_CTYPE=es_ES.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_ES.UTF-8 LC_COLLATE=es_ES.UTF-8
## [5] LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=es_ES.UTF-8
## [7] LC_PAPER=es_ES.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Madrid
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] UpSetR_1.4.0 Hmisc_5.2-5 ggstatsplot_0.13.6 doParallel_1.0.17
## [5] iterators_1.0.14 foreach_1.5.2 ggrepel_0.9.8 broom_1.0.12
## [9] microbiome_1.24.0 ggpubr_0.6.3 ggnewscale_0.5.2 microbial_0.0.22
## [13] phyloseq_1.46.0 viridis_0.6.5 viridisLite_0.4.3 DT_0.34.0
## [17] lubridate_1.9.5 forcats_1.0.1 stringr_1.6.0 dplyr_1.2.1
## [21] purrr_1.2.1 readr_2.2.0 tidyr_1.3.2 tibble_3.3.1
## [25] tidyverse_2.0.0 pheatmap_1.0.13 gridExtra_2.3 umap_0.2.10.0
## [29] factoextra_2.0.0 ggplot2_4.0.2 vegan_2.7-3 permute_0.9-10
## [33] MASS_7.3-60.0.1
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.3 prismatic_1.1.2
## [3] bitops_1.0-9 datawizard_1.3.0
## [5] rpart_4.1.27 lifecycle_1.0.5
## [7] rstatix_0.7.3 edgeR_4.0.16
## [9] vroom_1.7.1 lattice_0.22-9
## [11] crosstalk_1.2.2 insight_1.4.6
## [13] backports_1.5.1 magrittr_2.0.5
## [15] limma_3.58.1 sass_0.4.10
## [17] rmarkdown_2.31 jquerylib_0.1.4
## [19] yaml_2.3.12 otel_0.2.0
## [21] askpass_1.2.1 reticulate_1.45.0
## [23] RColorBrewer_1.1-3 ade4_1.7-24
## [25] abind_1.4-8 zlibbioc_1.48.2
## [27] Rtsne_0.17 quadprog_1.5-8
## [29] GenomicRanges_1.54.1 BiocGenerics_0.48.1
## [31] RCurl_1.98-1.18 nnet_7.3-20
## [33] GenomeInfoDbData_1.2.11 IRanges_2.36.0
## [35] S4Vectors_0.40.2 correlation_0.8.8
## [37] RSpectra_0.16-2 codetools_0.2-20
## [39] DelayedArray_0.28.0 SuppDists_1.1-9.9
## [41] tidyselect_1.2.1 farver_2.1.2
## [43] gmp_0.7-5.1 effectsize_1.0.2
## [45] base64enc_0.1-6 matrixStats_1.5.0
## [47] stats4_4.3.3 jsonlite_2.0.0
## [49] multtest_2.58.0 Formula_1.2-5
## [51] survival_3.8-6 emmeans_2.0.2
## [53] BWStest_0.2.3 tools_4.3.3
## [55] PMCMRplus_1.9.12 Rcpp_1.1.1
## [57] glue_1.8.0 SparseArray_1.2.4
## [59] xfun_0.57 mgcv_1.9-3
## [61] DESeq2_1.42.1 kSamples_1.2-12
## [63] MatrixGenerics_1.14.0 GenomeInfoDb_1.38.8
## [65] withr_3.0.2 fastmap_1.2.0
## [67] boot_1.3-32 rhdf5filters_1.14.1
## [69] openssl_2.3.5 digest_0.6.39
## [71] timechange_0.4.0 R6_2.6.1
## [73] estimability_1.5.1 colorspace_2.1-2
## [75] utf8_1.2.6 generics_0.1.4
## [77] data.table_1.18.2.1 htmlwidgets_1.6.4
## [79] S4Arrays_1.2.1 parameters_0.28.3
## [81] pkgconfig_2.0.3 gtable_0.3.6
## [83] Rmpfr_1.1-2 statsExpressions_1.7.4
## [85] S7_0.2.1 XVector_0.42.0
## [87] htmltools_0.5.9 carData_3.0-6
## [89] biomformat_1.30.0 multcompView_0.1-11
## [91] scales_1.4.0 Biobase_2.62.0
## [93] png_0.1-9 knitr_1.51
## [95] rstudioapi_0.18.0 tzdb_0.5.0
## [97] reshape2_1.4.5 checkmate_2.3.4
## [99] coda_0.19-4.1 nlme_3.1-169
## [101] cachem_1.1.0 rhdf5_2.46.1
## [103] foreign_0.8-91 pillar_1.11.1
## [105] grid_4.3.3 vctrs_0.7.2
## [107] randomForest_4.7-1.2 car_3.1-5
## [109] cluster_2.1.8.2 htmlTable_2.4.3
## [111] paletteer_1.7.0 evaluate_1.0.5
## [113] zeallot_0.2.0 mvtnorm_1.3-6
## [115] cli_3.6.5 locfit_1.5-9.12
## [117] compiler_4.3.3 rlang_1.2.0
## [119] crayon_1.5.3 rstantools_2.6.0
## [121] ggsignif_0.6.4 labeling_0.4.3
## [123] rematch2_2.1.2 plyr_1.8.9
## [125] stringi_1.8.7 BiocParallel_1.36.0
## [127] Biostrings_2.70.3 bayestestR_0.17.0
## [129] Matrix_1.6-5 hms_1.1.4
## [131] patchwork_1.3.2 bit64_4.6.0-1
## [133] Rhdf5lib_1.24.2 statmod_1.5.1
## [135] SummarizedExperiment_1.32.0 memoise_2.0.1
## [137] igraph_2.2.3 RcppParallel_5.1.11-2
## [139] bslib_0.10.0 phangorn_2.12.1
## [141] fastmatch_1.1-8 bit_4.6.0
## [143] ape_5.8-1