A quick list of the available groups in the datasets.
plot_list_counts <- list()
for (dname in names(meta_list)) {
meta <- meta_list[[dname]]
gcol <- group_cols[[dname]]
# For Malabirade_2018, show panels for each of the new grouping columns
if (dname == "Malabirade_2018" && !is.null(meta)) {
mala_groups <- c("growth_medium", "growth_phase", "rna_sample_type")
for (mg in mala_groups) {
if (mg %in% names(meta)) {
p <- ggplot(meta, aes(x = .data[[mg]])) +
geom_bar(fill = "#4C78A8") +
labs(x = NULL, y = "Count", title = paste("Malabirade_2018:", mg)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 10))
plot_list_counts[[paste0(dname, "_", mg)]] <- p
}
}
next
}
if (!is.null(meta) && gcol %in% names(meta)) {
p <- ggplot(meta, aes(x = .data[[gcol]])) +
geom_bar(fill = "#4C78A8") +
labs(x = NULL, y = "Count", title = dname) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 10))
plot_list_counts[[dname]] <- p
}
}
ggarrange(plotlist = plot_list_counts, ncol = 3, nrow = 3)On Malabirade_2018, the SPI-1ind-LB was only sequenced in HighOD condition thus the plots lack LowOD samples.
We obtained read lengths directly from the per-sample fastp QC JSON files. The summary statistics is after filtering (type == “after_filtering”), where the length column corresponds to the mean read length reported by fastp.
plot_list_lengths <- list()
for (dname in names(meta_list)) {
meta <- meta_list[[dname]]
gcol <- group_cols[[dname]]
# For Malabirade_2018, show separate panels per new grouping column
if (dname == "Malabirade_2018" && !is.null(meta)) {
mala_groups <- c("growth_medium", "growth_phase", "rna_sample_type")
for (mg in mala_groups) {
if (!mg %in% names(meta)) next
lengths_df <- fastp_lengths(meta, mg)
if (nrow(lengths_df) == 0) next
if ("type" %in% names(lengths_df) && any(lengths_df$type == "distribution")) {
p <- lengths_df %>%
filter(type == "distribution") %>%
ggplot(aes(x = length, y = count, color = condition)) +
geom_line(alpha = 0.7) +
scale_y_continuous(labels = scales::comma) +
labs(x = "Length (bp)", y = "Count", title = paste("Malabirade_2018:", mg), color = mg) +
theme(plot.title = element_text(size = 10))
} else {
len_summ <- lengths_df %>%
filter(type == "after_filtering") %>%
group_by(sample_id, condition) %>%
summarise(mean_length = first(length), .groups = "drop")
p <- ggplot(len_summ, aes(x = condition, y = mean_length)) +
geom_boxplot(alpha = 0.7, fill = "grey90") +
geom_jitter(width = 0.2, alpha = 0.6, color = "#2ca02c") +
labs(x = NULL, y = "Mean Length (bp)", title = paste("Malabirade_2018:", mg)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 10))
}
plot_list_lengths[[paste0(dname, "_", mg)]] <- p
}
next
}
if (!is.null(meta)) {
lengths_df <- fastp_lengths(meta, gcol)
if (nrow(lengths_df) > 0) {
if ("type" %in% names(lengths_df) && any(lengths_df$type == "distribution")) {
# Distribution plot
p <- lengths_df %>%
filter(type == "distribution") %>%
ggplot(aes(x = length, y = count, color = condition)) +
geom_line(alpha = 0.7) +
scale_y_continuous(labels = scales::comma) +
labs(x = "Length (bp)", y = "Count", title = dname, color = "Group") +
theme(plot.title = element_text(size = 10))
} else {
# Mean length plot
len_summ <- lengths_df %>%
filter(type == "after_filtering") %>%
group_by(sample_id, condition) %>%
summarise(mean_length = first(length), .groups = "drop")
p <- ggplot(len_summ, aes(x = condition, y = mean_length)) +
geom_boxplot(alpha = 0.7, fill = "grey90") +
geom_jitter(width = 0.2, alpha = 0.6, color = "#2ca02c") +
labs(x = NULL, y = "Mean Length (bp)", title = dname) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 10))
}
plot_list_lengths[[dname]] <- p
}
}
}
ggarrange(plotlist = plot_list_lengths, ncol = 3, nrow = 2)## $`1`
##
## $`2`
##
## attr(,"class")
## [1] "list" "ggarrange"
We filtered sRNAs by requiring RPKM ≥ 1 in at least 60% of samples (frac_gt1 >= 0.6) within any biological group before including them in downstream analyses.
filter_summary_list <- list()
pass_ids_list <- list()
for (dname in names(meta_list)) {
meta <- meta_list[[dname]]
gcol <- group_cols[[dname]]
# For Malabirade_2018, produce summaries for each grouping column
if (dname == "Malabirade_2018" && !is.null(meta)) {
counts <- load_counts_long(meta)
mala_groups <- c("growth_medium", "growth_phase", "rna_sample_type")
for (mg in mala_groups) {
if (!mg %in% names(meta)) next
res <- filter_srna(counts, meta, mg)
pass_ids_list[[paste0(dname, "_", mg)]] <- res$pass_ids
if (length(res$pass_ids) > 0) {
summ <- res$per_group %>%
group_by(group) %>%
summarise(n_srna_pass = n_distinct(srna_id), .groups = "drop") %>%
mutate(dataset = paste(dname, mg, sep = "_"))
filter_summary_list[[paste0(dname, "_", mg)]] <- summ
}
}
next
}
if (!is.null(meta)) {
counts <- load_counts_long(meta)
res <- filter_srna(counts, meta, gcol)
pass_ids_list[[dname]] <- res$pass_ids
if (length(res$pass_ids) > 0) {
summ <- res$per_group %>%
group_by(group) %>%
summarise(n_srna_pass = n_distinct(srna_id), .groups = "drop") %>%
mutate(dataset = dname)
filter_summary_list[[dname]] <- summ
}
}
}
all_filter_summary <- bind_rows(filter_summary_list)
ggplot(all_filter_summary, aes(x = group, y = n_srna_pass, fill = dataset)) +
geom_col() +
facet_wrap(~dataset, scales = "free_x", ncol = 3) +
labs(x = "Group", y = "Number of passing sRNAs", title = "sRNAs with stable expression") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")Fan_2023 had to be dropped off from the analysis as the pipeline didnt detected any sRNAs The methodology mentioned in the paper states that ” mRNA was purified from total RNA using poly(T) oligonucleotide-attached magnetic beads” which would only retain RNA sequences with poly-A tail (eukaryotic/human mRNAs).
blen_data <- NULL
if ("Blenkiron_2016" %in% names(meta_list)) {
meta <- meta_list$Blenkiron_2016
counts <- load_counts_long(meta)
# Relaxed filter: keep sRNAs with mean RPKM > 0.6 in ANY library size
sRNA_keep <- counts %>%
left_join(meta %>% select(sample_id, library_size), by = "sample_id") %>%
filter(library_size %in% c("small", "medium", "large")) %>%
group_by(srna_id, library_size) %>%
summarise(mean_expr = mean(RPKM, na.rm = TRUE), .groups = "drop") %>%
group_by(srna_id) %>%
filter(any(mean_expr > 0.6)) %>%
pull(srna_id)
# Median per library size (small / medium / large)
med <- counts %>%
filter(srna_id %in% sRNA_keep) %>%
left_join(meta %>% select(sample_id, library_size), by = "sample_id") %>%
filter(library_size %in% c("small", "medium", "large")) %>%
group_by(srna_id, library_size) %>%
summarise(median_RPKM = median(RPKM, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = library_size, values_from = median_RPKM) %>%
mutate(across(where(is.numeric), ~ replace_na(., 0.01))) # avoid zeros for log scales
blen_data <- med
}
p_blen <- if (!is.null(blen_data) && nrow(blen_data) > 0) {
# Build pairwise comparisons: small–large, medium–large, small–medium
blen_pairs <- bind_rows(
blen_data %>%
transmute(x = small, y = large, comparison = "Small vs Large"),
blen_data %>%
transmute(x = medium, y = large, comparison = "Medium vs Large"),
blen_data %>%
transmute(x = small, y = medium, comparison = "Small vs Medium")
)
ggplot(blen_pairs, aes(x = x, y = y)) +
geom_point(alpha = 0.6, color = "#2ca02c") +
scale_x_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
scale_y_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
labs(
title = "Blenkiron_2016: Pairwise library-size comparisons",
x = "Median RPKM (X library)",
y = "Median RPKM (Y library)"
) +
geom_abline(slope = 1, linetype = "dashed") +
facet_wrap(~ comparison)
} else {
ggplot() + labs(title = "Blenkiron_2016: No Data")
}
p_blensahr2_data <- NULL
if ("Sahr_2022_2" %in% names(meta_list)) {
meta <- meta_list$Sahr_2022_2
counts <- load_counts_long(meta)
sRNA_keep <- counts %>%
left_join(meta %>% select(sample_id, condition), by = "sample_id") %>%
group_by(srna_id, condition) %>%
summarise(mean_expr = mean(RPKM, na.rm = TRUE), .groups = "drop") %>%
group_by(srna_id) %>%
filter(any(mean_expr > 0.6)) %>%
pull(srna_id)
med <- counts %>%
filter(srna_id %in% sRNA_keep) %>%
left_join(meta %>% select(sample_id, condition), by = "sample_id") %>%
group_by(srna_id, condition) %>%
summarise(median_RPKM = median(RPKM, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = condition, values_from = median_RPKM) %>%
mutate(across(where(is.numeric), ~replace_na(., 0.01)))
sahr2_data <- med
}
p_sahr2 <- if (!is.null(sahr2_data) && nrow(sahr2_data) > 0) {
ggplot(sahr2_data, aes(x = RNAseq, y = MVseq)) +
geom_point(alpha = 0.6, color = "#d62728") +
scale_x_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
scale_y_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
labs(title = "Sahr_2022_2: EV vs Total", x = "Total RNA (RPKM)", y = "EV RNA (RPKM)") +
geom_abline(slope = 1, linetype = "dashed")
} else { ggplot() + labs(title = "Sahr_2022_2: No Data") }
p_sahr2mala_data_list <- list()
if ("Malabirade_2018" %in% names(meta_list)) {
meta <- meta_list$Malabirade_2018
counts <- load_counts_long(meta)
meta_proc <- meta %>%
mutate(
type = case_when(
str_detect(condition, "OMV-RNAs") ~ "OMV",
str_detect(condition, "intracellularRNAs") ~ "Intracellular",
TRUE ~ NA_character_
),
base_condition = condition %>%
str_remove("_OMV-RNAs") %>%
str_remove("_intracellularRNAs") %>%
str_remove(" ") %>% # Remove stray spaces
str_trim()
) %>%
filter(!is.na(type))
conditions <- unique(meta_proc$base_condition)
for (cond in conditions) {
# Subset metadata for this condition
sub_meta <- meta_proc %>% filter(base_condition == cond)
# Mean > 0.6
sRNA_keep <- counts %>%
filter(sample_id %in% sub_meta$sample_id) %>%
left_join(sub_meta %>% select(sample_id, type), by = "sample_id") %>%
group_by(srna_id, type) %>%
summarise(mean_expr = mean(RPKM, na.rm = TRUE), .groups = "drop") %>%
group_by(srna_id) %>%
filter(any(mean_expr > 0.6)) %>%
pull(srna_id)
if (length(sRNA_keep) > 0) {
med <- counts %>%
filter(sample_id %in% sub_meta$sample_id) %>%
filter(srna_id %in% sRNA_keep) %>%
left_join(sub_meta %>% select(sample_id, type), by = "sample_id") %>%
group_by(srna_id, type) %>%
summarise(median_RPKM = median(RPKM, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = type, values_from = median_RPKM) %>%
mutate(across(where(is.numeric), ~replace_na(., 0.01))) %>%
mutate(condition = cond) %>%
left_join(srna_lengths, by = "srna_id")
mala_data_list[[cond]] <- med
}
}
}
if (length(mala_data_list) > 0) {
all_mala <- bind_rows(mala_data_list)
# Annotate with growth metadata for downstream plots
cond_annotations <- meta_proc %>%
distinct(base_condition, growth_medium, growth_phase, rna_sample_type)
all_mala <- all_mala %>%
left_join(cond_annotations, by = c("condition" = "base_condition"))
ggplot(all_mala, aes(x = Intracellular, y = OMV, color = seq_len)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_gradient(low = "blue", high = "red", na.value = "grey70") +
scale_x_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
scale_y_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
geom_abline(slope = 1, linetype = "dashed") +
facet_wrap(~condition, scales = "fixed") +
labs(title = "Malabirade_2018: OMV vs Intracellular by Condition",
x = "Intracellular (RPKM)", y = "OMV (RPKM)", color = "sRNA length (nt)") +
theme(strip.text = element_text(size = 8))
} else {
ggplot() + labs(title = "Malabirade_2018: No Data")
}# Check if the data frame exists from the previous chunk
if (exists("all_mala")) {
# Get unique conditions (facets)
conditions <- unique(all_mala$condition)
for (cond in conditions) {
# Filter for sRNAs enriched in OMV (OMV > Intracellular)
omv_enriched <- all_mala %>%
filter(condition == cond) %>%
filter(OMV > Intracellular) %>%
select(srna_id, seq_len, OMV, Intracellular) %>%
distinct() %>% # Deduplicate entries created by the metadata join
arrange(desc(OMV))
# Create a Markdown header for each condition
cat(paste("\n####", cond, "\n\n"))
# Print the table if data exists
if (nrow(omv_enriched) > 0) {
print(knitr::kable(omv_enriched, caption = paste("sRNAs with OMV > Intracellular in", cond)))
} else {
cat("No sRNAs found with higher OMV expression in this condition.\n")
}
# Add newlines for spacing between tables
cat("\n")
}
}| srna_id | seq_len | OMV | Intracellular |
|---|---|---|---|
| sRNA-Sente-37 | 46 | 35.45908 | 11.82353 |
| srna_id | seq_len | OMV | Intracellular |
|---|---|---|---|
| sRNA-Sente-37 | 46 | 139.1449 | 4.086566 |
| srna_id | seq_len | OMV | Intracellular |
|---|---|---|---|
| sRNA-Sente-36 | 108 | 8405.86870 | 438.5577361 |
| sRNA-Styph-1 | 296 | 6145.35312 | 10.5823070 |
| sRNA-Sente-46 | 266 | 3876.27595 | 26.6404897 |
| sRNA-Sente-52 | 1236 | 2627.36197 | 410.7585078 |
| sRNA-Sente-33 | 82 | 1965.82992 | 1135.2027459 |
| sRNA-Ecoli-61 | 106 | 1538.10064 | 38.0156916 |
| sRNA-Sflex-21 | 105 | 1511.47483 | 37.6797771 |
| sRNA-Sente-5 | 32 | 1384.16287 | 261.9766245 |
| sRNA-Sente-35 | 90 | 924.38397 | 124.7045432 |
| sRNA-Ecoli-46 | 93 | 788.35502 | 8.9652078 |
| sRNA-Sflex-15 | 92 | 780.72819 | 8.9504852 |
| sRNA-Sente-50 | 191 | 720.88289 | 309.8276334 |
| sRNA-Sflex-41 | 94 | 582.97100 | 79.9381348 |
| manual_IsrA | NA | 578.47832 | 0.5131556 |
| sRNA-Sente-41 | 146 | 564.70392 | 406.5214043 |
| sRNA-Ecoli-60 | 73 | 491.71249 | 141.3234982 |
| sRNA-Ecoli-13 | 84 | 474.64201 | 362.3001297 |
| sRNA-Sente-1 | 144 | 467.73287 | 122.0943470 |
| sRNA-Sente-44 | 144 | 467.73287 | 122.0943470 |
| sRNA-Ecoli-103 | 114 | 443.73451 | 136.1333928 |
| sRNA-Sflex-6 | 114 | 443.73451 | 136.1333928 |
| sRNA-Sente-51 | 284 | 402.64798 | 38.8113518 |
| sRNA-Ecoli-70 | 95 | 402.30762 | 63.8035681 |
| sRNA-Sflex-23 | 79 | 385.80073 | 147.6851742 |
| sRNA-Ecoli-64 | 79 | 357.78880 | 159.8833597 |
| sRNA-Ecoli-17 | 246 | 355.48712 | 29.4636840 |
| sRNA-Sflex-3 | 245 | 355.48712 | 29.4636840 |
| sRNA-Sente-49 | 85 | 349.02792 | 16.3039797 |
| sRNA-Sflex-46 | 109 | 263.61919 | 188.3603141 |
| sRNA-Ecoli-86 | 111 | 263.03041 | 190.3157689 |
| sRNA-Sente-55 | 486 | 254.93458 | 127.3269480 |
| sRNA-Ecoli-87 | 57 | 234.16588 | 63.3461872 |
| sRNA-Sflex-47 | 57 | 234.16588 | 63.3461872 |
| sRNA-Sente-28 | 113 | 200.04483 | 36.4781997 |
| sRNA-Sente-37 | 46 | 183.23240 | 4.0336417 |
| sRNA-Sente-13 | 162 | 180.74064 | 127.6364763 |
| sRNA-Ecoli-2 | 105 | 161.98028 | 44.5074169 |
| sRNA-Sflex-5 | 87 | 151.47334 | 20.7369153 |
| sRNA-Sente-25 | 211 | 134.01291 | 128.7029000 |
| sRNA-Ecoli-52 | 88 | 124.07863 | 18.8340761 |
| sRNA-Sflex-33 | 88 | 124.07863 | 18.4647805 |
| sRNA-Sente-23 | 119 | 108.63634 | 87.9016456 |
| sRNA-Sflex-13 | 77 | 102.32459 | 31.5357441 |
| sRNA-Sente-48 | 196 | 99.52849 | 80.8043643 |
| sRNA-Ecoli-49 | 184 | 93.69258 | 14.6060289 |
| srna_id | seq_len | OMV | Intracellular |
|---|---|---|---|
| sRNA-Ecoli-13 | 84 | 46.593704 | 36.861068 |
| sRNA-Sente-37 | 46 | 33.538715 | 1.304677 |
| sRNA-Styph-1 | 296 | 27.037588 | 9.544384 |
| sRNA-Sente-46 | 266 | 20.178142 | 9.568794 |
| sRNA-Ecoli-46 | 93 | 9.323033 | 8.858069 |
| sRNA-Sflex-15 | 92 | 9.272199 | 8.993306 |
| manual_IsrA | NA | 3.743399 | 0.000000 |
| srna_id | seq_len | OMV | Intracellular |
|---|---|---|---|
| sRNA-Ecoli-13 | 84 | 6266.6253 | 146.18883 |
| sRNA-Sente-37 | 46 | 282.3882 | 17.99624 |
| sRNA-Sente-48 | 196 | 105.1909 | 47.86915 |
# Save OMV-enriched tables to files
if (exists("all_mala")) {
output_dir <- path_from_root("explore/summary/output")
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
conditions <- unique(all_mala$condition)
for (cond in conditions) {
omv_enriched <- all_mala %>%
filter(condition == cond) %>%
filter(OMV > Intracellular) %>%
select(srna_id, seq_len, OMV, Intracellular) %>%
distinct() %>%
arrange(desc(OMV))
# Write CSV per condition
fn <- file.path(output_dir, paste0("omv_enriched_", cond, ".csv"))
readr::write_csv(omv_enriched, fn)
}
}# List sRNAs with OMV RPKM >= 10 per condition (export for downstream analysis)
if (exists("all_mala")) {
output_dir <- path_from_root("explore/summary/output")
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
# Use the same wide table created in the scatter plot chunk; rebuild if missing
if (!exists("mala_wide")) {
cond_ctrl <- "ControlPCN_HighOD"
cond_spi1 <- "SPI-1ind-LB_HighOD"
cond_spi2 <- "SPI-2ind_HighOD"
cond_ctrl_low <- "ControlPCN_LowOD"
cond_spi2_low <- "SPI-2ind_LowOD"
required_conditions <- c(cond_ctrl, cond_spi1, cond_spi2, cond_ctrl_low, cond_spi2_low)
mala_base <- all_mala %>%
filter(condition %in% required_conditions) %>%
select(srna_id, seq_len, condition, Intracellular, OMV)
mala_wide <- mala_base %>%
group_by(srna_id, seq_len, condition) %>%
summarise(
Intracellular = median(Intracellular, na.rm = TRUE),
OMV = median(OMV, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_wider(
id_cols = c(srna_id, seq_len),
names_from = condition,
values_from = c(Intracellular, OMV),
names_glue = "{.value}_{condition}"
)
}
# Long table with both OMV and Intracellular per condition
mala_long <- mala_wide %>%
pivot_longer(
cols = -c(srna_id, seq_len),
names_to = c("type", "condition"),
names_pattern = "(Intracellular|OMV)_(.*)",
values_to = "RPKM"
)
omv_candidates <- mala_long %>%
filter(type == "OMV", RPKM >= 10) %>%
select(srna_id, seq_len, condition, OMV_RPKM = RPKM) %>%
left_join(
mala_long %>%
filter(type == "Intracellular") %>%
select(srna_id, seq_len, condition, Intracellular_RPKM = RPKM),
by = c("srna_id", "seq_len", "condition")
) %>%
arrange(condition, desc(OMV_RPKM))
# Save combined list
readr::write_csv(omv_candidates, file.path(output_dir, "omv_candidates_rpkm_ge10.csv"))
# Show a concise preview
head_preview <- omv_candidates %>%
group_by(condition) %>%
slice_max(order_by = OMV_RPKM, n = 10, with_ties = FALSE) %>%
ungroup()
knitr::kable(head_preview, caption = "Top OMV-packed sRNAs (RPKM >= 10) by condition")
}| srna_id | seq_len | condition | OMV_RPKM | Intracellular_RPKM |
|---|---|---|---|---|
| manual_10Sa | NA | ControlPCN_HighOD | 180.54065 | 6.543296e+05 |
| sRNA-Sflex-59 | 456 | ControlPCN_HighOD | 158.27740 | 5.254134e+05 |
| sRNA-Sente-36 | 108 | ControlPCN_HighOD | 137.29876 | 3.758984e+02 |
| sRNA-Paeru-10 | 209 | ControlPCN_HighOD | 91.48866 | 2.943083e+05 |
| sRNA-Apleu-10 | 176 | ControlPCN_HighOD | 87.36320 | 6.110659e+02 |
| sRNA-Sente-52 | 1236 | ControlPCN_HighOD | 62.79487 | 4.475461e+02 |
| sRNA-Ecoli-13 | 84 | ControlPCN_HighOD | 46.59370 | 3.686107e+01 |
| sRNA-Sente-29 | 109 | ControlPCN_HighOD | 33.74556 | 2.775994e+02 |
| manual_CsrC | NA | ControlPCN_HighOD | 33.56022 | 2.219529e+04 |
| sRNA-Sente-37 | 46 | ControlPCN_HighOD | 33.53872 | 1.304677e+00 |
| manual_10Sa | NA | ControlPCN_LowOD | 413.62867 | 4.132594e+05 |
| sRNA-Sflex-59 | 456 | ControlPCN_LowOD | 328.70732 | 3.286413e+05 |
| sRNA-Sflex-60 | 184 | ControlPCN_LowOD | 184.28206 | 4.696713e+04 |
| sRNA-Paeru-10 | 209 | ControlPCN_LowOD | 123.74163 | 1.261786e+05 |
| sRNA-Sente-29 | 109 | ControlPCN_LowOD | 97.31942 | 7.213708e+02 |
| manual_SsrS | NA | ControlPCN_LowOD | 93.35581 | 2.438831e+04 |
| sRNA-Sente-34 | 62 | ControlPCN_LowOD | 40.93169 | 3.841463e+03 |
| sRNA-Sente-37 | 46 | ControlPCN_LowOD | 35.45908 | 1.182353e+01 |
| sRNA-Sente-32 | 53 | ControlPCN_LowOD | 16.31676 | 7.743698e+02 |
| manual_CsrC | NA | ControlPCN_LowOD | 15.18939 | 1.064690e+04 |
| sRNA-Sente-36 | 108 | SPI-1ind-LB_HighOD | 8405.86870 | 4.385577e+02 |
| sRNA-Styph-1 | 296 | SPI-1ind-LB_HighOD | 6145.35312 | 1.058231e+01 |
| sRNA-Sente-46 | 266 | SPI-1ind-LB_HighOD | 3876.27595 | 2.664049e+01 |
| manual_10Sa | NA | SPI-1ind-LB_HighOD | 3519.38426 | 1.324867e+06 |
| sRNA-Sflex-59 | 456 | SPI-1ind-LB_HighOD | 3333.01371 | 1.065598e+06 |
| sRNA-Paeru-10 | 209 | SPI-1ind-LB_HighOD | 2939.72955 | 2.347986e+05 |
| sRNA-Sente-52 | 1236 | SPI-1ind-LB_HighOD | 2627.36197 | 4.107585e+02 |
| sRNA-Sente-33 | 82 | SPI-1ind-LB_HighOD | 1965.82992 | 1.135203e+03 |
| sRNA-Sflex-60 | 184 | SPI-1ind-LB_HighOD | 1887.11978 | 6.176864e+04 |
| sRNA-Ecoli-61 | 106 | SPI-1ind-LB_HighOD | 1538.10064 | 3.801569e+01 |
| manual_10Sa | NA | SPI-2ind_HighOD | 11648.83476 | 6.199890e+05 |
| sRNA-Sflex-59 | 456 | SPI-2ind_HighOD | 9320.62878 | 4.938366e+05 |
| sRNA-Ecoli-13 | 84 | SPI-2ind_HighOD | 6266.62528 | 1.461888e+02 |
| sRNA-Paeru-10 | 209 | SPI-2ind_HighOD | 5083.34974 | 2.084876e+05 |
| sRNA-Sflex-60 | 184 | SPI-2ind_HighOD | 978.25380 | 6.537325e+04 |
| manual_SsrS | NA | SPI-2ind_HighOD | 495.56124 | 3.702340e+04 |
| sRNA-Sente-37 | 46 | SPI-2ind_HighOD | 282.38818 | 1.799624e+01 |
| manual_CsrC | NA | SPI-2ind_HighOD | 273.16379 | 1.917958e+04 |
| sRNA-Sente-29 | 109 | SPI-2ind_HighOD | 176.22329 | 2.518915e+02 |
| sRNA-Sente-52 | 1236 | SPI-2ind_HighOD | 164.10322 | 2.584162e+02 |
| manual_10Sa | NA | SPI-2ind_LowOD | 3878.13087 | 3.681235e+05 |
| sRNA-Sflex-59 | 456 | SPI-2ind_LowOD | 3280.31683 | 3.317412e+05 |
| sRNA-Paeru-10 | 209 | SPI-2ind_LowOD | 1792.78847 | 1.072380e+05 |
| sRNA-Sflex-60 | 184 | SPI-2ind_LowOD | 1213.94850 | 7.409502e+04 |
| manual_SsrS | NA | SPI-2ind_LowOD | 611.16085 | 3.802551e+04 |
| sRNA-Sente-29 | 109 | SPI-2ind_LowOD | 540.78830 | 1.286075e+03 |
| sRNA-Sente-34 | 62 | SPI-2ind_LowOD | 168.39499 | 3.445253e+03 |
| sRNA-Sente-37 | 46 | SPI-2ind_LowOD | 139.14487 | 4.086566e+00 |
| manual_CsrC | NA | SPI-2ind_LowOD | 85.24838 | 6.005684e+03 |
| sRNA-Sente-52 | 1236 | SPI-2ind_LowOD | 67.12749 | 1.568392e+02 |
To establish the baseline profile of “normal” sRNA packaging, we examine both ControlPCN_LowOD (log phase) and ControlPCN_HighOD (stationary phase).
sRNA-Sente-37 (This sRNA is
enriched across all conditions)Does general stress increase RNA expression?
Yes.
Does general stress increase selective RNA packaging into
OMVs?
Yes.
We compare SPI-1ind-LB_HighOD against ControlPCN_HighOD.
Biological context:
SPI-1 induction in LB medium mimics the early intestinal infection stage
by activating Salmonella Pathogenicity Island 1.
Key observations: - A greater number of points fall left of the diagonal compared to the control, indicating more sRNAs enriched in OMVs during the invasion-like condition. - Overall expression of sRNAs increases in both intracellular and OMV fractions compared to the control. - These OMV-enriched sRNAs may represent candidates involved in the intestinal invasion phase.
Does sRNA expression increase during intestinal
invasion?
Yes.
Does selective sRNA packaging increase during invasion
compared to the control?
Yes.
Biological context:
SPI-2 medium mimics the intracellular macrophage environment by inducing
SPI-2 genes under acidic (pH 5.8), phosphate-limited PCN conditions.
Key observations: - Three sRNAs are located to the left of the diagonal, indicating OMV-specific enrichment relative to intracellular abundance. - These may represent candidates important for Salmonella survival or interaction with host (human) pathways during the macrophage phase.
This figure combines sRNA detection summary, UMAP embedding, and scatter plots as requested.
library(umap)
# ============================================================
# PANEL A: sRNA Detection Summary by Dataset
# ============================================================
# Count sRNAs detected per dataset (using existing pass_ids_list or recalculate)
detection_summary <- tibble(
dataset = character(),
n_srna = integer()
)
dataset_order <- c("Blenkiron_2016", "Sahr_2022_2", "Malabirade_2018")
for (dname in dataset_order) {
if (dname %in% names(meta_list) && !is.null(meta_list[[dname]])) {
meta <- meta_list[[dname]]
counts <- load_counts_long(meta)
gcol <- group_cols[[dname]]
if (nrow(counts) > 0 && gcol %in% names(meta)) {
# Count sRNAs with mean RPKM > 0.6 in ANY group (aligned filter)
n_detected <- counts %>%
left_join(meta %>% select(sample_id, group = .data[[gcol]]), by = "sample_id") %>%
group_by(srna_id, group) %>%
summarise(mean_expr = mean(RPKM, na.rm = TRUE), .groups = "drop") %>%
group_by(srna_id) %>%
filter(any(mean_expr > 0.6)) %>%
pull(srna_id) %>%
n_distinct()
detection_summary <- bind_rows(detection_summary,
tibble(dataset = dname, n_srna = n_detected))
}
}
}
# Create compact bar chart - use short dataset names
detection_summary <- detection_summary %>%
mutate(dataset_short = case_when(
dataset == "Blenkiron_2016" ~ "Blenkiron",
dataset == "Sahr_2022_2" ~ "Sahr",
dataset == "Malabirade_2018" ~ "Malabirade",
TRUE ~ dataset
))
fig1_panel_a <- ggplot(detection_summary, aes(x = factor(dataset_short, levels = c("Blenkiron", "Sahr", "Malabirade")), y = n_srna)) +
geom_col(fill = "#4C78A8", width = 0.6) +
geom_text(aes(label = n_srna), vjust = -0.3, size = 2.2) +
labs(x = NULL, y = "sRNAs detected", title = "sRNA Detection") +
scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
theme_bw(base_size = 7) +
theme(
plot.title = element_text(size = 7, face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
axis.text.y = element_text(size = 5),
axis.title.y = element_text(size = 6),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = margin(2, 2, 2, 2)
)
# ============================================================
# PANEL B: Combined UMAP of All Samples
# ============================================================
# Color by RNA fraction type (OMV vs Intracellular)
# Shape by study name
# Build sample × sRNA expression matrix from all datasets
all_samples_list <- list()
for (dname in dataset_order) {
if (dname %in% names(meta_list) && !is.null(meta_list[[dname]])) {
meta <- meta_list[[dname]]
counts <- load_counts_long(meta)
gcol <- group_cols[[dname]]
if (nrow(counts) > 0 && gcol %in% names(meta)) {
# Determine RNA fraction type based on dataset
# Different datasets encode this differently
if (dname == "Malabirade_2018" && "rna_sample_type" %in% names(meta)) {
meta_fraction <- meta %>%
mutate(fraction = case_when(
str_detect(rna_sample_type, "OMV") ~ "OMV",
str_detect(rna_sample_type, "intracellular") ~ "Intracellular",
TRUE ~ "Unknown"
))
} else if (dname == "Sahr_2022_2" && "condition" %in% names(meta)) {
# MVseq = EV/OMV fraction, RNAseq = Total/Intracellular
meta_fraction <- meta %>%
mutate(fraction = case_when(
condition == "MVseq" ~ "OMV",
condition == "RNAseq" ~ "Intracellular",
TRUE ~ "Unknown"
))
} else if (dname == "Blenkiron_2016") {
# Blenkiron_2016: ALL samples are from MVs/OMVs (E. coli)
# The small/medium/large refers to RNA size fractionation WITHIN vesicles
meta_fraction <- meta %>%
mutate(fraction = "OMV (size-fractionated)")
} else {
# Default: use grouping column as fraction
meta_fraction <- meta %>%
mutate(fraction = .data[[gcol]])
}
# Add dataset and fraction info
sample_data <- counts %>%
left_join(meta_fraction %>% select(sample_id, fraction), by = "sample_id") %>%
mutate(dataset = dname)
all_samples_list[[dname]] <- sample_data
}
}
}
all_samples_long <- bind_rows(all_samples_list)
# Create wide matrix for UMAP (samples × sRNAs)
if (nrow(all_samples_long) > 0) {
# Pivot to wide format
expr_wide <- all_samples_long %>%
select(sample_id, srna_id, RPKM, dataset, fraction) %>%
distinct(sample_id, srna_id, .keep_all = TRUE) %>%
pivot_wider(
id_cols = c(sample_id, dataset, fraction),
names_from = srna_id,
values_from = RPKM,
values_fill = 0
)
# Extract metadata and expression matrix
sample_meta <- expr_wide %>% select(sample_id, dataset, fraction)
expr_matrix <- expr_wide %>% select(-sample_id, -dataset, -fraction) %>% as.matrix()
# Log transform for UMAP
expr_log <- log1p(expr_matrix)
# Run UMAP
set.seed(42)
umap_result <- umap::umap(expr_log, n_neighbors = min(15, nrow(expr_log) - 1))
# Create UMAP data frame
umap_df <- tibble(
UMAP1 = umap_result$layout[, 1],
UMAP2 = umap_result$layout[, 2],
sample_id = sample_meta$sample_id,
dataset = sample_meta$dataset,
fraction = sample_meta$fraction
)
# Simplify fraction labels for cleaner legend
umap_df <- umap_df %>%
mutate(
fraction_clean = case_when(
fraction == "OMV" ~ "OMV",
fraction == "Intracellular" ~ "Intracellular",
fraction == "OMV (size-fractionated)" ~ "OMV (size-frac.)",
TRUE ~ fraction
)
)
# Create compact UMAP plot - Color by fraction, Shape by study
# Use shorter study names for legend
umap_df <- umap_df %>%
mutate(study_short = case_when(
dataset == "Blenkiron_2016" ~ "Blenkiron",
dataset == "Sahr_2022_2" ~ "Sahr",
dataset == "Malabirade_2018" ~ "Malabirade",
TRUE ~ dataset
))
# Calculate axis limits for SQUARE plot (same approach as Malabirade UMAP)
x_range_b <- range(umap_df$UMAP1, na.rm = TRUE)
y_range_b <- range(umap_df$UMAP2, na.rm = TRUE)
x_center_b <- mean(x_range_b)
y_center_b <- mean(y_range_b)
x_span_b <- diff(x_range_b)
y_span_b <- diff(y_range_b)
max_span_b <- max(x_span_b, y_span_b) * 0.55 # half-span with padding
fig1_panel_b <- ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = fraction_clean, shape = study_short)) +
geom_point(size = 1.2, alpha = 0.8) +
scale_color_manual(
values = c(
"OMV" = "#E45756",
"Intracellular" = "#4C78A8",
"OMV (size-frac.)" = "#F28E2B"
),
name = "Fraction"
) +
scale_shape_manual(
values = c(
"Blenkiron" = 16,
"Sahr" = 17,
"Malabirade" = 18
),
name = "Study"
) +
coord_fixed(ratio = 1,
xlim = c(x_center_b - max_span_b, x_center_b + max_span_b),
ylim = c(y_center_b - max_span_b, y_center_b + max_span_b)) +
labs(x = "UMAP 1", y = "UMAP 2", title = "All Studies UMAP") +
theme_bw(base_size = 7) +
theme(
plot.title = element_text(size = 7, face = "bold", hjust = 0.5),
legend.position = "right",
legend.text = element_text(size = 5.5),
legend.title = element_text(size = 6, face = "bold"),
legend.key.size = unit(0.3, "cm"),
legend.key.height = unit(0.35, "cm"),
legend.spacing.y = unit(0.05, "cm"),
legend.margin = margin(0, 0, 0, 0),
legend.box.margin = margin(0, 0, 0, -5),
axis.text = element_text(size = 5),
axis.title = element_text(size = 6),
plot.margin = margin(2, 2, 2, 2)
) +
guides(
color = guide_legend(order = 1, override.aes = list(size = 2)),
shape = guide_legend(order = 2, override.aes = list(size = 2))
)
} else {
fig1_panel_b <- ggplot() +
annotate("text", x = 0.5, y = 0.5, label = "No data for UMAP") +
theme_void()
}
print(fig1_panel_a)Since Malabirade_2018 has many conditions (OMV/Intracellular × growth phases × treatments), here’s a dedicated UMAP for clearer visualization.
# ============================================================
# MALABIRADE-SPECIFIC UMAP
# ============================================================
if ("Malabirade_2018" %in% names(meta_list) && !is.null(meta_list$Malabirade_2018)) {
meta_mala <- meta_list$Malabirade_2018
counts_mala <- load_counts_long(meta_mala)
if (nrow(counts_mala) > 0) {
# Add detailed condition info
mala_sample_data <- counts_mala %>%
left_join(
meta_mala %>% select(sample_id, condition, growth_medium, growth_phase, rna_sample_type),
by = "sample_id"
)
# Create wide matrix for UMAP
mala_wide <- mala_sample_data %>%
select(sample_id, srna_id, RPKM, growth_medium, growth_phase, rna_sample_type) %>%
distinct(sample_id, srna_id, .keep_all = TRUE) %>%
pivot_wider(
id_cols = c(sample_id, growth_medium, growth_phase, rna_sample_type),
names_from = srna_id,
values_from = RPKM,
values_fill = 0
)
# Extract metadata and matrix
mala_meta <- mala_wide %>% select(sample_id, growth_medium, growth_phase, rna_sample_type)
mala_matrix <- mala_wide %>% select(-sample_id, -growth_medium, -growth_phase, -rna_sample_type) %>% as.matrix()
# Log transform
mala_log <- log1p(mala_matrix)
# Run UMAP
set.seed(42)
mala_umap <- umap::umap(mala_log, n_neighbors = min(15, nrow(mala_log) - 1))
# Create UMAP data frame with detailed labels
mala_umap_df <- tibble(
UMAP1 = mala_umap$layout[, 1],
UMAP2 = mala_umap$layout[, 2],
sample_id = mala_meta$sample_id,
growth_medium = mala_meta$growth_medium,
growth_phase = mala_meta$growth_phase,
rna_sample_type = mala_meta$rna_sample_type
) %>%
mutate(
# Cleaner labels
fraction = case_when(
str_detect(rna_sample_type, "OMV") ~ "OMV",
str_detect(rna_sample_type, "intracellular") ~ "Intracellular",
TRUE ~ rna_sample_type
),
treatment = case_when(
str_detect(growth_medium, "Control") ~ "Control",
str_detect(growth_medium, "SPI-1") ~ "SPI-1",
str_detect(growth_medium, "SPI-2") ~ "SPI-2",
TRUE ~ growth_medium
),
phase = case_when(
growth_phase == "HighOD" ~ "Stationary",
growth_phase == "LowOD" ~ "Log phase",
TRUE ~ growth_phase
)
)
# Plot with cleaner aesthetics (full version for standalone)
mala_umap_plot <- ggplot(mala_umap_df, aes(x = UMAP1, y = UMAP2,
color = treatment,
shape = fraction)) +
geom_point(size = 3, alpha = 0.8) +
scale_color_manual(values = c("Control" = "#4C78A8", "SPI-1" = "#E45756", "SPI-2" = "#72B7B2")) +
scale_shape_manual(values = c("Intracellular" = 16, "OMV" = 17)) +
facet_wrap(~phase, ncol = 2) +
labs(
x = "UMAP 1",
y = "UMAP 2",
color = "Treatment",
shape = "Fraction",
title = "Malabirade_2018: Sample Clustering by Treatment and Fraction"
) +
theme_bw(base_size = 11) +
theme(
legend.position = "right",
strip.background = element_rect(fill = "grey90"),
strip.text = element_text(face = "bold")
)
print(mala_umap_plot)
# --- COMPACT VERSION FOR FIGURE 1 (Malabirade UMAP for top row) ---
# Create separate UMAP plots for each phase to match Panel C box dimensions
# Will be arranged HORIZONTALLY in top row next to Panel B
# Function to create a single UMAP box matching Panel C's scatter boxes
# Use FIXED axis limits for SQUARE plots (same span for X and Y)
make_umap_box <- function(data, title_text = NULL) {
# Calculate ranges from FULL data to ensure all points visible
x_range <- range(mala_umap_df$UMAP1, na.rm = TRUE)
y_range <- range(mala_umap_df$UMAP2, na.rm = TRUE)
# Find centers
x_center <- mean(x_range)
y_center <- mean(y_range)
# Use the LARGER span (with padding) for both axes to keep square
x_span <- diff(x_range)
y_span <- diff(y_range)
max_span <- max(x_span, y_span) * 0.55 # half-span with 10% padding
ggplot(data, aes(x = UMAP1, y = UMAP2, color = treatment, shape = fraction)) +
geom_point(size = 1.2, alpha = 0.8, stroke = 0) +
scale_color_manual(
values = c("Control" = "#4C78A8", "SPI-1" = "#E45756", "SPI-2" = "#72B7B2")
) +
scale_shape_manual(
values = c("Intracellular" = 16, "OMV" = 17)
) +
coord_fixed(ratio = 1,
xlim = c(x_center - max_span, x_center + max_span),
ylim = c(y_center - max_span, y_center + max_span)) +
theme_bw(base_size = 6) +
theme(
axis.title = element_blank(),
axis.text = element_text(size = 4),
axis.ticks = element_line(linewidth = 0.2),
axis.ticks.length = unit(0.5, "mm"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(linewidth = 0.15, color = "grey88"),
panel.border = element_rect(linewidth = 0.25),
plot.margin = margin(1, 1, 1, 1),
legend.position = "none"
)
}
# Create separate plots for each phase
mala_umap_log <- make_umap_box(mala_umap_df %>% filter(phase == "Log phase"))
mala_umap_stat <- make_umap_box(mala_umap_df %>% filter(phase == "Stationary"))
# Create a shared legend - two lines, centered
mala_legend_plot <- ggplot(mala_umap_df, aes(x = UMAP1, y = UMAP2, color = treatment, shape = fraction)) +
geom_point(size = 1.5) +
scale_color_manual(
values = c("Control" = "#4C78A8", "SPI-1" = "#E45756", "SPI-2" = "#72B7B2"),
name = "Treatment:"
) +
scale_shape_manual(
values = c("Intracellular" = 16, "OMV" = 17),
name = "Fraction:"
) +
theme_bw(base_size = 6) +
theme(
legend.position = "bottom",
legend.box = "vertical",
legend.box.just = "center",
legend.text = element_text(size = 5),
legend.title = element_text(size = 5.5, face = "bold"),
legend.key.size = unit(0.25, "cm"),
legend.margin = margin(0, 0, 0, 0),
legend.spacing.y = unit(0.02, "cm")
) +
guides(
color = guide_legend(order = 1, override.aes = list(size = 2), nrow = 1, title.position = "left"),
shape = guide_legend(order = 2, override.aes = list(size = 2), nrow = 1, title.position = "left")
)
# Extract legend
mala_legend_grob <- cowplot::get_legend(mala_legend_plot)
# Store for figure assembly
fig1_mala_umap_log <- mala_umap_log
fig1_mala_umap_stat <- mala_umap_stat
fig1_mala_umap_legend <- mala_legend_grob
# Save full version
output_dir <- path_from_root("explore/summary/output")
ggplot2::ggsave(
filename = file.path(output_dir, "umap_malabirade_detailed.png"),
plot = mala_umap_plot, width = 8, height = 6, dpi = 600
)
}
}# ============================================================
# PANEL D: Blenkiron Scatter Plots (compact, matching Panel C style)
# ============================================================
blen_scatter_data <- NULL
if ("Blenkiron_2016" %in% names(meta_list)) {
meta <- meta_list$Blenkiron_2016
counts <- load_counts_long(meta)
# Relaxed filter: keep sRNAs with mean RPKM > 0.6 in ANY library size
sRNA_keep <- counts %>%
left_join(meta %>% select(sample_id, library_size), by = "sample_id") %>%
filter(library_size %in% c("small", "medium", "large")) %>%
group_by(srna_id, library_size) %>%
summarise(mean_expr = mean(RPKM, na.rm = TRUE), .groups = "drop") %>%
group_by(srna_id) %>%
filter(any(mean_expr > 0.6)) %>%
pull(srna_id)
# Median per library size (small / medium / large)
blen_scatter_data <- counts %>%
filter(srna_id %in% sRNA_keep) %>%
left_join(meta %>% select(sample_id, library_size), by = "sample_id") %>%
filter(library_size %in% c("small", "medium", "large")) %>%
group_by(srna_id, library_size) %>%
summarise(median_RPKM = median(RPKM, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(names_from = library_size, values_from = median_RPKM) %>%
mutate(across(where(is.numeric), ~ replace_na(., 0.01)))
}
# Create compact scatter plots matching Panel C style
if (!is.null(blen_scatter_data) && nrow(blen_scatter_data) > 0) {
# Get axis limits
blen_vals <- c(blen_scatter_data$small, blen_scatter_data$medium, blen_scatter_data$large)
blen_axis_max <- 10^ceiling(log10(max(blen_vals, na.rm = TRUE)))
# Function to create compact Blenkiron scatter (matching make_scatter style)
make_blen_scatter <- function(data, x_col, y_col) {
plot_data <- data %>% filter(!is.na(.data[[x_col]]) & !is.na(.data[[y_col]]))
n_pts <- nrow(plot_data)
ggplot(plot_data, aes(x = .data[[x_col]], y = .data[[y_col]])) +
geom_point(alpha = 0.55, color = "#2ca02c", size = 0.5, stroke = 0) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50", linewidth = 0.3) +
annotate("text", x = 0.01, y = blen_axis_max * 0.6, label = paste0("n=", n_pts),
hjust = 0, vjust = 1, size = 1.8, color = "grey30") +
scale_x_continuous(trans = "log1p", breaks = c(0, 10, 100, 1000),
labels = c("0", "10", "100", "1k"), limits = c(0, blen_axis_max)) +
scale_y_continuous(trans = "log1p", breaks = c(0, 10, 100, 1000),
labels = c("0", "10", "100", "1k"), limits = c(0, blen_axis_max)) +
coord_fixed(ratio = 1) +
theme_bw(base_size = 6) +
theme(axis.title = element_blank(),
axis.text = element_text(size = 3),
axis.ticks = element_line(linewidth = 0.2),
axis.ticks.length = unit(0.5, "mm"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(linewidth = 0.15, color = "grey88"),
panel.border = element_rect(linewidth = 0.25),
plot.margin = margin(1, 1, 1, 1))
}
# Create three scatter plots
blen_p1 <- make_blen_scatter(blen_scatter_data, "small", "large")
blen_p2 <- make_blen_scatter(blen_scatter_data, "medium", "large")
blen_p3 <- make_blen_scatter(blen_scatter_data, "small", "medium")
# Store for figure assembly
fig1_blen_p1 <- blen_p1 # Small vs Large
fig1_blen_p2 <- blen_p2 # Medium vs Large
fig1_blen_p3 <- blen_p3 # Small vs Medium
}# ============================================================
# COMBINE INTO FIGURE 1 (Full A4)
# ============================================================
# First, we need the scatter plots from Panel C
# These will be created in the next chunk and combined hereif (exists("all_mala")) {
# Define condition names mapping to what is in all_mala$condition
cond_ctrl <- "ControlPCN_HighOD"
cond_spi1 <- "SPI-1ind-LB_HighOD"
cond_spi2 <- "SPI-2ind_HighOD"
cond_ctrl_low <- "ControlPCN_LowOD"
cond_spi2_low <- "SPI-2ind_LowOD"
output_dir <- path_from_root("explore/summary/output")
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
avail_conds <- unique(all_mala$condition)
required_conditions <- c(cond_ctrl, cond_spi1, cond_spi2, cond_ctrl_low, cond_spi2_low)
if (all(required_conditions %in% avail_conds)) {
# Prepare wide data
mala_wide <- all_mala %>%
filter(condition %in% required_conditions) %>%
select(srna_id, seq_len, condition, Intracellular, OMV) %>%
group_by(srna_id, seq_len, condition) %>%
summarise(Intracellular = median(Intracellular, na.rm = TRUE),
OMV = median(OMV, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(id_cols = c(srna_id, seq_len), names_from = condition,
values_from = c(Intracellular, OMV), names_glue = "{.value}_{condition}")
# Uniform axis limits
all_vals <- unlist(mala_wide[, grep("^(OMV|Intracellular)_", names(mala_wide))], use.names = FALSE)
axis_max <- 10^ceiling(log10(max(all_vals, na.rm = TRUE)))
# Ultra-compact scatter plot for 4x3 combined figure
make_scatter <- function(data, x_var, y_var) {
plot_data <- data %>% filter(!is.na(.data[[x_var]]) & !is.na(.data[[y_var]]))
n_pts <- nrow(plot_data)
ggplot(plot_data, aes(x = .data[[x_var]], y = .data[[y_var]])) +
geom_point(alpha = 0.55, color = "#2166AC", size = 0.5, stroke = 0) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50", linewidth = 0.3) +
annotate("text", x = 0, y = axis_max * 0.6, label = paste0("n=", n_pts),
hjust = 0, vjust = 1, size = 1.8, color = "grey30") +
scale_x_continuous(trans = "log1p", breaks = c(0, 1000, 10000),
labels = c("0", "1k", "10k"), limits = c(0, axis_max)) +
scale_y_continuous(trans = "log1p", breaks = c(0, 1000, 10000),
labels = c("0", "1k", "10k"), limits = c(0, axis_max)) +
coord_fixed(ratio = 1) +
theme_bw(base_size = 6) +
theme(axis.title = element_blank(),
axis.text = element_text(size = 3),
axis.ticks = element_line(linewidth = 0.2),
axis.ticks.length = unit(0.5, "mm"),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(linewidth = 0.15, color = "grey88"),
panel.border = element_rect(linewidth = 0.25),
plot.margin = margin(1, 1, 1, 1))
}
# Layout: 5 rows x 2 cols
# Row 1: Ctrl vs SPI-1 (Stationary) - treatment effect
# Row 2: Ctrl vs SPI-2 (Stationary) - treatment effect
# Row 3: Ctrl vs SPI-2 (Log phase) - treatment effect in log phase
# Row 4: Control: Stationary vs Log - growth phase effect on baseline
# Row 5: SPI-2: Stationary vs Log - growth phase effect on SPI-2
# Row 1: Ctrl vs SPI-1 (Stationary)
p1 <- make_scatter(mala_wide, paste0("Intracellular_", cond_ctrl),
paste0("Intracellular_", cond_spi1))
p2 <- make_scatter(mala_wide, paste0("OMV_", cond_ctrl),
paste0("OMV_", cond_spi1))
# Row 2: Ctrl vs SPI-2 (Stationary)
p3 <- make_scatter(mala_wide, paste0("Intracellular_", cond_ctrl),
paste0("Intracellular_", cond_spi2))
p4 <- make_scatter(mala_wide, paste0("OMV_", cond_ctrl),
paste0("OMV_", cond_spi2))
# Row 3: Ctrl vs SPI-2 (Log phase)
p5 <- make_scatter(mala_wide, paste0("Intracellular_", cond_ctrl_low),
paste0("Intracellular_", cond_spi2_low))
p6 <- make_scatter(mala_wide, paste0("OMV_", cond_ctrl_low),
paste0("OMV_", cond_spi2_low))
# Row 4: Control: Stationary vs Log (x = stationary, y = log)
p7 <- make_scatter(mala_wide, paste0("Intracellular_", cond_ctrl),
paste0("Intracellular_", cond_ctrl_low))
p8 <- make_scatter(mala_wide, paste0("OMV_", cond_ctrl),
paste0("OMV_", cond_ctrl_low))
# Row 5: SPI-2: Stationary vs Log (x = stationary, y = log)
p9 <- make_scatter(mala_wide, paste0("Intracellular_", cond_spi2),
paste0("Intracellular_", cond_spi2_low))
p10 <- make_scatter(mala_wide, paste0("OMV_", cond_spi2),
paste0("OMV_", cond_spi2_low))
# ============================================================
# COMBINED FIGURE: Panels A & B side by side in 4x3 inches
# ============================================================
# Row labels - full words, no shorthand
row_labs_a <- c("Stationary: Control vs SPI-1",
"Stationary: Control vs SPI-2",
"Log phase: Control vs SPI-2")
row_labs_b <- c("Control: Stationary vs Log",
"SPI-2: Stationary vs Log")
# Helper to make a row (label on top, 2 plots below)
make_row <- function(lab, p_left, p_right, lab_size = 4.5) {
plots <- cowplot::plot_grid(p_left, p_right, ncol = 2)
label <- cowplot::ggdraw() + cowplot::draw_label(lab, size = lab_size)
cowplot::plot_grid(label, plots, ncol = 1, rel_heights = c(0.12, 1))
}
# --- PANEL A: Treatment Effects (3 rows) ---
header_a <- cowplot::plot_grid(
cowplot::ggdraw() + cowplot::draw_label("Intracellular", size = 5, fontface = "bold"),
cowplot::ggdraw() + cowplot::draw_label("OMV", size = 5, fontface = "bold"),
ncol = 2
)
title_a <- cowplot::ggdraw() +
cowplot::draw_label("1. Treatment Effects", size = 6, fontface = "bold", hjust = 0, x = 0.02, y = 0.3)
panel_a <- cowplot::plot_grid(
NULL,
title_a,
NULL,
header_a,
make_row(row_labs_a[1], p1, p2),
make_row(row_labs_a[2], p3, p4),
make_row(row_labs_a[3], p5, p6),
ncol = 1, rel_heights = c(0.02, 0.05, 0.05, 0.04, 1, 1, 1)
)
# --- PANEL B: Growth Phase Effects (2 rows) ---
header_b <- cowplot::plot_grid(
cowplot::ggdraw() + cowplot::draw_label("Intracellular", size = 5, fontface = "bold"),
cowplot::ggdraw() + cowplot::draw_label("OMV", size = 5, fontface = "bold"),
ncol = 2
)
title_b <- cowplot::ggdraw() +
cowplot::draw_label("2. Phase Effects", size = 6, fontface = "bold", hjust = 0, x = 0.02, y = 0.3)
panel_b <- cowplot::plot_grid(
NULL,
title_b,
NULL,
header_b,
make_row(row_labs_b[1], p7, p8),
make_row(row_labs_b[2], p9, p10),
NULL, # empty row to align with Panel A's 3 rows
ncol = 1, rel_heights = c(0.02, 0.05, 0.05, 0.04, 1, 1, 1)
)
# Combine panels side by side
main_grid <- cowplot::plot_grid(panel_a, panel_b, ncol = 2, rel_widths = c(1, 1))
# Shared axis labels
x_lab <- cowplot::ggdraw() + cowplot::draw_label("RPKM (condition 1)", size = 5, hjust = 0.5)
y_lab <- cowplot::ggdraw() + cowplot::draw_label("RPKM (condition 2)", size = 5, angle = 90)
# Assemble final plot
with_y <- cowplot::plot_grid(y_lab, main_grid, ncol = 2, rel_widths = c(0.03, 1))
combined_plot <- cowplot::plot_grid(with_y, x_lab, ncol = 1, rel_heights = c(1, 0.04))
print(combined_plot)
# Save scatter plot separately
ggplot2::ggsave(filename = file.path(output_dir, "scatter_combined.png"),
plot = combined_plot, width = 4, height = 3, dpi = 600)
# Store for Figure 1 assembly
panel_c_scatter <- combined_plot
} else {
message("Missing conditions. Available: ", paste(avail_conds, collapse = ", "))
panel_c_scatter <- NULL
}
}# ============================================================
# FIGURE 1: New Layout (7 x 6 inches)
# ============================================================
# Top row: A (bar) | B (Overall UMAP) | Malabirade UMAP (horizontal)
# Bottom row: C (Malabirade scatter) | D (Blenkiron scatter)
if (exists("fig1_panel_a") && exists("fig1_panel_b") && exists("panel_c_scatter") && !is.null(panel_c_scatter)) {
output_dir <- path_from_root("explore/summary/output")
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
# === TOP ROW: A + B + Malabirade UMAP ===
# Malabirade UMAP panel (horizontal: Log | Stationary)
if (exists("fig1_mala_umap_log") && exists("fig1_mala_umap_stat")) {
# Create column labels
lab_log <- cowplot::ggdraw() + cowplot::draw_label("Log", size = 5)
lab_stat <- cowplot::ggdraw() + cowplot::draw_label("Stationary", size = 5)
# Arrange horizontally: label on top of each plot
umap_log_col <- cowplot::plot_grid(lab_log, fig1_mala_umap_log, ncol = 1, rel_heights = c(0.1, 1))
umap_stat_col <- cowplot::plot_grid(lab_stat, fig1_mala_umap_stat, ncol = 1, rel_heights = c(0.1, 1))
# Combine the two columns horizontally
mala_umap_plots <- cowplot::plot_grid(umap_log_col, umap_stat_col, ncol = 2)
# Add title and legend
mala_title <- cowplot::ggdraw() +
cowplot::draw_label("Malabirade UMAP", size = 7, fontface = "bold", hjust = 0.5)
mala_umap_panel <- cowplot::plot_grid(
mala_title,
mala_umap_plots,
fig1_mala_umap_legend,
ncol = 1,
rel_heights = c(0.08, 1, 0.18)
)
} else {
mala_umap_panel <- cowplot::ggdraw() +
cowplot::draw_label("Malabirade UMAP\n(not available)", size = 6)
}
# Combine A + B + Malabirade UMAP
# Panel B has 1 square plot + legend, Panel C has 2 square plots + legend
# So C should be ~2x B's plot width (accounting for B's legend on right)
top_row <- cowplot::plot_grid(
fig1_panel_a, fig1_panel_b, mala_umap_panel,
ncol = 3,
rel_widths = c(0.13, 0.39, 0.48),
labels = c("A", "B", "C"),
label_size = 10,
label_fontface = "bold"
)
# === BOTTOM ROW: Malabirade scatter + Blenkiron scatter ===
# Blenkiron scatter panel
if (exists("fig1_blen_p1") && exists("fig1_blen_p2") && exists("fig1_blen_p3")) {
# Create row labels for each comparison
blen_lab1 <- cowplot::ggdraw() + cowplot::draw_label("Small vs Large", size = 4.5)
blen_lab2 <- cowplot::ggdraw() + cowplot::draw_label("Medium vs Large", size = 4.5)
blen_lab3 <- cowplot::ggdraw() + cowplot::draw_label("Small vs Medium", size = 4.5)
# Stack each label + plot
blen_row1 <- cowplot::plot_grid(blen_lab1, fig1_blen_p1, ncol = 1, rel_heights = c(0.12, 1))
blen_row2 <- cowplot::plot_grid(blen_lab2, fig1_blen_p2, ncol = 1, rel_heights = c(0.12, 1))
blen_row3 <- cowplot::plot_grid(blen_lab3, fig1_blen_p3, ncol = 1, rel_heights = c(0.12, 1))
# Stack vertically
blen_plots <- cowplot::plot_grid(blen_row1, blen_row2, blen_row3, ncol = 1)
# Add title
blen_title <- cowplot::ggdraw() +
cowplot::draw_label("Blenkiron: Size Fractions", size = 6, fontface = "bold", hjust = 0.5)
blen_panel <- cowplot::plot_grid(
blen_title,
blen_plots,
ncol = 1,
rel_heights = c(0.06, 1)
)
} else {
blen_panel <- cowplot::ggdraw() +
cowplot::draw_label("Blenkiron scatter\n(not available)", size = 6)
}
# Combine C (Malabirade scatter) + D (Blenkiron scatter)
panel_c_labeled <- cowplot::plot_grid(
panel_c_scatter,
ncol = 1,
labels = c("D"),
label_size = 10,
label_fontface = "bold"
)
panel_d_labeled <- cowplot::plot_grid(
blen_panel,
ncol = 1,
labels = c("E"),
label_size = 10,
label_fontface = "bold"
)
bottom_row <- cowplot::plot_grid(
panel_c_labeled, panel_d_labeled,
ncol = 2,
rel_widths = c(0.78, 0.22)
)
# === COMBINE TOP + BOTTOM ===
figure1 <- cowplot::plot_grid(
top_row,
bottom_row,
ncol = 1,
rel_heights = c(0.38, 0.62)
)
print(figure1)
# Save Figure 1
ggplot2::ggsave(
filename = file.path(output_dir, "Figure1_complete.png"),
plot = figure1, width = 7, height = 6, dpi = 600
)
ggplot2::ggsave(
filename = file.path(output_dir, "Figure1_complete.pdf"),
plot = figure1, width = 7, height = 6, dpi = 600
)
ggplot2::ggsave(
filename = file.path(output_dir, "Figure1_complete.svg"),
plot = figure1, width = 7, height = 6, dpi = 600
)
message("Figure 1 saved to: ", file.path(output_dir, "Figure1_complete.png"))
} else {
message("Required panels not available for Figure 1 assembly")
}Because the corresponding intracellular panel is stable, it may mean selective export/packaging preference of the sRNA during intestinal infection mimic stage.
if (exists("all_mala") && exists("mala_wide")) {
output_dir <- path_from_root("explore/summary/output")
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
# Pivot wide data back to long for filtering
mala_long <- mala_wide %>%
pivot_longer(
cols = -c(srna_id, seq_len),
names_to = c("type", "condition"),
names_pattern = "(Intracellular|OMV)_(.*)",
values_to = "RPKM"
)
# Filter sRNAs with OMV RPKM >= 10, include matched intracellular RPKM
omv_candidates <- mala_long %>%
filter(type == "OMV", RPKM >= 10) %>%
select(srna_id, seq_len, condition, OMV_RPKM = RPKM) %>%
left_join(
mala_long %>%
filter(type == "Intracellular") %>%
select(srna_id, seq_len, condition, Intracellular_RPKM = RPKM),
by = c("srna_id", "seq_len", "condition")
) %>%
arrange(condition, desc(OMV_RPKM))
# Save full list
readr::write_csv(omv_candidates, file.path(output_dir, "omv_candidates_rpkm_ge10.csv"))
# Preview: top 10 per condition
head_preview <- omv_candidates %>%
group_by(condition) %>%
slice_max(order_by = OMV_RPKM, n = 10, with_ties = FALSE) %>%
ungroup()
knitr::kable(head_preview, caption = "Top OMV-packed sRNAs (RPKM >= 10) by condition")
}| srna_id | seq_len | condition | OMV_RPKM | Intracellular_RPKM |
|---|---|---|---|---|
| manual_10Sa | NA | ControlPCN_HighOD | 180.54065 | 6.543296e+05 |
| sRNA-Sflex-59 | 456 | ControlPCN_HighOD | 158.27740 | 5.254134e+05 |
| sRNA-Sente-36 | 108 | ControlPCN_HighOD | 137.29876 | 3.758984e+02 |
| sRNA-Paeru-10 | 209 | ControlPCN_HighOD | 91.48866 | 2.943083e+05 |
| sRNA-Apleu-10 | 176 | ControlPCN_HighOD | 87.36320 | 6.110659e+02 |
| sRNA-Sente-52 | 1236 | ControlPCN_HighOD | 62.79487 | 4.475461e+02 |
| sRNA-Ecoli-13 | 84 | ControlPCN_HighOD | 46.59370 | 3.686107e+01 |
| sRNA-Sente-29 | 109 | ControlPCN_HighOD | 33.74556 | 2.775994e+02 |
| manual_CsrC | NA | ControlPCN_HighOD | 33.56022 | 2.219529e+04 |
| sRNA-Sente-37 | 46 | ControlPCN_HighOD | 33.53872 | 1.304677e+00 |
| manual_10Sa | NA | ControlPCN_LowOD | 413.62867 | 4.132594e+05 |
| sRNA-Sflex-59 | 456 | ControlPCN_LowOD | 328.70732 | 3.286413e+05 |
| sRNA-Sflex-60 | 184 | ControlPCN_LowOD | 184.28206 | 4.696713e+04 |
| sRNA-Paeru-10 | 209 | ControlPCN_LowOD | 123.74163 | 1.261786e+05 |
| sRNA-Sente-29 | 109 | ControlPCN_LowOD | 97.31942 | 7.213708e+02 |
| manual_SsrS | NA | ControlPCN_LowOD | 93.35581 | 2.438831e+04 |
| sRNA-Sente-34 | 62 | ControlPCN_LowOD | 40.93169 | 3.841463e+03 |
| sRNA-Sente-37 | 46 | ControlPCN_LowOD | 35.45908 | 1.182353e+01 |
| sRNA-Sente-32 | 53 | ControlPCN_LowOD | 16.31676 | 7.743698e+02 |
| manual_CsrC | NA | ControlPCN_LowOD | 15.18939 | 1.064690e+04 |
| sRNA-Sente-36 | 108 | SPI-1ind-LB_HighOD | 8405.86870 | 4.385577e+02 |
| sRNA-Styph-1 | 296 | SPI-1ind-LB_HighOD | 6145.35312 | 1.058231e+01 |
| sRNA-Sente-46 | 266 | SPI-1ind-LB_HighOD | 3876.27595 | 2.664049e+01 |
| manual_10Sa | NA | SPI-1ind-LB_HighOD | 3519.38426 | 1.324867e+06 |
| sRNA-Sflex-59 | 456 | SPI-1ind-LB_HighOD | 3333.01371 | 1.065598e+06 |
| sRNA-Paeru-10 | 209 | SPI-1ind-LB_HighOD | 2939.72955 | 2.347986e+05 |
| sRNA-Sente-52 | 1236 | SPI-1ind-LB_HighOD | 2627.36197 | 4.107585e+02 |
| sRNA-Sente-33 | 82 | SPI-1ind-LB_HighOD | 1965.82992 | 1.135203e+03 |
| sRNA-Sflex-60 | 184 | SPI-1ind-LB_HighOD | 1887.11978 | 6.176864e+04 |
| sRNA-Ecoli-61 | 106 | SPI-1ind-LB_HighOD | 1538.10064 | 3.801569e+01 |
| manual_10Sa | NA | SPI-2ind_HighOD | 11648.83476 | 6.199890e+05 |
| sRNA-Sflex-59 | 456 | SPI-2ind_HighOD | 9320.62878 | 4.938366e+05 |
| sRNA-Ecoli-13 | 84 | SPI-2ind_HighOD | 6266.62528 | 1.461888e+02 |
| sRNA-Paeru-10 | 209 | SPI-2ind_HighOD | 5083.34974 | 2.084876e+05 |
| sRNA-Sflex-60 | 184 | SPI-2ind_HighOD | 978.25380 | 6.537325e+04 |
| manual_SsrS | NA | SPI-2ind_HighOD | 495.56124 | 3.702340e+04 |
| sRNA-Sente-37 | 46 | SPI-2ind_HighOD | 282.38818 | 1.799624e+01 |
| manual_CsrC | NA | SPI-2ind_HighOD | 273.16379 | 1.917958e+04 |
| sRNA-Sente-29 | 109 | SPI-2ind_HighOD | 176.22329 | 2.518915e+02 |
| sRNA-Sente-52 | 1236 | SPI-2ind_HighOD | 164.10322 | 2.584162e+02 |
| manual_10Sa | NA | SPI-2ind_LowOD | 3878.13087 | 3.681235e+05 |
| sRNA-Sflex-59 | 456 | SPI-2ind_LowOD | 3280.31683 | 3.317412e+05 |
| sRNA-Paeru-10 | 209 | SPI-2ind_LowOD | 1792.78847 | 1.072380e+05 |
| sRNA-Sflex-60 | 184 | SPI-2ind_LowOD | 1213.94850 | 7.409502e+04 |
| manual_SsrS | NA | SPI-2ind_LowOD | 611.16085 | 3.802551e+04 |
| sRNA-Sente-29 | 109 | SPI-2ind_LowOD | 540.78830 | 1.286075e+03 |
| sRNA-Sente-34 | 62 | SPI-2ind_LowOD | 168.39499 | 3.445253e+03 |
| sRNA-Sente-37 | 46 | SPI-2ind_LowOD | 139.14487 | 4.086566e+00 |
| manual_CsrC | NA | SPI-2ind_LowOD | 85.24838 | 6.005684e+03 |
| sRNA-Sente-52 | 1236 | SPI-2ind_LowOD | 67.12749 | 1.568392e+02 |