We will standardize condition strings and split into growth_medium, growth_phase, rna_sample_type for clarity: explore/malabirade_2018/Malabirade_2018
sra_path <- path_from_root("data", "Malabirade_2018", "SraRunInfo.csv")
metadata_path <- path_from_root("results_malabirade_2018", "misc", "rnaseq_metadata.tsv")
metadata_enriched_path <- path_from_root("results_malabirade_2018", "misc", "rnaseq_metadata_enriched.tsv")
sra_lookup <- read_csv(sra_path, show_col_types = FALSE) %>%
transmute(
sample_id = Run,
raw_condition = str_squish(SampleName),
condition = raw_condition %>%
str_replace_all("_\\s+", "_") %>%
str_replace("HighOD _", "HighOD_") %>%
str_replace("LowOD-OMV", "LowOD_OMV") %>%
str_remove("_\\d+$")
)
metadata_enriched <- read_tsv(metadata_path, show_col_types = FALSE) %>%
select(-condition) %>%
left_join(sra_lookup, by = "sample_id") %>%
relocate(condition, .after = taxon_taxid) %>%
relocate(raw_condition, .after = condition) %>%
# Split condition into growth_medium, growth_phase, rna_sample_type
tidyr::separate(
condition,
into = c("growth_medium", "growth_phase", "rna_sample_type"),
sep = "_",
remove = FALSE,
extra = "merge",
fill = "right"
)
if (anyNA(metadata_enriched$condition)) {
stop("Some samples are missing condition annotations after the join.")
}
write_tsv(metadata_enriched, metadata_enriched_path)
# Sample counts by separated condition fields
metadata_enriched %>%
dplyr::count(growth_medium, growth_phase, rna_sample_type, name = "n_samples")The conditions are separated by the following:
growth_phase (2): LowOD (represents log phase optical density at 600nm for the culture medium. Absorbance: ControlPCN = 0.4, SPI-2ind = 0.4) and HighOD (represents stationary phase at OD[600nm]. Absorbance: ControlPCN =1.6, SPI-1ind=1.6, SPI-2ind= 0.8) rna_sample_type: OMV-RNAs and IntracellularRNAs. growth_medium: ControlPCN (minimal growth control phosphate-carbon-nitrogen medium), SPI-1ind-LB (Lysogeny broth medium which mimics intestinal infection by expressing Salmonella pathogenicity island 1), SPI-2ind (mimics macrophage environment by expressing Salmonella pathogenicity island 2 genes in acidic (pH =5.8) and phosphate-depleted PCN medium)
| Condition Name (from R) | RNA Source | Core Condition | Growth Medium | Growth Phase (OD) | Biological Purpose |
|---|---|---|---|---|---|
ControlPCN_LowOD_intracellularRNAs |
Intracellular | Control | PCN (pH 7.4) | Low OD (0.4) | Baseline control |
ControlPCN_LowOD_OMV-RNAs |
OMV | Control | PCN (pH 7.4) | Low OD (0.4) | Baseline control |
ControlPCN_HighOD_intracellularRNAs |
Intracellular | Control | PCN (pH 7.4) | High OD (1.6) | Baseline control |
ControlPCN_HighOD_OMV-RNAs |
OMV | Control | PCN (pH 7.4) | High OD (1.6) | Baseline control |
SPI-1ind-LB_HighOD_intracellularRNAs |
Intracellular | SPI-1 Inducing | LB | High OD (1.6) | Mimics intestinal infection |
SPI-1ind-LB_HighOD_OMV-RNAs |
OMV | SPI-1 Inducing | LB | High OD (1.6) | Mimics intestinal infection |
SPI-2ind_LowOD_intracellularRNAs |
Intracellular | SPI-2 Inducing | PCN (pH 5.8, low P) | Low OD (0.4) | Mimics macrophage environment |
SPI-2ind_LowOD_OMV-RNAs |
OMV | SPI-2 Inducing | PCN (pH 5.8, low P) | Low OD (0.4) | Mimics macrophage environment |
SPI-2ind_HighOD_intracellularRNAs |
Intracellular | SPI-2 Inducing | PCN (pH 5.8, low P) | High OD (0.8) | Mimics macrophage environment |
SPI-2ind_HighOD_OMV-RNAs |
OMV | SPI-2 Inducing | PCN (pH 5.8, low P) | High OD (0.8) | Mimics macrophage environment |
rpkm_path <- path_from_root("results_malabirade_2018", "misc", "rnaseq_counts-rpkm.tsv")
condition_levels <- metadata_enriched %>%
distinct(condition) %>%
arrange(condition) %>%
pull(condition)
rpkm_long <- read_tsv(rpkm_path, show_col_types = FALSE) %>%
pivot_longer(-smallBARNA_ID, names_to = "sample_genome", values_to = "rpkm") %>%
separate(sample_genome, into = c("sample_id", "genbank_acc"), sep = "\\|") %>%
left_join(metadata_enriched, by = c("sample_id", "genbank_acc")) %>%
mutate(
log2_rpkm = log2(rpkm + 1),
condition = factor(condition, levels = condition_levels),
# Set useful factor levels for split condition columns
growth_phase = factor(growth_phase, levels = c("LowOD", "HighOD")),
rna_sample_type = factor(rna_sample_type, levels = c("intracellularRNAs", "OMV-RNAs")),
growth_medium = factor(growth_medium, levels = c("ControlPCN", "SPI-1ind-LB", "SPI-2ind"))
)
if (anyNA(rpkm_long$condition)) {
missing_samples <- rpkm_long %>%
filter(is.na(condition)) %>%
distinct(sample_id) %>%
pull(sample_id)
stop("Missing conditions for samples: ", paste(missing_samples, collapse = ", "))
}
cat("RPKM Stats:")## RPKM Stats:
rpkm_long %>%
summarise(
min_log2 = min(log2_rpkm, na.rm = TRUE),
median_log2 = median(log2_rpkm, na.rm = TRUE),
max_log2 = max(log2_rpkm, na.rm = TRUE)
)We can prepare plots showing log(RPKM+1) on x-axis and growth_medium conditions on the y-axis. Each growth_medium condition is separated by two boxplots: Intraceullar-RNA and OMV-RNA distributions. Each point for the samples are colored by LowOD(Cyan), HighOD(red).
output_dir <- path_from_root("explore", "malabirade_2018", "malabirade_2018_metadata_output")
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
# Plot reflecting split condition columns
srna_plot_factors <- ggplot(
rpkm_long,
aes(x = growth_medium, y = log2_rpkm, fill = rna_sample_type)
) +
geom_boxplot(outlier.shape = NA, position = position_dodge(width = 0.75)) +
geom_jitter(
aes(color = growth_phase),
alpha = 0.35,
size = 0.8,
position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.75)
) +
facet_wrap(~ smallBARNA_ID, scales = "free_y") +
labs(
x = "Growth medium / condition",
y = "log2(RPKM + 1)",
fill = "RNA source",
color = "Growth phase",
title = "sRNA profiles by growth medium, phase, and RNA source"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
srna_plot_factorsggsave(
filename = file.path(output_dir, "srna_log2rpkm_boxplots_by_factors.png"),
plot = srna_plot_factors,
width = 32,
height = 24,
units = "in",
dpi = 300
)Observation Visually, it seems most sRNA has different distributions in intracellular vs OMV. We may assume some intracellular sRNA may not be present in the OMVs but the plot suggests that all sRNA are found in both intracellular and OMVs.
rpkm_long %>%
group_by(smallBARNA_ID) %>%
summarise(
p_shapiro = shapiro.test(log2_rpkm)$p.value
) %>%
summarise(prop_normal = mean(p_shapiro > 0.05))Observation: prop_normal < 0.5. sRNAs are non-normal. So we should use non-parametric Wilcoxon/Kruskal-Wallis tests.
rna_sample_type is related to vesicle cargo packaging (OMV vs intracellular). We can look if these sRNAs are selectively packaged/exported to OMVs first. The RPKM data is non-normal/unbalanced/non-parametric so for rna_sample_type (=2 groups), we should perform Wilcoxon Rank-sum test to check significance.
We can also use the same test for Growth_Phase (=2 factors; LowOD represents Salmonella’s log growth phase and HighOD represents stationary phase.)
For growth_medium (=3 factors; ControlPCN, SPI-2ind, SPI-1ind-LB), we will use Kruskal-Wallis test.
# Summary maxima for annotation positions per sRNA
ymax_by_srna <- rpkm_long %>%
group_by(smallBARNA_ID) %>%
summarise(ymax = max(log2_rpkm, na.rm = TRUE), .groups = "drop")
# 1) Wilcoxon: OMV vs intracellular (rna_sample_type)
wilcox_type <- rpkm_long %>%
group_by(smallBARNA_ID) %>%
wilcox_test(log2_rpkm ~ rna_sample_type) %>%
adjust_pvalue(method = "BH") %>%
add_significance(p.col = "p.adj") %>%
mutate(group1 = "intracellularRNAs", group2 = "OMV-RNAs") %>%
left_join(ymax_by_srna, by = "smallBARNA_ID") %>%
mutate(y.position = ymax * 1.10)
# 2) Wilcoxon: HighOD vs LowOD (growth_phase)
wilcox_phase <- rpkm_long %>%
group_by(smallBARNA_ID) %>%
wilcox_test(log2_rpkm ~ growth_phase) %>%
adjust_pvalue(method = "BH") %>%
add_significance(p.col = "p.adj") %>%
mutate(group1 = "LowOD", group2 = "HighOD") %>%
left_join(ymax_by_srna, by = "smallBARNA_ID") %>%
mutate(y.position = ymax * 1.10)
# 3) Kruskal–Wallis across growth_medium (3 levels)
kw_global <- rpkm_long %>%
group_by(smallBARNA_ID) %>%
kruskal_test(log2_rpkm ~ growth_medium) %>%
adjust_pvalue(method = "BH") %>%
mutate(label = paste0("KW FDR=", signif(p.adj, 3))) %>%
left_join(ymax_by_srna, by = "smallBARNA_ID") %>%
mutate(x = 2, y = ymax * 1.05)
# Post-hoc Dunn tests for growth_medium, annotate only significant pairs
dunn_pairs <- rpkm_long %>%
group_by(smallBARNA_ID) %>%
dunn_test(log2_rpkm ~ growth_medium, p.adjust.method = "BH") %>%
add_significance(p.col = "p.adj") %>%
filter(!is.na(p.adj) & p.adj < 0.05) %>%
add_x_position(x = "growth_medium") %>%
add_y_position(data = rpkm_long, fun = "max", step.increase = 0.1)srna_plot_type_sig <- ggplot(rpkm_long, aes(x = rna_sample_type, y = log2_rpkm)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.15, alpha = 0.35, size = 0.8) +
facet_wrap(~ smallBARNA_ID, scales = "free_y") +
stat_pvalue_manual(
wilcox_type,
label = "p.adj.signif",
y.position = "y.position",
xmin = "group1",
xmax = "group2",
tip.length = 0.01,
bracket.size = 0.3,
size = 3
) +
labs(x = "RNA source", y = "log2(RPKM + 1)", title = "OMV vs intracellular (Wilcoxon, BH-FDR)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
srna_plot_type_sigggsave(
filename = file.path(output_dir, "srna_log2rpkm_by_type_sig.png"),
plot = srna_plot_type_sig,
width = 24,
height = 18,
units = "in",
dpi = 300
)Observation: Most of the sRNAs are significantly different in between intracellular vs OMV distribution.
wilcox_type %>%
dplyr::ungroup() %>%
dplyr::transmute(
smallBARNA_ID,
significant = !is.na(p.adj) & p.adj < 0.05
) %>%
dplyr::distinct(smallBARNA_ID, .keep_all = TRUE) %>%
dplyr::count(significant, name = "n_sRNAs") %>%
dplyr::mutate(prop = n_sRNAs / sum(n_sRNAs))Observation: 97 out of 109 sRNAs are significantly different in between intracellular vs OMV distribution. The result suggest differential distribution of sRNAs in intracellular vs OMVs (which aligns with the findings from Malabirade_2018 et al) which might be caused by selective/preferential packaging rather than passive leakage of sRNAs.
Do packaging pattern influenced by e.g. sRNA_length, GC Content, secondary structure, thermodynamics etc. cause differential abundance in intracellular vs OMVs?
Ghosal_2015 et al. suggests that most RNA isolated from OMVs has length less than 60 nt and with an enrichment in the 15-40nt size range especially for tRNA fragments and ncRNA.
We may want to check if shorter sRNAs are more likely to be significantly more abundant in OMVs vs intracellular distribution due to any selective packaging mechanism or extracellular stability or protein-mediated transfer etc.
or/and,
GC-rich or U-rich sequences are favored (stability/structure)
or/and,
Stable hairpins or specific terminal secondary structures are enriched and/or protect RNAs in OMVs. For example: Koeppen_2016 et al suggests sRNA52320 (P. aeruginosa methionine tRNA fragment (tRF)) was abundant in OMVs and Li et al. suggests tRFs have conserved and stable secondary structures. We may want to check if secondary structural features influences selective packaging of sRNAs.
or/and,
sRNA that can bind to RNA-binding proteins (RBP) are more likely to be selectively packaged. For example: Velez_2025 suggests that Hfq protein may have possible role in translocating RNA from the cytoplasm to periplasm and then to OMVs.
We can pull length from the data/positive-sRNA-mapped_20250701.csv and test Spearman between OMV enrichment effect (e.g., Wilcoxon) and sRNA length;
library(tidyverse)
library(rstatix)
library(stringr)
seq_path <- "/home/mrahman/srna_bevs/data/positive-sRNA-mapped_20250701.csv"
relevant_ids <- rpkm_long %>% distinct(smallBARNA_ID) %>% pull()
# 1) Read sequences, keep only relevant 109 sRNAs, clean and compute lengths
seq_raw <- readr::read_tsv(seq_path, show_col_types = FALSE)
srna_lengths <- seq_raw %>%
# Keep only needed columns with exact names
dplyr::select(smallBARNA_ID, Sequence_from_PMID) %>%
dplyr::filter(smallBARNA_ID %in% relevant_ids) %>%
dplyr::mutate(
sequence_clean = Sequence_from_PMID %>%
replace_na("") %>%
# Keep only IUPAC RNA/DNA letters; allow U or T
str_replace_all("[^ACGTUacgtu]", ""),
seq_len = nchar(sequence_clean)
) %>%
# If multiple rows per smallBARNA_ID, keep the longest sequence
dplyr::group_by(smallBARNA_ID) %>%
dplyr::slice_max(order_by = seq_len, n = 1, with_ties = FALSE) %>%
dplyr::ungroup()
# 2) Per‑sRNA OMV vs intracellular stats from expression data (rpkm_long)
# Rank-biserial effect size (OMV vs intracellular)
type_eff <- rpkm_long %>%
dplyr::group_by(smallBARNA_ID) %>%
rstatix::wilcox_effsize(log2_rpkm ~ rna_sample_type) %>%
tibble::as_tibble() %>%
dplyr::select(smallBARNA_ID, effsize) %>%
dplyr::rename(type_effsize = effsize)
# Wilcoxon p-values (BH-adjusted across sRNAs)
wilcox_type <- rpkm_long %>%
dplyr::group_by(smallBARNA_ID) %>%
rstatix::wilcox_test(log2_rpkm ~ rna_sample_type) %>%
rstatix::adjust_pvalue(method = "BH") %>%
rstatix::add_significance(p.col = "p.adj") %>%
tibble::as_tibble()
# Median OMV - intracellular difference (for direction/magnitude)
type_medians <- rpkm_long %>%
dplyr::group_by(smallBARNA_ID, rna_sample_type) %>%
dplyr::summarise(med = median(log2_rpkm, na.rm = TRUE), .groups = "drop") %>%
tidyr::pivot_wider(names_from = rna_sample_type, values_from = med) %>%
dplyr::mutate(delta_med_OMV_vs_intra = (`OMV-RNAs`) - intracellularRNAs)
# 3) Join lengths with stats for hypothesis testing
length_effect_df <- srna_lengths %>%
dplyr::select(smallBARNA_ID, seq_len) %>%
dplyr::left_join(type_eff, by = "smallBARNA_ID") %>%
dplyr::left_join(wilcox_type %>% dplyr::select(smallBARNA_ID, p, p.adj, p.adj.signif), by = "smallBARNA_ID") %>%
dplyr::left_join(type_medians %>% dplyr::select(smallBARNA_ID, delta_med_OMV_vs_intra), by = "smallBARNA_ID") %>%
dplyr::mutate(significant = !is.na(p.adj) & p.adj < 0.05)
# 4) Length hypothesis tests
# 4a) Spearman correlation: length vs rank-biserial effect size (OMV vs intracellular)
cor_len_eff <- length_effect_df %>%
tidyr::drop_na(seq_len, type_effsize) %>%
rstatix::cor_test(seq_len, type_effsize, method = "spearman")
# 4b) Spearman correlation: length vs median difference (OMV - intracellular)
cor_len_delta <- length_effect_df %>%
tidyr::drop_na(seq_len, delta_med_OMV_vs_intra) %>%
rstatix::cor_test(seq_len, delta_med_OMV_vs_intra, method = "spearman")
# 4c) Wilcoxon test: do significant sRNAs have different lengths?
len_sig_test <- length_effect_df %>%
dplyr::mutate(sig_flag = if_else(significant, "Significant", "Not Significant")) %>%
rstatix::wilcox_test(seq_len ~ sig_flag)
cat("Quick summaries and sanity checks:")## Quick summaries and sanity checks:
list(
n_relevant = length(relevant_ids),
n_with_lengths = nrow(srna_lengths),
n_missing_lengths = length(relevant_ids) - nrow(srna_lengths),
head_lengths = head(srna_lengths),
length_by_sig = length_effect_df %>%
dplyr::group_by(significant) %>%
dplyr::summarise(
n = dplyr::n(),
median_len = median(seq_len, na.rm = TRUE),
IQR_len = IQR(seq_len, na.rm = TRUE),
.groups = "drop"
),
cor_len_eff = cor_len_eff,
cor_len_delta = cor_len_delta,
len_sig_test = len_sig_test
)## $n_relevant
## [1] 109
##
## $n_with_lengths
## [1] 105
##
## $n_missing_lengths
## [1] 4
##
## $head_lengths
## # A tibble: 6 × 4
## smallBARNA_ID Sequence_from_PMID sequence_clean seq_len
## <chr> <chr> <chr> <int>
## 1 sRNA-Apleu-10 CTGCTCTACCGACTGAGCTAACGACCCTTTAGAAGGCGT… CTGCTCTACCGAC… 176
## 2 sRNA-Apleu-3 ATAAATTCGCAAAAAGTGCTTGCATTGGTTTTGGAAATC… ATAAATTCGCAAA… 200
## 3 sRNA-Apleu-9 AAAGTTCAAGAATTGCAAAGAATTGACAAGTTAGGTTAT… AAAGTTCAAGAAT… 278
## 4 sRNA-Ecoli-103 GGGGGCTCTGTTGGTTCTCCCGCAACGCTACTCTGTTTA… GGGGGCTCTGTTG… 114
## 5 sRNA-Ecoli-11 CCATGGAACAGCAGTAATTTATAAATAAGCATTAACCCT… CCATGGAACAGCA… 100
## 6 sRNA-Ecoli-13 ACACCGTCGCTTAAAGTGACGGCATAATAATAAAAAAAT… ACACCGTCGCTTA… 84
##
## $length_by_sig
## # A tibble: 2 × 4
## significant n median_len IQR_len
## <lgl> <int> <dbl> <dbl>
## 1 FALSE 11 144 132
## 2 TRUE 94 111 93.2
##
## $cor_len_eff
## # A tibble: 1 × 6
## var1 var2 cor statistic p method
## <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 seq_len type_effsize -0.016 196094. 0.868 Spearman
##
## $cor_len_delta
## # A tibble: 1 × 6
## var1 var2 cor statistic p method
## <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 seq_len delta_med_OMV_vs_intra -0.046 201717. 0.644 Spearman
##
## $len_sig_test
## # A tibble: 1 × 7
## .y. group1 group2 n1 n2 statistic p
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 seq_len Not Significant Significant 11 94 640. 0.202
Observation: 4 out of 109 sRNA sequences are not found. New final candidate: 105. Spearman stats (p=0.644) are not significant.
library(tidyverse)
library(ggpubr)
library(rstatix)
if (!"delta_med_OMV_vs_intra" %in% names(length_effect_df)) {
stopifnot(exists("rpkm_long"))
type_medians <- rpkm_long %>%
group_by(smallBARNA_ID, rna_sample_type) %>%
summarise(med = median(log2_rpkm, na.rm = TRUE), .groups = "drop") %>%
tidyr::pivot_wider(names_from = rna_sample_type, values_from = med) %>%
mutate(delta_med_OMV_vs_intra = (`OMV-RNAs`) - intracellularRNAs) %>%
select(smallBARNA_ID, delta_med_OMV_vs_intra)
length_effect_df <- length_effect_df %>%
left_join(type_medians, by = "smallBARNA_ID")
}
# Correlation labels
corr_eff <- length_effect_df %>%
drop_na(seq_len, type_effsize) %>%
cor_test(seq_len, type_effsize, method = "spearman")
corr_delta <- length_effect_df %>%
drop_na(seq_len, delta_med_OMV_vs_intra) %>%
cor_test(seq_len, delta_med_OMV_vs_intra, method = "spearman")
eff_lab <- paste0("Spearman rho=", signif(corr_eff$cor, 2),
", p=", signif(corr_eff$p, 2))
delta_lab <- paste0("Spearman rho=", signif(corr_delta$cor, 2),
", p=", signif(corr_delta$p, 2))
# Helper for annotation positions
x_max <- max(length_effect_df$seq_len, na.rm = TRUE)
y1_max <- max(length_effect_df$type_effsize, na.rm = TRUE)
y2_max <- max(length_effect_df$delta_med_OMV_vs_intra, na.rm = TRUE)
# Plot 1: Length vs OMV enrichment effect size
p1 <- ggplot(length_effect_df, aes(x = seq_len, y = type_effsize, color = significant)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "loess", se = TRUE, color = "grey30") +
annotate("text", x = 0.98 * x_max, y = 0.95 * y1_max, hjust = 1, vjust = 1,
label = eff_lab, size = 4) +
scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) +
labs(x = "sRNA length (nt)", y = "OMV vs intracellular effect size (r)",
color = "Significant (BH<0.05)",
title = "Length vs OMV enrichment effect size") +
theme_bw()
# Plot 2: Length vs median difference (OMV − intracellular)
p2 <- ggplot(length_effect_df, aes(x = seq_len, y = delta_med_OMV_vs_intra, color = significant)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "loess", se = TRUE, color = "grey30") +
annotate("text", x = 0.98 * x_max, y = 0.95 * y2_max, hjust = 1, vjust = 1,
label = delta_lab, size = 4) +
scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) +
labs(x = "sRNA length (nt)", y = "Δ median log2(RPKM+1) (OMV − intra)",
color = "Significant (BH<0.05)",
title = "Length vs OMV − intracellular median difference") +
theme_bw()
# Plot 3: Length distributions by significance
p3 <- ggplot(length_effect_df, aes(x = significant, y = seq_len, fill = significant)) +
geom_violin(alpha = 0.6, color = NA) +
geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.8) +
stat_compare_means(method = "wilcox.test", label = "p.signif") +
scale_fill_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) +
scale_x_discrete(labels = c("FALSE" = "Not significant", "TRUE" = "Significant")) +
labs(x = NULL, y = "sRNA length (nt)",
title = "Length distribution by significance (OMV vs intracellular)") +
theme_bw() +
theme(legend.position = "none")
# Arrange and save
fig_len <- ggpubr::ggarrange(p1, p2, p3, ncol = 2, nrow = 2, common.legend = TRUE, legend = "bottom")
if (!exists("output_dir")) output_dir <- "explore/malabirade_2018/malabirade_2018_metadata_output"
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
ggsave(file.path(output_dir, "srna_length_effects_summary.png"),
fig_len, width = 14, height = 10, dpi = 300)
fig_lenObservation: There is no detectable relationship between sRNA length and OMV enrichment. Short and long sRNAs are exported to OMVs with similar likelihood.
# suppressPackageStartupMessages({
# library(tidyverse)
# library(stringr)
# library(Biostrings)
# library(rstatix)
# })
#
# # Clean sequence: strip whitespace, non-letters, uppercase, DNA->RNA, keep IUPAC RNA (N, R, Y, S, W, K, M, B, D, H, V)
# clean_srna_sequence <- function(seq_raw) {
# if (is.na(seq_raw)) return(NA_character_)
# seq <- seq_raw %>%
# str_replace_all("\\s+", "") %>%
# str_replace_all("[^A-Za-z]", "") %>%
# toupper() %>%
# str_replace_all("T", "U") %>%
# str_replace_all("[^ACGUNRYSWKMBDHV]", "")
# if (nchar(seq) < 5) return(NA_character_)
# return(seq)
# }
#
# # Inputs
# seq_path <- "/home/mrahman/srna_bevs/data/positive-sRNA-mapped_20250701.csv"
# seq_raw <- readr::read_tsv(seq_path, show_col_types = FALSE)
# sRNA_ids <- rpkm_long %>% distinct(smallBARNA_ID) %>% pull()
#
# # Deduplicate to one sequence per sRNA (longest cleaned)
# srna_seqs <- seq_raw %>%
# filter(smallBARNA_ID %in% sRNA_ids) %>%
# mutate(sequence_clean = purrr::map_chr(Sequence_from_PMID, clean_srna_sequence)) %>%
# group_by(smallBARNA_ID) %>%
# slice_max(order_by = nchar(sequence_clean), n = 1, with_ties = FALSE) %>%
# ungroup() %>%
# mutate(seq_len = nchar(sequence_clean))
#
# foldable <- srna_seqs %>% filter(!is.na(sequence_clean), seq_len >= 5)
#
# # Write FASTA and fold
# rnafold_bin <- Sys.which("RNAfold")
#
# fa_path <- tempfile(fileext = ".fa")
# fasta_lines <- foldable %>% transmute(txt = paste0(">", smallBARNA_ID, "\n", sequence_clean)) %>% pull()
# writeLines(fasta_lines, con = fa_path)
#
# fold_output <- system2(
# rnafold_bin,
# args = c("--noPS", fa_path),
# stdout = TRUE,
# stderr = TRUE
# )
#
# # Parse RNAfold text output
# parse_rnafold_output <- function(lines) {
# tibble(raw = lines) %>%
# mutate(is_header = str_starts(raw, ">"),
# block = cumsum(is_header)) %>%
# group_by(block) %>%
# summarise(lines = list(raw), .groups = "drop") %>%
# mutate(
# id = str_remove(lines[[1]][1], "^>"),
# struct_line = ifelse(length(lines[[1]]) >= 3, lines[[1]][3], NA_character_),
# mfe = struct_line %>%
# str_extract("\\([^()]*-?\\d+\\.?\\d*[^()]*\\)") %>%
# str_extract("-?\\d+\\.?\\d*") %>%
# as.numeric()
# ) %>%
# transmute(smallBARNA_ID = id, structure = struct_line, mfe = mfe)
# }
#
# fold_df <- if (nrow(foldable) > 0) parse_rnafold_output(fold_output) else
# tibble(smallBARNA_ID = character(), structure = character(), mfe = numeric())
#
# # GC with Biostrings (robust to ambiguous IUPAC)
# gc_tbl <- foldable %>%
# transmute(smallBARNA_ID,
# sequence_clean,
# seq_len) %>%
# mutate(
# gc_frac = {
# rset <- RNAStringSet(sequence_clean)
# # Count only A/C/G/U letters in denominator
# counts <- letterFrequency(rset, letters = c("A","C","G","U"), as.prob = FALSE)
# num_gc <- counts[, "C"] + counts[, "G"]
# den_acgu <- rowSums(counts)
# ifelse(den_acgu > 0, num_gc / den_acgu, NA_real_)
# }
# )
#
# # Combine sequence features and folding metrics
# final_srna_features <- gc_tbl %>%
# left_join(fold_df, by = "smallBARNA_ID") %>%
# mutate(
# paired_count = ifelse(!is.na(structure), str_count(structure, "\\("), NA_integer_),
# paired_frac = ifelse(seq_len > 0, paired_count / seq_len, NA_real_),
# mfe_per_nt = ifelse(seq_len > 0, mfe / seq_len, NA_real_)
# )
#
# # Full table aligned to all sRNAs in srna_seqs (including those not foldable)
# final_out <- srna_seqs %>%
# select(smallBARNA_ID, sequence_clean, seq_len) %>%
# left_join(final_srna_features, by = "smallBARNA_ID")
#
# # Effect size and BH-FDR from your Wilcoxon OMV vs intracellular test
# type_eff <- rpkm_long %>%
# group_by(smallBARNA_ID) %>%
# rstatix::wilcox_effsize(log2_rpkm ~ rna_sample_type) %>%
# as_tibble() %>%
# select(smallBARNA_ID, effsize) %>%
# dplyr::rename(type_effsize = effsize)
#
# wilcox_type <- rpkm_long %>%
# group_by(smallBARNA_ID) %>%
# rstatix::wilcox_test(log2_rpkm ~ rna_sample_type) %>%
# rstatix::adjust_pvalue(method = "BH") %>%
# rstatix::add_significance(p.col = "p.adj") %>%
# as_tibble() %>%
# select(smallBARNA_ID, p, p.adj, p.adj.signif)
#
# type_medians <- rpkm_long %>%
# group_by(smallBARNA_ID, rna_sample_type) %>%
# summarise(med = median(log2_rpkm, na.rm = TRUE), .groups = "drop") %>%
# tidyr::pivot_wider(names_from = rna_sample_type, values_from = med) %>%
# mutate(delta_med_OMV_vs_intra = (`OMV-RNAs`) - intracellularRNAs) %>%
# select(smallBARNA_ID, delta_med_OMV_vs_intra)
#
# feat_df <- final_out %>%
# left_join(type_eff, by = "smallBARNA_ID") %>%
# left_join(wilcox_type, by = "smallBARNA_ID") %>%
# left_join(type_medians, by = "smallBARNA_ID") %>%
# mutate(significant = !is.na(p.adj) & p.adj < 0.05)
#
# # Return feature tables
# list(
# final_srna_features = final_srna_features, # only those successfully folded
# final_out = final_out, # all with sequence, GC, and folding fields where available
# feat_df = feat_df # merged with OMV enrichment stats (optional downstream analyses)
# )# library(tidyverse)
# library(stringr)
#
# # 0) Ensure feat_df has required columns; pull from srna_seqs/final_out/final_srna_features if needed
# needed_seq <- c("sequence_clean","seq_len")
# if (!all(needed_seq %in% names(feat_df))) {
# if (exists("srna_seqs")) {
# feat_df <- feat_df %>%
# left_join(srna_seqs %>% select(smallBARNA_ID, sequence_clean, seq_len), by = "smallBARNA_ID")
# } else if (exists("final_out")) {
# feat_df <- feat_df %>%
# left_join(final_out %>% select(smallBARNA_ID, sequence_clean, seq_len), by = "smallBARNA_ID")
# } else {
# stop("sequence_clean/seq_len not found and srna_seqs/final_out not available. Run the sequence-cleaning step first.")
# }
# }
#
# needed_fold <- c("structure","mfe","paired_frac","mfe_per_nt")
# if (!any(needed_fold %in% names(feat_df)) && exists("final_srna_features")) {
# feat_df <- feat_df %>%
# left_join(final_srna_features %>%
# select(smallBARNA_ID, structure, mfe, paired_frac, mfe_per_nt),
# by = "smallBARNA_ID")
# }
#
# # 1) Optional abundance covariate
# expr_summary <- rpkm_long %>%
# group_by(smallBARNA_ID) %>%
# summarise(expr_med = median(log2_rpkm, na.rm = TRUE), .groups = "drop")
#
# # 2) Recompute extended features (now that sequence_clean exists)
# features_ex <- feat_df %>%
# left_join(expr_summary, by = "smallBARNA_ID") %>%
# mutate(
# # U fraction among A/C/G/U
# u_count = if_else(!is.na(sequence_clean), str_count(sequence_clean, "U"), NA_integer_),
# acgu_count = if_else(!is.na(sequence_clean), str_count(sequence_clean, "[ACGU]"), NA_integer_),
# u_frac = if_else(acgu_count > 0, u_count / acgu_count, NA_real_),
#
# # U-rich kmers and densities
# uuu_count = if_else(!is.na(sequence_clean), str_count(sequence_clean, "UUU"), NA_integer_),
# uuuu_count = if_else(!is.na(sequence_clean), str_count(sequence_clean, "UUUU"), NA_integer_),
# uuu_density = if_else(seq_len > 0, uuu_count / pmax(seq_len, 1), NA_real_),
# uuuu_density = if_else(seq_len > 0, uuuu_count / pmax(seq_len, 1), NA_real_),
#
# # 3' U-run length (terminator proxy)
# urun3p_len = if_else(!is.na(sequence_clean),
# nchar(coalesce(str_extract(sequence_clean, "U+$"), "")),
# NA_integer_),
# urun3p_flag = !is.na(urun3p_len) & urun3p_len >= 4,
#
# # 3' structure proxy: paired fraction in last 15 nts (requires RNAfold structure)
# struct_tail = if_else(!is.na(structure),
# substr(structure, pmax(nchar(structure) - 14, 1), nchar(structure)),
# NA_character_),
# paired_tail = if_else(!is.na(struct_tail), str_count(struct_tail, "\\("), NA_integer_),
# tail_len = if_else(!is.na(struct_tail), nchar(struct_tail), NA_integer_),
# paired_frac_3p = if_else(tail_len > 0, paired_tail / tail_len, NA_real_)
# )
#
# # features_ex now has sequence-based metrics and will work with the downstream tests
# features_ex %>% select(smallBARNA_ID, seq_len, u_frac, uuu_density, uuuu_density, urun3p_len, paired_frac_3p) %>% head()# # Visualize GC/U-richness and structure vs OMV enrichment significance
# suppressPackageStartupMessages({
# library(tidyverse)
# library(ggpubr)
# library(rstatix)
# })
#
# # Expect features_ex from the previous step; if missing, stop to avoid confusion
# if (!exists("features_ex")) stop("features_ex not found. Run the features/extraction block first.")
#
# # Ensure output dir
# if (!exists("output_dir")) {
# output_dir <- "explore/malabirade_2018/malabirade_2018_metadata_output"
# }
# dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
#
# # ---------------------------
# # 1) Scatter panels: continuous features vs effect size (Spearman + BH-FDR)
# # ---------------------------
# scatter_feats <- c(
# gc_frac = "GC fraction",
# u_frac = "U fraction",
# mfe_per_nt = "MFE per nt",
# paired_frac = "Paired fraction",
# paired_frac_3p = "3' paired fraction",
# urun3p_len = "3' U-run length"
# )
#
# scatter_df <- features_ex %>%
# select(smallBARNA_ID, significant, type_effsize, all_of(names(scatter_feats))) %>%
# pivot_longer(cols = all_of(names(scatter_feats)), names_to = "feature", values_to = "value") %>%
# mutate(feature_label = scatter_feats[feature])
#
# # Spearman per feature with BH across features
# corr_labels <- scatter_df %>%
# group_by(feature, feature_label) %>%
# summarise(
# n = sum(!is.na(value) & !is.na(type_effsize)),
# rho = ifelse(n >= 3, suppressWarnings(cor(value, type_effsize, method = "spearman", use = "pairwise.complete.obs")), NA_real_),
# p = ifelse(n >= 3, suppressWarnings(cor.test(value, type_effsize, method = "spearman"))$p.value, NA_real_),
# .groups = "drop"
# ) %>%
# mutate(p_adj = p.adjust(p, method = "BH"),
# lab = paste0("rho=", signif(rho, 2), ", FDR=", signif(p_adj, 2)))
#
# # Axis maxima for annotation placement
# scat_max <- scatter_df %>%
# group_by(feature) %>%
# summarise(xmax = max(value, na.rm = TRUE), ymax = max(type_effsize, na.rm = TRUE), .groups = "drop") %>%
# left_join(corr_labels, by = "feature")
#
# p_scatter <- ggplot(scatter_df, aes(x = value, y = type_effsize, color = significant)) +
# geom_point(alpha = 0.7) +
# geom_smooth(method = "loess", se = TRUE, color = "grey40") +
# geom_text(
# data = scat_max,
# aes(x = 0.98 * xmax, y = 0.95 * ymax, label = lab),
# inherit.aes = FALSE, hjust = 1, vjust = 1, size = 3.2
# ) +
# scale_color_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) +
# facet_wrap(~ feature_label, scales = "free_x") +
# labs(x = "Feature value", y = "OMV vs intracellular effect size (r)",
# color = "Significant (BH<0.05)",
# title = "Continuous features vs OMV enrichment (Spearman + BH-FDR)") +
# theme_bw()
#
# ggsave(file.path(output_dir, "srna_features_vs_effect_scatter.png"),
# p_scatter, width = 12, height = 8, dpi = 300)
#
# # ---------------------------
# # 2) Violin/box: feature distributions by significance (Wilcoxon + BH-FDR)
# # ---------------------------
# viol_feats <- c(
# u_frac = "U fraction",
# gc_frac = "GC fraction",
# uuu_density = "UUU density",
# uuuu_density = "UUUU density",
# mfe_per_nt = "MFE per nt",
# paired_frac = "Paired fraction",
# paired_frac_3p = "3' paired fraction",
# urun3p_len = "3' U-run length"
# )
#
# viol_df <- features_ex %>%
# select(significant, all_of(names(viol_feats))) %>%
# pivot_longer(cols = all_of(names(viol_feats)), names_to = "feature", values_to = "value") %>%
# mutate(feature_label = viol_feats[feature])
#
# # Wilcoxon per feature, BH across features
# grp_tests <- viol_df %>%
# group_by(feature, feature_label) %>%
# summarise(
# p = tryCatch({
# d <- drop_na(cur_data(), value, significant)
# if (nrow(d) >= 4 && n_distinct(d$significant) == 2) {
# wilcox.test(value ~ significant, data = d)$p.value
# } else NA_real_
# }, error = function(e) NA_real_),
# ymax = suppressWarnings(max(value, na.rm = TRUE)),
# .groups = "drop"
# ) %>%
# mutate(p_adj = p.adjust(p, method = "BH"),
# stars = case_when(
# is.na(p_adj) ~ "n/a",
# p_adj < 0.001 ~ "***",
# p_adj < 0.01 ~ "**",
# p_adj < 0.05 ~ "*",
# TRUE ~ "ns"
# ),
# lab = paste0("FDR=", signif(p_adj, 2), " (", stars, ")"),
# y = ymax * 1.05)
#
# p_violin <- ggplot(viol_df, aes(x = significant, y = value, fill = significant)) +
# geom_violin(alpha = 0.6, color = NA) +
# geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.85) +
# geom_text(
# data = grp_tests,
# aes(x = 1.5, y = y, label = lab),
# inherit.aes = FALSE,
# size = 3.2
# ) +
# scale_fill_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) +
# scale_x_discrete(labels = c(`FALSE` = "Not significant", `TRUE` = "Significant")) +
# facet_wrap(~ feature_label, scales = "free_y") +
# labs(x = NULL, y = "Feature value",
# title = "Feature distributions by OMV-enrichment significance (Wilcoxon + BH-FDR)") +
# theme_bw() +
# theme(legend.position = "none")
#
# ggsave(file.path(output_dir, "srna_features_by_significance_violin.png"),
# p_violin, width = 12, height = 8, dpi = 300)
#
# # ---------------------------
# # 3) Motif enrichment bars: UUU and UUUU presence by significance
# # ---------------------------
# motif_tbl <- features_ex %>%
# transmute(
# significant,
# has_UUU = !is.na(uuu_count) & uuu_count > 0,
# has_UUUU = !is.na(uuuu_count) & uuuu_count > 0
# ) %>%
# pivot_longer(cols = c(has_UUU, has_UUUU), names_to = "motif", values_to = "present") %>%
# mutate(motif = recode(motif, has_UUU = "UUU", has_UUUU = "UUUU"))
#
# motif_prop <- motif_tbl %>%
# group_by(motif, significant) %>%
# summarise(n = n(), k = sum(present, na.rm = TRUE), prop = k / n, .groups = "drop")
#
# p_motif <- ggplot(motif_prop, aes(x = significant, y = prop, fill = significant)) +
# geom_col(alpha = 0.85) +
# geom_text(aes(label = scales::percent(prop, accuracy = 0.1)),
# vjust = -0.3, size = 3.2) +
# scale_fill_manual(values = c(`TRUE` = "#D55E00", `FALSE` = "#0072B2")) +
# scale_x_discrete(labels = c(`FALSE` = "Not significant", `TRUE` = "Significant")) +
# facet_wrap(~ motif) +
# labs(x = NULL, y = "Proportion with motif",
# title = "U-rich motif presence by OMV-enrichment significance") +
# theme_bw() +
# theme(legend.position = "none")
#
# ggsave(file.path(output_dir, "srna_motif_enrichment_bars.png"),
# p_motif, width = 8, height = 4, dpi = 300)srna_plot_phase_sig <- ggplot(rpkm_long, aes(x = growth_phase, y = log2_rpkm)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.15, alpha = 0.35, size = 0.8) +
facet_wrap(~ smallBARNA_ID, scales = "free_y") +
stat_pvalue_manual(
wilcox_phase,
label = "p.adj.signif",
y.position = "y.position",
xmin = "group1",
xmax = "group2",
tip.length = 0.01,
bracket.size = 0.3,
size = 3
) +
labs(x = "Growth phase", y = "log2(RPKM + 1)", title = "HighOD vs LowOD (Wilcoxon, BH-FDR)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
srna_plot_phase_sigggsave(
filename = file.path(output_dir, "srna_log2rpkm_by_phase_sig.png"),
plot = srna_plot_phase_sig,
width = 24,
height = 18,
units = "in",
dpi = 300
)Observation: sRNA selective packaging is NOT significantly influenced by growth phases (log vs stationary phase).
# Plot C: by growth medium with KW global + Dunn post-hoc annotations
srna_plot_medium_sig <- ggplot(rpkm_long, aes(x = growth_medium, y = log2_rpkm)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.15, alpha = 0.35, size = 0.8) +
facet_wrap(~ smallBARNA_ID, scales = "free_y") +
# Dunn pairwise (significant pairs only)
stat_pvalue_manual(
dunn_pairs,
label = "p.adj.signif",
y.position = "y.position",
xmin = "xmin",
xmax = "xmax",
tip.length = 0.01,
bracket.size = 0.3,
size = 3
) +
# label
geom_text(
data = kw_global,
aes(x = x, y = y, label = label),
inherit.aes = FALSE,
size = 3
) +
labs(x = "Growth medium", y = "log2(RPKM + 1)", title = "Growth medium (Kruskal–Wallis + Dunn post-hoc)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
srna_plot_medium_sigggsave(
filename = file.path(output_dir, "srna_log2rpkm_by_medium_sig.png"),
plot = srna_plot_medium_sig,
width = 24,
height = 18,
units = "in",
dpi = 300
)Observation: Most sRNA are NOT significant (>0.05). 16 of them are significant.
List of sRNAs that are significant:
sig_kw <- kw_global %>%
filter(p.adj < 0.05) %>% # significant
arrange(p.adj) %>%
select(smallBARNA_ID, kw_padj = p.adj, kw_label = label)
sig_dunn <- dunn_pairs %>%
filter(smallBARNA_ID %in% sig_kw$smallBARNA_ID) %>%
filter(p.adj < 0.05) %>% # only significant comparisons
select(smallBARNA_ID, group1, group2, dunn_padj = p.adj, dunn_sig = p.adj.signif)
sig_medium_table <- sig_kw %>%
left_join(sig_dunn, by = "smallBARNA_ID") %>%
arrange(kw_padj, smallBARNA_ID)
print(sig_medium_table, n = Inf)## # A tibble: 16 × 7
## smallBARNA_ID kw_padj kw_label group1 group2 dunn_padj dunn_sig
## <chr> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 sRNA-Sente-1 0.0290 KW FDR=0.029 ControlPCN SPI-1ind-… 0.00443 **
## 2 sRNA-Sente-1 0.0290 KW FDR=0.029 SPI-1ind-LB SPI-2ind 0.000364 ***
## 3 sRNA-Sente-44 0.0290 KW FDR=0.029 ControlPCN SPI-1ind-… 0.00443 **
## 4 sRNA-Sente-44 0.0290 KW FDR=0.029 SPI-1ind-LB SPI-2ind 0.000364 ***
## 5 sRNA-Ecoli-15 0.0316 KW FDR=0.0316 ControlPCN SPI-1ind-… 0.00175 **
## 6 sRNA-Ecoli-15 0.0316 KW FDR=0.0316 SPI-1ind-LB SPI-2ind 0.00107 **
## 7 sRNA-Sente-36 0.0376 KW FDR=0.0376 ControlPCN SPI-1ind-… 0.00148 **
## 8 sRNA-Sente-36 0.0376 KW FDR=0.0376 SPI-1ind-LB SPI-2ind 0.00148 **
## 9 sRNA-Ecoli-61 0.0461 KW FDR=0.0461 ControlPCN SPI-1ind-… 0.00309 **
## 10 sRNA-Ecoli-61 0.0461 KW FDR=0.0461 SPI-1ind-LB SPI-2ind 0.00589 **
## 11 sRNA-Sente-52 0.0461 KW FDR=0.0461 ControlPCN SPI-1ind-… 0.00211 **
## 12 sRNA-Sente-52 0.0461 KW FDR=0.0461 SPI-1ind-LB SPI-2ind 0.00638 **
## 13 sRNA-Sflex-21 0.0461 KW FDR=0.0461 ControlPCN SPI-1ind-… 0.00309 **
## 14 sRNA-Sflex-21 0.0461 KW FDR=0.0461 SPI-1ind-LB SPI-2ind 0.00589 **
## 15 sRNA-Ecoli-13 0.0484 KW FDR=0.0484 ControlPCN SPI-1ind-… 0.00360 **
## 16 sRNA-Ecoli-13 0.0484 KW FDR=0.0484 ControlPCN SPI-2ind 0.0492 *