Combined Analysis Summary

1. Sample Counts by Dataset and Group

A quick list of the available groups in the datasets.

plot_list_counts <- list()

for (dname in names(meta_list)) {
  meta <- meta_list[[dname]]
  gcol <- group_cols[[dname]]
  
  # For Malabirade_2018, show panels for each of the new grouping columns
  if (dname == "Malabirade_2018" && !is.null(meta)) {
    mala_groups <- c("growth_medium", "growth_phase", "rna_sample_type")
    for (mg in mala_groups) {
      if (mg %in% names(meta)) {
        p <- ggplot(meta, aes(x = .data[[mg]])) +
          geom_bar(fill = "#4C78A8") +
          labs(x = NULL, y = "Count", title = paste("Malabirade_2018:", mg)) +
          theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 10))
        plot_list_counts[[paste0(dname, "_", mg)]] <- p
      }
    }
    next
  }
  
  if (!is.null(meta) && gcol %in% names(meta)) {
    p <- ggplot(meta, aes(x = .data[[gcol]])) +
      geom_bar(fill = "#4C78A8") +
      labs(x = NULL, y = "Count", title = dname) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 10))
    plot_list_counts[[dname]] <- p
  }
}

ggarrange(plotlist = plot_list_counts, ncol = 3, nrow = 3)

On Malabirade_2018, the SPI-1ind-LB was only sequenced in HighOD condition thus the plots lack LowOD samples.

2. Read Length Distributions

We obtained read lengths directly from the per-sample fastp QC JSON files. The summary statistics is after filtering (type == “after_filtering”), where the length column corresponds to the mean read length reported by fastp.

plot_list_lengths <- list()

for (dname in names(meta_list)) {
  meta <- meta_list[[dname]]
  gcol <- group_cols[[dname]]
  
  # For Malabirade_2018, show separate panels per new grouping column
  if (dname == "Malabirade_2018" && !is.null(meta)) {
    mala_groups <- c("growth_medium", "growth_phase", "rna_sample_type")
    for (mg in mala_groups) {
      if (!mg %in% names(meta)) next
      lengths_df <- fastp_lengths(meta, mg)
      if (nrow(lengths_df) == 0) next
      
      if ("type" %in% names(lengths_df) && any(lengths_df$type == "distribution")) {
        p <- lengths_df %>%
          filter(type == "distribution") %>%
          ggplot(aes(x = length, y = count, color = condition)) +
          geom_line(alpha = 0.7) +
          scale_y_continuous(labels = scales::comma) +
          labs(x = "Length (bp)", y = "Count", title = paste("Malabirade_2018:", mg), color = mg) +
          theme(plot.title = element_text(size = 10))
      } else {
        len_summ <- lengths_df %>%
          filter(type == "after_filtering") %>%
          group_by(sample_id, condition) %>%
          summarise(mean_length = first(length), .groups = "drop")
        
        p <- ggplot(len_summ, aes(x = condition, y = mean_length)) +
          geom_boxplot(alpha = 0.7, fill = "grey90") +
          geom_jitter(width = 0.2, alpha = 0.6, color = "#2ca02c") +
          labs(x = NULL, y = "Mean Length (bp)", title = paste("Malabirade_2018:", mg)) +
          theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 10))
      }
      plot_list_lengths[[paste0(dname, "_", mg)]] <- p
    }
    next
  }
  
  if (!is.null(meta)) {
    lengths_df <- fastp_lengths(meta, gcol)
    
    if (nrow(lengths_df) > 0) {
      if ("type" %in% names(lengths_df) && any(lengths_df$type == "distribution")) {
        # Distribution plot
        p <- lengths_df %>%
          filter(type == "distribution") %>%
          ggplot(aes(x = length, y = count, color = condition)) +
          geom_line(alpha = 0.7) +
          scale_y_continuous(labels = scales::comma) +
          labs(x = "Length (bp)", y = "Count", title = dname, color = "Group") +
          theme(plot.title = element_text(size = 10))
      } else {
        # Mean length plot
        len_summ <- lengths_df %>%
          filter(type == "after_filtering") %>%
          group_by(sample_id, condition) %>%
          summarise(mean_length = first(length), .groups = "drop")
        
        p <- ggplot(len_summ, aes(x = condition, y = mean_length)) +
          geom_boxplot(alpha = 0.7, fill = "grey90") +
          geom_jitter(width = 0.2, alpha = 0.6, color = "#2ca02c") +
          labs(x = NULL, y = "Mean Length (bp)", title = dname) +
          theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 10))
      }
      plot_list_lengths[[dname]] <- p
    }
  }
}

ggarrange(plotlist = plot_list_lengths, ncol = 3, nrow = 2)

## $`1`

## 
## $`2`

## 
## attr(,"class")
## [1] "list"      "ggarrange"

3. sRNA Filtering Summary (>1 RPKM in ≥60% samples)

We filtered sRNAs by requiring RPKM ≥ 1 in at least 60% of samples (frac_gt1 >= 0.6) within any biological group before including them in downstream analyses.

filter_summary_list <- list()
pass_ids_list <- list()

for (dname in names(meta_list)) {
  meta <- meta_list[[dname]]
  gcol <- group_cols[[dname]]
  
  # For Malabirade_2018, produce summaries for each grouping column
  if (dname == "Malabirade_2018" && !is.null(meta)) {
    counts <- load_counts_long(meta)
    mala_groups <- c("growth_medium", "growth_phase", "rna_sample_type")
    for (mg in mala_groups) {
      if (!mg %in% names(meta)) next
      res <- filter_srna(counts, meta, mg)
      pass_ids_list[[paste0(dname, "_", mg)]] <- res$pass_ids
      
      if (length(res$pass_ids) > 0) {
        summ <- res$per_group %>%
          group_by(group) %>%
          summarise(n_srna_pass = n_distinct(srna_id), .groups = "drop") %>%
          mutate(dataset = paste(dname, mg, sep = "_"))
        filter_summary_list[[paste0(dname, "_", mg)]] <- summ
      }
    }
    next
  }
  
  if (!is.null(meta)) {
    counts <- load_counts_long(meta)
    res <- filter_srna(counts, meta, gcol)
    
    pass_ids_list[[dname]] <- res$pass_ids
    
    if (length(res$pass_ids) > 0) {
      summ <- res$per_group %>%
        group_by(group) %>%
        summarise(n_srna_pass = n_distinct(srna_id), .groups = "drop") %>%
        mutate(dataset = dname)
      filter_summary_list[[dname]] <- summ
    }
  }
}

all_filter_summary <- bind_rows(filter_summary_list)

ggplot(all_filter_summary, aes(x = group, y = n_srna_pass, fill = dataset)) +
  geom_col() +
  facet_wrap(~dataset, scales = "free_x", ncol = 3) +
  labs(x = "Group", y = "Number of passing sRNAs", title = "sRNAs with stable expression") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

4. Scatterplots: Enrichment Analysis

Fan_2023:

Fan_2023 had to be dropped off from the analysis as the pipeline didnt detected any sRNAs The methodology mentioned in the paper states that ” mRNA was purified from total RNA using poly(T) oligonucleotide-attached magnetic beads” which would only retain RNA sequences with poly-A tail (eukaryotic/human mRNAs).

Blenkiron_2016: Small vs Large Fraction

blen_data <- NULL
if ("Blenkiron_2016" %in% names(meta_list)) {
  meta   <- meta_list$Blenkiron_2016
  counts <- load_counts_long(meta)
  
  # Relaxed filter: keep sRNAs with mean RPKM > 0.6 in ANY library size
  sRNA_keep <- counts %>%
    left_join(meta %>% select(sample_id, library_size), by = "sample_id") %>%
    filter(library_size %in% c("small", "medium", "large")) %>%
    group_by(srna_id, library_size) %>%
    summarise(mean_expr = mean(RPKM, na.rm = TRUE), .groups = "drop") %>%
    group_by(srna_id) %>%
    filter(any(mean_expr > 0.6)) %>%
    pull(srna_id)
  
  # Median per library size (small / medium / large)
  med <- counts %>%
    filter(srna_id %in% sRNA_keep) %>%
    left_join(meta %>% select(sample_id, library_size), by = "sample_id") %>%
    filter(library_size %in% c("small", "medium", "large")) %>%
    group_by(srna_id, library_size) %>%
    summarise(median_RPKM = median(RPKM, na.rm = TRUE), .groups = "drop") %>%
    pivot_wider(names_from = library_size, values_from = median_RPKM) %>%
    mutate(across(where(is.numeric), ~ replace_na(., 0.01)))  # avoid zeros for log scales
  
  blen_data <- med
}

p_blen <- if (!is.null(blen_data) && nrow(blen_data) > 0) {
  
  # Build pairwise comparisons: small–large, medium–large, small–medium
  blen_pairs <- bind_rows(
    blen_data %>%
      transmute(x = small,  y = large,  comparison = "Small vs Large"),
    blen_data %>%
      transmute(x = medium, y = large,  comparison = "Medium vs Large"),
    blen_data %>%
      transmute(x = small,  y = medium, comparison = "Small vs Medium")
  )
  
  ggplot(blen_pairs, aes(x = x, y = y)) +
    geom_point(alpha = 0.6, color = "#2ca02c") +
    scale_x_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
    scale_y_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
    labs(
      title = "Blenkiron_2016: Pairwise library-size comparisons",
      x = "Median RPKM (X library)",
      y = "Median RPKM (Y library)"
    ) +
    geom_abline(slope = 1, linetype = "dashed") +
    facet_wrap(~ comparison)
  
} else {
  ggplot() + labs(title = "Blenkiron_2016: No Data")
}

p_blen

Sahr_2022_2: MVseq vs RNAseq

sahr2_data <- NULL
if ("Sahr_2022_2" %in% names(meta_list)) {
  meta <- meta_list$Sahr_2022_2
  counts <- load_counts_long(meta)
  
  sRNA_keep <- counts %>%
    left_join(meta %>% select(sample_id, condition), by = "sample_id") %>%
    group_by(srna_id, condition) %>%
    summarise(mean_expr = mean(RPKM, na.rm = TRUE), .groups = "drop") %>%
    group_by(srna_id) %>%
    filter(any(mean_expr > 0.6)) %>%
    pull(srna_id)
  
  med <- counts %>%
    filter(srna_id %in% sRNA_keep) %>%
    left_join(meta %>% select(sample_id, condition), by = "sample_id") %>%
    group_by(srna_id, condition) %>%
    summarise(median_RPKM = median(RPKM, na.rm = TRUE), .groups = "drop") %>%
    pivot_wider(names_from = condition, values_from = median_RPKM) %>%
    mutate(across(where(is.numeric), ~replace_na(., 0.01)))
  
  sahr2_data <- med
}

p_sahr2 <- if (!is.null(sahr2_data) && nrow(sahr2_data) > 0) {
  ggplot(sahr2_data, aes(x = RNAseq, y = MVseq)) +
    geom_point(alpha = 0.6, color = "#d62728") +
    scale_x_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) + 
    scale_y_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
    labs(title = "Sahr_2022_2: EV vs Total", x = "Total RNA (RPKM)", y = "EV RNA (RPKM)") +
    geom_abline(slope = 1, linetype = "dashed")
} else { ggplot() + labs(title = "Sahr_2022_2: No Data") }
p_sahr2

Malabirade_2018: Comparison by Biological Condition

mala_data_list <- list()
if ("Malabirade_2018" %in% names(meta_list)) {
  meta <- meta_list$Malabirade_2018
  counts <- load_counts_long(meta)
  
  meta_proc <- meta %>%
    mutate(
      type = case_when(
        str_detect(condition, "OMV-RNAs") ~ "OMV",
        str_detect(condition, "intracellularRNAs") ~ "Intracellular",
        TRUE ~ NA_character_
      ),

      base_condition = condition %>%
        str_remove("_OMV-RNAs") %>%
        str_remove("_intracellularRNAs") %>%
        str_remove(" ") %>% # Remove stray spaces
        str_trim()
    ) %>%
    filter(!is.na(type))
  
  conditions <- unique(meta_proc$base_condition)
  
  for (cond in conditions) {
    # Subset metadata for this condition
    sub_meta <- meta_proc %>% filter(base_condition == cond)

    # Mean > 0.6
    sRNA_keep <- counts %>%
      filter(sample_id %in% sub_meta$sample_id) %>%
      left_join(sub_meta %>% select(sample_id, type), by = "sample_id") %>%
      group_by(srna_id, type) %>%
      summarise(mean_expr = mean(RPKM, na.rm = TRUE), .groups = "drop") %>%
      group_by(srna_id) %>%
      filter(any(mean_expr > 0.6)) %>%
      pull(srna_id)
    
    if (length(sRNA_keep) > 0) {
      med <- counts %>%
        filter(sample_id %in% sub_meta$sample_id) %>%
        filter(srna_id %in% sRNA_keep) %>%
        left_join(sub_meta %>% select(sample_id, type), by = "sample_id") %>%
        group_by(srna_id, type) %>%
        summarise(median_RPKM = median(RPKM, na.rm = TRUE), .groups = "drop") %>%
        pivot_wider(names_from = type, values_from = median_RPKM) %>%
        mutate(across(where(is.numeric), ~replace_na(., 0.01))) %>%
        mutate(condition = cond) %>%
        left_join(srna_lengths, by = "srna_id")
      
      mala_data_list[[cond]] <- med
    }
  }
}

if (length(mala_data_list) > 0) {
  all_mala <- bind_rows(mala_data_list)
  
  # Annotate with growth metadata for downstream plots
  cond_annotations <- meta_proc %>%
    distinct(base_condition, growth_medium, growth_phase, rna_sample_type)
  all_mala <- all_mala %>%
    left_join(cond_annotations, by = c("condition" = "base_condition"))
  
  ggplot(all_mala, aes(x = Intracellular, y = OMV, color = seq_len)) +
    geom_point(alpha = 0.6, size = 2) +
    scale_color_gradient(low = "blue", high = "red", na.value = "grey70") +
    scale_x_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
    scale_y_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000)) +
    geom_abline(slope = 1, linetype = "dashed") +
    facet_wrap(~condition, scales = "fixed") +
    labs(title = "Malabirade_2018: OMV vs Intracellular by Condition",
         x = "Intracellular (RPKM)", y = "OMV (RPKM)", color = "sRNA length (nt)") +
    theme(strip.text = element_text(size = 8))
} else {
  ggplot() + labs(title = "Malabirade_2018: No Data")
}

Tables containing OMV-enriched RNAs per condition

# Check if the data frame exists from the previous chunk
if (exists("all_mala")) {
  # Get unique conditions (facets)
  conditions <- unique(all_mala$condition)
  
  for (cond in conditions) {
    # Filter for sRNAs enriched in OMV (OMV > Intracellular)
    omv_enriched <- all_mala %>%
      filter(condition == cond) %>%
      filter(OMV > Intracellular) %>%
      select(srna_id, seq_len, OMV, Intracellular) %>%
      distinct() %>% # Deduplicate entries created by the metadata join
      arrange(desc(OMV))
    
    # Create a Markdown header for each condition
    cat(paste("\n####", cond, "\n\n"))
    
    # Print the table if data exists
    if (nrow(omv_enriched) > 0) {
      print(knitr::kable(omv_enriched, caption = paste("sRNAs with OMV > Intracellular in", cond)))
    } else {
      cat("No sRNAs found with higher OMV expression in this condition.\n")
    }
    
    # Add newlines for spacing between tables
    cat("\n")
  }
}

ControlPCN_LowOD

sRNAs with OMV > Intracellular in ControlPCN_LowOD
srna_id	seq_len	OMV	Intracellular
sRNA-Sente-37	46	35.45908	11.82353

SPI-2ind_LowOD

sRNAs with OMV > Intracellular in SPI-2ind_LowOD
srna_id	seq_len	OMV	Intracellular
sRNA-Sente-37	46	139.1449	4.086566

SPI-1ind-LB_HighOD

sRNAs with OMV > Intracellular in SPI-1ind-LB_HighOD
srna_id	seq_len	OMV	Intracellular
sRNA-Sente-36	108	8405.86870	438.5577361
sRNA-Styph-1	296	6145.35312	10.5823070
sRNA-Sente-46	266	3876.27595	26.6404897
sRNA-Sente-52	1236	2627.36197	410.7585078
sRNA-Sente-33	82	1965.82992	1135.2027459
sRNA-Ecoli-61	106	1538.10064	38.0156916
sRNA-Sflex-21	105	1511.47483	37.6797771
sRNA-Sente-5	32	1384.16287	261.9766245
sRNA-Sente-35	90	924.38397	124.7045432
sRNA-Ecoli-46	93	788.35502	8.9652078
sRNA-Sflex-15	92	780.72819	8.9504852
sRNA-Sente-50	191	720.88289	309.8276334
sRNA-Sflex-41	94	582.97100	79.9381348
manual_IsrA	NA	578.47832	0.5131556
sRNA-Sente-41	146	564.70392	406.5214043
sRNA-Ecoli-60	73	491.71249	141.3234982
sRNA-Ecoli-13	84	474.64201	362.3001297
sRNA-Sente-1	144	467.73287	122.0943470
sRNA-Sente-44	144	467.73287	122.0943470
sRNA-Ecoli-103	114	443.73451	136.1333928
sRNA-Sflex-6	114	443.73451	136.1333928
sRNA-Sente-51	284	402.64798	38.8113518
sRNA-Ecoli-70	95	402.30762	63.8035681
sRNA-Sflex-23	79	385.80073	147.6851742
sRNA-Ecoli-64	79	357.78880	159.8833597
sRNA-Ecoli-17	246	355.48712	29.4636840
sRNA-Sflex-3	245	355.48712	29.4636840
sRNA-Sente-49	85	349.02792	16.3039797
sRNA-Sflex-46	109	263.61919	188.3603141
sRNA-Ecoli-86	111	263.03041	190.3157689
sRNA-Sente-55	486	254.93458	127.3269480
sRNA-Ecoli-87	57	234.16588	63.3461872
sRNA-Sflex-47	57	234.16588	63.3461872
sRNA-Sente-28	113	200.04483	36.4781997
sRNA-Sente-37	46	183.23240	4.0336417
sRNA-Sente-13	162	180.74064	127.6364763
sRNA-Ecoli-2	105	161.98028	44.5074169
sRNA-Sflex-5	87	151.47334	20.7369153
sRNA-Sente-25	211	134.01291	128.7029000
sRNA-Ecoli-52	88	124.07863	18.8340761
sRNA-Sflex-33	88	124.07863	18.4647805
sRNA-Sente-23	119	108.63634	87.9016456
sRNA-Sflex-13	77	102.32459	31.5357441
sRNA-Sente-48	196	99.52849	80.8043643
sRNA-Ecoli-49	184	93.69258	14.6060289

ControlPCN_HighOD

sRNAs with OMV > Intracellular in ControlPCN_HighOD
srna_id	seq_len	OMV	Intracellular
sRNA-Ecoli-13	84	46.593704	36.861068
sRNA-Sente-37	46	33.538715	1.304677
sRNA-Styph-1	296	27.037588	9.544384
sRNA-Sente-46	266	20.178142	9.568794
sRNA-Ecoli-46	93	9.323033	8.858069
sRNA-Sflex-15	92	9.272199	8.993306
manual_IsrA	NA	3.743399	0.000000

SPI-2ind_HighOD

sRNAs with OMV > Intracellular in SPI-2ind_HighOD
srna_id	seq_len	OMV	Intracellular
sRNA-Ecoli-13	84	6266.6253	146.18883
sRNA-Sente-37	46	282.3882	17.99624
sRNA-Sente-48	196	105.1909	47.86915

# Save OMV-enriched tables to files
if (exists("all_mala")) {
  output_dir <- path_from_root("explore/summary/output")
  if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
  
  conditions <- unique(all_mala$condition)
  
  for (cond in conditions) {
    omv_enriched <- all_mala %>%
      filter(condition == cond) %>%
      filter(OMV > Intracellular) %>%
      select(srna_id, seq_len, OMV, Intracellular) %>%
      distinct() %>%
      arrange(desc(OMV))
    
    # Write CSV per condition
    fn <- file.path(output_dir, paste0("omv_enriched_", cond, ".csv"))
    readr::write_csv(omv_enriched, fn)
  }
}

Observations

1. Baseline Condition (ControlPCN)

To establish the baseline profile of “normal” sRNA packaging, we examine both ControlPCN_LowOD (log phase) and ControlPCN_HighOD (stationary phase).

(a) LowOD: Logarithmic growth phase

sRNA RPKMs are strongly skewed to the right (higher intracellular abundance).
Intracellular sRNAs generally have higher RPKM values than OMV-associated sRNAs.
Only a single point lies to the left of the diagonal, indicating OMV enrichment which is sRNA-Sente-37 (This sRNA is enriched across all conditions)
This point may represent an sRNA preferentially packaged into OMVs even during active growth. (See table above)

(b) HighOD: Stationary phase

The global distribution shifts leftward, and many points move closer to the diagonal.
Most sRNAs remain on the right side, confirming higher intracellular abundance.
Approximately five sRNAs fall to the left of the diagonal, indicating OMV enrichment under stationary-phase stress.
These OMV-enriched sRNAs may be associated with general stress responses (e.g., nutrient limitation).

Does general stress increase RNA expression?
Yes.

Does general stress increase selective RNA packaging into OMVs?
Yes.

2. “Invasion” Mimic Condition (SPI-1 Induction)

We compare SPI-1ind-LB_HighOD against ControlPCN_HighOD.

Biological context:
SPI-1 induction in LB medium mimics the early intestinal infection stage by activating Salmonella Pathogenicity Island 1.

Key observations: - A greater number of points fall left of the diagonal compared to the control, indicating more sRNAs enriched in OMVs during the invasion-like condition. - Overall expression of sRNAs increases in both intracellular and OMV fractions compared to the control. - These OMV-enriched sRNAs may represent candidates involved in the intestinal invasion phase.

Does sRNA expression increase during intestinal invasion?
Yes.

Does selective sRNA packaging increase during invasion compared to the control?
Yes.

3. “Macrophage Survival” Mimic (SPI-2 Induction)

Biological context:
SPI-2 medium mimics the intracellular macrophage environment by inducing SPI-2 genes under acidic (pH 5.8), phosphate-limited PCN conditions.

Key observations: - Three sRNAs are located to the left of the diagonal, indicating OMV-specific enrichment relative to intracellular abundance. - These may represent candidates important for Salmonella survival or interaction with host (human) pathways during the macrophage phase.

Scatter Plots comparing conditions

if (exists("all_mala")) {
  
  # Define condition names mapping to what is in all_mala$condition
  # High OD (stationary) comparisons and Low OD (log phase) control vs SPI-2
  cond_ctrl      <- "ControlPCN_HighOD"
  cond_spi1      <- "SPI-1ind-LB_HighOD"
  cond_spi2      <- "SPI-2ind_HighOD"
  cond_ctrl_low  <- "ControlPCN_LowOD"
  cond_spi2_low  <- "SPI-2ind_LowOD"
  
  # Where to save outputs
  output_dir <- path_from_root("explore/summary/output")
  if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
  
  # Check if these conditions exist in the data
  avail_conds <- unique(all_mala$condition)
  required_conditions <- c(cond_ctrl, cond_spi1, cond_spi2, cond_ctrl_low, cond_spi2_low)
  has_conds <- required_conditions %in% avail_conds
  
  if (all(has_conds)) {
    
    # --- Pairwise scatter panels ---
    
    conditions_keep <- required_conditions
    
    # Collapse any duplicates per (srna_id, seq_len, condition) to avoid list-cols
    mala_base <- all_mala %>%
      filter(condition %in% conditions_keep) %>%
      select(srna_id, seq_len, condition, Intracellular, OMV)
    
    mala_wide <- mala_base %>%
      group_by(srna_id, seq_len, condition) %>%
      summarise(
        Intracellular = median(Intracellular, na.rm = TRUE),
        OMV = median(OMV, na.rm = TRUE),
        .groups = "drop"
      ) %>%
      pivot_wider(
        id_cols = c(srna_id, seq_len),
        names_from = condition,
        values_from = c(Intracellular, OMV),
        names_glue = "{.value}_{condition}"
      )
    
    # Helper for consistency
    plot_pairwise <- function(data, x_var, y_var, title_str) {
      ggplot(data, aes(x = .data[[x_var]], y = .data[[y_var]])) +
        geom_point(alpha = 0.6, color = "#4C78A8") +
        scale_x_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000, 10000)) +
        scale_y_continuous(trans = "log1p", breaks = c(0, 1, 10, 100, 1000, 10000)) +
        geom_abline(slope = 1, linetype = "dashed", color = "grey50") +
        labs(title = title_str, x = x_var, y = y_var) +
        theme_minimal() +
        theme(plot.title = element_text(size = 11))
    }
    
    # 1. Intracellular: Control vs SPI1
    p_ic_spi1 <- plot_pairwise(mala_wide, 
                               paste0("Intracellular_", cond_ctrl), 
                               paste0("Intracellular_", cond_spi1), 
                               "Intracellular: Control vs SPI1")
    
    # 2. OMV: Control vs SPI1
    p_omv_spi1 <- plot_pairwise(mala_wide, 
                                paste0("OMV_", cond_ctrl), 
                                paste0("OMV_", cond_spi1), 
                                "OMV: Control vs SPI1")
    
    # 3. Intracellular: Control vs SPI2
    p_ic_spi2 <- plot_pairwise(mala_wide, 
                               paste0("Intracellular_", cond_ctrl), 
                               paste0("Intracellular_", cond_spi2), 
                               "Intracellular: Control vs SPI2")
    
    # 4. OMV: Control vs SPI2
    p_omv_spi2 <- plot_pairwise(mala_wide, 
                                paste0("OMV_", cond_ctrl), 
                                paste0("OMV_", cond_spi2), 
                                "OMV: Control vs SPI2")
    
    # 5. Intracellular: Control LowOD vs SPI2 LowOD
    p_ic_low_spi2 <- plot_pairwise(mala_wide,
                                   paste0("Intracellular_", cond_ctrl_low),
                                   paste0("Intracellular_", cond_spi2_low),
                                   "Intracellular: Control LowOD vs SPI2 LowOD")
    
    # 6. OMV: Control LowOD vs SPI2 LowOD
    p_omv_low_spi2 <- plot_pairwise(mala_wide,
                                    paste0("OMV_", cond_ctrl_low),
                                    paste0("OMV_", cond_spi2_low),
                                    "OMV: Control LowOD vs SPI2 LowOD")
    
    combined_plot <- ggpubr::ggarrange(
      p_ic_spi1, p_omv_spi1,
      p_ic_spi2, p_omv_spi2,
      p_ic_low_spi2, p_omv_low_spi2,
      ncol = 2, nrow = 3
    )
    
    print(combined_plot)
    
    # Save combined panel (PNG)
    ggplot2::ggsave(
      filename = file.path(output_dir, "malabirade_pairwise_scatter.png"),
      plot = combined_plot,
      width = 10, height = 16, dpi = 300
    )
  } else {
    print(paste("Missing required conditions for plots. Available:", paste(avail_conds, collapse=", ")))
  }
}

Observation:

1. Intracellular baseline is generally stable accross conditions. SPI-1 and SPI-2 induction do not strongly change intracellular sRNA levels. There are no clear upward or downward shift at both log phase and stationary phase.
1. SPI-1 causes a global increase of sRNAs in OMVs. All points are above the diagonal compared to Control stationary phase.

Because the corresponding intracellular panel is stable, it may mean selective export/packaging preference of the sRNA during intestinal infection mimic stage.

1. SPI-2 heterogenously increases/decreases OMV sRNAs at stationary phase. Many points lie above the diagonal, but the pattern is dispersed. There are a few sRNAs which are expressed in the control but NOT expressed at all on the SPI-2ind samples at stationary phase.
1. Most sRNA abundance increases in SPI-2ind samples compared to the Control samples during log growth phase. There are few sRNAs that are only detected in the SPI-2ind samples but NOT in the control samples.

Combined Analysis Summary: All Datasets

Md Mobashir Rahman

2025-11-22