1 Goal

This R Markdown file implements the project using the airway bulk RNA-seq dataset.

The analysis tests how different upstream gene-filtering strategies interact with three differential expression methods:

  1. DESeq2
  2. edgeR quasi-likelihood F-test
  3. limma-voom

The airway dataset contains human airway smooth muscle cell RNA-seq samples treated with dexamethasone versus untreated controls. The samples are paired by cell line, so the default model uses:

~ cell + condition

This keeps the two-condition comparison while accounting for cell-line effects.

2 Package setup

knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.width = 10,
  fig.height = 6
)

if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install("airway")
library(airway)
data(airway)

required_pkgs <- c(
  "airway",
  "SummarizedExperiment",
  "DESeq2",
  "edgeR",
  "limma",
  "ggplot2",
  "dplyr",
  "tidyr",
  "tibble",
  "readr",
  "UpSetR",
  "knitr"
)

missing_pkgs <- required_pkgs[
  !vapply(required_pkgs, requireNamespace, logical(1), quietly = TRUE)
]

if (length(missing_pkgs) > 0) {
  stop(
    "Missing packages: ", paste(missing_pkgs, collapse = ", "),
    "\nInstall CRAN packages with install.packages(). ",
    "Install Bioconductor packages with BiocManager::install()."
  )
}

suppressPackageStartupMessages({
  library(airway)
  library(SummarizedExperiment)
  library(DESeq2)
  library(edgeR)
  library(limma)
  library(ggplot2)
  library(dplyr)
  library(tidyr)
  library(tibble)
  library(readr)
  library(UpSetR)
})

alpha <- params$alpha
lfc_threshold <- params$lfc_threshold

# Output folders
dir.create("results", showWarnings = FALSE)
dir.create("figures", showWarnings = FALSE)
dir.create("figures/upset", recursive = TRUE, showWarnings = FALSE)
dir.create("figures/go", recursive = TRUE, showWarnings = FALSE)

2.1 Optional installation commands

Run this only if packages are missing. Restart R after installing packages.

install.packages(c("ggplot2", "dplyr", "tidyr", "tibble", "readr", "UpSetR", "knitr"))

if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

BiocManager::install(c(
  "airway",
  "SummarizedExperiment",
  "DESeq2",
  "edgeR",
  "limma"
))

# Optional GO enrichment packages
BiocManager::install(c("clusterProfiler", "org.Hs.eg.db"))

3 Load airway data

data("airway", package = "airway")
airway_se <- airway

counts_raw <- as.matrix(SummarizedExperiment::assay(airway_se))
metadata <- as.data.frame(SummarizedExperiment::colData(airway_se))

# Keep clean sample IDs.
metadata$sample_id <- rownames(metadata)

# Airway uses dex == "trt" and "untrt".
metadata$condition <- factor(metadata$dex, levels = c("untrt", "trt"))
metadata$cell <- factor(metadata$cell)

# Make gene IDs cleaner for downstream annotation.
# Airway rownames are Ensembl IDs. This also handles cases where version suffixes exist.
gene_id_original <- rownames(counts_raw)
gene_id_clean <- sub("\\\\..*$", "", gene_id_original)
rownames(counts_raw) <- make.unique(gene_id_clean)

# Ensure sample order matches.
counts_raw <- counts_raw[, metadata$sample_id]

metadata %>%
  dplyr::select(sample_id, cell, dex, condition) %>%
  knitr::kable()
sample_id cell dex condition
SRR1039508 SRR1039508 N61311 untrt untrt
SRR1039509 SRR1039509 N61311 trt trt
SRR1039512 SRR1039512 N052611 untrt untrt
SRR1039513 SRR1039513 N052611 trt trt
SRR1039516 SRR1039516 N080611 untrt untrt
SRR1039517 SRR1039517 N080611 trt trt
SRR1039520 SRR1039520 N061011 untrt untrt
SRR1039521 SRR1039521 N061011 trt trt

3.1 Dataset summary

dataset_summary <- tibble::tibble(
  n_genes = nrow(counts_raw),
  n_samples = ncol(counts_raw),
  n_untrt = sum(metadata$condition == "untrt"),
  n_trt = sum(metadata$condition == "trt"),
  n_cells = length(unique(metadata$cell))
)

dataset_summary %>% knitr::kable()
n_genes n_samples n_untrt n_trt n_cells
63677 8 4 4 4

4 Helper functions

make_design_matrix <- function(meta, paired_design = TRUE) {
  if (paired_design && "cell" %in% colnames(meta)) {
    design <- stats::model.matrix(~ cell + condition, data = meta)
  } else {
    design <- stats::model.matrix(~ condition, data = meta)
  }
  design
}

make_deseq_formula <- function(paired_design = TRUE) {
  if (paired_design) {
    stats::as.formula("~ cell + condition")
  } else {
    stats::as.formula("~ condition")
  }
}

get_condition_coef <- function(design_matrix) {
  coef_name <- grep("^condition", colnames(design_matrix), value = TRUE)
  if (length(coef_name) != 1) {
    stop(
      "Could not identify exactly one condition coefficient. Found: ",
      paste(coef_name, collapse = ", ")
    )
  }
  coef_name
}

standardize_gene_ids <- function(gene_ids) {
  sub("\\\\..*$", "", gene_ids)
}

run_deseq2 <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
                       paired_design = TRUE) {
  dds <- DESeq2::DESeqDataSetFromMatrix(
    countData = round(counts_mat),
    colData = meta,
    design = make_deseq_formula(paired_design)
  )

  dds <- DESeq2::DESeq(dds, quiet = TRUE)

  res <- DESeq2::results(
    dds,
    contrast = c("condition", "trt", "untrt"),
    alpha = alpha,
    independentFiltering = TRUE
  )

  res_df <- as.data.frame(res) %>%
    tibble::rownames_to_column("gene_id") %>%
    dplyr::transmute(
      gene_id = gene_id,
      ensembl_id = standardize_gene_ids(gene_id),
      log2FoldChange = log2FoldChange,
      statistic = stat,
      pvalue = pvalue,
      padj = padj,
      is_de = !is.na(padj) &
        padj < alpha &
        abs(log2FoldChange) >= lfc_threshold
    )

  deseq_metadata <- metadata(res)
  filter_threshold <- NA_real_
  if (!is.null(deseq_metadata$filterThreshold)) {
    filter_threshold <- as.numeric(deseq_metadata$filterThreshold)
  }

  attr(res_df, "deseq_filter_threshold") <- filter_threshold
  attr(res_df, "deseq_filter_num_rej") <- deseq_metadata$filterNumRej
  attr(res_df, "deseq_alpha") <- deseq_metadata$alpha

  res_df
}

run_edger_qlf <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
                          paired_design = TRUE) {
  design <- make_design_matrix(meta, paired_design)
  condition_coef <- get_condition_coef(design)

  y <- edgeR::DGEList(counts = counts_mat)
  y <- edgeR::calcNormFactors(y)
  y <- edgeR::estimateDisp(y, design)

  fit <- edgeR::glmQLFit(y, design)
  qlf <- edgeR::glmQLFTest(fit, coef = condition_coef)

  tab <- edgeR::topTags(qlf, n = Inf, sort.by = "none")$table

  tab %>%
    tibble::rownames_to_column("gene_id") %>%
    dplyr::transmute(
      gene_id = gene_id,
      ensembl_id = standardize_gene_ids(gene_id),
      log2FoldChange = logFC,
      statistic = F,
      pvalue = PValue,
      padj = FDR,
      is_de = !is.na(padj) &
        padj < alpha &
        abs(log2FoldChange) >= lfc_threshold
    )
}

run_limma_voom <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
                           paired_design = TRUE) {
  design <- make_design_matrix(meta, paired_design)
  condition_coef <- get_condition_coef(design)

  y <- edgeR::DGEList(counts = counts_mat)
  y <- edgeR::calcNormFactors(y)

  v <- limma::voom(y, design, plot = FALSE)
  fit <- limma::lmFit(v, design)
  fit <- limma::eBayes(fit)

  tab <- limma::topTable(
    fit,
    coef = condition_coef,
    number = Inf,
    sort.by = "none"
  )

  tab %>%
    tibble::rownames_to_column("gene_id") %>%
    dplyr::transmute(
      gene_id = gene_id,
      ensembl_id = standardize_gene_ids(gene_id),
      log2FoldChange = logFC,
      statistic = t,
      pvalue = P.Value,
      padj = adj.P.Val,
      is_de = !is.na(padj) &
        padj < alpha &
        abs(log2FoldChange) >= lfc_threshold
    )
}

run_de_method <- function(method, counts_mat, meta, alpha = 0.05,
                          lfc_threshold = 0, paired_design = TRUE) {
  if (method == "DESeq2") {
    return(run_deseq2(counts_mat, meta, alpha, lfc_threshold, paired_design))
  }

  if (method == "edgeR_QLF") {
    return(run_edger_qlf(counts_mat, meta, alpha, lfc_threshold, paired_design))
  }

  if (method == "limma_voom") {
    return(run_limma_voom(counts_mat, meta, alpha, lfc_threshold, paired_design))
  }

  stop("Unknown method: ", method)
}

5 Filtering functions

filter_no_filter <- function(counts_mat) {
  rep(TRUE, nrow(counts_mat))
}

filter_total_count_ge_10 <- function(counts_mat) {
  rowSums(counts_mat) >= 10
}

filter_cpm_ge_0_5_in_ge_2 <- function(counts_mat) {
  cpm_mat <- edgeR::cpm(counts_mat)
  rowSums(cpm_mat >= 0.5) >= 2
}

filter_cpm_ge_1_in_ge_half <- function(counts_mat) {
  cpm_mat <- edgeR::cpm(counts_mat)
  min_samples <- ceiling(ncol(counts_mat) / 2)
  rowSums(cpm_mat >= 1) >= min_samples
}

filter_edgeR_filterByExpr <- function(counts_mat, meta, paired_design = TRUE) {
  y <- edgeR::DGEList(counts = counts_mat)
  design <- make_design_matrix(meta, paired_design)
  edgeR::filterByExpr(y, design = design)
}

make_adaptive_candidate_filters <- function(counts_mat, meta) {
  cpm_mat <- edgeR::cpm(counts_mat)
  expr_cpm1 <- cpm_mat >= 1
  group <- meta$condition

  min_group_1 <- ceiling(sum(group == "untrt") / 2)
  min_group_2 <- ceiling(sum(group == "trt") / 2)

  group_presence <-
    rowSums(expr_cpm1[, group == "untrt", drop = FALSE]) >= min_group_1 |
    rowSums(expr_cpm1[, group == "trt", drop = FALSE]) >= min_group_2

  list(
    mean_count_ge_1 = rowMeans(counts_mat) >= 1,
    mean_count_ge_5 = rowMeans(counts_mat) >= 5,
    max_count_ge_10 = apply(counts_mat, 1, max) >= 10,
    total_count_ge_10 = rowSums(counts_mat) >= 10,
    cpm_0_5_in_2 = rowSums(cpm_mat >= 0.5) >= 2,
    cpm_1_in_half = rowSums(cpm_mat >= 1) >= ceiling(ncol(counts_mat) / 2),
    zero_prop_le_0_75 = rowMeans(counts_mat == 0) <= 0.75,
    group_presence_cpm1 = group_presence
  )
}

select_adaptive_filter <- function(counts_mat, meta, alpha = 0.05,
                                   lfc_threshold = 0,
                                   paired_design = TRUE,
                                   minimum_genes = 100,
                                   fallback_name = "cpm_0_5_in_2") {
  candidate_filters <- make_adaptive_candidate_filters(counts_mat, meta)

  scores <- tibble::tibble(
    candidate = names(candidate_filters),
    retained_genes = NA_integer_,
    n_rejections = NA_integer_,
    status = NA_character_
  )

  for (i in seq_along(candidate_filters)) {
    candidate_name <- names(candidate_filters)[i]
    mask <- candidate_filters[[i]]
    retained <- sum(mask)
    scores$retained_genes[i] <- retained

    if (retained < minimum_genes) {
      scores$n_rejections[i] <- NA_integer_
      scores$status[i] <- "skipped_too_few_genes"
      next
    }

    candidate_result <- tryCatch(
      {
        run_limma_voom(
          counts_mat[mask, , drop = FALSE],
          meta,
          alpha = alpha,
          lfc_threshold = lfc_threshold,
          paired_design = paired_design
        )
      },
      error = function(e) {
        attr(e, "message") <- conditionMessage(e)
        e
      }
    )

    if (inherits(candidate_result, "error")) {
      scores$n_rejections[i] <- NA_integer_
      scores$status[i] <- paste0("failed: ", conditionMessage(candidate_result))
    } else {
      scores$n_rejections[i] <- sum(candidate_result$is_de, na.rm = TRUE)
      scores$status[i] <- "ok"
    }
  }

  ok_scores <- scores %>%
    dplyr::filter(status == "ok", !is.na(n_rejections))

  if (nrow(ok_scores) == 0 || max(ok_scores$n_rejections, na.rm = TRUE) == 0) {
    selected_name <- fallback_name
  } else {
    selected_name <- ok_scores %>%
      dplyr::arrange(dplyr::desc(n_rejections), retained_genes) %>%
      dplyr::slice(1) %>%
      dplyr::pull(candidate)
  }

  list(
    selected_name = selected_name,
    selected_mask = candidate_filters[[selected_name]],
    scores = scores
  )
}

6 Create filters

adaptive_selection <- select_adaptive_filter(
  counts_raw,
  metadata,
  alpha = alpha,
  lfc_threshold = lfc_threshold,
  paired_design = params$paired_design
)

filter_masks <- list(
  no_filter = filter_no_filter(counts_raw),
  total_count_ge_10 = filter_total_count_ge_10(counts_raw),
  cpm_ge_0_5_in_ge_2 = filter_cpm_ge_0_5_in_ge_2(counts_raw),
  cpm_ge_1_in_ge_half = filter_cpm_ge_1_in_ge_half(counts_raw),
  edgeR_filterByExpr = filter_edgeR_filterByExpr(
    counts_raw,
    metadata,
    paired_design = params$paired_design
  ),
  adaptive_limma_voom_selected = adaptive_selection$selected_mask
)

filter_summary <- tibble::tibble(
  filter = names(filter_masks),
  retained_genes = vapply(filter_masks, sum, integer(1)),
  removed_genes = nrow(counts_raw) - retained_genes,
  retained_percent = round(100 * retained_genes / nrow(counts_raw), 2)
)

filter_summary %>% knitr::kable()
filter retained_genes removed_genes retained_percent
no_filter 63677 0 100.00
total_count_ge_10 22369 41308 35.13
cpm_ge_0_5_in_ge_2 16730 46947 26.27
cpm_ge_1_in_ge_half 14332 49345 22.51
edgeR_filterByExpr 16860 46817 26.48
adaptive_limma_voom_selected 18086 45591 28.40
adaptive_selection$scores %>% knitr::kable()
candidate retained_genes n_rejections status
mean_count_ge_1 23171 5083 ok
mean_count_ge_5 17852 5152 ok
max_count_ge_10 18086 5211 ok
total_count_ge_10 22369 5108 ok
cpm_0_5_in_2 16730 5161 ok
cpm_1_in_half 14332 4847 ok
zero_prop_le_0_75 28877 4738 ok
group_presence_cpm1 14827 5001 ok
cat("Adaptive filter selected:", adaptive_selection$selected_name, "\n")
## Adaptive filter selected: max_count_ge_10
readr::write_csv(filter_summary, "results/filter_summary_airway.csv")
readr::write_csv(adaptive_selection$scores, "results/adaptive_filter_scores_airway.csv")

7 Run DESeq2, edgeR-QLF, and limma-voom under each filter

methods <- c("DESeq2", "edgeR_QLF", "limma_voom")
all_results <- list()
deseq_filter_diagnostics <- list()

for (filter_name in names(filter_masks)) {
  mask <- filter_masks[[filter_name]]
  counts_filtered <- counts_raw[mask, , drop = FALSE]

  for (method_name in methods) {
    message("Running ", method_name, " under filter: ", filter_name)

    result <- run_de_method(
      method = method_name,
      counts_mat = counts_filtered,
      meta = metadata,
      alpha = alpha,
      lfc_threshold = lfc_threshold,
      paired_design = params$paired_design
    )

    analysis_id <- paste(filter_name, method_name, sep = "__")
    all_results[[analysis_id]] <- result

    if (method_name == "DESeq2") {
      deseq_filter_diagnostics[[filter_name]] <- tibble::tibble(
        filter = filter_name,
        upstream_retained_genes = nrow(counts_filtered),
        deseq_filter_threshold = attr(result, "deseq_filter_threshold"),
        n_padj_na = sum(is.na(result$padj)),
        n_pvalue_na = sum(is.na(result$pvalue)),
        n_de = sum(result$is_de, na.rm = TRUE)
      )
    }
  }
}

de_results_long <- dplyr::bind_rows(all_results, .id = "analysis_id") %>%
  tidyr::separate(
    analysis_id,
    into = c("filter", "method"),
    sep = "__",
    remove = FALSE
  ) %>%
  dplyr::mutate(
    filter = factor(filter, levels = names(filter_masks)),
    method = factor(method, levels = methods)
  )

deseq_filter_diagnostics_df <- dplyr::bind_rows(deseq_filter_diagnostics)

readr::write_csv(de_results_long, "results/all_de_results_airway.csv")
readr::write_csv(deseq_filter_diagnostics_df, "results/deseq_independent_filtering_diagnostics_airway.csv")

8 DEG count comparison

deg_summary <- de_results_long %>%
  dplyr::group_by(filter, method) %>%
  dplyr::summarise(
    n_tested = dplyr::n(),
    n_pvalue_na = sum(is.na(pvalue)),
    n_padj_na = sum(is.na(padj)),
    n_de = sum(is_de, na.rm = TRUE),
    .groups = "drop"
  )

readr::write_csv(deg_summary, "results/deg_count_summary_airway.csv")

deg_summary %>% knitr::kable()
filter method n_tested n_pvalue_na n_padj_na n_de
no_filter DESeq2 63677 30208 46895 4028
no_filter edgeR_QLF 63677 0 0 3704
no_filter limma_voom 63677 0 0 6723
total_count_ge_10 DESeq2 22369 0 5204 4030
total_count_ge_10 edgeR_QLF 22369 0 0 5088
total_count_ge_10 limma_voom 22369 0 0 5108
cpm_ge_0_5_in_ge_2 DESeq2 16730 0 0 4122
cpm_ge_0_5_in_ge_2 edgeR_QLF 16730 0 0 5119
cpm_ge_0_5_in_ge_2 limma_voom 16730 0 0 5161
cpm_ge_1_in_ge_half DESeq2 14332 0 0 3994
cpm_ge_1_in_ge_half edgeR_QLF 14332 0 0 4764
cpm_ge_1_in_ge_half limma_voom 14332 0 0 4847
edgeR_filterByExpr DESeq2 16860 0 0 4113
edgeR_filterByExpr edgeR_QLF 16860 0 0 5127
edgeR_filterByExpr limma_voom 16860 0 0 5170
adaptive_limma_voom_selected DESeq2 18086 0 702 4062
adaptive_limma_voom_selected edgeR_QLF 18086 0 0 5200
adaptive_limma_voom_selected limma_voom 18086 0 0 5211
p_deg_counts <- ggplot2::ggplot(
  deg_summary,
  ggplot2::aes(x = filter, y = n_de, fill = method)
) +
  ggplot2::geom_col(position = "dodge") +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Number of DE genes by filtering strategy and DE method",
    x = "Filtering strategy",
    y = paste0("Number of DE genes, FDR < ", alpha),
    fill = "DE method"
  ) +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))

p_deg_counts

ggplot2::ggsave(
  "figures/deg_counts_by_filter_and_method_airway.png",
  p_deg_counts,
  width = 11,
  height = 6,
  dpi = 300
)

9 P-value distribution diagnostics

p_pvalue <- de_results_long %>%
  dplyr::filter(!is.na(pvalue)) %>%
  ggplot2::ggplot(ggplot2::aes(x = pvalue)) +
  ggplot2::geom_histogram(bins = 40, boundary = 0) +
  ggplot2::facet_grid(method ~ filter) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "P-value distributions by filtering strategy and DE method",
    x = "Raw p-value",
    y = "Number of genes"
  ) +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))

p_pvalue

ggplot2::ggsave(
  "figures/pvalue_distributions_airway.png",
  p_pvalue,
  width = 14,
  height = 8,
  dpi = 300
)

10 DESeq2 independent filtering diagnostics

deseq_filter_diagnostics_df %>% knitr::kable()
filter upstream_retained_genes deseq_filter_threshold n_padj_na n_pvalue_na n_de
no_filter 63677 7.163845 46895 30208 4028
total_count_ge_10 22369 6.136364 5204 0 4030
cpm_ge_0_5_in_ge_2 16730 3.612137 0 0 4122
cpm_ge_1_in_ge_half 14332 14.594640 0 0 3994
edgeR_filterByExpr 16860 3.611121 0 0 4113
adaptive_limma_voom_selected 18086 5.441465 702 0 4062
p_deseq_na <- ggplot2::ggplot(
  deseq_filter_diagnostics_df,
  ggplot2::aes(x = filter, y = n_padj_na)
) +
  ggplot2::geom_col() +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "DESeq2 genes with NA adjusted p-values",
    subtitle = "NA padj values are one way to inspect the effect of DESeq2 independent filtering",
    x = "Upstream filtering strategy",
    y = "Number of genes with NA padj"
  ) +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))

p_deseq_na

ggplot2::ggsave(
  "figures/deseq2_na_padj_by_filter_airway.png",
  p_deseq_na,
  width = 10,
  height = 5,
  dpi = 300
)

11 Pairwise DEG overlap across methods

get_deg_set <- function(results_df, filter_name, method_name) {
  results_df %>%
    dplyr::filter(
      filter == filter_name,
      method == method_name,
      is_de
    ) %>%
    dplyr::pull(gene_id) %>%
    unique()
}

compute_pairwise_overlap <- function(results_df, filters, methods) {
  output <- list()
  counter <- 1

  for (filter_name in filters) {
    for (method_1 in methods) {
      for (method_2 in methods) {
        genes_1 <- get_deg_set(results_df, filter_name, method_1)
        genes_2 <- get_deg_set(results_df, filter_name, method_2)

        intersection_size <- length(intersect(genes_1, genes_2))
        union_size <- length(union(genes_1, genes_2))
        jaccard <- ifelse(union_size == 0, NA_real_, intersection_size / union_size)

        output[[counter]] <- tibble::tibble(
          filter = filter_name,
          method_1 = method_1,
          method_2 = method_2,
          n_method_1 = length(genes_1),
          n_method_2 = length(genes_2),
          intersection = intersection_size,
          union = union_size,
          jaccard = jaccard
        )
        counter <- counter + 1
      }
    }
  }

  dplyr::bind_rows(output)
}

pairwise_overlap <- compute_pairwise_overlap(
  de_results_long,
  filters = names(filter_masks),
  methods = methods
)

readr::write_csv(pairwise_overlap, "results/pairwise_overlap_airway.csv")

pairwise_overlap %>% knitr::kable()
filter method_1 method_2 n_method_1 n_method_2 intersection union jaccard
no_filter DESeq2 DESeq2 4028 4028 4028 4028 1.0000000
no_filter DESeq2 edgeR_QLF 4028 3704 3405 4327 0.7869193
no_filter DESeq2 limma_voom 4028 6723 4027 6724 0.5988995
no_filter edgeR_QLF DESeq2 3704 4028 3405 4327 0.7869193
no_filter edgeR_QLF edgeR_QLF 3704 3704 3704 3704 1.0000000
no_filter edgeR_QLF limma_voom 3704 6723 3699 6728 0.5497919
no_filter limma_voom DESeq2 6723 4028 4027 6724 0.5988995
no_filter limma_voom edgeR_QLF 6723 3704 3699 6728 0.5497919
no_filter limma_voom limma_voom 6723 6723 6723 6723 1.0000000
total_count_ge_10 DESeq2 DESeq2 4030 4030 4030 4030 1.0000000
total_count_ge_10 DESeq2 edgeR_QLF 4030 5088 3985 5133 0.7763491
total_count_ge_10 DESeq2 limma_voom 4030 5108 4017 5121 0.7844171
total_count_ge_10 edgeR_QLF DESeq2 5088 4030 3985 5133 0.7763491
total_count_ge_10 edgeR_QLF edgeR_QLF 5088 5088 5088 5088 1.0000000
total_count_ge_10 edgeR_QLF limma_voom 5088 5108 4920 5276 0.9325246
total_count_ge_10 limma_voom DESeq2 5108 4030 4017 5121 0.7844171
total_count_ge_10 limma_voom edgeR_QLF 5108 5088 4920 5276 0.9325246
total_count_ge_10 limma_voom limma_voom 5108 5108 5108 5108 1.0000000
cpm_ge_0_5_in_ge_2 DESeq2 DESeq2 4122 4122 4122 4122 1.0000000
cpm_ge_0_5_in_ge_2 DESeq2 edgeR_QLF 4122 5119 4074 5167 0.7884653
cpm_ge_0_5_in_ge_2 DESeq2 limma_voom 4122 5161 4102 5181 0.7917390
cpm_ge_0_5_in_ge_2 edgeR_QLF DESeq2 5119 4122 4074 5167 0.7884653
cpm_ge_0_5_in_ge_2 edgeR_QLF edgeR_QLF 5119 5119 5119 5119 1.0000000
cpm_ge_0_5_in_ge_2 edgeR_QLF limma_voom 5119 5161 5017 5263 0.9532586
cpm_ge_0_5_in_ge_2 limma_voom DESeq2 5161 4122 4102 5181 0.7917390
cpm_ge_0_5_in_ge_2 limma_voom edgeR_QLF 5161 5119 5017 5263 0.9532586
cpm_ge_0_5_in_ge_2 limma_voom limma_voom 5161 5161 5161 5161 1.0000000
cpm_ge_1_in_ge_half DESeq2 DESeq2 3994 3994 3994 3994 1.0000000
cpm_ge_1_in_ge_half DESeq2 edgeR_QLF 3994 4764 3920 4838 0.8102522
cpm_ge_1_in_ge_half DESeq2 limma_voom 3994 4847 3951 4890 0.8079755
cpm_ge_1_in_ge_half edgeR_QLF DESeq2 4764 3994 3920 4838 0.8102522
cpm_ge_1_in_ge_half edgeR_QLF edgeR_QLF 4764 4764 4764 4764 1.0000000
cpm_ge_1_in_ge_half edgeR_QLF limma_voom 4764 4847 4718 4893 0.9642346
cpm_ge_1_in_ge_half limma_voom DESeq2 4847 3994 3951 4890 0.8079755
cpm_ge_1_in_ge_half limma_voom edgeR_QLF 4847 4764 4718 4893 0.9642346
cpm_ge_1_in_ge_half limma_voom limma_voom 4847 4847 4847 4847 1.0000000
edgeR_filterByExpr DESeq2 DESeq2 4113 4113 4113 4113 1.0000000
edgeR_filterByExpr DESeq2 edgeR_QLF 4113 5127 4065 5175 0.7855072
edgeR_filterByExpr DESeq2 limma_voom 4113 5170 4093 5190 0.7886320
edgeR_filterByExpr edgeR_QLF DESeq2 5127 4113 4065 5175 0.7855072
edgeR_filterByExpr edgeR_QLF edgeR_QLF 5127 5127 5127 5127 1.0000000
edgeR_filterByExpr edgeR_QLF limma_voom 5127 5170 5022 5275 0.9520379
edgeR_filterByExpr limma_voom DESeq2 5170 4113 4093 5190 0.7886320
edgeR_filterByExpr limma_voom edgeR_QLF 5170 5127 5022 5275 0.9520379
edgeR_filterByExpr limma_voom limma_voom 5170 5170 5170 5170 1.0000000
adaptive_limma_voom_selected DESeq2 DESeq2 4062 4062 4062 4062 1.0000000
adaptive_limma_voom_selected DESeq2 edgeR_QLF 4062 5200 4020 5242 0.7668829
adaptive_limma_voom_selected DESeq2 limma_voom 4062 5211 4044 5229 0.7733792
adaptive_limma_voom_selected edgeR_QLF DESeq2 5200 4062 4020 5242 0.7668829
adaptive_limma_voom_selected edgeR_QLF edgeR_QLF 5200 5200 5200 5200 1.0000000
adaptive_limma_voom_selected edgeR_QLF limma_voom 5200 5211 5047 5364 0.9409023
adaptive_limma_voom_selected limma_voom DESeq2 5211 4062 4044 5229 0.7733792
adaptive_limma_voom_selected limma_voom edgeR_QLF 5211 5200 5047 5364 0.9409023
adaptive_limma_voom_selected limma_voom limma_voom 5211 5211 5211 5211 1.0000000
p_overlap <- ggplot2::ggplot(
  pairwise_overlap,
  ggplot2::aes(x = method_1, y = method_2, fill = jaccard)
) +
  ggplot2::geom_tile() +
  ggplot2::geom_text(ggplot2::aes(label = round(jaccard, 2)), size = 3) +
  ggplot2::facet_wrap(~ filter) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Pairwise Jaccard overlap of DE gene sets",
    x = "Method 1",
    y = "Method 2",
    fill = "Jaccard"
  ) +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))

p_overlap

ggplot2::ggsave(
  "figures/pairwise_jaccard_overlap_airway.png",
  p_overlap,
  width = 13,
  height = 8,
  dpi = 300
)

12 UpSet plots

for (filter_name in names(filter_masks)) {
  gene_sets <- list()

  for (method_name in methods) {
    gene_sets[[method_name]] <- get_deg_set(de_results_long, filter_name, method_name)
  }

  total_de_genes <- length(unique(unlist(gene_sets)))

  if (total_de_genes == 0) {
    message("Skipping UpSet plot for ", filter_name, ": no DE genes.")
    next
  }

  upset_input <- UpSetR::fromList(gene_sets)

  pdf(
    file = file.path("figures/upset", paste0("upset_", filter_name, "_airway.pdf")),
    width = 8,
    height = 6
  )

  UpSetR::upset(
    upset_input,
    nsets = length(methods),
    order.by = "freq",
    mainbar.y.label = "DE gene intersection size",
    sets.x.label = "DE genes per method"
  )

  dev.off()
}

13 Compare log2 fold-change estimates

lfc_wide <- de_results_long %>%
  dplyr::select(filter, method, gene_id, log2FoldChange) %>%
  tidyr::pivot_wider(
    names_from = method,
    values_from = log2FoldChange
  )

readr::write_csv(lfc_wide, "results/log2fc_wide_airway.csv")

p_lfc_deseq_edger <- ggplot2::ggplot(
  lfc_wide,
  ggplot2::aes(x = DESeq2, y = edgeR_QLF)
) +
  ggplot2::geom_point(alpha = 0.25, size = 0.7) +
  ggplot2::facet_wrap(~ filter) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Log2 fold-change agreement: DESeq2 vs edgeR-QLF",
    x = "DESeq2 log2FC",
    y = "edgeR-QLF log2FC"
  )

p_lfc_deseq_edger

ggplot2::ggsave(
  "figures/log2fc_deseq2_vs_edger_airway.png",
  p_lfc_deseq_edger,
  width = 13,
  height = 8,
  dpi = 300
)

p_lfc_deseq_limma <- ggplot2::ggplot(
  lfc_wide,
  ggplot2::aes(x = DESeq2, y = limma_voom)
) +
  ggplot2::geom_point(alpha = 0.25, size = 0.7) +
  ggplot2::facet_wrap(~ filter) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Log2 fold-change agreement: DESeq2 vs limma-voom",
    x = "DESeq2 log2FC",
    y = "limma-voom log2FC"
  )

p_lfc_deseq_limma

ggplot2::ggsave(
  "figures/log2fc_deseq2_vs_limma_airway.png",
  p_lfc_deseq_limma,
  width = 13,
  height = 8,
  dpi = 300
)

14 Optional GO enrichment consistency

This section is disabled by default. To run it, set this in the YAML header:

params:
  run_go: true
go_pkgs <- c("clusterProfiler", "org.Hs.eg.db")
missing_go_pkgs <- go_pkgs[
  !vapply(go_pkgs, requireNamespace, logical(1), quietly = TRUE)
]

if (length(missing_go_pkgs) > 0) {
  stop(
    "Missing GO enrichment packages: ", paste(missing_go_pkgs, collapse = ", "),
    "\nInstall with BiocManager::install(c('clusterProfiler', 'org.Hs.eg.db'))."
  )
}

suppressPackageStartupMessages({
  library(clusterProfiler)
  library(org.Hs.eg.db)
})

map_ensembl_to_entrez <- function(ensembl_ids) {
  mapped <- AnnotationDbi::mapIds(
    org.Hs.eg.db::org.Hs.eg.db,
    keys = unique(ensembl_ids),
    keytype = "ENSEMBL",
    column = "ENTREZID",
    multiVals = "first"
  )
  stats::na.omit(as.character(mapped))
}

run_go_for_deg_set <- function(de_ensembl_ids, universe_ensembl_ids) {
  entrez_de <- map_ensembl_to_entrez(de_ensembl_ids)
  entrez_universe <- map_ensembl_to_entrez(universe_ensembl_ids)

  if (length(entrez_de) < 10) {
    return(NULL)
  }

  clusterProfiler::enrichGO(
    gene = entrez_de,
    universe = entrez_universe,
    OrgDb = org.Hs.eg.db::org.Hs.eg.db,
    keyType = "ENTREZID",
    ont = "BP",
    pAdjustMethod = "BH",
    pvalueCutoff = 0.05,
    qvalueCutoff = 0.2,
    readable = TRUE
  )
}

go_results <- list()

for (filter_name in names(filter_masks)) {
  universe_ids <- rownames(counts_raw)[filter_masks[[filter_name]]]

  for (method_name in methods) {
    de_ids <- de_results_long %>%
      dplyr::filter(filter == filter_name, method == method_name, is_de) %>%
      dplyr::pull(ensembl_id) %>%
      unique()

    ego <- run_go_for_deg_set(de_ids, universe_ids)
    analysis_id <- paste(filter_name, method_name, sep = "__")
    go_results[[analysis_id]] <- ego

    if (!is.null(ego)) {
      ego_df <- as.data.frame(ego)
      readr::write_csv(
        ego_df,
        file.path("results", paste0("GO_", analysis_id, "_airway.csv"))
      )
    }
  }
}

# Summarize top GO terms.
go_top_terms <- list()

for (analysis_id in names(go_results)) {
  ego <- go_results[[analysis_id]]
  if (is.null(ego)) next

  ego_df <- as.data.frame(ego)
  if (nrow(ego_df) == 0) next

  pieces <- strsplit(analysis_id, "__")[[1]]

  go_top_terms[[analysis_id]] <- ego_df %>%
    dplyr::slice_head(n = 10) %>%
    dplyr::mutate(
      filter = pieces[1],
      method = pieces[2]
    ) %>%
    dplyr::select(filter, method, ID, Description, p.adjust, Count)
}

go_top_terms_df <- dplyr::bind_rows(go_top_terms)
readr::write_csv(go_top_terms_df, "results/top_go_terms_airway.csv")

go_top_terms_df %>% knitr::kable()

15 Interpretation guide

Use these outputs to answer the main project question:

  1. DEG count summary: Does one DE method become more or less sensitive under stricter filters?
  2. P-value distributions: Do some filters create more uniform, inflated, or enriched low-p-value distributions?
  3. Pairwise overlap / UpSet plots: Are method differences stable across filters, or does overlap change dramatically?
  4. DESeq2 independent filtering diagnostics: Does DESeq2 compensate for weak upstream filtering by producing more NA adjusted p-values?
  5. GO enrichment: Do filtering choices change the biological interpretation, even when the same DE method is used?

16 Session information

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] UpSetR_1.4.0                readr_2.2.0                
##  [3] tibble_3.3.1                tidyr_1.3.2                
##  [5] dplyr_1.2.1                 ggplot2_4.0.3              
##  [7] edgeR_4.6.3                 limma_3.64.3               
##  [9] DESeq2_1.48.2               airway_1.28.0              
## [11] SummarizedExperiment_1.38.1 Biobase_2.68.0             
## [13] GenomicRanges_1.60.0        GenomeInfoDb_1.44.3        
## [15] IRanges_2.42.0              S4Vectors_0.46.0           
## [17] BiocGenerics_0.54.1         generics_0.1.4             
## [19] MatrixGenerics_1.20.0       matrixStats_1.5.0          
## [21] BiocManager_1.30.27        
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6            xfun_0.57               bslib_0.10.0           
##  [4] lattice_0.22-9          tzdb_0.5.0              vctrs_0.7.3            
##  [7] tools_4.5.1             parallel_4.5.1          pkgconfig_2.0.3        
## [10] Matrix_1.7-5            RColorBrewer_1.1-3      S7_0.2.2               
## [13] lifecycle_1.0.5         GenomeInfoDbData_1.2.14 compiler_4.5.1         
## [16] farver_2.1.2            textshaping_1.0.5       statmod_1.5.1          
## [19] codetools_0.2-20        htmltools_0.5.9         sass_0.4.10            
## [22] yaml_2.3.12             pillar_1.11.1           crayon_1.5.3           
## [25] jquerylib_0.1.4         BiocParallel_1.42.2     DelayedArray_0.34.1    
## [28] cachem_1.1.0            abind_1.4-8             tidyselect_1.2.1       
## [31] locfit_1.5-9.12         digest_0.6.39           purrr_1.2.2            
## [34] labeling_0.4.3          fastmap_1.2.0           grid_4.5.1             
## [37] cli_3.6.6               SparseArray_1.8.1       magrittr_2.0.5         
## [40] S4Arrays_1.8.1          withr_3.0.2             scales_1.4.0           
## [43] UCSC.utils_1.4.0        bit64_4.8.0             rmarkdown_2.31         
## [46] XVector_0.48.0          httr_1.4.8              bit_4.6.0              
## [49] otel_0.2.0              gridExtra_2.3           ragg_1.5.2             
## [52] hms_1.1.4               evaluate_1.0.5          knitr_1.51             
## [55] rlang_1.2.0             Rcpp_1.1.1-1.1          glue_1.8.1             
## [58] vroom_1.7.1             rstudioapi_0.18.0       jsonlite_2.0.0         
## [61] plyr_1.8.9              R6_2.6.1                systemfonts_1.3.2