1 Goal
2 Package setup
- 2.1 Optional installation commands
3 Load airway data
- 3.1 Dataset summary
4 Helper functions
5 Filtering functions
6 Create filters
7 Run DESeq2, edgeR-QLF, and limma-voom under each filter
8 DEG count comparison
9 P-value distribution diagnostics
10 DESeq2 independent filtering diagnostics
11 Pairwise DEG overlap across methods
12 UpSet plots
13 Compare log2 fold-change estimates
14 Optional GO enrichment consistency
15 Interpretation guide
16 Session information

1 Goal

This R Markdown file implements the project using the airway bulk RNA-seq dataset.

The analysis tests how different upstream gene-filtering strategies interact with three differential expression methods:

DESeq2
edgeR quasi-likelihood F-test
limma-voom

The airway dataset contains human airway smooth muscle cell RNA-seq samples treated with dexamethasone versus untreated controls. The samples are paired by cell line, so the default model uses:

~ cell + condition

This keeps the two-condition comparison while accounting for cell-line effects.

2 Package setup

knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.width = 10,
  fig.height = 6
)

if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install("airway")
library(airway)
data(airway)

required_pkgs <- c(
  "airway",
  "SummarizedExperiment",
  "DESeq2",
  "edgeR",
  "limma",
  "ggplot2",
  "dplyr",
  "tidyr",
  "tibble",
  "readr",
  "UpSetR",
  "knitr"
)

missing_pkgs <- required_pkgs[
  !vapply(required_pkgs, requireNamespace, logical(1), quietly = TRUE)
]

if (length(missing_pkgs) > 0) {
  stop(
    "Missing packages: ", paste(missing_pkgs, collapse = ", "),
    "\nInstall CRAN packages with install.packages(). ",
    "Install Bioconductor packages with BiocManager::install()."
  )
}

suppressPackageStartupMessages({
  library(airway)
  library(SummarizedExperiment)
  library(DESeq2)
  library(edgeR)
  library(limma)
  library(ggplot2)
  library(dplyr)
  library(tidyr)
  library(tibble)
  library(readr)
  library(UpSetR)
})

alpha <- params$alpha
lfc_threshold <- params$lfc_threshold

# Output folders
dir.create("results", showWarnings = FALSE)
dir.create("figures", showWarnings = FALSE)
dir.create("figures/upset", recursive = TRUE, showWarnings = FALSE)
dir.create("figures/go", recursive = TRUE, showWarnings = FALSE)

2.1 Optional installation commands

Run this only if packages are missing. Restart R after installing packages.

install.packages(c("ggplot2", "dplyr", "tidyr", "tibble", "readr", "UpSetR", "knitr"))

if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

BiocManager::install(c(
  "airway",
  "SummarizedExperiment",
  "DESeq2",
  "edgeR",
  "limma"
))

# Optional GO enrichment packages
BiocManager::install(c("clusterProfiler", "org.Hs.eg.db"))

3 Load airway data

data("airway", package = "airway")
airway_se <- airway

counts_raw <- as.matrix(SummarizedExperiment::assay(airway_se))
metadata <- as.data.frame(SummarizedExperiment::colData(airway_se))

# Keep clean sample IDs.
metadata$sample_id <- rownames(metadata)

# Airway uses dex == "trt" and "untrt".
metadata$condition <- factor(metadata$dex, levels = c("untrt", "trt"))
metadata$cell <- factor(metadata$cell)

# Make gene IDs cleaner for downstream annotation.
# Airway rownames are Ensembl IDs. This also handles cases where version suffixes exist.
gene_id_original <- rownames(counts_raw)
gene_id_clean <- sub("\\\\..*$", "", gene_id_original)
rownames(counts_raw) <- make.unique(gene_id_clean)

# Ensure sample order matches.
counts_raw <- counts_raw[, metadata$sample_id]

metadata %>%
  dplyr::select(sample_id, cell, dex, condition) %>%
  knitr::kable()

	sample_id	cell	dex	condition
SRR1039508	SRR1039508	N61311	untrt	untrt
SRR1039509	SRR1039509	N61311	trt	trt
SRR1039512	SRR1039512	N052611	untrt	untrt
SRR1039513	SRR1039513	N052611	trt	trt
SRR1039516	SRR1039516	N080611	untrt	untrt
SRR1039517	SRR1039517	N080611	trt	trt
SRR1039520	SRR1039520	N061011	untrt	untrt
SRR1039521	SRR1039521	N061011	trt	trt

3.1 Dataset summary

dataset_summary <- tibble::tibble(
  n_genes = nrow(counts_raw),
  n_samples = ncol(counts_raw),
  n_untrt = sum(metadata$condition == "untrt"),
  n_trt = sum(metadata$condition == "trt"),
  n_cells = length(unique(metadata$cell))
)

dataset_summary %>% knitr::kable()

n_genes	n_samples	n_untrt	n_trt	n_cells
63677	8	4	4	4

4 Helper functions

make_design_matrix <- function(meta, paired_design = TRUE) {
  if (paired_design && "cell" %in% colnames(meta)) {
    design <- stats::model.matrix(~ cell + condition, data = meta)
  } else {
    design <- stats::model.matrix(~ condition, data = meta)
  }
  design
}

make_deseq_formula <- function(paired_design = TRUE) {
  if (paired_design) {
    stats::as.formula("~ cell + condition")
  } else {
    stats::as.formula("~ condition")
  }
}

get_condition_coef <- function(design_matrix) {
  coef_name <- grep("^condition", colnames(design_matrix), value = TRUE)
  if (length(coef_name) != 1) {
    stop(
      "Could not identify exactly one condition coefficient. Found: ",
      paste(coef_name, collapse = ", ")
    )
  }
  coef_name
}

standardize_gene_ids <- function(gene_ids) {
  sub("\\\\..*$", "", gene_ids)
}

run_deseq2 <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
                       paired_design = TRUE) {
  dds <- DESeq2::DESeqDataSetFromMatrix(
    countData = round(counts_mat),
    colData = meta,
    design = make_deseq_formula(paired_design)
  )

  dds <- DESeq2::DESeq(dds, quiet = TRUE)

  res <- DESeq2::results(
    dds,
    contrast = c("condition", "trt", "untrt"),
    alpha = alpha,
    independentFiltering = TRUE
  )

  res_df <- as.data.frame(res) %>%
    tibble::rownames_to_column("gene_id") %>%
    dplyr::transmute(
      gene_id = gene_id,
      ensembl_id = standardize_gene_ids(gene_id),
      log2FoldChange = log2FoldChange,
      statistic = stat,
      pvalue = pvalue,
      padj = padj,
      is_de = !is.na(padj) &
        padj < alpha &
        abs(log2FoldChange) >= lfc_threshold
    )

  deseq_metadata <- metadata(res)
  filter_threshold <- NA_real_
  if (!is.null(deseq_metadata$filterThreshold)) {
    filter_threshold <- as.numeric(deseq_metadata$filterThreshold)
  }

  attr(res_df, "deseq_filter_threshold") <- filter_threshold
  attr(res_df, "deseq_filter_num_rej") <- deseq_metadata$filterNumRej
  attr(res_df, "deseq_alpha") <- deseq_metadata$alpha

  res_df
}

run_edger_qlf <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
                          paired_design = TRUE) {
  design <- make_design_matrix(meta, paired_design)
  condition_coef <- get_condition_coef(design)

  y <- edgeR::DGEList(counts = counts_mat)
  y <- edgeR::calcNormFactors(y)
  y <- edgeR::estimateDisp(y, design)

  fit <- edgeR::glmQLFit(y, design)
  qlf <- edgeR::glmQLFTest(fit, coef = condition_coef)

  tab <- edgeR::topTags(qlf, n = Inf, sort.by = "none")$table

  tab %>%
    tibble::rownames_to_column("gene_id") %>%
    dplyr::transmute(
      gene_id = gene_id,
      ensembl_id = standardize_gene_ids(gene_id),
      log2FoldChange = logFC,
      statistic = F,
      pvalue = PValue,
      padj = FDR,
      is_de = !is.na(padj) &
        padj < alpha &
        abs(log2FoldChange) >= lfc_threshold
    )
}

run_limma_voom <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
                           paired_design = TRUE) {
  design <- make_design_matrix(meta, paired_design)
  condition_coef <- get_condition_coef(design)

  y <- edgeR::DGEList(counts = counts_mat)
  y <- edgeR::calcNormFactors(y)

  v <- limma::voom(y, design, plot = FALSE)
  fit <- limma::lmFit(v, design)
  fit <- limma::eBayes(fit)

  tab <- limma::topTable(
    fit,
    coef = condition_coef,
    number = Inf,
    sort.by = "none"
  )

  tab %>%
    tibble::rownames_to_column("gene_id") %>%
    dplyr::transmute(
      gene_id = gene_id,
      ensembl_id = standardize_gene_ids(gene_id),
      log2FoldChange = logFC,
      statistic = t,
      pvalue = P.Value,
      padj = adj.P.Val,
      is_de = !is.na(padj) &
        padj < alpha &
        abs(log2FoldChange) >= lfc_threshold
    )
}

run_de_method <- function(method, counts_mat, meta, alpha = 0.05,
                          lfc_threshold = 0, paired_design = TRUE) {
  if (method == "DESeq2") {
    return(run_deseq2(counts_mat, meta, alpha, lfc_threshold, paired_design))
  }

  if (method == "edgeR_QLF") {
    return(run_edger_qlf(counts_mat, meta, alpha, lfc_threshold, paired_design))
  }

  if (method == "limma_voom") {
    return(run_limma_voom(counts_mat, meta, alpha, lfc_threshold, paired_design))
  }

  stop("Unknown method: ", method)
}

5 Filtering functions

filter_no_filter <- function(counts_mat) {
  rep(TRUE, nrow(counts_mat))
}

filter_total_count_ge_10 <- function(counts_mat) {
  rowSums(counts_mat) >= 10
}

filter_cpm_ge_0_5_in_ge_2 <- function(counts_mat) {
  cpm_mat <- edgeR::cpm(counts_mat)
  rowSums(cpm_mat >= 0.5) >= 2
}

filter_cpm_ge_1_in_ge_half <- function(counts_mat) {
  cpm_mat <- edgeR::cpm(counts_mat)
  min_samples <- ceiling(ncol(counts_mat) / 2)
  rowSums(cpm_mat >= 1) >= min_samples
}

filter_edgeR_filterByExpr <- function(counts_mat, meta, paired_design = TRUE) {
  y <- edgeR::DGEList(counts = counts_mat)
  design <- make_design_matrix(meta, paired_design)
  edgeR::filterByExpr(y, design = design)
}

make_adaptive_candidate_filters <- function(counts_mat, meta) {
  cpm_mat <- edgeR::cpm(counts_mat)
  expr_cpm1 <- cpm_mat >= 1
  group <- meta$condition

  min_group_1 <- ceiling(sum(group == "untrt") / 2)
  min_group_2 <- ceiling(sum(group == "trt") / 2)

  group_presence <-
    rowSums(expr_cpm1[, group == "untrt", drop = FALSE]) >= min_group_1 |
    rowSums(expr_cpm1[, group == "trt", drop = FALSE]) >= min_group_2

  list(
    mean_count_ge_1 = rowMeans(counts_mat) >= 1,
    mean_count_ge_5 = rowMeans(counts_mat) >= 5,
    max_count_ge_10 = apply(counts_mat, 1, max) >= 10,
    total_count_ge_10 = rowSums(counts_mat) >= 10,
    cpm_0_5_in_2 = rowSums(cpm_mat >= 0.5) >= 2,
    cpm_1_in_half = rowSums(cpm_mat >= 1) >= ceiling(ncol(counts_mat) / 2),
    zero_prop_le_0_75 = rowMeans(counts_mat == 0) <= 0.75,
    group_presence_cpm1 = group_presence
  )
}

select_adaptive_filter <- function(counts_mat, meta, alpha = 0.05,
                                   lfc_threshold = 0,
                                   paired_design = TRUE,
                                   minimum_genes = 100,
                                   fallback_name = "cpm_0_5_in_2") {
  candidate_filters <- make_adaptive_candidate_filters(counts_mat, meta)

  scores <- tibble::tibble(
    candidate = names(candidate_filters),
    retained_genes = NA_integer_,
    n_rejections = NA_integer_,
    status = NA_character_
  )

  for (i in seq_along(candidate_filters)) {
    candidate_name <- names(candidate_filters)[i]
    mask <- candidate_filters[[i]]
    retained <- sum(mask)
    scores$retained_genes[i] <- retained

    if (retained < minimum_genes) {
      scores$n_rejections[i] <- NA_integer_
      scores$status[i] <- "skipped_too_few_genes"
      next
    }

    candidate_result <- tryCatch(
      {
        run_limma_voom(
          counts_mat[mask, , drop = FALSE],
          meta,
          alpha = alpha,
          lfc_threshold = lfc_threshold,
          paired_design = paired_design
        )
      },
      error = function(e) {
        attr(e, "message") <- conditionMessage(e)
        e
      }
    )

    if (inherits(candidate_result, "error")) {
      scores$n_rejections[i] <- NA_integer_
      scores$status[i] <- paste0("failed: ", conditionMessage(candidate_result))
    } else {
      scores$n_rejections[i] <- sum(candidate_result$is_de, na.rm = TRUE)
      scores$status[i] <- "ok"
    }
  }

  ok_scores <- scores %>%
    dplyr::filter(status == "ok", !is.na(n_rejections))

  if (nrow(ok_scores) == 0 || max(ok_scores$n_rejections, na.rm = TRUE) == 0) {
    selected_name <- fallback_name
  } else {
    selected_name <- ok_scores %>%
      dplyr::arrange(dplyr::desc(n_rejections), retained_genes) %>%
      dplyr::slice(1) %>%
      dplyr::pull(candidate)
  }

  list(
    selected_name = selected_name,
    selected_mask = candidate_filters[[selected_name]],
    scores = scores
  )
}

6 Create filters

adaptive_selection <- select_adaptive_filter(
  counts_raw,
  metadata,
  alpha = alpha,
  lfc_threshold = lfc_threshold,
  paired_design = params$paired_design
)

filter_masks <- list(
  no_filter = filter_no_filter(counts_raw),
  total_count_ge_10 = filter_total_count_ge_10(counts_raw),
  cpm_ge_0_5_in_ge_2 = filter_cpm_ge_0_5_in_ge_2(counts_raw),
  cpm_ge_1_in_ge_half = filter_cpm_ge_1_in_ge_half(counts_raw),
  edgeR_filterByExpr = filter_edgeR_filterByExpr(
    counts_raw,
    metadata,
    paired_design = params$paired_design
  ),
  adaptive_limma_voom_selected = adaptive_selection$selected_mask
)

filter_summary <- tibble::tibble(
  filter = names(filter_masks),
  retained_genes = vapply(filter_masks, sum, integer(1)),
  removed_genes = nrow(counts_raw) - retained_genes,
  retained_percent = round(100 * retained_genes / nrow(counts_raw), 2)
)

filter_summary %>% knitr::kable()

filter	retained_genes	removed_genes	retained_percent
no_filter	63677	0	100.00
total_count_ge_10	22369	41308	35.13
cpm_ge_0_5_in_ge_2	16730	46947	26.27
cpm_ge_1_in_ge_half	14332	49345	22.51
edgeR_filterByExpr	16860	46817	26.48
adaptive_limma_voom_selected	18086	45591	28.40

adaptive_selection$scores %>% knitr::kable()

candidate	retained_genes	n_rejections	status
mean_count_ge_1	23171	5083	ok
mean_count_ge_5	17852	5152	ok
max_count_ge_10	18086	5211	ok
total_count_ge_10	22369	5108	ok
cpm_0_5_in_2	16730	5161	ok
cpm_1_in_half	14332	4847	ok
zero_prop_le_0_75	28877	4738	ok
group_presence_cpm1	14827	5001	ok

cat("Adaptive filter selected:", adaptive_selection$selected_name, "\n")

## Adaptive filter selected: max_count_ge_10

readr::write_csv(filter_summary, "results/filter_summary_airway.csv")
readr::write_csv(adaptive_selection$scores, "results/adaptive_filter_scores_airway.csv")

7 Run DESeq2, edgeR-QLF, and limma-voom under each filter

methods <- c("DESeq2", "edgeR_QLF", "limma_voom")
all_results <- list()
deseq_filter_diagnostics <- list()

for (filter_name in names(filter_masks)) {
  mask <- filter_masks[[filter_name]]
  counts_filtered <- counts_raw[mask, , drop = FALSE]

  for (method_name in methods) {
    message("Running ", method_name, " under filter: ", filter_name)

    result <- run_de_method(
      method = method_name,
      counts_mat = counts_filtered,
      meta = metadata,
      alpha = alpha,
      lfc_threshold = lfc_threshold,
      paired_design = params$paired_design
    )

    analysis_id <- paste(filter_name, method_name, sep = "__")
    all_results[[analysis_id]] <- result

    if (method_name == "DESeq2") {
      deseq_filter_diagnostics[[filter_name]] <- tibble::tibble(
        filter = filter_name,
        upstream_retained_genes = nrow(counts_filtered),
        deseq_filter_threshold = attr(result, "deseq_filter_threshold"),
        n_padj_na = sum(is.na(result$padj)),
        n_pvalue_na = sum(is.na(result$pvalue)),
        n_de = sum(result$is_de, na.rm = TRUE)
      )
    }
  }
}

de_results_long <- dplyr::bind_rows(all_results, .id = "analysis_id") %>%
  tidyr::separate(
    analysis_id,
    into = c("filter", "method"),
    sep = "__",
    remove = FALSE
  ) %>%
  dplyr::mutate(
    filter = factor(filter, levels = names(filter_masks)),
    method = factor(method, levels = methods)
  )

deseq_filter_diagnostics_df <- dplyr::bind_rows(deseq_filter_diagnostics)

readr::write_csv(de_results_long, "results/all_de_results_airway.csv")
readr::write_csv(deseq_filter_diagnostics_df, "results/deseq_independent_filtering_diagnostics_airway.csv")

8 DEG count comparison

deg_summary <- de_results_long %>%
  dplyr::group_by(filter, method) %>%
  dplyr::summarise(
    n_tested = dplyr::n(),
    n_pvalue_na = sum(is.na(pvalue)),
    n_padj_na = sum(is.na(padj)),
    n_de = sum(is_de, na.rm = TRUE),
    .groups = "drop"
  )

readr::write_csv(deg_summary, "results/deg_count_summary_airway.csv")

deg_summary %>% knitr::kable()

filter	method	n_tested	n_pvalue_na	n_padj_na	n_de
no_filter	DESeq2	63677	30208	46895	4028
no_filter	edgeR_QLF	63677	0	0	3704
no_filter	limma_voom	63677	0	0	6723
total_count_ge_10	DESeq2	22369	0	5204	4030
total_count_ge_10	edgeR_QLF	22369	0	0	5088
total_count_ge_10	limma_voom	22369	0	0	5108
cpm_ge_0_5_in_ge_2	DESeq2	16730	0	0	4122
cpm_ge_0_5_in_ge_2	edgeR_QLF	16730	0	0	5119
cpm_ge_0_5_in_ge_2	limma_voom	16730	0	0	5161
cpm_ge_1_in_ge_half	DESeq2	14332	0	0	3994
cpm_ge_1_in_ge_half	edgeR_QLF	14332	0	0	4764
cpm_ge_1_in_ge_half	limma_voom	14332	0	0	4847
edgeR_filterByExpr	DESeq2	16860	0	0	4113
edgeR_filterByExpr	edgeR_QLF	16860	0	0	5127
edgeR_filterByExpr	limma_voom	16860	0	0	5170
adaptive_limma_voom_selected	DESeq2	18086	0	702	4062
adaptive_limma_voom_selected	edgeR_QLF	18086	0	0	5200
adaptive_limma_voom_selected	limma_voom	18086	0	0	5211

p_deg_counts <- ggplot2::ggplot(
  deg_summary,
  ggplot2::aes(x = filter, y = n_de, fill = method)
) +
  ggplot2::geom_col(position = "dodge") +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Number of DE genes by filtering strategy and DE method",
    x = "Filtering strategy",
    y = paste0("Number of DE genes, FDR < ", alpha),
    fill = "DE method"
  ) +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))

p_deg_counts

ggplot2::ggsave(
  "figures/deg_counts_by_filter_and_method_airway.png",
  p_deg_counts,
  width = 11,
  height = 6,
  dpi = 300
)

9 P-value distribution diagnostics

p_pvalue <- de_results_long %>%
  dplyr::filter(!is.na(pvalue)) %>%
  ggplot2::ggplot(ggplot2::aes(x = pvalue)) +
  ggplot2::geom_histogram(bins = 40, boundary = 0) +
  ggplot2::facet_grid(method ~ filter) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "P-value distributions by filtering strategy and DE method",
    x = "Raw p-value",
    y = "Number of genes"
  ) +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))

p_pvalue

ggplot2::ggsave(
  "figures/pvalue_distributions_airway.png",
  p_pvalue,
  width = 14,
  height = 8,
  dpi = 300
)

10 DESeq2 independent filtering diagnostics

deseq_filter_diagnostics_df %>% knitr::kable()

filter	upstream_retained_genes	deseq_filter_threshold	n_padj_na	n_pvalue_na	n_de
no_filter	63677	7.163845	46895	30208	4028
total_count_ge_10	22369	6.136364	5204	0	4030
cpm_ge_0_5_in_ge_2	16730	3.612137	0	0	4122
cpm_ge_1_in_ge_half	14332	14.594640	0	0	3994
edgeR_filterByExpr	16860	3.611121	0	0	4113
adaptive_limma_voom_selected	18086	5.441465	702	0	4062

p_deseq_na <- ggplot2::ggplot(
  deseq_filter_diagnostics_df,
  ggplot2::aes(x = filter, y = n_padj_na)
) +
  ggplot2::geom_col() +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "DESeq2 genes with NA adjusted p-values",
    subtitle = "NA padj values are one way to inspect the effect of DESeq2 independent filtering",
    x = "Upstream filtering strategy",
    y = "Number of genes with NA padj"
  ) +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))

p_deseq_na

ggplot2::ggsave(
  "figures/deseq2_na_padj_by_filter_airway.png",
  p_deseq_na,
  width = 10,
  height = 5,
  dpi = 300
)

11 Pairwise DEG overlap across methods

get_deg_set <- function(results_df, filter_name, method_name) {
  results_df %>%
    dplyr::filter(
      filter == filter_name,
      method == method_name,
      is_de
    ) %>%
    dplyr::pull(gene_id) %>%
    unique()
}

compute_pairwise_overlap <- function(results_df, filters, methods) {
  output <- list()
  counter <- 1

  for (filter_name in filters) {
    for (method_1 in methods) {
      for (method_2 in methods) {
        genes_1 <- get_deg_set(results_df, filter_name, method_1)
        genes_2 <- get_deg_set(results_df, filter_name, method_2)

        intersection_size <- length(intersect(genes_1, genes_2))
        union_size <- length(union(genes_1, genes_2))
        jaccard <- ifelse(union_size == 0, NA_real_, intersection_size / union_size)

        output[[counter]] <- tibble::tibble(
          filter = filter_name,
          method_1 = method_1,
          method_2 = method_2,
          n_method_1 = length(genes_1),
          n_method_2 = length(genes_2),
          intersection = intersection_size,
          union = union_size,
          jaccard = jaccard
        )
        counter <- counter + 1
      }
    }
  }

  dplyr::bind_rows(output)
}

pairwise_overlap <- compute_pairwise_overlap(
  de_results_long,
  filters = names(filter_masks),
  methods = methods
)

readr::write_csv(pairwise_overlap, "results/pairwise_overlap_airway.csv")

pairwise_overlap %>% knitr::kable()

filter	method_1	method_2	n_method_1	n_method_2	intersection	union	jaccard
no_filter	DESeq2	DESeq2	4028	4028	4028	4028	1.0000000
no_filter	DESeq2	edgeR_QLF	4028	3704	3405	4327	0.7869193
no_filter	DESeq2	limma_voom	4028	6723	4027	6724	0.5988995
no_filter	edgeR_QLF	DESeq2	3704	4028	3405	4327	0.7869193
no_filter	edgeR_QLF	edgeR_QLF	3704	3704	3704	3704	1.0000000
no_filter	edgeR_QLF	limma_voom	3704	6723	3699	6728	0.5497919
no_filter	limma_voom	DESeq2	6723	4028	4027	6724	0.5988995
no_filter	limma_voom	edgeR_QLF	6723	3704	3699	6728	0.5497919
no_filter	limma_voom	limma_voom	6723	6723	6723	6723	1.0000000
total_count_ge_10	DESeq2	DESeq2	4030	4030	4030	4030	1.0000000
total_count_ge_10	DESeq2	edgeR_QLF	4030	5088	3985	5133	0.7763491
total_count_ge_10	DESeq2	limma_voom	4030	5108	4017	5121	0.7844171
total_count_ge_10	edgeR_QLF	DESeq2	5088	4030	3985	5133	0.7763491
total_count_ge_10	edgeR_QLF	edgeR_QLF	5088	5088	5088	5088	1.0000000
total_count_ge_10	edgeR_QLF	limma_voom	5088	5108	4920	5276	0.9325246
total_count_ge_10	limma_voom	DESeq2	5108	4030	4017	5121	0.7844171
total_count_ge_10	limma_voom	edgeR_QLF	5108	5088	4920	5276	0.9325246
total_count_ge_10	limma_voom	limma_voom	5108	5108	5108	5108	1.0000000
cpm_ge_0_5_in_ge_2	DESeq2	DESeq2	4122	4122	4122	4122	1.0000000
cpm_ge_0_5_in_ge_2	DESeq2	edgeR_QLF	4122	5119	4074	5167	0.7884653
cpm_ge_0_5_in_ge_2	DESeq2	limma_voom	4122	5161	4102	5181	0.7917390
cpm_ge_0_5_in_ge_2	edgeR_QLF	DESeq2	5119	4122	4074	5167	0.7884653
cpm_ge_0_5_in_ge_2	edgeR_QLF	edgeR_QLF	5119	5119	5119	5119	1.0000000
cpm_ge_0_5_in_ge_2	edgeR_QLF	limma_voom	5119	5161	5017	5263	0.9532586
cpm_ge_0_5_in_ge_2	limma_voom	DESeq2	5161	4122	4102	5181	0.7917390
cpm_ge_0_5_in_ge_2	limma_voom	edgeR_QLF	5161	5119	5017	5263	0.9532586
cpm_ge_0_5_in_ge_2	limma_voom	limma_voom	5161	5161	5161	5161	1.0000000
cpm_ge_1_in_ge_half	DESeq2	DESeq2	3994	3994	3994	3994	1.0000000
cpm_ge_1_in_ge_half	DESeq2	edgeR_QLF	3994	4764	3920	4838	0.8102522
cpm_ge_1_in_ge_half	DESeq2	limma_voom	3994	4847	3951	4890	0.8079755
cpm_ge_1_in_ge_half	edgeR_QLF	DESeq2	4764	3994	3920	4838	0.8102522
cpm_ge_1_in_ge_half	edgeR_QLF	edgeR_QLF	4764	4764	4764	4764	1.0000000
cpm_ge_1_in_ge_half	edgeR_QLF	limma_voom	4764	4847	4718	4893	0.9642346
cpm_ge_1_in_ge_half	limma_voom	DESeq2	4847	3994	3951	4890	0.8079755
cpm_ge_1_in_ge_half	limma_voom	edgeR_QLF	4847	4764	4718	4893	0.9642346
cpm_ge_1_in_ge_half	limma_voom	limma_voom	4847	4847	4847	4847	1.0000000
edgeR_filterByExpr	DESeq2	DESeq2	4113	4113	4113	4113	1.0000000
edgeR_filterByExpr	DESeq2	edgeR_QLF	4113	5127	4065	5175	0.7855072
edgeR_filterByExpr	DESeq2	limma_voom	4113	5170	4093	5190	0.7886320
edgeR_filterByExpr	edgeR_QLF	DESeq2	5127	4113	4065	5175	0.7855072
edgeR_filterByExpr	edgeR_QLF	edgeR_QLF	5127	5127	5127	5127	1.0000000
edgeR_filterByExpr	edgeR_QLF	limma_voom	5127	5170	5022	5275	0.9520379
edgeR_filterByExpr	limma_voom	DESeq2	5170	4113	4093	5190	0.7886320
edgeR_filterByExpr	limma_voom	edgeR_QLF	5170	5127	5022	5275	0.9520379
edgeR_filterByExpr	limma_voom	limma_voom	5170	5170	5170	5170	1.0000000
adaptive_limma_voom_selected	DESeq2	DESeq2	4062	4062	4062	4062	1.0000000
adaptive_limma_voom_selected	DESeq2	edgeR_QLF	4062	5200	4020	5242	0.7668829
adaptive_limma_voom_selected	DESeq2	limma_voom	4062	5211	4044	5229	0.7733792
adaptive_limma_voom_selected	edgeR_QLF	DESeq2	5200	4062	4020	5242	0.7668829
adaptive_limma_voom_selected	edgeR_QLF	edgeR_QLF	5200	5200	5200	5200	1.0000000
adaptive_limma_voom_selected	edgeR_QLF	limma_voom	5200	5211	5047	5364	0.9409023
adaptive_limma_voom_selected	limma_voom	DESeq2	5211	4062	4044	5229	0.7733792
adaptive_limma_voom_selected	limma_voom	edgeR_QLF	5211	5200	5047	5364	0.9409023
adaptive_limma_voom_selected	limma_voom	limma_voom	5211	5211	5211	5211	1.0000000

p_overlap <- ggplot2::ggplot(
  pairwise_overlap,
  ggplot2::aes(x = method_1, y = method_2, fill = jaccard)
) +
  ggplot2::geom_tile() +
  ggplot2::geom_text(ggplot2::aes(label = round(jaccard, 2)), size = 3) +
  ggplot2::facet_wrap(~ filter) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Pairwise Jaccard overlap of DE gene sets",
    x = "Method 1",
    y = "Method 2",
    fill = "Jaccard"
  ) +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))

p_overlap

ggplot2::ggsave(
  "figures/pairwise_jaccard_overlap_airway.png",
  p_overlap,
  width = 13,
  height = 8,
  dpi = 300
)

12 UpSet plots

for (filter_name in names(filter_masks)) {
  gene_sets <- list()

  for (method_name in methods) {
    gene_sets[[method_name]] <- get_deg_set(de_results_long, filter_name, method_name)
  }

  total_de_genes <- length(unique(unlist(gene_sets)))

  if (total_de_genes == 0) {
    message("Skipping UpSet plot for ", filter_name, ": no DE genes.")
    next
  }

  upset_input <- UpSetR::fromList(gene_sets)

  pdf(
    file = file.path("figures/upset", paste0("upset_", filter_name, "_airway.pdf")),
    width = 8,
    height = 6
  )

  UpSetR::upset(
    upset_input,
    nsets = length(methods),
    order.by = "freq",
    mainbar.y.label = "DE gene intersection size",
    sets.x.label = "DE genes per method"
  )

  dev.off()
}

13 Compare log2 fold-change estimates

lfc_wide <- de_results_long %>%
  dplyr::select(filter, method, gene_id, log2FoldChange) %>%
  tidyr::pivot_wider(
    names_from = method,
    values_from = log2FoldChange
  )

readr::write_csv(lfc_wide, "results/log2fc_wide_airway.csv")

p_lfc_deseq_edger <- ggplot2::ggplot(
  lfc_wide,
  ggplot2::aes(x = DESeq2, y = edgeR_QLF)
) +
  ggplot2::geom_point(alpha = 0.25, size = 0.7) +
  ggplot2::facet_wrap(~ filter) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Log2 fold-change agreement: DESeq2 vs edgeR-QLF",
    x = "DESeq2 log2FC",
    y = "edgeR-QLF log2FC"
  )

p_lfc_deseq_edger

ggplot2::ggsave(
  "figures/log2fc_deseq2_vs_edger_airway.png",
  p_lfc_deseq_edger,
  width = 13,
  height = 8,
  dpi = 300
)

p_lfc_deseq_limma <- ggplot2::ggplot(
  lfc_wide,
  ggplot2::aes(x = DESeq2, y = limma_voom)
) +
  ggplot2::geom_point(alpha = 0.25, size = 0.7) +
  ggplot2::facet_wrap(~ filter) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Log2 fold-change agreement: DESeq2 vs limma-voom",
    x = "DESeq2 log2FC",
    y = "limma-voom log2FC"
  )

p_lfc_deseq_limma

ggplot2::ggsave(
  "figures/log2fc_deseq2_vs_limma_airway.png",
  p_lfc_deseq_limma,
  width = 13,
  height = 8,
  dpi = 300
)

14 Optional GO enrichment consistency

This section is disabled by default. To run it, set this in the YAML header:

params:
  run_go: true

go_pkgs <- c("clusterProfiler", "org.Hs.eg.db")
missing_go_pkgs <- go_pkgs[
  !vapply(go_pkgs, requireNamespace, logical(1), quietly = TRUE)
]

if (length(missing_go_pkgs) > 0) {
  stop(
    "Missing GO enrichment packages: ", paste(missing_go_pkgs, collapse = ", "),
    "\nInstall with BiocManager::install(c('clusterProfiler', 'org.Hs.eg.db'))."
  )
}

suppressPackageStartupMessages({
  library(clusterProfiler)
  library(org.Hs.eg.db)
})

map_ensembl_to_entrez <- function(ensembl_ids) {
  mapped <- AnnotationDbi::mapIds(
    org.Hs.eg.db::org.Hs.eg.db,
    keys = unique(ensembl_ids),
    keytype = "ENSEMBL",
    column = "ENTREZID",
    multiVals = "first"
  )
  stats::na.omit(as.character(mapped))
}

run_go_for_deg_set <- function(de_ensembl_ids, universe_ensembl_ids) {
  entrez_de <- map_ensembl_to_entrez(de_ensembl_ids)
  entrez_universe <- map_ensembl_to_entrez(universe_ensembl_ids)

  if (length(entrez_de) < 10) {
    return(NULL)
  }

  clusterProfiler::enrichGO(
    gene = entrez_de,
    universe = entrez_universe,
    OrgDb = org.Hs.eg.db::org.Hs.eg.db,
    keyType = "ENTREZID",
    ont = "BP",
    pAdjustMethod = "BH",
    pvalueCutoff = 0.05,
    qvalueCutoff = 0.2,
    readable = TRUE
  )
}

go_results <- list()

for (filter_name in names(filter_masks)) {
  universe_ids <- rownames(counts_raw)[filter_masks[[filter_name]]]

  for (method_name in methods) {
    de_ids <- de_results_long %>%
      dplyr::filter(filter == filter_name, method == method_name, is_de) %>%
      dplyr::pull(ensembl_id) %>%
      unique()

    ego <- run_go_for_deg_set(de_ids, universe_ids)
    analysis_id <- paste(filter_name, method_name, sep = "__")
    go_results[[analysis_id]] <- ego

    if (!is.null(ego)) {
      ego_df <- as.data.frame(ego)
      readr::write_csv(
        ego_df,
        file.path("results", paste0("GO_", analysis_id, "_airway.csv"))
      )
    }
  }
}

# Summarize top GO terms.
go_top_terms <- list()

for (analysis_id in names(go_results)) {
  ego <- go_results[[analysis_id]]
  if (is.null(ego)) next

  ego_df <- as.data.frame(ego)
  if (nrow(ego_df) == 0) next

  pieces <- strsplit(analysis_id, "__")[[1]]

  go_top_terms[[analysis_id]] <- ego_df %>%
    dplyr::slice_head(n = 10) %>%
    dplyr::mutate(
      filter = pieces[1],
      method = pieces[2]
    ) %>%
    dplyr::select(filter, method, ID, Description, p.adjust, Count)
}

go_top_terms_df <- dplyr::bind_rows(go_top_terms)
readr::write_csv(go_top_terms_df, "results/top_go_terms_airway.csv")

go_top_terms_df %>% knitr::kable()

15 Interpretation guide

Use these outputs to answer the main project question:

DEG count summary: Does one DE method become more or less sensitive under stricter filters?
P-value distributions: Do some filters create more uniform, inflated, or enriched low-p-value distributions?
Pairwise overlap / UpSet plots: Are method differences stable across filters, or does overlap change dramatically?
DESeq2 independent filtering diagnostics: Does DESeq2 compensate for weak upstream filtering by producing more NA adjusted p-values?
GO enrichment: Do filtering choices change the biological interpretation, even when the same DE method is used?

16 Session information

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] UpSetR_1.4.0                readr_2.2.0                
##  [3] tibble_3.3.1                tidyr_1.3.2                
##  [5] dplyr_1.2.1                 ggplot2_4.0.3              
##  [7] edgeR_4.6.3                 limma_3.64.3               
##  [9] DESeq2_1.48.2               airway_1.28.0              
## [11] SummarizedExperiment_1.38.1 Biobase_2.68.0             
## [13] GenomicRanges_1.60.0        GenomeInfoDb_1.44.3        
## [15] IRanges_2.42.0              S4Vectors_0.46.0           
## [17] BiocGenerics_0.54.1         generics_0.1.4             
## [19] MatrixGenerics_1.20.0       matrixStats_1.5.0          
## [21] BiocManager_1.30.27        
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6            xfun_0.57               bslib_0.10.0           
##  [4] lattice_0.22-9          tzdb_0.5.0              vctrs_0.7.3            
##  [7] tools_4.5.1             parallel_4.5.1          pkgconfig_2.0.3        
## [10] Matrix_1.7-5            RColorBrewer_1.1-3      S7_0.2.2               
## [13] lifecycle_1.0.5         GenomeInfoDbData_1.2.14 compiler_4.5.1         
## [16] farver_2.1.2            textshaping_1.0.5       statmod_1.5.1          
## [19] codetools_0.2-20        htmltools_0.5.9         sass_0.4.10            
## [22] yaml_2.3.12             pillar_1.11.1           crayon_1.5.3           
## [25] jquerylib_0.1.4         BiocParallel_1.42.2     DelayedArray_0.34.1    
## [28] cachem_1.1.0            abind_1.4-8             tidyselect_1.2.1       
## [31] locfit_1.5-9.12         digest_0.6.39           purrr_1.2.2            
## [34] labeling_0.4.3          fastmap_1.2.0           grid_4.5.1             
## [37] cli_3.6.6               SparseArray_1.8.1       magrittr_2.0.5         
## [40] S4Arrays_1.8.1          withr_3.0.2             scales_1.4.0           
## [43] UCSC.utils_1.4.0        bit64_4.8.0             rmarkdown_2.31         
## [46] XVector_0.48.0          httr_1.4.8              bit_4.6.0              
## [49] otel_0.2.0              gridExtra_2.3           ragg_1.5.2             
## [52] hms_1.1.4               evaluate_1.0.5          knitr_1.51             
## [55] rlang_1.2.0             Rcpp_1.1.1-1.1          glue_1.8.1             
## [58] vroom_1.7.1             rstudioapi_0.18.0       jsonlite_2.0.0         
## [61] plyr_1.8.9              R6_2.6.1                systemfonts_1.3.2

Airway RNA-seq: Filtering Strategy x Differential Expression Method

Generated analysis template

2026-04-29