1 Study aim

This analysis evaluates whether differential expression results depend only on the DE method, only on the upstream gene-filtering rule, or on the interaction between both choices.

The workflow crosses six filtering strategies with three RNA-seq DE pipelines:

  1. no upstream filter,
  2. total count >= 10,
  3. CPM >= 0.5 in at least 2 samples,
  4. CPM >= 1 in at least half of samples,
  5. edgeR filterByExpr(),
  6. a Zehetmayer-style adaptive filter that scores candidate filters by the number of detected DE genes.

The three DE methods are:

  1. DESeq2,
  2. edgeR quasi-likelihood F-test,
  3. limma-voom.

The main outputs are the number of detected DE genes, method overlap, p-value distributions, DESeq2 independent-filtering diagnostics, and optional GO enrichment consistency.

Note: the adaptive filter implemented here is a practical scaffold inspired by the proposal. It evaluates candidate mean-, max-, CPM-, zero-, and Jaccard-like filters and selects the one producing the most rejections using a reference DE method. For a publication-level analysis, replace this section with the exact Zehetmayer et al. implementation if required by your instructor or committee.

2 Package loading

# Load only the packages this analysis directly needs.
# Do not load the full tidyverse here. In Bioconductor workflows, loading tidyverse
# after packages such as DESeq2/UpSetR can trigger a ggplot2 namespace conflict.
required_pkgs <- c(
  "readr", "dplyr", "tidyr", "tibble", "purrr", "ggplot2",
  "DESeq2", "edgeR", "limma", "UpSetR"
)

missing_pkgs <- required_pkgs[!vapply(required_pkgs, requireNamespace, logical(1), quietly = TRUE)]
if (length(missing_pkgs) > 0) {
  stop(
    "Please install these packages before knitting: ",
    paste(missing_pkgs, collapse = ", "),
    call. = FALSE
  )
}

suppressPackageStartupMessages({
  library(readr)
  library(dplyr)
  library(tidyr)
  library(tibble)
  library(purrr)
  library(ggplot2)

  library(DESeq2)
  library(edgeR)
  library(limma)
  library(UpSetR)
})

# Helper functions are embedded below, so this file can run by itself.
# Helper functions for RNA-seq filtering x DE-method interaction analysis
# Author: generated template

required_columns <- function(data, cols, data_name = "data") {
  missing_cols <- setdiff(cols, colnames(data))
  if (length(missing_cols) > 0) {
    stop(data_name, " is missing required column(s): ", paste(missing_cols, collapse = ", "), call. = FALSE)
  }
}

safe_dir_create <- function(path) {
  if (!dir.exists(path)) dir.create(path, recursive = TRUE, showWarnings = FALSE)
}

load_rnaseq_inputs <- function(count_file,
                               metadata_file,
                               sample_id_col = "sample_id",
                               condition_col = "condition",
                               gene_id_col = NULL,
                               reference_condition = NULL) {
  count_df <- readr::read_csv(count_file, show_col_types = FALSE)
  metadata <- readr::read_csv(metadata_file, show_col_types = FALSE)

  if (is.null(gene_id_col) || gene_id_col == "") {
    gene_id_col <- colnames(count_df)[1]
  }

  required_columns(count_df, gene_id_col, "count file")
  required_columns(metadata, c(sample_id_col, condition_col), "metadata file")

  gene_ids <- as.character(count_df[[gene_id_col]])
  if (anyDuplicated(gene_ids)) {
    stop("Gene IDs must be unique. Duplicated gene IDs found in the count file.", call. = FALSE)
  }

  count_mat <- count_df[, setdiff(colnames(count_df), gene_id_col), drop = FALSE]
  count_mat <- as.matrix(count_mat)
  rownames(count_mat) <- gene_ids

  if (any(is.na(count_mat))) stop("Count matrix contains missing values.", call. = FALSE)
  if (any(count_mat < 0)) stop("Count matrix contains negative values.", call. = FALSE)
  if (any(count_mat != round(count_mat))) {
    warning("Some count values are not integers. They will be rounded for count-based DE tools.")
    count_mat <- round(count_mat)
  }
  storage.mode(count_mat) <- "integer"

  sample_ids <- as.character(metadata[[sample_id_col]])
  missing_in_counts <- setdiff(sample_ids, colnames(count_mat))
  missing_in_metadata <- setdiff(colnames(count_mat), sample_ids)
  if (length(missing_in_counts) > 0) {
    stop("Metadata contains samples not present in the count matrix: ", paste(missing_in_counts, collapse = ", "), call. = FALSE)
  }
  if (length(missing_in_metadata) > 0) {
    stop("Count matrix contains samples not present in metadata: ", paste(missing_in_metadata, collapse = ", "), call. = FALSE)
  }

  metadata <- metadata[match(colnames(count_mat), sample_ids), , drop = FALSE]
  metadata[[condition_col]] <- factor(metadata[[condition_col]])

  if (!is.null(reference_condition) && reference_condition != "") {
    if (!reference_condition %in% levels(metadata[[condition_col]])) {
      stop("reference_condition is not one of the observed condition levels.", call. = FALSE)
    }
    metadata[[condition_col]] <- stats::relevel(metadata[[condition_col]], ref = reference_condition)
  }

  if (nlevels(metadata[[condition_col]]) != 2) {
    stop("This template expects exactly two groups in the condition column.", call. = FALSE)
  }

  list(counts = count_mat, metadata = metadata)
}

make_static_filters <- function(counts, group) {
  group <- factor(group)
  cpm_mat <- edgeR::cpm(counts)
  n_samples <- ncol(counts)

  filters <- list(
    no_filter = rownames(counts),
    total_count_ge_10 = rownames(counts)[rowSums(counts) >= 10],
    cpm_ge_0_5_in_ge_2_samples = rownames(counts)[rowSums(cpm_mat >= 0.5) >= 2],
    cpm_ge_1_in_ge_half_samples = rownames(counts)[rowSums(cpm_mat >= 1) >= ceiling(n_samples / 2)],
    edgeR_filterByExpr = rownames(counts)[edgeR::filterByExpr(edgeR::DGEList(counts = counts), group = group)]
  )

  filters <- lapply(filters, unique)
  filters
}

calculate_detection_jaccard_like <- function(counts, group) {
  group <- factor(group)
  if (nlevels(group) != 2) stop("Jaccard-like detection score expects two groups.", call. = FALSE)

  expressed <- counts > 0
  g1 <- group == levels(group)[1]
  g2 <- group == levels(group)[2]

  p1 <- rowMeans(expressed[, g1, drop = FALSE])
  p2 <- rowMeans(expressed[, g2, drop = FALSE])
  intersection_like <- pmin(p1, p2)
  union_like <- pmax(p1, p2)

  score <- ifelse(union_like == 0, 0, intersection_like / union_like)
  names(score) <- rownames(counts)
  score
}

make_adaptive_candidate_filters <- function(counts, group) {
  group <- factor(group)
  cpm_mat <- edgeR::cpm(counts)
  n_samples <- ncol(counts)
  min_group_size <- min(table(group))

  mean_counts <- rowMeans(counts)
  max_counts <- apply(counts, 1, max)
  zero_prop <- rowMeans(counts == 0)
  jaccard_like <- calculate_detection_jaccard_like(counts, group)

  positive_mean <- mean_counts[mean_counts > 0]
  positive_max <- max_counts[max_counts > 0]

  q_mean_10 <- if (length(positive_mean) > 0) stats::quantile(positive_mean, 0.10, na.rm = TRUE) else Inf
  q_mean_25 <- if (length(positive_mean) > 0) stats::quantile(positive_mean, 0.25, na.rm = TRUE) else Inf
  q_max_10 <- if (length(positive_max) > 0) stats::quantile(positive_max, 0.10, na.rm = TRUE) else Inf
  q_max_25 <- if (length(positive_max) > 0) stats::quantile(positive_max, 0.25, na.rm = TRUE) else Inf

  candidates <- list(
    mean_count_q10 = rownames(counts)[mean_counts >= q_mean_10],
    mean_count_q25 = rownames(counts)[mean_counts >= q_mean_25],
    max_count_q10 = rownames(counts)[max_counts >= q_max_10],
    max_count_q25 = rownames(counts)[max_counts >= q_max_25],
    cpm_0_5_min_group_size = rownames(counts)[rowSums(cpm_mat >= 0.5) >= min_group_size],
    cpm_1_half_samples = rownames(counts)[rowSums(cpm_mat >= 1) >= ceiling(n_samples / 2)],
    zero_prop_le_0_90 = rownames(counts)[zero_prop <= 0.90],
    zero_prop_le_0_80 = rownames(counts)[zero_prop <= 0.80],
    jaccard_like_ge_0_10 = rownames(counts)[jaccard_like >= 0.10],
    jaccard_like_ge_0_25 = rownames(counts)[jaccard_like >= 0.25]
  )

  candidates <- lapply(candidates, unique)
  candidates[lengths(candidates) > 0]
}

standardize_de_result <- function(tbl, method, filter_name, alpha = 0.05, lfc_threshold = 0) {
  tbl <- dplyr::as_tibble(tbl)

  if (!"gene_id" %in% colnames(tbl)) stop("DE result table must include gene_id.", call. = FALSE)
  if (!"log2FoldChange" %in% colnames(tbl)) stop("DE result table must include log2FoldChange.", call. = FALSE)
  if (!"pvalue" %in% colnames(tbl)) stop("DE result table must include pvalue.", call. = FALSE)
  if (!"padj" %in% colnames(tbl)) stop("DE result table must include padj.", call. = FALSE)

  tbl |>
    dplyr::mutate(
      method = method,
      filter = filter_name,
      significant = !is.na(.data$padj) & .data$padj < alpha & abs(.data$log2FoldChange) >= lfc_threshold,
      direction = dplyr::case_when(
        .data$significant & .data$log2FoldChange > 0 ~ "up",
        .data$significant & .data$log2FoldChange < 0 ~ "down",
        TRUE ~ "not_significant"
      )
    ) |>
    dplyr::select("filter", "method", "gene_id", dplyr::everything())
}

run_deseq2 <- function(counts, metadata, condition_col, filter_name, alpha = 0.05, lfc_threshold = 0) {
  design_formula <- stats::as.formula(paste0("~ ", condition_col))
  condition_levels <- levels(metadata[[condition_col]])
  contrast_vec <- c(condition_col, condition_levels[2], condition_levels[1])

  dds <- DESeq2::DESeqDataSetFromMatrix(
    countData = counts,
    colData = metadata,
    design = design_formula
  )
  dds <- DESeq2::DESeq(dds, quiet = TRUE)

  res <- DESeq2::results(dds, contrast = contrast_vec, alpha = alpha, independentFiltering = TRUE)
  res_no_if <- DESeq2::results(dds, contrast = contrast_vec, alpha = alpha, independentFiltering = FALSE)

  out <- as.data.frame(res)
  out$gene_id <- rownames(out)
  out <- standardize_de_result(out, method = "DESeq2", filter_name = filter_name, alpha = alpha, lfc_threshold = lfc_threshold)

  res_meta <- S4Vectors::metadata(res)
  filter_threshold <- if ("filterThreshold" %in% names(res_meta)) res_meta$filterThreshold else NA_real_

  no_if_tbl <- as.data.frame(res_no_if)
  diagnostics <- dplyr::tibble(
    filter = filter_name,
    method = "DESeq2",
    n_tested = nrow(out),
    n_with_pvalue = sum(!is.na(out$pvalue)),
    n_with_padj = sum(!is.na(out$padj)),
    n_na_padj = sum(is.na(out$padj)),
    n_na_padj_but_has_pvalue = sum(is.na(out$padj) & !is.na(out$pvalue)),
    n_sig = sum(out$significant, na.rm = TRUE),
    n_sig_without_internal_independent_filtering = sum(!is.na(no_if_tbl$padj) & no_if_tbl$padj < alpha & abs(no_if_tbl$log2FoldChange) >= lfc_threshold, na.rm = TRUE),
    independent_filter_threshold = as.numeric(filter_threshold)[1]
  )

  list(results = out, diagnostics = diagnostics)
}

run_edger_qlf <- function(counts, metadata, condition_col, filter_name, alpha = 0.05, lfc_threshold = 0) {
  group <- factor(metadata[[condition_col]])
  design <- stats::model.matrix(~ group)

  dge <- edgeR::DGEList(counts = counts, group = group)
  dge <- edgeR::calcNormFactors(dge)
  dge <- edgeR::estimateDisp(dge, design)

  fit <- edgeR::glmQLFit(dge, design)
  qlf <- edgeR::glmQLFTest(fit, coef = 2)
  res <- edgeR::topTags(qlf, n = Inf, sort.by = "none")$table

  out <- as.data.frame(res)
  out$gene_id <- rownames(out)
  out <- out |>
    dplyr::transmute(
      gene_id = .data$gene_id,
      log2FoldChange = .data$logFC,
      logCPM = .data$logCPM,
      F = .data$F,
      pvalue = .data$PValue,
      padj = .data$FDR
    )

  out <- standardize_de_result(out, method = "edgeR_QLF", filter_name = filter_name, alpha = alpha, lfc_threshold = lfc_threshold)

  diagnostics <- dplyr::tibble(
    filter = filter_name,
    method = "edgeR_QLF",
    n_tested = nrow(out),
    n_with_pvalue = sum(!is.na(out$pvalue)),
    n_with_padj = sum(!is.na(out$padj)),
    n_na_padj = sum(is.na(out$padj)),
    n_na_padj_but_has_pvalue = sum(is.na(out$padj) & !is.na(out$pvalue)),
    n_sig = sum(out$significant, na.rm = TRUE),
    n_sig_without_internal_independent_filtering = NA_integer_,
    independent_filter_threshold = NA_real_
  )

  list(results = out, diagnostics = diagnostics)
}

run_limma_voom <- function(counts, metadata, condition_col, filter_name, alpha = 0.05, lfc_threshold = 0) {
  group <- factor(metadata[[condition_col]])
  design <- stats::model.matrix(~ group)

  dge <- edgeR::DGEList(counts = counts, group = group)
  dge <- edgeR::calcNormFactors(dge)
  v <- limma::voom(dge, design, plot = FALSE)
  fit <- limma::lmFit(v, design)
  fit <- limma::eBayes(fit)
  res <- limma::topTable(fit, coef = 2, number = Inf, sort.by = "none")

  out <- as.data.frame(res)
  out$gene_id <- rownames(out)
  out <- out |>
    dplyr::transmute(
      gene_id = .data$gene_id,
      log2FoldChange = .data$logFC,
      AveExpr = .data$AveExpr,
      t = .data$t,
      B = .data$B,
      pvalue = .data$P.Value,
      padj = .data$adj.P.Val
    )

  out <- standardize_de_result(out, method = "limma_voom", filter_name = filter_name, alpha = alpha, lfc_threshold = lfc_threshold)

  diagnostics <- dplyr::tibble(
    filter = filter_name,
    method = "limma_voom",
    n_tested = nrow(out),
    n_with_pvalue = sum(!is.na(out$pvalue)),
    n_with_padj = sum(!is.na(out$padj)),
    n_na_padj = sum(is.na(out$padj)),
    n_na_padj_but_has_pvalue = sum(is.na(out$padj) & !is.na(out$pvalue)),
    n_sig = sum(out$significant, na.rm = TRUE),
    n_sig_without_internal_independent_filtering = NA_integer_,
    independent_filter_threshold = NA_real_
  )

  list(results = out, diagnostics = diagnostics)
}

run_de_method <- function(method, counts, metadata, condition_col, filter_name, alpha = 0.05, lfc_threshold = 0) {
  method <- match.arg(method, c("DESeq2", "edgeR_QLF", "limma_voom"))

  if (nrow(counts) < 5) {
    stop("Too few genes remain after filtering to run DE method safely.", call. = FALSE)
  }

  if (method == "DESeq2") {
    run_deseq2(counts, metadata, condition_col, filter_name, alpha, lfc_threshold)
  } else if (method == "edgeR_QLF") {
    run_edger_qlf(counts, metadata, condition_col, filter_name, alpha, lfc_threshold)
  } else {
    run_limma_voom(counts, metadata, condition_col, filter_name, alpha, lfc_threshold)
  }
}

choose_adaptive_filter <- function(counts,
                                   metadata,
                                   condition_col,
                                   reference_method = "limma_voom",
                                   alpha = 0.05,
                                   lfc_threshold = 0,
                                   min_rejections_for_adaptive = 10) {
  group <- metadata[[condition_col]]
  candidates <- make_adaptive_candidate_filters(counts, group)
  static_filters <- make_static_filters(counts, group)

  scores <- purrr::imap_dfr(candidates, function(genes, candidate_name) {
    counts_sub <- counts[genes, , drop = FALSE]
    fit <- tryCatch(
      run_de_method(reference_method, counts_sub, metadata, condition_col, candidate_name, alpha, lfc_threshold),
      error = function(e) e
    )

    if (inherits(fit, "error")) {
      dplyr::tibble(
        candidate_filter = candidate_name,
        n_genes = length(genes),
        n_rejections = NA_integer_,
        error = fit$message
      )
    } else {
      dplyr::tibble(
        candidate_filter = candidate_name,
        n_genes = length(genes),
        n_rejections = sum(fit$results$significant, na.rm = TRUE),
        error = NA_character_
      )
    }
  })

  usable_scores <- scores |>
    dplyr::filter(!is.na(.data$n_rejections)) |>
    dplyr::arrange(dplyr::desc(.data$n_rejections), .data$n_genes)

  if (nrow(usable_scores) == 0 || max(usable_scores$n_rejections, na.rm = TRUE) < min_rejections_for_adaptive) {
    selected_name <- "fallback_edgeR_filterByExpr"
    selected_genes <- static_filters$edgeR_filterByExpr
    fallback_used <- TRUE
  } else {
    selected_name <- usable_scores$candidate_filter[1]
    selected_genes <- candidates[[selected_name]]
    fallback_used <- FALSE
  }

  list(
    selected_name = selected_name,
    selected_genes = selected_genes,
    scores = scores,
    fallback_used = fallback_used,
    reference_method = reference_method
  )
}

run_pipeline <- function(counts,
                         metadata,
                         condition_col = "condition",
                         alpha = 0.05,
                         lfc_threshold = 0,
                         adaptive_reference_method = "limma_voom",
                         min_rejections_for_adaptive = 10) {
  group <- metadata[[condition_col]]
  static_filters <- make_static_filters(counts, group)

  adaptive <- choose_adaptive_filter(
    counts = counts,
    metadata = metadata,
    condition_col = condition_col,
    reference_method = adaptive_reference_method,
    alpha = alpha,
    lfc_threshold = lfc_threshold,
    min_rejections_for_adaptive = min_rejections_for_adaptive
  )

  filter_sets <- static_filters
  filter_sets$adaptive_Zehetmayer_style <- adaptive$selected_genes

  filter_summary <- purrr::imap_dfr(filter_sets, function(genes, filter_name) {
    dplyr::tibble(
      filter = filter_name,
      n_genes_retained = length(unique(genes)),
      pct_genes_retained = 100 * length(unique(genes)) / nrow(counts)
    )
  })

  methods <- c("DESeq2", "edgeR_QLF", "limma_voom")

  all_fits <- list()
  counter <- 1

  for (filter_name in names(filter_sets)) {
    genes <- unique(filter_sets[[filter_name]])
    counts_sub <- counts[genes, , drop = FALSE]

    for (method in methods) {
      message("Running ", method, " under filter: ", filter_name)
      fit <- tryCatch(
        run_de_method(method, counts_sub, metadata, condition_col, filter_name, alpha, lfc_threshold),
        error = function(e) {
          list(
            results = dplyr::tibble(),
            diagnostics = dplyr::tibble(
              filter = filter_name,
              method = method,
              n_tested = nrow(counts_sub),
              n_with_pvalue = NA_integer_,
              n_with_padj = NA_integer_,
              n_na_padj = NA_integer_,
              n_na_padj_but_has_pvalue = NA_integer_,
              n_sig = NA_integer_,
              n_sig_without_internal_independent_filtering = NA_integer_,
              independent_filter_threshold = NA_real_,
              error = e$message
            )
          )
        }
      )
      all_fits[[counter]] <- fit
      counter <- counter + 1
    }
  }

  results <- dplyr::bind_rows(purrr::map(all_fits, "results"))
  diagnostics <- dplyr::bind_rows(purrr::map(all_fits, "diagnostics"))

  de_summary <- results |>
    dplyr::group_by(.data$filter, .data$method) |>
    dplyr::summarise(
      n_tested = dplyr::n(),
      n_significant = sum(.data$significant, na.rm = TRUE),
      n_up = sum(.data$direction == "up", na.rm = TRUE),
      n_down = sum(.data$direction == "down", na.rm = TRUE),
      median_pvalue = stats::median(.data$pvalue, na.rm = TRUE),
      .groups = "drop"
    ) |>
    dplyr::left_join(filter_summary, by = "filter")

  list(
    filter_sets = filter_sets,
    filter_summary = filter_summary,
    adaptive = adaptive,
    results = results,
    diagnostics = diagnostics,
    de_summary = de_summary
  )
}

calculate_pairwise_overlap <- function(results) {
  results |>
    dplyr::filter(.data$significant) |>
    dplyr::select("filter", "method", "gene_id") |>
    dplyr::group_by(.data$filter) |>
    dplyr::group_modify(function(.x, .y) {
      methods <- sort(unique(.x$method))
      if (length(methods) < 2) return(dplyr::tibble())
      pairs <- utils::combn(methods, 2, simplify = FALSE)
      purrr::map_dfr(pairs, function(pair) {
        a <- unique(.x$gene_id[.x$method == pair[1]])
        b <- unique(.x$gene_id[.x$method == pair[2]])
        union_n <- length(union(a, b))
        inter_n <- length(intersect(a, b))
        dplyr::tibble(
          method_1 = pair[1],
          method_2 = pair[2],
          n_method_1 = length(a),
          n_method_2 = length(b),
          n_overlap = inter_n,
          n_union = union_n,
          jaccard = ifelse(union_n == 0, NA_real_, inter_n / union_n)
        )
      })
    }) |>
    dplyr::ungroup()
}

make_sig_sets_for_filter <- function(results, filter_name) {
  res_sub <- results |>
    dplyr::filter(.data$filter == filter_name, .data$significant) |>
    dplyr::select("method", "gene_id")

  split(res_sub$gene_id, res_sub$method)
}

run_go_enrichment <- function(results,
                              annotation_file,
                              filter_sets,
                              orgdb_package = "org.Hs.eg.db",
                              alpha = 0.05,
                              ontology = "BP") {
  if (is.null(annotation_file) || annotation_file == "" || !file.exists(annotation_file)) {
    message("No annotation file provided. Skipping GO enrichment.")
    return(dplyr::tibble())
  }
  if (!requireNamespace("clusterProfiler", quietly = TRUE)) {
    message("clusterProfiler is not installed. Skipping GO enrichment.")
    return(dplyr::tibble())
  }
  if (!requireNamespace(orgdb_package, quietly = TRUE)) {
    message(orgdb_package, " is not installed. Skipping GO enrichment.")
    return(dplyr::tibble())
  }

  annot <- readr::read_csv(annotation_file, show_col_types = FALSE)
  required_columns(annot, c("gene_id", "entrez_id"), "annotation file")
  annot <- annot |>
    dplyr::mutate(entrez_id = as.character(.data$entrez_id)) |>
    dplyr::filter(!is.na(.data$entrez_id), .data$entrez_id != "")

  OrgDb <- get(orgdb_package, envir = asNamespace(orgdb_package))

  sig_groups <- results |>
    dplyr::filter(.data$significant) |>
    dplyr::select("filter", "method", "gene_id") |>
    dplyr::group_by(.data$filter, .data$method) |>
    tidyr::nest()

  if (nrow(sig_groups) == 0) return(dplyr::tibble())

  purrr::pmap_dfr(sig_groups, function(filter, method, data) {
    sig_entrez <- data |>
      dplyr::inner_join(annot, by = "gene_id") |>
      dplyr::pull("entrez_id") |>
      unique()

    universe_entrez <- annot |>
      dplyr::filter(.data$gene_id %in% filter_sets[[filter]]) |>
      dplyr::pull("entrez_id") |>
      unique()

    if (length(sig_entrez) < 5 || length(universe_entrez) < 20) {
      return(dplyr::tibble())
    }

    ego <- tryCatch(
      clusterProfiler::enrichGO(
        gene = sig_entrez,
        universe = universe_entrez,
        OrgDb = OrgDb,
        keyType = "ENTREZID",
        ont = ontology,
        pAdjustMethod = "BH",
        pvalueCutoff = alpha,
        qvalueCutoff = alpha,
        readable = TRUE
      ),
      error = function(e) NULL
    )

    if (is.null(ego)) return(dplyr::tibble())
    ego_df <- as.data.frame(ego)
    if (nrow(ego_df) == 0) return(dplyr::tibble())

    ego_df |>
      dplyr::as_tibble() |>
      dplyr::mutate(filter = filter, method = method, .before = 1)
  })
}

3 Analysis parameters

alpha <- params$alpha
lfc_threshold <- params$lfc_threshold
output_dir <- params$output_dir

table_dir <- file.path(output_dir, "tables")
rds_dir <- file.path(output_dir, "rds")
figure_dir <- "figures"

safe_dir_create(output_dir)
safe_dir_create(table_dir)
safe_dir_create(rds_dir)
safe_dir_create(figure_dir)

analysis_parameters <- tibble::tibble(
  parameter = c(
    "count_file", "metadata_file", "annotation_file", "sample_id_col",
    "condition_col", "gene_id_col", "reference_condition", "alpha",
    "lfc_threshold", "adaptive_reference_method", "min_rejections_for_adaptive",
    "orgdb_package", "go_ontology"
  ),
  value = c(
    params$count_file, params$metadata_file, params$annotation_file, params$sample_id_col,
    params$condition_col, params$gene_id_col, params$reference_condition, as.character(params$alpha),
    as.character(params$lfc_threshold), params$adaptive_reference_method,
    as.character(params$min_rejections_for_adaptive), params$orgdb_package, params$go_ontology
  )
)

analysis_parameters
## # A tibble: 13 × 2
##    parameter                   value                      
##    <chr>                       <chr>                      
##  1 count_file                  "data/counts.csv"          
##  2 metadata_file               "data/metadata.csv"        
##  3 annotation_file             "data/gene_annotations.csv"
##  4 sample_id_col               "sample_id"                
##  5 condition_col               "condition"                
##  6 gene_id_col                 "gene_id"                  
##  7 reference_condition         ""                         
##  8 alpha                       "0.05"                     
##  9 lfc_threshold               "0"                        
## 10 adaptive_reference_method   "limma_voom"               
## 11 min_rejections_for_adaptive "10"                       
## 12 orgdb_package               "org.Hs.eg.db"             
## 13 go_ontology                 "BP"

4 Load count matrix and metadata

The count matrix should contain raw integer counts, with genes in rows and samples in columns. The metadata file should contain one row per sample and at least two columns: sample ID and condition.

inputs <- load_rnaseq_inputs(
  count_file = params$count_file,
  metadata_file = params$metadata_file,
  sample_id_col = params$sample_id_col,
  condition_col = params$condition_col,
  gene_id_col = params$gene_id_col,
  reference_condition = params$reference_condition
)

counts <- inputs$counts
metadata <- inputs$metadata

group <- metadata[[params$condition_col]]
condition_levels <- levels(group)

cat("Number of genes:", nrow(counts), "\n")
## Number of genes: 5
cat("Number of samples:", ncol(counts), "\n")
## Number of samples: 6
cat("Condition levels:", paste(condition_levels, collapse = " vs "), "\n")
## Condition levels: control vs treatment
metadata |>
  dplyr::count(.data[[params$condition_col]], name = "n_samples")
## # A tibble: 2 × 2
##   condition n_samples
##   <fct>         <int>
## 1 control           3
## 2 treatment         3

5 Basic QC before filtering

lib_sizes <- colSums(counts)
qc_tbl <- tibble::tibble(
  sample_id = colnames(counts),
  library_size = lib_sizes,
  condition = group
)

readr::write_csv(qc_tbl, file.path(table_dir, "sample_library_sizes.csv"))

qc_tbl |>
  ggplot(aes(x = reorder(sample_id, library_size), y = library_size, fill = condition)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Library sizes by sample",
    x = "Sample",
    y = "Total counts"
  ) +
  theme_bw()

gene_detection_tbl <- tibble::tibble(
  gene_id = rownames(counts),
  total_count = rowSums(counts),
  mean_count = rowMeans(counts),
  n_samples_detected = rowSums(counts > 0),
  zero_proportion = rowMeans(counts == 0)
)

readr::write_csv(gene_detection_tbl, file.path(table_dir, "gene_detection_summary.csv"))

gene_detection_tbl |>
  ggplot(aes(x = log10(total_count + 1))) +
  geom_histogram(bins = 60) +
  labs(
    title = "Distribution of total gene counts before filtering",
    x = "log10(total count + 1)",
    y = "Number of genes"
  ) +
  theme_bw()

6 Define and inspect filtering strategies

static_filters <- make_static_filters(counts, group)

adaptive_preview <- choose_adaptive_filter(
  counts = counts,
  metadata = metadata,
  condition_col = params$condition_col,
  reference_method = params$adaptive_reference_method,
  alpha = alpha,
  lfc_threshold = lfc_threshold,
  min_rejections_for_adaptive = params$min_rejections_for_adaptive
)

filter_sets_preview <- static_filters
filter_sets_preview$adaptive_Zehetmayer_style <- adaptive_preview$selected_genes

filter_summary_preview <- purrr::imap_dfr(filter_sets_preview, function(genes, filter_name) {
  tibble::tibble(
    filter = filter_name,
    n_genes_retained = length(unique(genes)),
    pct_genes_retained = 100 * length(unique(genes)) / nrow(counts)
  )
})

readr::write_csv(filter_summary_preview, file.path(table_dir, "filter_summary_preview.csv"))
readr::write_csv(adaptive_preview$scores, file.path(table_dir, "adaptive_filter_candidate_scores.csv"))

filter_summary_preview
## # A tibble: 6 × 3
##   filter                      n_genes_retained pct_genes_retained
##   <chr>                                  <int>              <dbl>
## 1 no_filter                                  5                100
## 2 total_count_ge_10                          5                100
## 3 cpm_ge_0_5_in_ge_2_samples                 5                100
## 4 cpm_ge_1_in_ge_half_samples                5                100
## 5 edgeR_filterByExpr                         4                 80
## 6 adaptive_Zehetmayer_style                  4                 80
filter_summary_preview |>
  ggplot(aes(x = reorder(filter, n_genes_retained), y = n_genes_retained)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Number of genes retained by each upstream filter",
    x = "Filtering strategy",
    y = "Genes retained"
  ) +
  theme_bw()

adaptive_preview$scores |>
  dplyr::arrange(dplyr::desc(n_rejections), n_genes)
## # A tibble: 10 × 4
##    candidate_filter       n_genes n_rejections error                            
##    <chr>                    <int>        <int> <chr>                            
##  1 cpm_0_5_min_group_size       5            5 <NA>                             
##  2 cpm_1_half_samples           5            5 <NA>                             
##  3 zero_prop_le_0_90            5            5 <NA>                             
##  4 zero_prop_le_0_80            5            5 <NA>                             
##  5 jaccard_like_ge_0_10         5            5 <NA>                             
##  6 jaccard_like_ge_0_25         5            5 <NA>                             
##  7 mean_count_q10               4           NA Too few genes remain after filte…
##  8 mean_count_q25               4           NA Too few genes remain after filte…
##  9 max_count_q10                4           NA Too few genes remain after filte…
## 10 max_count_q25                4           NA Too few genes remain after filte…
cat("Selected adaptive candidate:", adaptive_preview$selected_name, "\n")
## Selected adaptive candidate: fallback_edgeR_filterByExpr
cat("Reference method used for adaptive scoring:", adaptive_preview$reference_method, "\n")
## Reference method used for adaptive scoring: limma_voom
cat("Fallback used:", adaptive_preview$fallback_used, "\n")
## Fallback used: TRUE

7 Run full DE pipeline

This chunk runs all three methods under all filters. Depending on dataset size, this may take several minutes.

pipeline <- run_pipeline(
  counts = counts,
  metadata = metadata,
  condition_col = params$condition_col,
  alpha = alpha,
  lfc_threshold = lfc_threshold,
  adaptive_reference_method = params$adaptive_reference_method,
  min_rejections_for_adaptive = params$min_rejections_for_adaptive
)

all_results <- pipeline$results
de_summary <- pipeline$de_summary
diagnostics <- pipeline$diagnostics
pairwise_overlap <- calculate_pairwise_overlap(all_results)

readr::write_csv(all_results, file.path(table_dir, "all_de_results.csv"))
readr::write_csv(de_summary, file.path(table_dir, "de_summary_by_filter_and_method.csv"))
readr::write_csv(diagnostics, file.path(table_dir, "method_diagnostics.csv"))
readr::write_csv(pairwise_overlap, file.path(table_dir, "pairwise_deg_overlap.csv"))
readr::write_csv(pipeline$filter_summary, file.path(table_dir, "final_filter_summary.csv"))
readr::write_csv(pipeline$adaptive$scores, file.path(table_dir, "final_adaptive_filter_scores.csv"))

saveRDS(pipeline, file.path(rds_dir, "pipeline_results.rds"))

de_summary
## # A tibble: 12 × 9
##    filter               method n_tested n_significant  n_up n_down median_pvalue
##    <chr>                <chr>     <int>         <int> <int>  <int>         <dbl>
##  1 cpm_ge_0_5_in_ge_2_… DESeq2        5             3     2      1 0.000194     
##  2 cpm_ge_0_5_in_ge_2_… edgeR…        5             4     2      2 0.000000136  
##  3 cpm_ge_0_5_in_ge_2_… limma…        5             5     2      3 0.00000000170
##  4 cpm_ge_1_in_ge_half… DESeq2        5             3     2      1 0.000194     
##  5 cpm_ge_1_in_ge_half… edgeR…        5             4     2      2 0.000000136  
##  6 cpm_ge_1_in_ge_half… limma…        5             5     2      3 0.00000000170
##  7 no_filter            DESeq2        5             3     2      1 0.000194     
##  8 no_filter            edgeR…        5             4     2      2 0.000000136  
##  9 no_filter            limma…        5             5     2      3 0.00000000170
## 10 total_count_ge_10    DESeq2        5             3     2      1 0.000194     
## 11 total_count_ge_10    edgeR…        5             4     2      2 0.000000136  
## 12 total_count_ge_10    limma…        5             5     2      3 0.00000000170
## # ℹ 2 more variables: n_genes_retained <int>, pct_genes_retained <dbl>

8 Compare number of DE genes

de_summary |>
  ggplot(aes(x = filter, y = n_significant, fill = method)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(
    title = "Detected DE genes by filtering strategy and DE method",
    x = "Filtering strategy",
    y = paste0("Number of DE genes at FDR < ", alpha),
    fill = "DE method"
  ) +
  theme_bw()

de_summary |>
  dplyr::select(filter, method, n_up, n_down) |>
  tidyr::pivot_longer(cols = c(n_up, n_down), names_to = "direction", values_to = "n_genes") |>
  ggplot(aes(x = filter, y = n_genes, fill = direction)) +
  geom_col(position = "dodge") +
  facet_wrap(~ method, scales = "free_y") +
  coord_flip() +
  labs(
    title = "Up- and down-regulated DE genes",
    x = "Filtering strategy",
    y = "Number of genes",
    fill = "Direction"
  ) +
  theme_bw()

9 Pairwise overlap across DE methods

if (nrow(pairwise_overlap) > 0) {
  pairwise_overlap |>
    dplyr::arrange(filter, dplyr::desc(jaccard))
} else {
  cat("No pairwise overlap table was produced because fewer than two methods detected significant genes within the same filter.\n")
}
## # A tibble: 12 × 8
##    filter      method_1 method_2 n_method_1 n_method_2 n_overlap n_union jaccard
##    <chr>       <chr>    <chr>         <int>      <int>     <int>   <int>   <dbl>
##  1 cpm_ge_0_5… edgeR_Q… limma_v…          4          5         4       5    0.8 
##  2 cpm_ge_0_5… DESeq2   edgeR_Q…          3          4         3       4    0.75
##  3 cpm_ge_0_5… DESeq2   limma_v…          3          5         3       5    0.6 
##  4 cpm_ge_1_i… edgeR_Q… limma_v…          4          5         4       5    0.8 
##  5 cpm_ge_1_i… DESeq2   edgeR_Q…          3          4         3       4    0.75
##  6 cpm_ge_1_i… DESeq2   limma_v…          3          5         3       5    0.6 
##  7 no_filter   edgeR_Q… limma_v…          4          5         4       5    0.8 
##  8 no_filter   DESeq2   edgeR_Q…          3          4         3       4    0.75
##  9 no_filter   DESeq2   limma_v…          3          5         3       5    0.6 
## 10 total_coun… edgeR_Q… limma_v…          4          5         4       5    0.8 
## 11 total_coun… DESeq2   edgeR_Q…          3          4         3       4    0.75
## 12 total_coun… DESeq2   limma_v…          3          5         3       5    0.6
if (nrow(pairwise_overlap) > 0) {
  pairwise_overlap |>
    dplyr::mutate(method_pair = paste(method_1, method_2, sep = " vs ")) |>
    ggplot(aes(x = filter, y = jaccard, fill = method_pair)) +
    geom_col(position = "dodge") +
    coord_flip() +
    labs(
      title = "Pairwise Jaccard overlap of significant DE genes",
      x = "Filtering strategy",
      y = "Jaccard overlap",
      fill = "Method pair"
    ) +
    theme_bw()
} else {
  cat("Pairwise overlap plot skipped.\n")
}

10 UpSet plots

for (filter_name in names(pipeline$filter_sets)) {
  sig_sets <- make_sig_sets_for_filter(all_results, filter_name)
  sig_sets <- sig_sets[lengths(sig_sets) > 0]

  if (length(sig_sets) < 2) {
    cat("Skipping UpSet plot for", filter_name, "because fewer than two methods produced significant genes.\n")
    next
  }

  cat("\n\n### UpSet plot:", filter_name, "\n\n")
  upset_input <- UpSetR::fromList(sig_sets)
  print(
    UpSetR::upset(
      upset_input,
      nsets = length(sig_sets),
      order.by = "freq",
      mainbar.y.label = "DE gene intersections",
      sets.x.label = "DE genes per method"
    )
  )
}
## 
## 
## ### UpSet plot: no_filter

## 
## 
## ### UpSet plot: total_count_ge_10

## 
## 
## ### UpSet plot: cpm_ge_0_5_in_ge_2_samples

## 
## 
## ### UpSet plot: cpm_ge_1_in_ge_half_samples

## Skipping UpSet plot for edgeR_filterByExpr because fewer than two methods produced significant genes.
## Skipping UpSet plot for adaptive_Zehetmayer_style because fewer than two methods produced significant genes.

11 P-value distribution diagnostics

all_results |>
  dplyr::filter(!is.na(pvalue)) |>
  ggplot(aes(x = pvalue)) +
  geom_histogram(bins = 50) +
  facet_grid(method ~ filter) +
  labs(
    title = "P-value distributions by filtering strategy and DE method",
    x = "Raw p-value",
    y = "Number of genes"
  ) +
  theme_bw()

12 DESeq2 independent filtering diagnostics

DESeq2 applies independent filtering internally after model fitting. This section compares how many genes remain with adjusted p-values under each upstream filter and whether the number of DE genes changes when internal independent filtering is turned off.

deseq2_diagnostics <- diagnostics |>
  dplyr::filter(method == "DESeq2") |>
  dplyr::mutate(
    difference_due_to_internal_filtering = n_sig - n_sig_without_internal_independent_filtering
  )

readr::write_csv(deseq2_diagnostics, file.path(table_dir, "deseq2_independent_filtering_diagnostics.csv"))

deseq2_diagnostics
## # A tibble: 6 × 12
##   filter                     method n_tested n_with_pvalue n_with_padj n_na_padj
##   <chr>                      <chr>     <int>         <int>       <int>     <int>
## 1 no_filter                  DESeq2        5             5           5         0
## 2 total_count_ge_10          DESeq2        5             5           5         0
## 3 cpm_ge_0_5_in_ge_2_samples DESeq2        5             5           5         0
## 4 cpm_ge_1_in_ge_half_sampl… DESeq2        5             5           5         0
## 5 edgeR_filterByExpr         DESeq2        4            NA          NA        NA
## 6 adaptive_Zehetmayer_style  DESeq2        4            NA          NA        NA
## # ℹ 6 more variables: n_na_padj_but_has_pvalue <int>, n_sig <int>,
## #   n_sig_without_internal_independent_filtering <int>,
## #   independent_filter_threshold <dbl>, error <chr>,
## #   difference_due_to_internal_filtering <int>
deseq2_diagnostics |>
  dplyr::select(filter, n_sig, n_sig_without_internal_independent_filtering) |>
  tidyr::pivot_longer(
    cols = c(n_sig, n_sig_without_internal_independent_filtering),
    names_to = "setting",
    values_to = "n_significant"
  ) |>
  ggplot(aes(x = filter, y = n_significant, fill = setting)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(
    title = "DESeq2 results with vs without internal independent filtering",
    x = "Upstream filtering strategy",
    y = "Number of DE genes",
    fill = "DESeq2 setting"
  ) +
  theme_bw()

13 Optional GO enrichment consistency

This section runs GO enrichment only if data/gene_annotations.csv exists and contains gene_id and entrez_id columns. The enrichment background is defined separately for each upstream filter, using the genes retained by that filter.

go_results <- run_go_enrichment(
  results = all_results,
  annotation_file = params$annotation_file,
  filter_sets = pipeline$filter_sets,
  orgdb_package = params$orgdb_package,
  alpha = alpha,
  ontology = params$go_ontology
)

readr::write_csv(go_results, file.path(table_dir, "go_enrichment_results.csv"))

if (nrow(go_results) > 0) {
  go_results |>
    dplyr::select(filter, method, ID, Description, GeneRatio, BgRatio, p.adjust, qvalue, Count) |>
    dplyr::arrange(filter, method, p.adjust) |>
    dplyr::slice_head(n = 30)
} else {
  cat("GO enrichment was skipped or returned no significant terms.\n")
}
## GO enrichment was skipped or returned no significant terms.
if (nrow(go_results) > 0) {
  top_terms <- go_results |>
    dplyr::group_by(Description) |>
    dplyr::summarise(best_padj = min(p.adjust, na.rm = TRUE), .groups = "drop") |>
    dplyr::slice_min(best_padj, n = 20) |>
    dplyr::pull(Description)

  go_results |>
    dplyr::filter(Description %in% top_terms) |>
    dplyr::mutate(minus_log10_padj = -log10(p.adjust)) |>
    ggplot(aes(x = method, y = Description, size = Count, fill = minus_log10_padj)) +
    geom_point(shape = 21) +
    facet_wrap(~ filter) +
    labs(
      title = "GO enrichment consistency across filters and DE methods",
      x = "DE method",
      y = "GO biological process",
      size = "DE genes in term",
      fill = "-log10(adj. p)"
    ) +
    theme_bw()
} else {
  cat("GO enrichment was skipped or returned no significant terms.\n")
}
## GO enrichment was skipped or returned no significant terms.

14 Final interpretation checklist

Use the following questions when writing the Results/Discussion section:

  1. Does increasing filter stringency monotonically increase or decrease the number of DE genes?
  2. Does the answer depend on the DE method?
  3. Which method is most sensitive to filtering choice?
  4. Do the same genes remain significant across methods at a fixed filter level?
  5. Does DESeq2’s internal independent filtering compensate for weak upstream filtering?
  6. Does biological interpretation change when the filter changes, as measured by GO enrichment?

15 Reproducibility

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] UpSetR_1.4.0                edgeR_4.6.3                
##  [3] limma_3.64.3                DESeq2_1.48.2              
##  [5] SummarizedExperiment_1.38.1 Biobase_2.68.0             
##  [7] MatrixGenerics_1.20.0       matrixStats_1.5.0          
##  [9] GenomicRanges_1.60.0        GenomeInfoDb_1.44.3        
## [11] IRanges_2.42.0              S4Vectors_0.46.0           
## [13] BiocGenerics_0.54.1         generics_0.1.4             
## [15] ggplot2_4.0.3               purrr_1.2.2                
## [17] tibble_3.3.1                tidyr_1.3.2                
## [19] dplyr_1.2.1                 readr_2.2.0                
## 
## loaded via a namespace (and not attached):
##   [1] DBI_1.3.0               gson_0.1.0              gridExtra_2.3          
##   [4] rlang_1.2.0             magrittr_2.0.5          DOSE_4.2.0             
##   [7] otel_0.2.0              compiler_4.5.1          RSQLite_2.4.6          
##  [10] reshape2_1.4.5          png_0.1-9               vctrs_0.7.3            
##  [13] stringr_1.6.0           pkgconfig_2.0.3         crayon_1.5.3           
##  [16] fastmap_1.2.0           XVector_0.48.0          labeling_0.4.3         
##  [19] utf8_1.2.6              rmarkdown_2.31          enrichplot_1.28.4      
##  [22] tzdb_0.5.0              UCSC.utils_1.4.0        bit_4.6.0              
##  [25] xfun_0.57               cachem_1.1.0            aplot_0.2.9            
##  [28] jsonlite_2.0.0          blob_1.3.0              DelayedArray_0.34.1    
##  [31] BiocParallel_1.42.2     parallel_4.5.1          R6_2.6.1               
##  [34] stringi_1.8.7           bslib_0.10.0            RColorBrewer_1.1-3     
##  [37] jquerylib_0.1.4         GOSemSim_2.34.0         Rcpp_1.1.1-1.1         
##  [40] knitr_1.51              ggtangle_0.1.2          R.utils_2.13.0         
##  [43] igraph_2.3.0            Matrix_1.7-5            splines_4.5.1          
##  [46] tidyselect_1.2.1        qvalue_2.40.0           rstudioapi_0.18.0      
##  [49] abind_1.4-8             yaml_2.3.12             codetools_0.2-20       
##  [52] lattice_0.22-9          plyr_1.8.9              treeio_1.32.0          
##  [55] withr_3.0.2             KEGGREST_1.48.1         S7_0.2.2               
##  [58] evaluate_1.0.5          gridGraphics_0.5-1      Biostrings_2.76.0      
##  [61] ggtree_3.16.3           pillar_1.11.1           clusterProfiler_4.16.0 
##  [64] ggfun_0.2.0             vroom_1.7.1             hms_1.1.4              
##  [67] tidytree_0.4.7          scales_1.4.0            glue_1.8.1             
##  [70] lazyeval_0.2.3          tools_4.5.1             data.table_1.18.2.1    
##  [73] fgsea_1.34.2            locfit_1.5-9.12         fs_2.1.0               
##  [76] fastmatch_1.1-8         cowplot_1.2.0           grid_4.5.1             
##  [79] ape_5.8-1               colorspace_2.1-2        AnnotationDbi_1.70.0   
##  [82] nlme_3.1-169            patchwork_1.3.2         GenomeInfoDbData_1.2.14
##  [85] cli_3.6.6               rappdirs_0.3.4          S4Arrays_1.8.1         
##  [88] gtable_0.3.6            R.methodsS3_1.8.2       yulab.utils_0.2.4      
##  [91] sass_0.4.10             digest_0.6.39           ggrepel_0.9.8          
##  [94] ggplotify_0.1.3         SparseArray_1.8.1       farver_2.1.2           
##  [97] memoise_2.0.1           htmltools_0.5.9         R.oo_1.27.1            
## [100] lifecycle_1.0.5         httr_1.4.8              GO.db_3.21.0           
## [103] statmod_1.5.1           bit64_4.8.0