This R Markdown file implements the project using the airway bulk RNA-seq dataset.
The analysis tests how different upstream gene-filtering strategies interact with three differential expression methods:
The airway dataset contains human airway smooth muscle cell RNA-seq samples treated with dexamethasone versus untreated controls. The samples are paired by cell line, so the default model uses:
~ cell + condition
This keeps the two-condition comparison while accounting for cell-line effects.
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE,
fig.width = 10,
fig.height = 6
)
if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install("airway")
library(airway)
data(airway)
required_pkgs <- c(
"airway",
"SummarizedExperiment",
"DESeq2",
"edgeR",
"limma",
"ggplot2",
"dplyr",
"tidyr",
"tibble",
"readr",
"UpSetR",
"knitr"
)
missing_pkgs <- required_pkgs[
!vapply(required_pkgs, requireNamespace, logical(1), quietly = TRUE)
]
if (length(missing_pkgs) > 0) {
stop(
"Missing packages: ", paste(missing_pkgs, collapse = ", "),
"\nInstall CRAN packages with install.packages(). ",
"Install Bioconductor packages with BiocManager::install()."
)
}
suppressPackageStartupMessages({
library(airway)
library(SummarizedExperiment)
library(DESeq2)
library(edgeR)
library(limma)
library(ggplot2)
library(dplyr)
library(tidyr)
library(tibble)
library(readr)
library(UpSetR)
})
alpha <- params$alpha
lfc_threshold <- params$lfc_threshold
# Output folders
dir.create("results", showWarnings = FALSE)
dir.create("figures", showWarnings = FALSE)
dir.create("figures/upset", recursive = TRUE, showWarnings = FALSE)
dir.create("figures/go", recursive = TRUE, showWarnings = FALSE)
Run this only if packages are missing. Restart R after installing packages.
install.packages(c("ggplot2", "dplyr", "tidyr", "tibble", "readr", "UpSetR", "knitr"))
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
BiocManager::install(c(
"airway",
"SummarizedExperiment",
"DESeq2",
"edgeR",
"limma"
))
# Optional GO enrichment packages
BiocManager::install(c("clusterProfiler", "org.Hs.eg.db"))
data("airway", package = "airway")
airway_se <- airway
counts_raw <- as.matrix(SummarizedExperiment::assay(airway_se))
metadata <- as.data.frame(SummarizedExperiment::colData(airway_se))
# Keep clean sample IDs.
metadata$sample_id <- rownames(metadata)
# Airway uses dex == "trt" and "untrt".
metadata$condition <- factor(metadata$dex, levels = c("untrt", "trt"))
metadata$cell <- factor(metadata$cell)
# Make gene IDs cleaner for downstream annotation.
# Airway rownames are Ensembl IDs. This also handles cases where version suffixes exist.
gene_id_original <- rownames(counts_raw)
gene_id_clean <- sub("\\\\..*$", "", gene_id_original)
rownames(counts_raw) <- make.unique(gene_id_clean)
# Ensure sample order matches.
counts_raw <- counts_raw[, metadata$sample_id]
metadata %>%
dplyr::select(sample_id, cell, dex, condition) %>%
knitr::kable()
| sample_id | cell | dex | condition | |
|---|---|---|---|---|
| SRR1039508 | SRR1039508 | N61311 | untrt | untrt |
| SRR1039509 | SRR1039509 | N61311 | trt | trt |
| SRR1039512 | SRR1039512 | N052611 | untrt | untrt |
| SRR1039513 | SRR1039513 | N052611 | trt | trt |
| SRR1039516 | SRR1039516 | N080611 | untrt | untrt |
| SRR1039517 | SRR1039517 | N080611 | trt | trt |
| SRR1039520 | SRR1039520 | N061011 | untrt | untrt |
| SRR1039521 | SRR1039521 | N061011 | trt | trt |
dataset_summary <- tibble::tibble(
n_genes = nrow(counts_raw),
n_samples = ncol(counts_raw),
n_untrt = sum(metadata$condition == "untrt"),
n_trt = sum(metadata$condition == "trt"),
n_cells = length(unique(metadata$cell))
)
dataset_summary %>% knitr::kable()
| n_genes | n_samples | n_untrt | n_trt | n_cells |
|---|---|---|---|---|
| 63677 | 8 | 4 | 4 | 4 |
make_design_matrix <- function(meta, paired_design = TRUE) {
if (paired_design && "cell" %in% colnames(meta)) {
design <- stats::model.matrix(~ cell + condition, data = meta)
} else {
design <- stats::model.matrix(~ condition, data = meta)
}
design
}
make_deseq_formula <- function(paired_design = TRUE) {
if (paired_design) {
stats::as.formula("~ cell + condition")
} else {
stats::as.formula("~ condition")
}
}
get_condition_coef <- function(design_matrix) {
coef_name <- grep("^condition", colnames(design_matrix), value = TRUE)
if (length(coef_name) != 1) {
stop(
"Could not identify exactly one condition coefficient. Found: ",
paste(coef_name, collapse = ", ")
)
}
coef_name
}
standardize_gene_ids <- function(gene_ids) {
sub("\\\\..*$", "", gene_ids)
}
run_deseq2 <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
paired_design = TRUE) {
dds <- DESeq2::DESeqDataSetFromMatrix(
countData = round(counts_mat),
colData = meta,
design = make_deseq_formula(paired_design)
)
dds <- DESeq2::DESeq(dds, quiet = TRUE)
res <- DESeq2::results(
dds,
contrast = c("condition", "trt", "untrt"),
alpha = alpha,
independentFiltering = TRUE
)
res_df <- as.data.frame(res) %>%
tibble::rownames_to_column("gene_id") %>%
dplyr::transmute(
gene_id = gene_id,
ensembl_id = standardize_gene_ids(gene_id),
log2FoldChange = log2FoldChange,
statistic = stat,
pvalue = pvalue,
padj = padj,
is_de = !is.na(padj) &
padj < alpha &
abs(log2FoldChange) >= lfc_threshold
)
deseq_metadata <- metadata(res)
filter_threshold <- NA_real_
if (!is.null(deseq_metadata$filterThreshold)) {
filter_threshold <- as.numeric(deseq_metadata$filterThreshold)
}
attr(res_df, "deseq_filter_threshold") <- filter_threshold
attr(res_df, "deseq_filter_num_rej") <- deseq_metadata$filterNumRej
attr(res_df, "deseq_alpha") <- deseq_metadata$alpha
res_df
}
run_edger_qlf <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
paired_design = TRUE) {
design <- make_design_matrix(meta, paired_design)
condition_coef <- get_condition_coef(design)
y <- edgeR::DGEList(counts = counts_mat)
y <- edgeR::calcNormFactors(y)
y <- edgeR::estimateDisp(y, design)
fit <- edgeR::glmQLFit(y, design)
qlf <- edgeR::glmQLFTest(fit, coef = condition_coef)
tab <- edgeR::topTags(qlf, n = Inf, sort.by = "none")$table
tab %>%
tibble::rownames_to_column("gene_id") %>%
dplyr::transmute(
gene_id = gene_id,
ensembl_id = standardize_gene_ids(gene_id),
log2FoldChange = logFC,
statistic = F,
pvalue = PValue,
padj = FDR,
is_de = !is.na(padj) &
padj < alpha &
abs(log2FoldChange) >= lfc_threshold
)
}
run_limma_voom <- function(counts_mat, meta, alpha = 0.05, lfc_threshold = 0,
paired_design = TRUE) {
design <- make_design_matrix(meta, paired_design)
condition_coef <- get_condition_coef(design)
y <- edgeR::DGEList(counts = counts_mat)
y <- edgeR::calcNormFactors(y)
v <- limma::voom(y, design, plot = FALSE)
fit <- limma::lmFit(v, design)
fit <- limma::eBayes(fit)
tab <- limma::topTable(
fit,
coef = condition_coef,
number = Inf,
sort.by = "none"
)
tab %>%
tibble::rownames_to_column("gene_id") %>%
dplyr::transmute(
gene_id = gene_id,
ensembl_id = standardize_gene_ids(gene_id),
log2FoldChange = logFC,
statistic = t,
pvalue = P.Value,
padj = adj.P.Val,
is_de = !is.na(padj) &
padj < alpha &
abs(log2FoldChange) >= lfc_threshold
)
}
run_de_method <- function(method, counts_mat, meta, alpha = 0.05,
lfc_threshold = 0, paired_design = TRUE) {
if (method == "DESeq2") {
return(run_deseq2(counts_mat, meta, alpha, lfc_threshold, paired_design))
}
if (method == "edgeR_QLF") {
return(run_edger_qlf(counts_mat, meta, alpha, lfc_threshold, paired_design))
}
if (method == "limma_voom") {
return(run_limma_voom(counts_mat, meta, alpha, lfc_threshold, paired_design))
}
stop("Unknown method: ", method)
}
filter_no_filter <- function(counts_mat) {
rep(TRUE, nrow(counts_mat))
}
filter_total_count_ge_10 <- function(counts_mat) {
rowSums(counts_mat) >= 10
}
filter_cpm_ge_0_5_in_ge_2 <- function(counts_mat) {
cpm_mat <- edgeR::cpm(counts_mat)
rowSums(cpm_mat >= 0.5) >= 2
}
filter_cpm_ge_1_in_ge_half <- function(counts_mat) {
cpm_mat <- edgeR::cpm(counts_mat)
min_samples <- ceiling(ncol(counts_mat) / 2)
rowSums(cpm_mat >= 1) >= min_samples
}
filter_edgeR_filterByExpr <- function(counts_mat, meta, paired_design = TRUE) {
y <- edgeR::DGEList(counts = counts_mat)
design <- make_design_matrix(meta, paired_design)
edgeR::filterByExpr(y, design = design)
}
make_adaptive_candidate_filters <- function(counts_mat, meta) {
cpm_mat <- edgeR::cpm(counts_mat)
expr_cpm1 <- cpm_mat >= 1
group <- meta$condition
min_group_1 <- ceiling(sum(group == "untrt") / 2)
min_group_2 <- ceiling(sum(group == "trt") / 2)
group_presence <-
rowSums(expr_cpm1[, group == "untrt", drop = FALSE]) >= min_group_1 |
rowSums(expr_cpm1[, group == "trt", drop = FALSE]) >= min_group_2
list(
mean_count_ge_1 = rowMeans(counts_mat) >= 1,
mean_count_ge_5 = rowMeans(counts_mat) >= 5,
max_count_ge_10 = apply(counts_mat, 1, max) >= 10,
total_count_ge_10 = rowSums(counts_mat) >= 10,
cpm_0_5_in_2 = rowSums(cpm_mat >= 0.5) >= 2,
cpm_1_in_half = rowSums(cpm_mat >= 1) >= ceiling(ncol(counts_mat) / 2),
zero_prop_le_0_75 = rowMeans(counts_mat == 0) <= 0.75,
group_presence_cpm1 = group_presence
)
}
select_adaptive_filter <- function(counts_mat, meta, alpha = 0.05,
lfc_threshold = 0,
paired_design = TRUE,
minimum_genes = 100,
fallback_name = "cpm_0_5_in_2") {
candidate_filters <- make_adaptive_candidate_filters(counts_mat, meta)
scores <- tibble::tibble(
candidate = names(candidate_filters),
retained_genes = NA_integer_,
n_rejections = NA_integer_,
status = NA_character_
)
for (i in seq_along(candidate_filters)) {
candidate_name <- names(candidate_filters)[i]
mask <- candidate_filters[[i]]
retained <- sum(mask)
scores$retained_genes[i] <- retained
if (retained < minimum_genes) {
scores$n_rejections[i] <- NA_integer_
scores$status[i] <- "skipped_too_few_genes"
next
}
candidate_result <- tryCatch(
{
run_limma_voom(
counts_mat[mask, , drop = FALSE],
meta,
alpha = alpha,
lfc_threshold = lfc_threshold,
paired_design = paired_design
)
},
error = function(e) {
attr(e, "message") <- conditionMessage(e)
e
}
)
if (inherits(candidate_result, "error")) {
scores$n_rejections[i] <- NA_integer_
scores$status[i] <- paste0("failed: ", conditionMessage(candidate_result))
} else {
scores$n_rejections[i] <- sum(candidate_result$is_de, na.rm = TRUE)
scores$status[i] <- "ok"
}
}
ok_scores <- scores %>%
dplyr::filter(status == "ok", !is.na(n_rejections))
if (nrow(ok_scores) == 0 || max(ok_scores$n_rejections, na.rm = TRUE) == 0) {
selected_name <- fallback_name
} else {
selected_name <- ok_scores %>%
dplyr::arrange(dplyr::desc(n_rejections), retained_genes) %>%
dplyr::slice(1) %>%
dplyr::pull(candidate)
}
list(
selected_name = selected_name,
selected_mask = candidate_filters[[selected_name]],
scores = scores
)
}
adaptive_selection <- select_adaptive_filter(
counts_raw,
metadata,
alpha = alpha,
lfc_threshold = lfc_threshold,
paired_design = params$paired_design
)
filter_masks <- list(
no_filter = filter_no_filter(counts_raw),
total_count_ge_10 = filter_total_count_ge_10(counts_raw),
cpm_ge_0_5_in_ge_2 = filter_cpm_ge_0_5_in_ge_2(counts_raw),
cpm_ge_1_in_ge_half = filter_cpm_ge_1_in_ge_half(counts_raw),
edgeR_filterByExpr = filter_edgeR_filterByExpr(
counts_raw,
metadata,
paired_design = params$paired_design
),
adaptive_limma_voom_selected = adaptive_selection$selected_mask
)
filter_summary <- tibble::tibble(
filter = names(filter_masks),
retained_genes = vapply(filter_masks, sum, integer(1)),
removed_genes = nrow(counts_raw) - retained_genes,
retained_percent = round(100 * retained_genes / nrow(counts_raw), 2)
)
filter_summary %>% knitr::kable()
| filter | retained_genes | removed_genes | retained_percent |
|---|---|---|---|
| no_filter | 63677 | 0 | 100.00 |
| total_count_ge_10 | 22369 | 41308 | 35.13 |
| cpm_ge_0_5_in_ge_2 | 16730 | 46947 | 26.27 |
| cpm_ge_1_in_ge_half | 14332 | 49345 | 22.51 |
| edgeR_filterByExpr | 16860 | 46817 | 26.48 |
| adaptive_limma_voom_selected | 18086 | 45591 | 28.40 |
adaptive_selection$scores %>% knitr::kable()
| candidate | retained_genes | n_rejections | status |
|---|---|---|---|
| mean_count_ge_1 | 23171 | 5083 | ok |
| mean_count_ge_5 | 17852 | 5152 | ok |
| max_count_ge_10 | 18086 | 5211 | ok |
| total_count_ge_10 | 22369 | 5108 | ok |
| cpm_0_5_in_2 | 16730 | 5161 | ok |
| cpm_1_in_half | 14332 | 4847 | ok |
| zero_prop_le_0_75 | 28877 | 4738 | ok |
| group_presence_cpm1 | 14827 | 5001 | ok |
cat("Adaptive filter selected:", adaptive_selection$selected_name, "\n")
## Adaptive filter selected: max_count_ge_10
readr::write_csv(filter_summary, "results/filter_summary_airway.csv")
readr::write_csv(adaptive_selection$scores, "results/adaptive_filter_scores_airway.csv")
methods <- c("DESeq2", "edgeR_QLF", "limma_voom")
all_results <- list()
deseq_filter_diagnostics <- list()
for (filter_name in names(filter_masks)) {
mask <- filter_masks[[filter_name]]
counts_filtered <- counts_raw[mask, , drop = FALSE]
for (method_name in methods) {
message("Running ", method_name, " under filter: ", filter_name)
result <- run_de_method(
method = method_name,
counts_mat = counts_filtered,
meta = metadata,
alpha = alpha,
lfc_threshold = lfc_threshold,
paired_design = params$paired_design
)
analysis_id <- paste(filter_name, method_name, sep = "__")
all_results[[analysis_id]] <- result
if (method_name == "DESeq2") {
deseq_filter_diagnostics[[filter_name]] <- tibble::tibble(
filter = filter_name,
upstream_retained_genes = nrow(counts_filtered),
deseq_filter_threshold = attr(result, "deseq_filter_threshold"),
n_padj_na = sum(is.na(result$padj)),
n_pvalue_na = sum(is.na(result$pvalue)),
n_de = sum(result$is_de, na.rm = TRUE)
)
}
}
}
de_results_long <- dplyr::bind_rows(all_results, .id = "analysis_id") %>%
tidyr::separate(
analysis_id,
into = c("filter", "method"),
sep = "__",
remove = FALSE
) %>%
dplyr::mutate(
filter = factor(filter, levels = names(filter_masks)),
method = factor(method, levels = methods)
)
deseq_filter_diagnostics_df <- dplyr::bind_rows(deseq_filter_diagnostics)
readr::write_csv(de_results_long, "results/all_de_results_airway.csv")
readr::write_csv(deseq_filter_diagnostics_df, "results/deseq_independent_filtering_diagnostics_airway.csv")
deg_summary <- de_results_long %>%
dplyr::group_by(filter, method) %>%
dplyr::summarise(
n_tested = dplyr::n(),
n_pvalue_na = sum(is.na(pvalue)),
n_padj_na = sum(is.na(padj)),
n_de = sum(is_de, na.rm = TRUE),
.groups = "drop"
)
readr::write_csv(deg_summary, "results/deg_count_summary_airway.csv")
deg_summary %>% knitr::kable()
| filter | method | n_tested | n_pvalue_na | n_padj_na | n_de |
|---|---|---|---|---|---|
| no_filter | DESeq2 | 63677 | 30208 | 46895 | 4028 |
| no_filter | edgeR_QLF | 63677 | 0 | 0 | 3704 |
| no_filter | limma_voom | 63677 | 0 | 0 | 6723 |
| total_count_ge_10 | DESeq2 | 22369 | 0 | 5204 | 4030 |
| total_count_ge_10 | edgeR_QLF | 22369 | 0 | 0 | 5088 |
| total_count_ge_10 | limma_voom | 22369 | 0 | 0 | 5108 |
| cpm_ge_0_5_in_ge_2 | DESeq2 | 16730 | 0 | 0 | 4122 |
| cpm_ge_0_5_in_ge_2 | edgeR_QLF | 16730 | 0 | 0 | 5119 |
| cpm_ge_0_5_in_ge_2 | limma_voom | 16730 | 0 | 0 | 5161 |
| cpm_ge_1_in_ge_half | DESeq2 | 14332 | 0 | 0 | 3994 |
| cpm_ge_1_in_ge_half | edgeR_QLF | 14332 | 0 | 0 | 4764 |
| cpm_ge_1_in_ge_half | limma_voom | 14332 | 0 | 0 | 4847 |
| edgeR_filterByExpr | DESeq2 | 16860 | 0 | 0 | 4113 |
| edgeR_filterByExpr | edgeR_QLF | 16860 | 0 | 0 | 5127 |
| edgeR_filterByExpr | limma_voom | 16860 | 0 | 0 | 5170 |
| adaptive_limma_voom_selected | DESeq2 | 18086 | 0 | 702 | 4062 |
| adaptive_limma_voom_selected | edgeR_QLF | 18086 | 0 | 0 | 5200 |
| adaptive_limma_voom_selected | limma_voom | 18086 | 0 | 0 | 5211 |
p_deg_counts <- ggplot2::ggplot(
deg_summary,
ggplot2::aes(x = filter, y = n_de, fill = method)
) +
ggplot2::geom_col(position = "dodge") +
ggplot2::theme_bw() +
ggplot2::labs(
title = "Number of DE genes by filtering strategy and DE method",
x = "Filtering strategy",
y = paste0("Number of DE genes, FDR < ", alpha),
fill = "DE method"
) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))
p_deg_counts
ggplot2::ggsave(
"figures/deg_counts_by_filter_and_method_airway.png",
p_deg_counts,
width = 11,
height = 6,
dpi = 300
)
p_pvalue <- de_results_long %>%
dplyr::filter(!is.na(pvalue)) %>%
ggplot2::ggplot(ggplot2::aes(x = pvalue)) +
ggplot2::geom_histogram(bins = 40, boundary = 0) +
ggplot2::facet_grid(method ~ filter) +
ggplot2::theme_bw() +
ggplot2::labs(
title = "P-value distributions by filtering strategy and DE method",
x = "Raw p-value",
y = "Number of genes"
) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))
p_pvalue
ggplot2::ggsave(
"figures/pvalue_distributions_airway.png",
p_pvalue,
width = 14,
height = 8,
dpi = 300
)
deseq_filter_diagnostics_df %>% knitr::kable()
| filter | upstream_retained_genes | deseq_filter_threshold | n_padj_na | n_pvalue_na | n_de |
|---|---|---|---|---|---|
| no_filter | 63677 | 7.163845 | 46895 | 30208 | 4028 |
| total_count_ge_10 | 22369 | 6.136364 | 5204 | 0 | 4030 |
| cpm_ge_0_5_in_ge_2 | 16730 | 3.612137 | 0 | 0 | 4122 |
| cpm_ge_1_in_ge_half | 14332 | 14.594640 | 0 | 0 | 3994 |
| edgeR_filterByExpr | 16860 | 3.611121 | 0 | 0 | 4113 |
| adaptive_limma_voom_selected | 18086 | 5.441465 | 702 | 0 | 4062 |
p_deseq_na <- ggplot2::ggplot(
deseq_filter_diagnostics_df,
ggplot2::aes(x = filter, y = n_padj_na)
) +
ggplot2::geom_col() +
ggplot2::theme_bw() +
ggplot2::labs(
title = "DESeq2 genes with NA adjusted p-values",
subtitle = "NA padj values are one way to inspect the effect of DESeq2 independent filtering",
x = "Upstream filtering strategy",
y = "Number of genes with NA padj"
) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))
p_deseq_na
ggplot2::ggsave(
"figures/deseq2_na_padj_by_filter_airway.png",
p_deseq_na,
width = 10,
height = 5,
dpi = 300
)
get_deg_set <- function(results_df, filter_name, method_name) {
results_df %>%
dplyr::filter(
filter == filter_name,
method == method_name,
is_de
) %>%
dplyr::pull(gene_id) %>%
unique()
}
compute_pairwise_overlap <- function(results_df, filters, methods) {
output <- list()
counter <- 1
for (filter_name in filters) {
for (method_1 in methods) {
for (method_2 in methods) {
genes_1 <- get_deg_set(results_df, filter_name, method_1)
genes_2 <- get_deg_set(results_df, filter_name, method_2)
intersection_size <- length(intersect(genes_1, genes_2))
union_size <- length(union(genes_1, genes_2))
jaccard <- ifelse(union_size == 0, NA_real_, intersection_size / union_size)
output[[counter]] <- tibble::tibble(
filter = filter_name,
method_1 = method_1,
method_2 = method_2,
n_method_1 = length(genes_1),
n_method_2 = length(genes_2),
intersection = intersection_size,
union = union_size,
jaccard = jaccard
)
counter <- counter + 1
}
}
}
dplyr::bind_rows(output)
}
pairwise_overlap <- compute_pairwise_overlap(
de_results_long,
filters = names(filter_masks),
methods = methods
)
readr::write_csv(pairwise_overlap, "results/pairwise_overlap_airway.csv")
pairwise_overlap %>% knitr::kable()
| filter | method_1 | method_2 | n_method_1 | n_method_2 | intersection | union | jaccard |
|---|---|---|---|---|---|---|---|
| no_filter | DESeq2 | DESeq2 | 4028 | 4028 | 4028 | 4028 | 1.0000000 |
| no_filter | DESeq2 | edgeR_QLF | 4028 | 3704 | 3405 | 4327 | 0.7869193 |
| no_filter | DESeq2 | limma_voom | 4028 | 6723 | 4027 | 6724 | 0.5988995 |
| no_filter | edgeR_QLF | DESeq2 | 3704 | 4028 | 3405 | 4327 | 0.7869193 |
| no_filter | edgeR_QLF | edgeR_QLF | 3704 | 3704 | 3704 | 3704 | 1.0000000 |
| no_filter | edgeR_QLF | limma_voom | 3704 | 6723 | 3699 | 6728 | 0.5497919 |
| no_filter | limma_voom | DESeq2 | 6723 | 4028 | 4027 | 6724 | 0.5988995 |
| no_filter | limma_voom | edgeR_QLF | 6723 | 3704 | 3699 | 6728 | 0.5497919 |
| no_filter | limma_voom | limma_voom | 6723 | 6723 | 6723 | 6723 | 1.0000000 |
| total_count_ge_10 | DESeq2 | DESeq2 | 4030 | 4030 | 4030 | 4030 | 1.0000000 |
| total_count_ge_10 | DESeq2 | edgeR_QLF | 4030 | 5088 | 3985 | 5133 | 0.7763491 |
| total_count_ge_10 | DESeq2 | limma_voom | 4030 | 5108 | 4017 | 5121 | 0.7844171 |
| total_count_ge_10 | edgeR_QLF | DESeq2 | 5088 | 4030 | 3985 | 5133 | 0.7763491 |
| total_count_ge_10 | edgeR_QLF | edgeR_QLF | 5088 | 5088 | 5088 | 5088 | 1.0000000 |
| total_count_ge_10 | edgeR_QLF | limma_voom | 5088 | 5108 | 4920 | 5276 | 0.9325246 |
| total_count_ge_10 | limma_voom | DESeq2 | 5108 | 4030 | 4017 | 5121 | 0.7844171 |
| total_count_ge_10 | limma_voom | edgeR_QLF | 5108 | 5088 | 4920 | 5276 | 0.9325246 |
| total_count_ge_10 | limma_voom | limma_voom | 5108 | 5108 | 5108 | 5108 | 1.0000000 |
| cpm_ge_0_5_in_ge_2 | DESeq2 | DESeq2 | 4122 | 4122 | 4122 | 4122 | 1.0000000 |
| cpm_ge_0_5_in_ge_2 | DESeq2 | edgeR_QLF | 4122 | 5119 | 4074 | 5167 | 0.7884653 |
| cpm_ge_0_5_in_ge_2 | DESeq2 | limma_voom | 4122 | 5161 | 4102 | 5181 | 0.7917390 |
| cpm_ge_0_5_in_ge_2 | edgeR_QLF | DESeq2 | 5119 | 4122 | 4074 | 5167 | 0.7884653 |
| cpm_ge_0_5_in_ge_2 | edgeR_QLF | edgeR_QLF | 5119 | 5119 | 5119 | 5119 | 1.0000000 |
| cpm_ge_0_5_in_ge_2 | edgeR_QLF | limma_voom | 5119 | 5161 | 5017 | 5263 | 0.9532586 |
| cpm_ge_0_5_in_ge_2 | limma_voom | DESeq2 | 5161 | 4122 | 4102 | 5181 | 0.7917390 |
| cpm_ge_0_5_in_ge_2 | limma_voom | edgeR_QLF | 5161 | 5119 | 5017 | 5263 | 0.9532586 |
| cpm_ge_0_5_in_ge_2 | limma_voom | limma_voom | 5161 | 5161 | 5161 | 5161 | 1.0000000 |
| cpm_ge_1_in_ge_half | DESeq2 | DESeq2 | 3994 | 3994 | 3994 | 3994 | 1.0000000 |
| cpm_ge_1_in_ge_half | DESeq2 | edgeR_QLF | 3994 | 4764 | 3920 | 4838 | 0.8102522 |
| cpm_ge_1_in_ge_half | DESeq2 | limma_voom | 3994 | 4847 | 3951 | 4890 | 0.8079755 |
| cpm_ge_1_in_ge_half | edgeR_QLF | DESeq2 | 4764 | 3994 | 3920 | 4838 | 0.8102522 |
| cpm_ge_1_in_ge_half | edgeR_QLF | edgeR_QLF | 4764 | 4764 | 4764 | 4764 | 1.0000000 |
| cpm_ge_1_in_ge_half | edgeR_QLF | limma_voom | 4764 | 4847 | 4718 | 4893 | 0.9642346 |
| cpm_ge_1_in_ge_half | limma_voom | DESeq2 | 4847 | 3994 | 3951 | 4890 | 0.8079755 |
| cpm_ge_1_in_ge_half | limma_voom | edgeR_QLF | 4847 | 4764 | 4718 | 4893 | 0.9642346 |
| cpm_ge_1_in_ge_half | limma_voom | limma_voom | 4847 | 4847 | 4847 | 4847 | 1.0000000 |
| edgeR_filterByExpr | DESeq2 | DESeq2 | 4113 | 4113 | 4113 | 4113 | 1.0000000 |
| edgeR_filterByExpr | DESeq2 | edgeR_QLF | 4113 | 5127 | 4065 | 5175 | 0.7855072 |
| edgeR_filterByExpr | DESeq2 | limma_voom | 4113 | 5170 | 4093 | 5190 | 0.7886320 |
| edgeR_filterByExpr | edgeR_QLF | DESeq2 | 5127 | 4113 | 4065 | 5175 | 0.7855072 |
| edgeR_filterByExpr | edgeR_QLF | edgeR_QLF | 5127 | 5127 | 5127 | 5127 | 1.0000000 |
| edgeR_filterByExpr | edgeR_QLF | limma_voom | 5127 | 5170 | 5022 | 5275 | 0.9520379 |
| edgeR_filterByExpr | limma_voom | DESeq2 | 5170 | 4113 | 4093 | 5190 | 0.7886320 |
| edgeR_filterByExpr | limma_voom | edgeR_QLF | 5170 | 5127 | 5022 | 5275 | 0.9520379 |
| edgeR_filterByExpr | limma_voom | limma_voom | 5170 | 5170 | 5170 | 5170 | 1.0000000 |
| adaptive_limma_voom_selected | DESeq2 | DESeq2 | 4062 | 4062 | 4062 | 4062 | 1.0000000 |
| adaptive_limma_voom_selected | DESeq2 | edgeR_QLF | 4062 | 5200 | 4020 | 5242 | 0.7668829 |
| adaptive_limma_voom_selected | DESeq2 | limma_voom | 4062 | 5211 | 4044 | 5229 | 0.7733792 |
| adaptive_limma_voom_selected | edgeR_QLF | DESeq2 | 5200 | 4062 | 4020 | 5242 | 0.7668829 |
| adaptive_limma_voom_selected | edgeR_QLF | edgeR_QLF | 5200 | 5200 | 5200 | 5200 | 1.0000000 |
| adaptive_limma_voom_selected | edgeR_QLF | limma_voom | 5200 | 5211 | 5047 | 5364 | 0.9409023 |
| adaptive_limma_voom_selected | limma_voom | DESeq2 | 5211 | 4062 | 4044 | 5229 | 0.7733792 |
| adaptive_limma_voom_selected | limma_voom | edgeR_QLF | 5211 | 5200 | 5047 | 5364 | 0.9409023 |
| adaptive_limma_voom_selected | limma_voom | limma_voom | 5211 | 5211 | 5211 | 5211 | 1.0000000 |
p_overlap <- ggplot2::ggplot(
pairwise_overlap,
ggplot2::aes(x = method_1, y = method_2, fill = jaccard)
) +
ggplot2::geom_tile() +
ggplot2::geom_text(ggplot2::aes(label = round(jaccard, 2)), size = 3) +
ggplot2::facet_wrap(~ filter) +
ggplot2::theme_bw() +
ggplot2::labs(
title = "Pairwise Jaccard overlap of DE gene sets",
x = "Method 1",
y = "Method 2",
fill = "Jaccard"
) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 35, hjust = 1))
p_overlap
ggplot2::ggsave(
"figures/pairwise_jaccard_overlap_airway.png",
p_overlap,
width = 13,
height = 8,
dpi = 300
)
for (filter_name in names(filter_masks)) {
gene_sets <- list()
for (method_name in methods) {
gene_sets[[method_name]] <- get_deg_set(de_results_long, filter_name, method_name)
}
total_de_genes <- length(unique(unlist(gene_sets)))
if (total_de_genes == 0) {
message("Skipping UpSet plot for ", filter_name, ": no DE genes.")
next
}
upset_input <- UpSetR::fromList(gene_sets)
pdf(
file = file.path("figures/upset", paste0("upset_", filter_name, "_airway.pdf")),
width = 8,
height = 6
)
UpSetR::upset(
upset_input,
nsets = length(methods),
order.by = "freq",
mainbar.y.label = "DE gene intersection size",
sets.x.label = "DE genes per method"
)
dev.off()
}
lfc_wide <- de_results_long %>%
dplyr::select(filter, method, gene_id, log2FoldChange) %>%
tidyr::pivot_wider(
names_from = method,
values_from = log2FoldChange
)
readr::write_csv(lfc_wide, "results/log2fc_wide_airway.csv")
p_lfc_deseq_edger <- ggplot2::ggplot(
lfc_wide,
ggplot2::aes(x = DESeq2, y = edgeR_QLF)
) +
ggplot2::geom_point(alpha = 0.25, size = 0.7) +
ggplot2::facet_wrap(~ filter) +
ggplot2::theme_bw() +
ggplot2::labs(
title = "Log2 fold-change agreement: DESeq2 vs edgeR-QLF",
x = "DESeq2 log2FC",
y = "edgeR-QLF log2FC"
)
p_lfc_deseq_edger
ggplot2::ggsave(
"figures/log2fc_deseq2_vs_edger_airway.png",
p_lfc_deseq_edger,
width = 13,
height = 8,
dpi = 300
)
p_lfc_deseq_limma <- ggplot2::ggplot(
lfc_wide,
ggplot2::aes(x = DESeq2, y = limma_voom)
) +
ggplot2::geom_point(alpha = 0.25, size = 0.7) +
ggplot2::facet_wrap(~ filter) +
ggplot2::theme_bw() +
ggplot2::labs(
title = "Log2 fold-change agreement: DESeq2 vs limma-voom",
x = "DESeq2 log2FC",
y = "limma-voom log2FC"
)
p_lfc_deseq_limma
ggplot2::ggsave(
"figures/log2fc_deseq2_vs_limma_airway.png",
p_lfc_deseq_limma,
width = 13,
height = 8,
dpi = 300
)
This section is disabled by default. To run it, set this in the YAML header:
params:
run_go: true
go_pkgs <- c("clusterProfiler", "org.Hs.eg.db")
missing_go_pkgs <- go_pkgs[
!vapply(go_pkgs, requireNamespace, logical(1), quietly = TRUE)
]
if (length(missing_go_pkgs) > 0) {
stop(
"Missing GO enrichment packages: ", paste(missing_go_pkgs, collapse = ", "),
"\nInstall with BiocManager::install(c('clusterProfiler', 'org.Hs.eg.db'))."
)
}
suppressPackageStartupMessages({
library(clusterProfiler)
library(org.Hs.eg.db)
})
map_ensembl_to_entrez <- function(ensembl_ids) {
mapped <- AnnotationDbi::mapIds(
org.Hs.eg.db::org.Hs.eg.db,
keys = unique(ensembl_ids),
keytype = "ENSEMBL",
column = "ENTREZID",
multiVals = "first"
)
stats::na.omit(as.character(mapped))
}
run_go_for_deg_set <- function(de_ensembl_ids, universe_ensembl_ids) {
entrez_de <- map_ensembl_to_entrez(de_ensembl_ids)
entrez_universe <- map_ensembl_to_entrez(universe_ensembl_ids)
if (length(entrez_de) < 10) {
return(NULL)
}
clusterProfiler::enrichGO(
gene = entrez_de,
universe = entrez_universe,
OrgDb = org.Hs.eg.db::org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.2,
readable = TRUE
)
}
go_results <- list()
for (filter_name in names(filter_masks)) {
universe_ids <- rownames(counts_raw)[filter_masks[[filter_name]]]
for (method_name in methods) {
de_ids <- de_results_long %>%
dplyr::filter(filter == filter_name, method == method_name, is_de) %>%
dplyr::pull(ensembl_id) %>%
unique()
ego <- run_go_for_deg_set(de_ids, universe_ids)
analysis_id <- paste(filter_name, method_name, sep = "__")
go_results[[analysis_id]] <- ego
if (!is.null(ego)) {
ego_df <- as.data.frame(ego)
readr::write_csv(
ego_df,
file.path("results", paste0("GO_", analysis_id, "_airway.csv"))
)
}
}
}
# Summarize top GO terms.
go_top_terms <- list()
for (analysis_id in names(go_results)) {
ego <- go_results[[analysis_id]]
if (is.null(ego)) next
ego_df <- as.data.frame(ego)
if (nrow(ego_df) == 0) next
pieces <- strsplit(analysis_id, "__")[[1]]
go_top_terms[[analysis_id]] <- ego_df %>%
dplyr::slice_head(n = 10) %>%
dplyr::mutate(
filter = pieces[1],
method = pieces[2]
) %>%
dplyr::select(filter, method, ID, Description, p.adjust, Count)
}
go_top_terms_df <- dplyr::bind_rows(go_top_terms)
readr::write_csv(go_top_terms_df, "results/top_go_terms_airway.csv")
go_top_terms_df %>% knitr::kable()
Use these outputs to answer the main project question:
NA adjusted p-values?sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] UpSetR_1.4.0 readr_2.2.0
## [3] tibble_3.3.1 tidyr_1.3.2
## [5] dplyr_1.2.1 ggplot2_4.0.3
## [7] edgeR_4.6.3 limma_3.64.3
## [9] DESeq2_1.48.2 airway_1.28.0
## [11] SummarizedExperiment_1.38.1 Biobase_2.68.0
## [13] GenomicRanges_1.60.0 GenomeInfoDb_1.44.3
## [15] IRanges_2.42.0 S4Vectors_0.46.0
## [17] BiocGenerics_0.54.1 generics_0.1.4
## [19] MatrixGenerics_1.20.0 matrixStats_1.5.0
## [21] BiocManager_1.30.27
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.57 bslib_0.10.0
## [4] lattice_0.22-9 tzdb_0.5.0 vctrs_0.7.3
## [7] tools_4.5.1 parallel_4.5.1 pkgconfig_2.0.3
## [10] Matrix_1.7-5 RColorBrewer_1.1-3 S7_0.2.2
## [13] lifecycle_1.0.5 GenomeInfoDbData_1.2.14 compiler_4.5.1
## [16] farver_2.1.2 textshaping_1.0.5 statmod_1.5.1
## [19] codetools_0.2-20 htmltools_0.5.9 sass_0.4.10
## [22] yaml_2.3.12 pillar_1.11.1 crayon_1.5.3
## [25] jquerylib_0.1.4 BiocParallel_1.42.2 DelayedArray_0.34.1
## [28] cachem_1.1.0 abind_1.4-8 tidyselect_1.2.1
## [31] locfit_1.5-9.12 digest_0.6.39 purrr_1.2.2
## [34] labeling_0.4.3 fastmap_1.2.0 grid_4.5.1
## [37] cli_3.6.6 SparseArray_1.8.1 magrittr_2.0.5
## [40] S4Arrays_1.8.1 withr_3.0.2 scales_1.4.0
## [43] UCSC.utils_1.4.0 bit64_4.8.0 rmarkdown_2.31
## [46] XVector_0.48.0 httr_1.4.8 bit_4.6.0
## [49] otel_0.2.0 gridExtra_2.3 ragg_1.5.2
## [52] hms_1.1.4 evaluate_1.0.5 knitr_1.51
## [55] rlang_1.2.0 Rcpp_1.1.1-1.1 glue_1.8.1
## [58] vroom_1.7.1 rstudioapi_0.18.0 jsonlite_2.0.0
## [61] plyr_1.8.9 R6_2.6.1 systemfonts_1.3.2