Part 1: Environment Setup

1.1 Set up library path

.libPaths("/courses/BINF6430.202630/students/tran.nhi2/module06_QC_and_Quantification/renv/library/linux-ubuntu-noble/R-4.5/x86_64-pc-linux-gnu/")
cat("Library path set.\n")
## Library path set.

1.2 Load libraries

suppressPackageStartupMessages({
  library(tidyverse)
  library(DESeq2)
  library(DEGreport)
  library(org.Hs.eg.db)
  library(clusterProfiler)
  library(pathview)
  library(ggplot2)
  library(pheatmap)
  library(ggrepel)
  library(cowplot)
  library(enrichplot)
  library(ggridges)
  library(DOSE)
})
cat("\033[32mAll libraries loaded successfully.\033[0m\n")
## All libraries loaded successfully.

1.3 Load Project 07 workspace

p07_path <- "/courses/BINF6430.202630/students/tran.nhi2/module06_QC_and_Quantification/project07.RData"

if (file.exists(p07_path)) {
  load(p07_path)
  cat("\033[32mProject 07 workspace loaded successfully.\033[0m\n")
} else {
  stop(paste("File not found:", p07_path))
}
## Project 07 workspace loaded successfully.
# Add threshold_OE and genelabels if not present
if (!"threshold_OE" %in% colnames(res_bmat_tb)) {
  res_bmat_tb <- res_bmat_tb %>%
    mutate(
      threshold_OE = !is.na(padj) & padj < 0.1 & abs(log2FoldChange) >= 1,
      genelabels   = ifelse(threshold_OE, gene, "")
    )
}
if (!"threshold_OE" %in% colnames(res_mm_tb)) {
  res_mm_tb <- res_mm_tb %>%
    mutate(
      threshold_OE = !is.na(padj) & padj < 0.1 & abs(log2FoldChange) >= 1,
      genelabels   = ifelse(threshold_OE, gene, "")
    )
}
cat("\033[32mthreshold_OE and genelabels columns verified.\033[0m\n")
## threshold_OE and genelabels columns verified.

Part 2: Reviewing the Data

2.1 Summary of DE results

We use alpha = 0.1 because this is an underpowered pilot study. A relaxed p-value threshold allows us to include genes that lean toward statistical significance without demanding the power of a fully-replicated experiment.

cat("=== BMAT Summary ===\n")
## === BMAT Summary ===
summary(res_bmat, alpha = 0.1)
## 
## out of 23567 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 40, 0.17%
## LFC < 0 (down)     : 10, 0.042%
## outliers [1]       : 0, 0%
## low counts [2]     : 5891, 25%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
cat("\n=== MM Summary ===\n")
## 
## === MM Summary ===
summary(res_mm, alpha = 0.1)
## 
## out of 22474 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 29, 0.13%
## LFC < 0 (down)     : 38, 0.17%
## outliers [1]       : 0, 0%
## low counts [2]     : 16337, 73%
## (mean count < 196)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

2.2 Build DE gene tables with ID mapping

We filter by padj < 0.1 and |log2FoldChange| >= 1. We map ENSEMBL IDs to gene symbols (for readability) and Entrez IDs (required by KEGG and Disease Ontology).

# ── BMAT ─────────────────────────────────────────────────────────────────────
DE_bmat <- res_bmat_tb %>%
  filter(padj < 0.1 & abs(log2FoldChange) >= 1)

DE_bmat$symbol <- mapIds(org.Hs.eg.db, keys = DE_bmat$gene,
  column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first")
DE_bmat$entrez <- mapIds(org.Hs.eg.db, keys = DE_bmat$gene,
  column = "ENTREZID", keytype = "ENSEMBL", multiVals = "first")

write.csv(DE_bmat, "DE_BMAT.csv", row.names = TRUE)
cat(sprintf("BMAT: %d DE genes after filtering\n", nrow(DE_bmat)))
## BMAT: 14 DE genes after filtering
knitr::kable(head(DE_bmat), caption = "Top BMAT DE Genes")
Top BMAT DE Genes
gene baseMean log2FoldChange lfcSE pvalue padj threshold_OE symbol genelabels entrez
ENSG00000273768 81.53610 -10.838511 3.0497175 0e+00 0.0000000 TRUE LOC124904613 RNVU1-29 124904613
ENSG00000177084 365.08065 1.685195 0.2540747 0e+00 0.0000000 TRUE POLE POLE 5426
ENSG00000170522 1658.62441 1.138227 0.1770923 0e+00 0.0000000 TRUE ELOVL6 ELOVL6 79071
ENSG00000174697 267.24841 1.479131 0.2847458 0e+00 0.0000264 TRUE LEP LEP 3952
ENSG00000260914 22.85321 8.456502 2.8325525 0e+00 0.0001048 TRUE NA NA
ENSG00000159307 250.30474 1.163067 0.2696740 7e-07 0.0014518 TRUE SCUBE1 SCUBE1 80274
# ── MM ───────────────────────────────────────────────────────────────────────
DE_mm <- res_mm_tb %>%
  filter(padj < 0.1 & abs(log2FoldChange) >= 1)

DE_mm$symbol <- mapIds(org.Hs.eg.db, keys = DE_mm$gene,
  column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first")
DE_mm$entrez <- mapIds(org.Hs.eg.db, keys = DE_mm$gene,
  column = "ENTREZID", keytype = "ENSEMBL", multiVals = "first")

write.csv(DE_mm, "DE_MM.csv", row.names = TRUE)
cat(sprintf("MM: %d DE genes after filtering\n", nrow(DE_mm)))
## MM: 11 DE genes after filtering
knitr::kable(head(DE_mm), caption = "Top MM DE Genes")
Top MM DE Genes
gene baseMean log2FoldChange lfcSE pvalue padj threshold_OE symbol genelabels entrez
ENSG00000168209 1212.6055 1.962096 0.2663245 0e+00 0.0000000 TRUE DDIT4 DDIT4 54541
ENSG00000184557 220.5533 3.982983 0.6178745 0e+00 0.0000000 TRUE SOCS3 SOCS3 9021
ENSG00000157514 1170.0315 2.115485 0.3442793 0e+00 0.0000000 TRUE TSC22D3 TSC22D3 1831
ENSG00000173821 2025.1851 1.303084 0.2124927 0e+00 0.0000000 TRUE RNF213 RNF213 57674
ENSG00000138496 260.1226 2.973312 0.6607364 0e+00 0.0000230 TRUE PARP9 PARP9 83666
ENSG00000134107 437.9017 1.385086 0.3069167 2e-07 0.0001682 TRUE BHLHE40 BHLHE40 8553

Part 3: Sanity Checks

3.1 Sample correlation heatmaps

These heatmaps verify that samples cluster as expected. Co-cultured samples should group separately from alone samples. A clean 1:1 diagonal confirms each sample correlates perfectly with itself.

# ── BMAT ─────────────────────────────────────────────────────────────────────
vst_mat_bmat <- assay(vsd_bmat)
vst_cor_bmat <- cor(vst_mat_bmat)
bmat_annotation <- data.frame(
  row.names   = colnames(vst_mat_bmat),
  co_cultured = s2c_bmat$co_cultured
)
pheatmap(vst_cor_bmat, annotation_col = bmat_annotation,
         main = "BMAT Sample Correlation",
         show_rownames = FALSE, show_colnames = FALSE)

# ── MM ───────────────────────────────────────────────────────────────────────
vst_mat_mm <- assay(vsd_mm)
vst_cor_mm <- cor(vst_mat_mm)
mm_annotation <- data.frame(
  row.names   = colnames(vst_mat_mm),
  co_cultured = s2c_mm$co_cultured
)
pheatmap(vst_cor_mm, annotation_col = mm_annotation,
         main = "MM Sample Correlation",
         show_rownames = FALSE, show_colnames = FALSE)

3.2 Volcano plots

degVolcano(
  data.frame(res_bmat_tb[, c("log2FoldChange", "padj")]),
  plot_text = data.frame(
    res_bmat_tb[res_bmat_tb$threshold_OE == TRUE,
                c("log2FoldChange", "padj", "genelabels")])
)

degVolcano(
  data.frame(res_mm_tb[, c("log2FoldChange", "padj")]),
  plot_text = data.frame(
    res_mm_tb[res_mm_tb$threshold_OE == TRUE,
              c("log2FoldChange", "padj", "genelabels")])
)

3.3 MA plots

An MA plot shows log2 fold change (M) on the y-axis versus mean normalized expression (A) on the x-axis. Genes near M = 0 are similarly expressed between conditions. Genes above are upregulated in co-cultured cells; below are downregulated.

# Build DEGSets from the original split model dds objects
res_bmat_degset <- degComps(dds_bmat, contrast = list("co_cultured_TRUE_vs_FALSE"))
res_mm_degset   <- degComps(dds_mm,   contrast = list("co_cultured_TRUE_vs_FALSE"))

# Basic MA plots for BMAT and MM
degMA(res_bmat_degset, raw = TRUE, diff = 1)

degMA(res_mm_degset,   raw = TRUE, diff = 1)

Question 1: How do you interpret the MA plots? What do the colors mean?

The x-axis (A) represents mean normalized expression and the y-axis (M) is the log2 fold change between co-cultured and alone conditions. Genes near M = 0 show little difference between conditions. Red/colored points are significantly DE (padj < 0.1, |LFC| ≥ 1), while gray points are not significant. The “fanning” pattern at low expression is expected — lowly expressed genes have higher variance, producing more scatter around zero.

# Side-by-side: standard MA (bottom) vs correlation MA (top)
ma1 <- degMA(res_mm_degset, correlation = TRUE, diff = 1)
ma2 <- degMA(res_mm_degset, diff = 1)
plot_grid(ma2, ma1, align = "v", nrow = 2, ncol = 1)

Question 2: How do you interpret the side-by-side MA plots?

These two-panel plots come from degMA with raw = TRUE (top panel) and correlation = TRUE (bottom panel).

The top panel shows raw unshrunken fold changes vs mean expression. The dramatic vertical spread of blue points at low expression levels represents genes with extreme but unreliable fold change estimates caused by low counts — this illustrates exactly why shrinkage is necessary. Red points are significant genes.

The bottom panel plots the shrunken fold change (y-axis) against the unshrunken fold change (x-axis). The tight diagonal relationship for moderately and highly expressed genes shows that shrinkage changes these estimates only slightly. However, genes with very large unshrunken fold changes get pulled dramatically toward zero by apeglm — this is the shrinkage doing its job, removing noise from lowly expressed genes. The red significant genes cluster near the center-right diagonal, confirming that our DE calls come from genes where shrunken and unshrunken estimates agree reasonably well, indicating reliable results.


Part 4: Gene Set Enrichment Analysis

4.1 Prepare ranked gene lists

# ── BMAT: ENSEMBL ─────────────────────────────────────────────────────────────
original_gene_list_bmat        <- DE_bmat$log2FoldChange
names(original_gene_list_bmat) <- DE_bmat$gene
gene_list_bmat <- sort(na.omit(original_gene_list_bmat), decreasing = TRUE)

# ── BMAT: Entrez ──────────────────────────────────────────────────────────────
original_gene_list_bmat_entrez        <- DE_bmat$log2FoldChange
names(original_gene_list_bmat_entrez) <- DE_bmat$entrez
gene_list_bmat_entrez <- sort(na.omit(original_gene_list_bmat_entrez), decreasing = TRUE)

# ── MM: ENSEMBL ───────────────────────────────────────────────────────────────
original_gene_list_mm        <- DE_mm$log2FoldChange
names(original_gene_list_mm) <- DE_mm$gene
gene_list_mm <- sort(na.omit(original_gene_list_mm), decreasing = TRUE)

# ── MM: Entrez ────────────────────────────────────────────────────────────────
original_gene_list_mm_entrez        <- DE_mm$log2FoldChange
names(original_gene_list_mm_entrez) <- DE_mm$entrez
gene_list_mm_entrez <- sort(na.omit(original_gene_list_mm_entrez), decreasing = TRUE)

cat("BMAT gene list length:", length(gene_list_bmat), "\n")
## BMAT gene list length: 14
cat("MM gene list length:",   length(gene_list_mm),   "\n")
## MM gene list length: 11

4.2 Gene Ontology (GO) Analysis

# ── BMAT ORA ──────────────────────────────────────────────────────────────────
gse_bmat_enrich <- tryCatch(
  enrichGO(names(gene_list_bmat), ont = "BP", keyType = "ENSEMBL",
    minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.1,
    OrgDb = org.Hs.eg.db, pAdjustMethod = "none"),
  error = function(e) { cat("BMAT enrichGO skipped:", e$message, "\n"); NULL }
)

# ── BMAT GSEA ─────────────────────────────────────────────────────────────────
gse_bmat <- tryCatch(
  gseGO(geneList = gene_list_bmat, ont = "BP", keyType = "ENSEMBL",
    minGSSize = 3, maxGSSize = 800, pvalueCutoff = 1,
    verbose = TRUE, OrgDb = org.Hs.eg.db, pAdjustMethod = "none"),
  error = function(e) { cat("BMAT gseGO skipped:", e$message, "\n"); NULL }
)
gse_bmat_pairwise <- if (!is.null(gse_bmat)) pairwise_termsim(gse_bmat) else NULL

# ── MM ORA ────────────────────────────────────────────────────────────────────
gse_mm_enrich <- tryCatch(
  enrichGO(names(gene_list_mm), ont = "BP", keyType = "ENSEMBL",
    minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.1,
    OrgDb = org.Hs.eg.db, pAdjustMethod = "none"),
  error = function(e) { cat("MM enrichGO skipped:", e$message, "\n"); NULL }
)

# ── MM GSEA ───────────────────────────────────────────────────────────────────
gse_mm <- tryCatch(
  gseGO(geneList = gene_list_mm, ont = "BP", keyType = "ENSEMBL",
    minGSSize = 3, maxGSSize = 800, pvalueCutoff = 1,
    verbose = TRUE, OrgDb = org.Hs.eg.db, pAdjustMethod = "none"),
  error = function(e) { cat("MM gseGO skipped:", e$message, "\n"); NULL }
)
gse_mm_pairwise <- if (!is.null(gse_mm)) pairwise_termsim(gse_mm) else NULL

4.3 Review GO result summaries

gse_bmat_enrich
## #
## # over-representation test
## #
## #...@organism     Homo sapiens 
## #...@ontology     BP 
## #...@keytype      ENSEMBL 
## #...@gene     chr [1:14] "ENSG00000260914" "ENSG00000289685" "ENSG00000103426" ...
## #...pvalues adjusted by 'none' with cutoff <0.1 
## #...515 enriched terms found
## 'data.frame':    515 obs. of  12 variables:
##  $ ID            : chr  "GO:0006631" "GO:0044283" "GO:0032787" "GO:0097305" ...
##  $ Description   : chr  "fatty acid metabolic process" "small molecule biosynthetic process" "monocarboxylic acid metabolic process" "response to alcohol" ...
##  $ GeneRatio     : chr  "4/9" "4/9" "4/9" "3/9" ...
##  $ BgRatio       : chr  "456/21338" "689/21338" "691/21338" "311/21338" ...
##  $ RichFactor    : num  0.00877 0.00581 0.00579 0.00965 0.0303 ...
##  $ FoldEnrichment: num  20.8 13.8 13.7 22.9 71.8 ...
##  $ zScore        : num  8.78 7 6.98 7.98 11.84 ...
##  $ pvalue        : num  2.38e-05 1.19e-04 1.21e-04 2.41e-04 3.34e-04 ...
##  $ p.adjust      : num  2.38e-05 1.19e-04 1.21e-04 2.41e-04 3.34e-04 ...
##  $ qvalue        : num  0.00975 0.01647 0.01647 0.0209 0.0209 ...
##  $ geneID        : chr  "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000012048" "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000012048" "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000012048" "ENSG00000174697/ENSG00000164742/ENSG00000012048" ...
##  $ Count         : int  4 4 4 3 2 2 3 3 2 2 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320
gse_bmat
## #
## # Gene Set Enrichment Analysis
## #
## #...@organism     Homo sapiens 
## #...@setType      BP 
## #...@keytype      ENSEMBL 
## #...@geneList     Named num [1:14] 8.46 6.81 6.72 3.41 3.3 ...
##  - attr(*, "names")= chr [1:14] "ENSG00000260914" "ENSG00000289685" "ENSG00000103426" "ENSG00000293495" ...
## #...nPerm     
## #...pvalues adjusted by 'none' with cutoff <1 
## #...95 enriched terms found
## 'data.frame':    95 obs. of  11 variables:
##  $ ID             : chr  "GO:0008150" "GO:0009987" "GO:0008152" "GO:0044238" ...
##  $ Description    : chr  "biological_process" "cellular process" "metabolic process" "primary metabolic process" ...
##  $ setSize        : int  9 9 7 7 6 6 8 8 3 3 ...
##  $ enrichmentScore: num  -0.8 -0.8 -0.714 -0.714 -0.75 ...
##  $ NES            : num  -1.78 -1.78 -1.5 -1.5 -1.5 ...
##  $ pvalue         : num  0.0143 0.0143 0.043 0.043 0.0558 ...
##  $ p.adjust       : num  0.0143 0.0143 0.043 0.043 0.0558 ...
##  $ qvalue         : num  0.558 0.558 0.558 0.558 0.558 ...
##  $ rank           : num  11 11 10 10 9 9 11 11 5 5 ...
##  $ leading_edge   : chr  "tags=100%, list=79%, signal=60%" "tags=100%, list=79%, signal=60%" "tags=100%, list=71%, signal=57%" "tags=100%, list=71%, signal=57%" ...
##  $ core_enrichment: chr  "ENSG00000177084/ENSG00000174697/ENSG00000159307/ENSG00000170522/ENSG00000099998/ENSG00000164742/ENSG00000012048"| __truncated__ "ENSG00000177084/ENSG00000174697/ENSG00000159307/ENSG00000170522/ENSG00000099998/ENSG00000164742/ENSG00000012048"| __truncated__ "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000164742/ENSG00000012048/ENSG00000132207" "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000164742/ENSG00000012048/ENSG00000132207" ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320
gse_mm_enrich
## #
## # over-representation test
## #
## #...@organism     Homo sapiens 
## #...@ontology     BP 
## #...@keytype      ENSEMBL 
## #...@gene     chr [1:11] "ENSG00000184557" "ENSG00000224389" "ENSG00000138496" ...
## #...pvalues adjusted by 'none' with cutoff <0.1 
## #...547 enriched terms found
## 'data.frame':    547 obs. of  12 variables:
##  $ ID            : chr  "GO:0072539" "GO:0072538" "GO:0042093" "GO:0002294" ...
##  $ Description   : chr  "T-helper 17 cell differentiation" "T-helper 17 type immune response" "T-helper cell differentiation" "CD4-positive, alpha-beta T cell differentiation involved in immune response" ...
##  $ GeneRatio     : chr  "3/11" "3/11" "3/11" "3/11" ...
##  $ BgRatio       : chr  "47/21338" "64/21338" "108/21338" "110/21338" ...
##  $ RichFactor    : num  0.0638 0.0469 0.0278 0.0273 0.027 ...
##  $ FoldEnrichment: num  123.8 90.9 53.9 52.9 52.4 ...
##  $ zScore        : num  19.1 16.4 12.5 12.4 12.3 ...
##  $ pvalue        : num  1.63e-06 4.17e-06 2.02e-05 2.13e-05 2.19e-05 ...
##  $ p.adjust      : num  1.63e-06 4.17e-06 2.02e-05 2.13e-05 2.19e-05 ...
##  $ qvalue        : num  0.000574 0.000734 0.001285 0.001285 0.001285 ...
##  $ geneID        : chr  "ENSG00000184557/ENSG00000171223/ENSG00000168610" "ENSG00000184557/ENSG00000171223/ENSG00000168610" "ENSG00000184557/ENSG00000171223/ENSG00000168610" "ENSG00000184557/ENSG00000171223/ENSG00000168610" ...
##  $ Count         : int  3 3 3 3 3 3 3 3 2 2 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320
gse_mm
## #
## # Gene Set Enrichment Analysis
## #
## #...@organism     Homo sapiens 
## #...@setType      BP 
## #...@keytype      ENSEMBL 
## #...@geneList     Named num [1:11] 3.98 3.44 2.97 2.33 2.12 ...
##  - attr(*, "names")= chr [1:11] "ENSG00000184557" "ENSG00000224389" "ENSG00000138496" "ENSG00000187323" ...
## #...nPerm     
## #...pvalues adjusted by 'none' with cutoff <1 
## #...194 enriched terms found
## 'data.frame':    194 obs. of  11 variables:
##  $ ID             : chr  "GO:0006954" "GO:0035239" "GO:0035295" "GO:0006955" ...
##  $ Description    : chr  "inflammatory response" "tube morphogenesis" "tube development" "immune response" ...
##  $ setSize        : int  3 4 4 5 5 5 4 4 4 3 ...
##  $ enrichmentScore: num  0.879 -1 -1 0.827 0.827 ...
##  $ NES            : num  1.61 -2.21 -2.21 1.63 1.63 ...
##  $ pvalue         : num  0.0112 0.0118 0.0118 0.0302 0.0302 ...
##  $ p.adjust       : num  0.0112 0.0118 0.0118 0.0302 0.0302 ...
##  $ qvalue         : num  0.708 0.708 0.708 0.708 0.708 ...
##  $ rank           : num  2 5 5 3 3 3 2 2 2 5 ...
##  $ leading_edge   : chr  "tags=67%, list=18%, signal=75%" "tags=100%, list=45%, signal=86%" "tags=100%, list=45%, signal=86%" "tags=60%, list=27%, signal=80%" ...
##  $ core_enrichment: chr  "ENSG00000184557/ENSG00000224389" "ENSG00000171223/ENSG00000168610/ENSG00000100628" "ENSG00000171223/ENSG00000168610/ENSG00000100628" "ENSG00000184557/ENSG00000224389/ENSG00000138496" ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320

Question 3: Do we generate meaningful results given our relaxed thresholds?

With 14 DE genes in BMAT and 11 in MM, both lists fall below the recommended minimum of 50 genes for reliable ORA. Results are hypothesis-generating only, not statistically rigorous conclusions.

MM shows biologically coherent signals — immune regulation GO terms, T cell differentiation, and regulation of defense response — consistent with the multiple myeloma co-culture biology. Key DE genes like SOCS3, TSC22D3, and DDIT4 are established players in immune signaling and cancer pathways, lending biological face validity to the results. BMAT results are noisier given fewer DE genes, with DNA replication terms likely reflecting small-sample overlap noise rather than a true biological signal.


Part 5: GO Visualizations

5.1 Dotplots

if (!is.null(gse_bmat_enrich)) {
  print(dotplot(gse_bmat_enrich, showCategory = 10) +
    ggtitle("BMAT: GO BP Over-Representation"))
} else { cat("BMAT enrichGO: no results to plot\n") }

if (!is.null(gse_mm)) {
  print(dotplot(gse_mm, showCategory = 10) +
    ggtitle("MM: GO BP GSEA"))
} else { cat("MM gseGO: no results to plot\n") }

5.2 Enrichment map plots

if (!is.null(gse_bmat_pairwise)) {
  print(emapplot(gse_bmat_pairwise, showCategory = 10))
} else { cat("BMAT emapplot: no results to plot\n") }

if (!is.null(gse_mm_pairwise)) {
  print(emapplot(gse_mm_pairwise, showCategory = 10))
} else { cat("MM emapplot: no results to plot\n") }

5.3 Ridgeplots

if (!is.null(gse_bmat)) {
  print(ridgeplot(gse_bmat) +
    labs(x = "Enrichment distribution", title = "BMAT Ridgeplot") +
    theme(plot.margin = margin(10, 10, 10, 10)))
} else { cat("BMAT ridgeplot: no results to plot\n") }

if (!is.null(gse_mm)) {
  print(ridgeplot(gse_mm) +
    labs(x = "Enrichment distribution", title = "MM Ridgeplot") +
    theme(plot.margin = margin(10, 10, 10, 10)))
} else { cat("MM ridgeplot: no results to plot\n") }

5.4 Classic GSEA enrichment plot

if (!is.null(gse_mm) && nrow(gse_mm) > 0) {
  gseaplot(gse_mm, by = "all", title = gse_mm$Description[1], geneSetID = 1)
} else { cat("MM gseaplot: no results to plot\n") }


Part 6: Disease Ontology

gse_bmat_do <- tryCatch(
  gseDO(gene_list_bmat_entrez, pvalueCutoff = 1),
  error = function(e) { cat("BMAT gseDO skipped:", e$message, "\n"); NULL }
)
## BMAT gseDO skipped: NAs in names(stats) are not allowed
gse_mm_do <- tryCatch(
  gseDO(gene_list_mm_entrez[!duplicated(names(gene_list_mm_entrez))],
        pvalueCutoff = 1),
  error = function(e) { cat("MM gseDO skipped:", e$message, "\n"); NULL }
)
if (!is.null(gse_mm_do) && nrow(gse_mm_do) > 0) {
  print(dotplot(gse_mm_do) + ggtitle("MM: Disease Ontology Enrichment"))
} else { cat("MM DO dotplot: no results to plot\n") }
## MM DO dotplot: no results to plot

Question 4: What do you see in the Disease Ontology dotplot?

The MM Disease Ontology analysis returned no results to plot. This is because our 11-gene MM list is too small to achieve statistically meaningful overlap with Disease Ontology terms even at a fully relaxed pvalueCutoff of 1. The BMAT gseDO was similarly skipped due to NAs in the Entrez ID mapping. While disappointing, this absence is itself informative — it reflects the fundamental power limitation of this pilot dataset. It is not a failure of the biology but a consequence of sample size. The GO and KEGG results already point toward immune and metabolic associations consistent with multiple myeloma biology. In a well-powered study, we would expect Disease Ontology analysis to reveal enrichment for hematologic malignancy, immune dysregulation, and bone marrow disease terms directly relevant to the Reagan lab’s research questions. —

Part 7: KEGG Pathway Analysis

7.1 Run KEGG enrichment

kk_bmat <- tryCatch(
  enrichKEGG(names(gene_list_bmat_entrez), organism = "hsa",
    minGSSize = 3, maxGSSize = 800,
    pvalueCutoff = 1, pAdjustMethod = "BH", keyType = "kegg"),
  error = function(e) { cat("BMAT KEGG skipped:", e$message, "\n"); NULL }
)

kk_mm <- tryCatch(
  enrichKEGG(names(gene_list_mm_entrez), organism = "hsa",
    minGSSize = 3, maxGSSize = 800,
    pvalueCutoff = 1, pAdjustMethod = "BH", keyType = "kegg"),
  error = function(e) { cat("MM KEGG skipped:", e$message, "\n"); NULL }
)

kk_bmat
## #
## # over-representation test
## #
## #...@organism     hsa 
## #...@ontology     KEGG 
## #...@keytype      kegg 
## #...@gene     chr [1:13] NA "100529144" "84798" "3683" "5426" "3952" "80274" "79071" ...
## #...pvalues adjusted by 'BH' with cutoff <1 
## #...75 enriched terms found
## 'data.frame':    75 obs. of  14 variables:
##  $ category      : chr  "Genetic Information Processing" "Metabolism" "Environmental Information Processing" "Environmental Information Processing" ...
##  $ subcategory   : chr  "Replication and repair" "Metabolism of other amino acids" "Signal transduction" "Signaling molecules and interaction" ...
##  $ ID            : chr  "hsa03460" "hsa00430" "hsa04015" "hsa04081" ...
##  $ Description   : chr  "Fanconi anemia pathway" "Taurine and hypotaurine metabolism" "Rap1 signaling pathway" "Hormone signaling" ...
##  $ GeneRatio     : chr  "2/9" "1/9" "2/9" "2/9" ...
##  $ BgRatio       : chr  "54/9370" "17/9370" "212/9370" "219/9370" ...
##  $ RichFactor    : num  0.03704 0.05882 0.00943 0.00913 0.00893 ...
##  $ FoldEnrichment: num  38.56 61.24 9.82 9.51 9.3 ...
##  $ zScore        : num  8.58 7.71 4.03 3.95 3.9 ...
##  $ pvalue        : num  0.00114 0.01622 0.01652 0.01757 0.01834 ...
##  $ p.adjust      : num  0.101 0.183 0.183 0.183 0.183 ...
##  $ qvalue        : num  0.0915 0.166 0.166 0.166 0.166 ...
##  $ geneID        : chr  "672/79008" "2687" "3683/107" "3952/107" ...
##  $ Count         : int  2 1 2 2 2 1 1 1 1 1 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320
kk_mm
## #
## # over-representation test
## #
## #...@organism     hsa 
## #...@ontology     KEGG 
## #...@keytype      kegg 
## #...@gene     chr [1:11] "9021" "721" "83666" "1630" "1831" "54541" "8553" "57674" ...
## #...pvalues adjusted by 'BH' with cutoff <1 
## #...53 enriched terms found
## 'data.frame':    53 obs. of  14 variables:
##  $ category      : chr  "Organismal Systems" "Organismal Systems" "Organismal Systems" "Human Diseases" ...
##  $ subcategory   : chr  "Endocrine system" "Endocrine system" "Endocrine system" "Endocrine and metabolic disease" ...
##  $ ID            : chr  "hsa04935" "hsa04920" "hsa04917" "hsa04931" ...
##  $ Description   : chr  "Growth hormone synthesis, secretion and action" "Adipocytokine signaling pathway" "Prolactin signaling pathway" "Insulin resistance" ...
##  $ GeneRatio     : chr  "3/7" "2/7" "2/7" "2/7" ...
##  $ BgRatio       : chr  "122/9370" "70/9370" "71/9370" "109/9370" ...
##  $ RichFactor    : num  0.0246 0.0286 0.0282 0.0183 0.0168 ...
##  $ FoldEnrichment: num  32.9 38.2 37.7 24.6 22.5 ...
##  $ zScore        : num  9.7 8.55 8.49 6.76 6.45 ...
##  $ pvalue        : num  7.26e-05 1.13e-03 1.16e-03 2.71e-03 3.22e-03 ...
##  $ p.adjust      : num  0.00385 0.02049 0.02049 0.03415 0.03415 ...
##  $ qvalue        : num  0.00298 0.01587 0.01587 0.02645 0.02645 ...
##  $ geneID        : chr  "9021/3726/6774" "9021/6774" "9021/6774" "9021/6774" ...
##  $ Count         : int  3 2 2 2 2 2 2 2 2 2 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320

7.2 KEGG dotplots

if (!is.null(kk_bmat) && nrow(kk_bmat) > 0) {
  print(dotplot(kk_bmat, showCategory = 10) + ggtitle("BMAT: KEGG Pathways"))
} else { cat("BMAT KEGG dotplot: no results to plot\n") }

if (!is.null(kk_mm) && nrow(kk_mm) > 0) {
  print(dotplot(kk_mm, showCategory = 10) + ggtitle("MM: KEGG Pathways"))
} else { cat("MM KEGG dotplot: no results to plot\n") }

7.3 Pathview

tryCatch({
  pathview(gene_list_bmat_entrez, pathway.id = "hsa00430", species = "hsa")
  knitr::include_graphics("hsa00430.pathview.png")
}, error = function(e) { cat("Pathview skipped:", e$message, "\n") })