Part 1: Environment Setup

1.1 Set up library path

.libPaths("/courses/BINF6430.202630/students/tran.nhi2/module06_QC_and_Quantification/renv/library/linux-ubuntu-noble/R-4.5/x86_64-pc-linux-gnu/")
cat("Library path set.\n")

## Library path set.

1.2 Load libraries

suppressPackageStartupMessages({
  library(tidyverse)
  library(DESeq2)
  library(DEGreport)
  library(org.Hs.eg.db)
  library(clusterProfiler)
  library(pathview)
  library(ggplot2)
  library(pheatmap)
  library(ggrepel)
  library(cowplot)
  library(enrichplot)
  library(ggridges)
  library(DOSE)
})
cat("\033[32mAll libraries loaded successfully.\033[0m\n")

## [32mAll libraries loaded successfully.[0m

1.3 Load Project 07 workspace

p07_path <- "/courses/BINF6430.202630/students/tran.nhi2/module06_QC_and_Quantification/project07.RData"

if (file.exists(p07_path)) {
  load(p07_path)
  cat("\033[32mProject 07 workspace loaded successfully.\033[0m\n")
} else {
  stop(paste("File not found:", p07_path))
}

## [32mProject 07 workspace loaded successfully.[0m

# Add threshold_OE and genelabels if not present
if (!"threshold_OE" %in% colnames(res_bmat_tb)) {
  res_bmat_tb <- res_bmat_tb %>%
    mutate(
      threshold_OE = !is.na(padj) & padj < 0.1 & abs(log2FoldChange) >= 1,
      genelabels   = ifelse(threshold_OE, gene, "")
    )
}
if (!"threshold_OE" %in% colnames(res_mm_tb)) {
  res_mm_tb <- res_mm_tb %>%
    mutate(
      threshold_OE = !is.na(padj) & padj < 0.1 & abs(log2FoldChange) >= 1,
      genelabels   = ifelse(threshold_OE, gene, "")
    )
}
cat("\033[32mthreshold_OE and genelabels columns verified.\033[0m\n")

## [32mthreshold_OE and genelabels columns verified.[0m

Part 2: Reviewing the Data

2.1 Summary of DE results

We use alpha = 0.1 because this is an underpowered pilot study. A relaxed p-value threshold allows us to include genes that lean toward statistical significance without demanding the power of a fully-replicated experiment.

cat("=== BMAT Summary ===\n")

## === BMAT Summary ===

summary(res_bmat, alpha = 0.1)

## 
## out of 23567 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 40, 0.17%
## LFC < 0 (down)     : 10, 0.042%
## outliers [1]       : 0, 0%
## low counts [2]     : 5891, 25%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

cat("\n=== MM Summary ===\n")

## 
## === MM Summary ===

summary(res_mm, alpha = 0.1)

## 
## out of 22474 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 29, 0.13%
## LFC < 0 (down)     : 38, 0.17%
## outliers [1]       : 0, 0%
## low counts [2]     : 16337, 73%
## (mean count < 196)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

2.2 Build DE gene tables with ID mapping

We filter by padj < 0.1 and |log2FoldChange| >= 1. We map ENSEMBL IDs to gene symbols (for readability) and Entrez IDs (required by KEGG and Disease Ontology).

# ── BMAT ─────────────────────────────────────────────────────────────────────
DE_bmat <- res_bmat_tb %>%
  filter(padj < 0.1 & abs(log2FoldChange) >= 1)

DE_bmat$symbol <- mapIds(org.Hs.eg.db, keys = DE_bmat$gene,
  column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first")
DE_bmat$entrez <- mapIds(org.Hs.eg.db, keys = DE_bmat$gene,
  column = "ENTREZID", keytype = "ENSEMBL", multiVals = "first")

write.csv(DE_bmat, "DE_BMAT.csv", row.names = TRUE)
cat(sprintf("BMAT: %d DE genes after filtering\n", nrow(DE_bmat)))

## BMAT: 14 DE genes after filtering

knitr::kable(head(DE_bmat), caption = "Top BMAT DE Genes")

Top BMAT DE Genes
gene	baseMean	log2FoldChange	lfcSE	pvalue	padj	threshold_OE	symbol	genelabels	entrez
ENSG00000273768	81.53610	-10.838511	3.0497175	0e+00	0.0000000	TRUE	LOC124904613	RNVU1-29	124904613
ENSG00000177084	365.08065	1.685195	0.2540747	0e+00	0.0000000	TRUE	POLE	POLE	5426
ENSG00000170522	1658.62441	1.138227	0.1770923	0e+00	0.0000000	TRUE	ELOVL6	ELOVL6	79071
ENSG00000174697	267.24841	1.479131	0.2847458	0e+00	0.0000264	TRUE	LEP	LEP	3952
ENSG00000260914	22.85321	8.456502	2.8325525	0e+00	0.0001048	TRUE	NA		NA
ENSG00000159307	250.30474	1.163067	0.2696740	7e-07	0.0014518	TRUE	SCUBE1	SCUBE1	80274

# ── MM ───────────────────────────────────────────────────────────────────────
DE_mm <- res_mm_tb %>%
  filter(padj < 0.1 & abs(log2FoldChange) >= 1)

DE_mm$symbol <- mapIds(org.Hs.eg.db, keys = DE_mm$gene,
  column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first")
DE_mm$entrez <- mapIds(org.Hs.eg.db, keys = DE_mm$gene,
  column = "ENTREZID", keytype = "ENSEMBL", multiVals = "first")

write.csv(DE_mm, "DE_MM.csv", row.names = TRUE)
cat(sprintf("MM: %d DE genes after filtering\n", nrow(DE_mm)))

## MM: 11 DE genes after filtering

knitr::kable(head(DE_mm), caption = "Top MM DE Genes")

Top MM DE Genes
gene	baseMean	log2FoldChange	lfcSE	pvalue	padj	threshold_OE	symbol	genelabels	entrez
ENSG00000168209	1212.6055	1.962096	0.2663245	0e+00	0.0000000	TRUE	DDIT4	DDIT4	54541
ENSG00000184557	220.5533	3.982983	0.6178745	0e+00	0.0000000	TRUE	SOCS3	SOCS3	9021
ENSG00000157514	1170.0315	2.115485	0.3442793	0e+00	0.0000000	TRUE	TSC22D3	TSC22D3	1831
ENSG00000173821	2025.1851	1.303084	0.2124927	0e+00	0.0000000	TRUE	RNF213	RNF213	57674
ENSG00000138496	260.1226	2.973312	0.6607364	0e+00	0.0000230	TRUE	PARP9	PARP9	83666
ENSG00000134107	437.9017	1.385086	0.3069167	2e-07	0.0001682	TRUE	BHLHE40	BHLHE40	8553

Part 3: Sanity Checks

3.1 Sample correlation heatmaps

These heatmaps verify that samples cluster as expected. Co-cultured samples should group separately from alone samples. A clean 1:1 diagonal confirms each sample correlates perfectly with itself.

# ── BMAT ─────────────────────────────────────────────────────────────────────
vst_mat_bmat <- assay(vsd_bmat)
vst_cor_bmat <- cor(vst_mat_bmat)
bmat_annotation <- data.frame(
  row.names   = colnames(vst_mat_bmat),
  co_cultured = s2c_bmat$co_cultured
)
pheatmap(vst_cor_bmat, annotation_col = bmat_annotation,
         main = "BMAT Sample Correlation",
         show_rownames = FALSE, show_colnames = FALSE)

# ── MM ───────────────────────────────────────────────────────────────────────
vst_mat_mm <- assay(vsd_mm)
vst_cor_mm <- cor(vst_mat_mm)
mm_annotation <- data.frame(
  row.names   = colnames(vst_mat_mm),
  co_cultured = s2c_mm$co_cultured
)
pheatmap(vst_cor_mm, annotation_col = mm_annotation,
         main = "MM Sample Correlation",
         show_rownames = FALSE, show_colnames = FALSE)

3.2 Volcano plots

degVolcano(
  data.frame(res_bmat_tb[, c("log2FoldChange", "padj")]),
  plot_text = data.frame(
    res_bmat_tb[res_bmat_tb$threshold_OE == TRUE,
                c("log2FoldChange", "padj", "genelabels")])
)

degVolcano(
  data.frame(res_mm_tb[, c("log2FoldChange", "padj")]),
  plot_text = data.frame(
    res_mm_tb[res_mm_tb$threshold_OE == TRUE,
              c("log2FoldChange", "padj", "genelabels")])
)

3.3 MA plots

An MA plot shows log2 fold change (M) on the y-axis versus mean normalized expression (A) on the x-axis. Genes near M = 0 are similarly expressed between conditions. Genes above are upregulated in co-cultured cells; below are downregulated.

# Build DEGSets from the original split model dds objects
res_bmat_degset <- degComps(dds_bmat, contrast = list("co_cultured_TRUE_vs_FALSE"))
res_mm_degset   <- degComps(dds_mm,   contrast = list("co_cultured_TRUE_vs_FALSE"))

# Basic MA plots for BMAT and MM
degMA(res_bmat_degset, raw = TRUE, diff = 1)

degMA(res_mm_degset,   raw = TRUE, diff = 1)

Question 1: How do you interpret the MA plots? What do the colors mean?

The x-axis (A) represents mean normalized expression and the y-axis (M) is the log2 fold change between co-cultured and alone conditions. Genes near M = 0 show little difference between conditions. Red/colored points are significantly DE (padj < 0.1, |LFC| ≥ 1), while gray points are not significant. The “fanning” pattern at low expression is expected — lowly expressed genes have higher variance, producing more scatter around zero.

# Side-by-side: standard MA (bottom) vs correlation MA (top)
ma1 <- degMA(res_mm_degset, correlation = TRUE, diff = 1)
ma2 <- degMA(res_mm_degset, diff = 1)
plot_grid(ma2, ma1, align = "v", nrow = 2, ncol = 1)

Question 2: How do you interpret the side-by-side MA plots?

These two-panel plots come from degMA with raw = TRUE (top panel) and correlation = TRUE (bottom panel).

The top panel shows raw unshrunken fold changes vs mean expression. The dramatic vertical spread of blue points at low expression levels represents genes with extreme but unreliable fold change estimates caused by low counts — this illustrates exactly why shrinkage is necessary. Red points are significant genes.

The bottom panel plots the shrunken fold change (y-axis) against the unshrunken fold change (x-axis). The tight diagonal relationship for moderately and highly expressed genes shows that shrinkage changes these estimates only slightly. However, genes with very large unshrunken fold changes get pulled dramatically toward zero by apeglm — this is the shrinkage doing its job, removing noise from lowly expressed genes. The red significant genes cluster near the center-right diagonal, confirming that our DE calls come from genes where shrunken and unshrunken estimates agree reasonably well, indicating reliable results.

Part 4: Gene Set Enrichment Analysis

4.1 Prepare ranked gene lists

# ── BMAT: ENSEMBL ─────────────────────────────────────────────────────────────
original_gene_list_bmat        <- DE_bmat$log2FoldChange
names(original_gene_list_bmat) <- DE_bmat$gene
gene_list_bmat <- sort(na.omit(original_gene_list_bmat), decreasing = TRUE)

# ── BMAT: Entrez ──────────────────────────────────────────────────────────────
original_gene_list_bmat_entrez        <- DE_bmat$log2FoldChange
names(original_gene_list_bmat_entrez) <- DE_bmat$entrez
gene_list_bmat_entrez <- sort(na.omit(original_gene_list_bmat_entrez), decreasing = TRUE)

# ── MM: ENSEMBL ───────────────────────────────────────────────────────────────
original_gene_list_mm        <- DE_mm$log2FoldChange
names(original_gene_list_mm) <- DE_mm$gene
gene_list_mm <- sort(na.omit(original_gene_list_mm), decreasing = TRUE)

# ── MM: Entrez ────────────────────────────────────────────────────────────────
original_gene_list_mm_entrez        <- DE_mm$log2FoldChange
names(original_gene_list_mm_entrez) <- DE_mm$entrez
gene_list_mm_entrez <- sort(na.omit(original_gene_list_mm_entrez), decreasing = TRUE)

cat("BMAT gene list length:", length(gene_list_bmat), "\n")

## BMAT gene list length: 14

cat("MM gene list length:",   length(gene_list_mm),   "\n")

## MM gene list length: 11

4.2 Gene Ontology (GO) Analysis

# ── BMAT ORA ──────────────────────────────────────────────────────────────────
gse_bmat_enrich <- tryCatch(
  enrichGO(names(gene_list_bmat), ont = "BP", keyType = "ENSEMBL",
    minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.1,
    OrgDb = org.Hs.eg.db, pAdjustMethod = "none"),
  error = function(e) { cat("BMAT enrichGO skipped:", e$message, "\n"); NULL }
)

# ── BMAT GSEA ─────────────────────────────────────────────────────────────────
gse_bmat <- tryCatch(
  gseGO(geneList = gene_list_bmat, ont = "BP", keyType = "ENSEMBL",
    minGSSize = 3, maxGSSize = 800, pvalueCutoff = 1,
    verbose = TRUE, OrgDb = org.Hs.eg.db, pAdjustMethod = "none"),
  error = function(e) { cat("BMAT gseGO skipped:", e$message, "\n"); NULL }
)
gse_bmat_pairwise <- if (!is.null(gse_bmat)) pairwise_termsim(gse_bmat) else NULL

# ── MM ORA ────────────────────────────────────────────────────────────────────
gse_mm_enrich <- tryCatch(
  enrichGO(names(gene_list_mm), ont = "BP", keyType = "ENSEMBL",
    minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.1,
    OrgDb = org.Hs.eg.db, pAdjustMethod = "none"),
  error = function(e) { cat("MM enrichGO skipped:", e$message, "\n"); NULL }
)

# ── MM GSEA ───────────────────────────────────────────────────────────────────
gse_mm <- tryCatch(
  gseGO(geneList = gene_list_mm, ont = "BP", keyType = "ENSEMBL",
    minGSSize = 3, maxGSSize = 800, pvalueCutoff = 1,
    verbose = TRUE, OrgDb = org.Hs.eg.db, pAdjustMethod = "none"),
  error = function(e) { cat("MM gseGO skipped:", e$message, "\n"); NULL }
)
gse_mm_pairwise <- if (!is.null(gse_mm)) pairwise_termsim(gse_mm) else NULL

4.3 Review GO result summaries

gse_bmat_enrich

## #
## # over-representation test
## #
## #...@organism     Homo sapiens 
## #...@ontology     BP 
## #...@keytype      ENSEMBL 
## #...@gene     chr [1:14] "ENSG00000260914" "ENSG00000289685" "ENSG00000103426" ...
## #...pvalues adjusted by 'none' with cutoff <0.1 
## #...515 enriched terms found
## 'data.frame':    515 obs. of  12 variables:
##  $ ID            : chr  "GO:0006631" "GO:0044283" "GO:0032787" "GO:0097305" ...
##  $ Description   : chr  "fatty acid metabolic process" "small molecule biosynthetic process" "monocarboxylic acid metabolic process" "response to alcohol" ...
##  $ GeneRatio     : chr  "4/9" "4/9" "4/9" "3/9" ...
##  $ BgRatio       : chr  "456/21338" "689/21338" "691/21338" "311/21338" ...
##  $ RichFactor    : num  0.00877 0.00581 0.00579 0.00965 0.0303 ...
##  $ FoldEnrichment: num  20.8 13.8 13.7 22.9 71.8 ...
##  $ zScore        : num  8.78 7 6.98 7.98 11.84 ...
##  $ pvalue        : num  2.38e-05 1.19e-04 1.21e-04 2.41e-04 3.34e-04 ...
##  $ p.adjust      : num  2.38e-05 1.19e-04 1.21e-04 2.41e-04 3.34e-04 ...
##  $ qvalue        : num  0.00975 0.01647 0.01647 0.0209 0.0209 ...
##  $ geneID        : chr  "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000012048" "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000012048" "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000012048" "ENSG00000174697/ENSG00000164742/ENSG00000012048" ...
##  $ Count         : int  4 4 4 3 2 2 3 3 2 2 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320

gse_bmat

## #
## # Gene Set Enrichment Analysis
## #
## #...@organism     Homo sapiens 
## #...@setType      BP 
## #...@keytype      ENSEMBL 
## #...@geneList     Named num [1:14] 8.46 6.81 6.72 3.41 3.3 ...
##  - attr(*, "names")= chr [1:14] "ENSG00000260914" "ENSG00000289685" "ENSG00000103426" "ENSG00000293495" ...
## #...nPerm     
## #...pvalues adjusted by 'none' with cutoff <1 
## #...95 enriched terms found
## 'data.frame':    95 obs. of  11 variables:
##  $ ID             : chr  "GO:0008150" "GO:0009987" "GO:0008152" "GO:0044238" ...
##  $ Description    : chr  "biological_process" "cellular process" "metabolic process" "primary metabolic process" ...
##  $ setSize        : int  9 9 7 7 6 6 8 8 3 3 ...
##  $ enrichmentScore: num  -0.8 -0.8 -0.714 -0.714 -0.75 ...
##  $ NES            : num  -1.78 -1.78 -1.5 -1.5 -1.5 ...
##  $ pvalue         : num  0.0143 0.0143 0.043 0.043 0.0558 ...
##  $ p.adjust       : num  0.0143 0.0143 0.043 0.043 0.0558 ...
##  $ qvalue         : num  0.558 0.558 0.558 0.558 0.558 ...
##  $ rank           : num  11 11 10 10 9 9 11 11 5 5 ...
##  $ leading_edge   : chr  "tags=100%, list=79%, signal=60%" "tags=100%, list=79%, signal=60%" "tags=100%, list=71%, signal=57%" "tags=100%, list=71%, signal=57%" ...
##  $ core_enrichment: chr  "ENSG00000177084/ENSG00000174697/ENSG00000159307/ENSG00000170522/ENSG00000099998/ENSG00000164742/ENSG00000012048"| __truncated__ "ENSG00000177084/ENSG00000174697/ENSG00000159307/ENSG00000170522/ENSG00000099998/ENSG00000164742/ENSG00000012048"| __truncated__ "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000164742/ENSG00000012048/ENSG00000132207" "ENSG00000174697/ENSG00000170522/ENSG00000099998/ENSG00000164742/ENSG00000012048/ENSG00000132207" ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320

gse_mm_enrich

## #
## # over-representation test
## #
## #...@organism     Homo sapiens 
## #...@ontology     BP 
## #...@keytype      ENSEMBL 
## #...@gene     chr [1:11] "ENSG00000184557" "ENSG00000224389" "ENSG00000138496" ...
## #...pvalues adjusted by 'none' with cutoff <0.1 
## #...547 enriched terms found
## 'data.frame':    547 obs. of  12 variables:
##  $ ID            : chr  "GO:0072539" "GO:0072538" "GO:0042093" "GO:0002294" ...
##  $ Description   : chr  "T-helper 17 cell differentiation" "T-helper 17 type immune response" "T-helper cell differentiation" "CD4-positive, alpha-beta T cell differentiation involved in immune response" ...
##  $ GeneRatio     : chr  "3/11" "3/11" "3/11" "3/11" ...
##  $ BgRatio       : chr  "47/21338" "64/21338" "108/21338" "110/21338" ...
##  $ RichFactor    : num  0.0638 0.0469 0.0278 0.0273 0.027 ...
##  $ FoldEnrichment: num  123.8 90.9 53.9 52.9 52.4 ...
##  $ zScore        : num  19.1 16.4 12.5 12.4 12.3 ...
##  $ pvalue        : num  1.63e-06 4.17e-06 2.02e-05 2.13e-05 2.19e-05 ...
##  $ p.adjust      : num  1.63e-06 4.17e-06 2.02e-05 2.13e-05 2.19e-05 ...
##  $ qvalue        : num  0.000574 0.000734 0.001285 0.001285 0.001285 ...
##  $ geneID        : chr  "ENSG00000184557/ENSG00000171223/ENSG00000168610" "ENSG00000184557/ENSG00000171223/ENSG00000168610" "ENSG00000184557/ENSG00000171223/ENSG00000168610" "ENSG00000184557/ENSG00000171223/ENSG00000168610" ...
##  $ Count         : int  3 3 3 3 3 3 3 3 2 2 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320

gse_mm

## #
## # Gene Set Enrichment Analysis
## #
## #...@organism     Homo sapiens 
## #...@setType      BP 
## #...@keytype      ENSEMBL 
## #...@geneList     Named num [1:11] 3.98 3.44 2.97 2.33 2.12 ...
##  - attr(*, "names")= chr [1:11] "ENSG00000184557" "ENSG00000224389" "ENSG00000138496" "ENSG00000187323" ...
## #...nPerm     
## #...pvalues adjusted by 'none' with cutoff <1 
## #...194 enriched terms found
## 'data.frame':    194 obs. of  11 variables:
##  $ ID             : chr  "GO:0006954" "GO:0035239" "GO:0035295" "GO:0006955" ...
##  $ Description    : chr  "inflammatory response" "tube morphogenesis" "tube development" "immune response" ...
##  $ setSize        : int  3 4 4 5 5 5 4 4 4 3 ...
##  $ enrichmentScore: num  0.879 -1 -1 0.827 0.827 ...
##  $ NES            : num  1.61 -2.21 -2.21 1.63 1.63 ...
##  $ pvalue         : num  0.0112 0.0118 0.0118 0.0302 0.0302 ...
##  $ p.adjust       : num  0.0112 0.0118 0.0118 0.0302 0.0302 ...
##  $ qvalue         : num  0.708 0.708 0.708 0.708 0.708 ...
##  $ rank           : num  2 5 5 3 3 3 2 2 2 5 ...
##  $ leading_edge   : chr  "tags=67%, list=18%, signal=75%" "tags=100%, list=45%, signal=86%" "tags=100%, list=45%, signal=86%" "tags=60%, list=27%, signal=80%" ...
##  $ core_enrichment: chr  "ENSG00000184557/ENSG00000224389" "ENSG00000171223/ENSG00000168610/ENSG00000100628" "ENSG00000171223/ENSG00000168610/ENSG00000100628" "ENSG00000184557/ENSG00000224389/ENSG00000138496" ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320

Question 3: Do we generate meaningful results given our relaxed thresholds?

With 14 DE genes in BMAT and 11 in MM, both lists fall below the recommended minimum of 50 genes for reliable ORA. Results are hypothesis-generating only, not statistically rigorous conclusions.

MM shows biologically coherent signals — immune regulation GO terms, T cell differentiation, and regulation of defense response — consistent with the multiple myeloma co-culture biology. Key DE genes like SOCS3, TSC22D3, and DDIT4 are established players in immune signaling and cancer pathways, lending biological face validity to the results. BMAT results are noisier given fewer DE genes, with DNA replication terms likely reflecting small-sample overlap noise rather than a true biological signal.

Part 5: GO Visualizations

5.1 Dotplots

if (!is.null(gse_bmat_enrich)) {
  print(dotplot(gse_bmat_enrich, showCategory = 10) +
    ggtitle("BMAT: GO BP Over-Representation"))
} else { cat("BMAT enrichGO: no results to plot\n") }

if (!is.null(gse_mm)) {
  print(dotplot(gse_mm, showCategory = 10) +
    ggtitle("MM: GO BP GSEA"))
} else { cat("MM gseGO: no results to plot\n") }

5.2 Enrichment map plots

if (!is.null(gse_bmat_pairwise)) {
  print(emapplot(gse_bmat_pairwise, showCategory = 10))
} else { cat("BMAT emapplot: no results to plot\n") }

if (!is.null(gse_mm_pairwise)) {
  print(emapplot(gse_mm_pairwise, showCategory = 10))
} else { cat("MM emapplot: no results to plot\n") }

5.3 Ridgeplots

if (!is.null(gse_bmat)) {
  print(ridgeplot(gse_bmat) +
    labs(x = "Enrichment distribution", title = "BMAT Ridgeplot") +
    theme(plot.margin = margin(10, 10, 10, 10)))
} else { cat("BMAT ridgeplot: no results to plot\n") }

if (!is.null(gse_mm)) {
  print(ridgeplot(gse_mm) +
    labs(x = "Enrichment distribution", title = "MM Ridgeplot") +
    theme(plot.margin = margin(10, 10, 10, 10)))
} else { cat("MM ridgeplot: no results to plot\n") }

5.4 Classic GSEA enrichment plot

if (!is.null(gse_mm) && nrow(gse_mm) > 0) {
  gseaplot(gse_mm, by = "all", title = gse_mm$Description[1], geneSetID = 1)
} else { cat("MM gseaplot: no results to plot\n") }

Part 6: Disease Ontology

gse_bmat_do <- tryCatch(
  gseDO(gene_list_bmat_entrez, pvalueCutoff = 1),
  error = function(e) { cat("BMAT gseDO skipped:", e$message, "\n"); NULL }
)

## BMAT gseDO skipped: NAs in names(stats) are not allowed

gse_mm_do <- tryCatch(
  gseDO(gene_list_mm_entrez[!duplicated(names(gene_list_mm_entrez))],
        pvalueCutoff = 1),
  error = function(e) { cat("MM gseDO skipped:", e$message, "\n"); NULL }
)

if (!is.null(gse_mm_do) && nrow(gse_mm_do) > 0) {
  print(dotplot(gse_mm_do) + ggtitle("MM: Disease Ontology Enrichment"))
} else { cat("MM DO dotplot: no results to plot\n") }

## MM DO dotplot: no results to plot

Question 4: What do you see in the Disease Ontology dotplot?

The MM Disease Ontology analysis returned no results to plot. This is because our 11-gene MM list is too small to achieve statistically meaningful overlap with Disease Ontology terms even at a fully relaxed pvalueCutoff of 1. The BMAT gseDO was similarly skipped due to NAs in the Entrez ID mapping. While disappointing, this absence is itself informative — it reflects the fundamental power limitation of this pilot dataset. It is not a failure of the biology but a consequence of sample size. The GO and KEGG results already point toward immune and metabolic associations consistent with multiple myeloma biology. In a well-powered study, we would expect Disease Ontology analysis to reveal enrichment for hematologic malignancy, immune dysregulation, and bone marrow disease terms directly relevant to the Reagan lab’s research questions. —

Part 7: KEGG Pathway Analysis

7.1 Run KEGG enrichment

kk_bmat <- tryCatch(
  enrichKEGG(names(gene_list_bmat_entrez), organism = "hsa",
    minGSSize = 3, maxGSSize = 800,
    pvalueCutoff = 1, pAdjustMethod = "BH", keyType = "kegg"),
  error = function(e) { cat("BMAT KEGG skipped:", e$message, "\n"); NULL }
)

kk_mm <- tryCatch(
  enrichKEGG(names(gene_list_mm_entrez), organism = "hsa",
    minGSSize = 3, maxGSSize = 800,
    pvalueCutoff = 1, pAdjustMethod = "BH", keyType = "kegg"),
  error = function(e) { cat("MM KEGG skipped:", e$message, "\n"); NULL }
)

kk_bmat

## #
## # over-representation test
## #
## #...@organism     hsa 
## #...@ontology     KEGG 
## #...@keytype      kegg 
## #...@gene     chr [1:13] NA "100529144" "84798" "3683" "5426" "3952" "80274" "79071" ...
## #...pvalues adjusted by 'BH' with cutoff <1 
## #...75 enriched terms found
## 'data.frame':    75 obs. of  14 variables:
##  $ category      : chr  "Genetic Information Processing" "Metabolism" "Environmental Information Processing" "Environmental Information Processing" ...
##  $ subcategory   : chr  "Replication and repair" "Metabolism of other amino acids" "Signal transduction" "Signaling molecules and interaction" ...
##  $ ID            : chr  "hsa03460" "hsa00430" "hsa04015" "hsa04081" ...
##  $ Description   : chr  "Fanconi anemia pathway" "Taurine and hypotaurine metabolism" "Rap1 signaling pathway" "Hormone signaling" ...
##  $ GeneRatio     : chr  "2/9" "1/9" "2/9" "2/9" ...
##  $ BgRatio       : chr  "54/9370" "17/9370" "212/9370" "219/9370" ...
##  $ RichFactor    : num  0.03704 0.05882 0.00943 0.00913 0.00893 ...
##  $ FoldEnrichment: num  38.56 61.24 9.82 9.51 9.3 ...
##  $ zScore        : num  8.58 7.71 4.03 3.95 3.9 ...
##  $ pvalue        : num  0.00114 0.01622 0.01652 0.01757 0.01834 ...
##  $ p.adjust      : num  0.101 0.183 0.183 0.183 0.183 ...
##  $ qvalue        : num  0.0915 0.166 0.166 0.166 0.166 ...
##  $ geneID        : chr  "672/79008" "2687" "3683/107" "3952/107" ...
##  $ Count         : int  2 1 2 2 2 1 1 1 1 1 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320

kk_mm

## #
## # over-representation test
## #
## #...@organism     hsa 
## #...@ontology     KEGG 
## #...@keytype      kegg 
## #...@gene     chr [1:11] "9021" "721" "83666" "1630" "1831" "54541" "8553" "57674" ...
## #...pvalues adjusted by 'BH' with cutoff <1 
## #...53 enriched terms found
## 'data.frame':    53 obs. of  14 variables:
##  $ category      : chr  "Organismal Systems" "Organismal Systems" "Organismal Systems" "Human Diseases" ...
##  $ subcategory   : chr  "Endocrine system" "Endocrine system" "Endocrine system" "Endocrine and metabolic disease" ...
##  $ ID            : chr  "hsa04935" "hsa04920" "hsa04917" "hsa04931" ...
##  $ Description   : chr  "Growth hormone synthesis, secretion and action" "Adipocytokine signaling pathway" "Prolactin signaling pathway" "Insulin resistance" ...
##  $ GeneRatio     : chr  "3/7" "2/7" "2/7" "2/7" ...
##  $ BgRatio       : chr  "122/9370" "70/9370" "71/9370" "109/9370" ...
##  $ RichFactor    : num  0.0246 0.0286 0.0282 0.0183 0.0168 ...
##  $ FoldEnrichment: num  32.9 38.2 37.7 24.6 22.5 ...
##  $ zScore        : num  9.7 8.55 8.49 6.76 6.45 ...
##  $ pvalue        : num  7.26e-05 1.13e-03 1.16e-03 2.71e-03 3.22e-03 ...
##  $ p.adjust      : num  0.00385 0.02049 0.02049 0.03415 0.03415 ...
##  $ qvalue        : num  0.00298 0.01587 0.01587 0.02645 0.02645 ...
##  $ geneID        : chr  "9021/3726/6774" "9021/6774" "9021/6774" "9021/6774" ...
##  $ Count         : int  3 2 2 2 2 2 2 2 2 2 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, 19(11):3292-3320

7.2 KEGG dotplots

if (!is.null(kk_bmat) && nrow(kk_bmat) > 0) {
  print(dotplot(kk_bmat, showCategory = 10) + ggtitle("BMAT: KEGG Pathways"))
} else { cat("BMAT KEGG dotplot: no results to plot\n") }

if (!is.null(kk_mm) && nrow(kk_mm) > 0) {
  print(dotplot(kk_mm, showCategory = 10) + ggtitle("MM: KEGG Pathways"))
} else { cat("MM KEGG dotplot: no results to plot\n") }

7.3 Pathview

tryCatch({
  pathview(gene_list_bmat_entrez, pathway.id = "hsa00430", species = "hsa")
  knitr::include_graphics("hsa00430.pathview.png")
}, error = function(e) { cat("Pathview skipped:", e$message, "\n") })

Project 08: GSEA, GO, and Pathway Analysis

Lily Lee

2026-03-16