# Complete GSEA test data generation code
library(msigdbr)
## Warning: package 'msigdbr' was built under R version 4.4.2
library(clusterProfiler)
## Warning: package 'clusterProfiler' was built under R version 4.4.2
##
## clusterProfiler v4.14.4 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
## clusterProfiler: an R package for comparing biological themes among
## gene clusters. OMICS: A Journal of Integrative Biology. 2012,
## 16(5):284-287
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:stats':
##
## filter
library(limma)
## Warning: package 'limma' was built under R version 4.4.2
# 1. Get MsigDB pathway data
msig_df <- msigdbr(species = "Homo sapiens",
category = "C5",
subcategory = "MF") %>%
dplyr::select(gs_name, gene_symbol)
cat("\n1. Pathway data dimensions and content:\n")
##
## 1. Pathway data dimensions and content:
print(dim(msig_df))
## [1] 122674 2
cat("\nFirst 6 rows of pathway data:\n")
##
## First 6 rows of pathway data:
print(head(msig_df))
## # A tibble: 6 × 2
## gs_name gene_symbol
## <chr> <chr>
## 1 GOMF_11_CIS_RETINAL_BINDING ABCA4
## 2 GOMF_11_CIS_RETINAL_BINDING CYP27C1
## 3 GOMF_11_CIS_RETINAL_BINDING OPN3
## 4 GOMF_11_CIS_RETINAL_BINDING OPN4
## 5 GOMF_11_CIS_RETINAL_BINDING OPN5
## 6 GOMF_11_CIS_RETINAL_BINDING RHO
# 2. Randomly select 1000 genes
set.seed(42)
selected_genes <- sample(unique(msig_df$gene_symbol), 1000)
cat("\n2. Number and content of selected genes:\n")
##
## 2. Number and content of selected genes:
print(length(selected_genes))
## [1] 1000
cat("\nFirst 6 selected genes:\n")
##
## First 6 selected genes:
print(head(selected_genes))
## [1] "PPOX" "ANGPTL8" "KIF11" "PCDHGC3" "STOX2" "ATP6V1E2"
# 3. Generate expression matrix
n_samples <- 6 # 3 samples per group
expr_data <- matrix(rnorm(length(selected_genes) * n_samples),
ncol = n_samples,
dimnames = list(selected_genes, paste0("sample", 1:n_samples)))
cat("\n3. Expression matrix dimensions and content:\n")
##
## 3. Expression matrix dimensions and content:
print(dim(expr_data))
## [1] 1000 6
cat("\nFirst 6 rows of expression matrix:\n")
##
## First 6 rows of expression matrix:
print(head(expr_data))
## sample1 sample2 sample3 sample4 sample5 sample6
## PPOX 0.06999775 0.8274220 0.5240592 -0.470569909 0.7118706 0.397203036
## ANGPTL8 -0.56167118 -1.9415441 -0.3760024 1.222223044 -0.1332234 0.584407977
## KIF11 1.26949632 1.6289717 -1.1941578 0.796876140 1.4003775 0.115466272
## PCDHGC3 -1.63101018 -0.3619278 0.6085870 0.660395481 -0.1558488 0.651715393
## STOX2 0.51144652 -0.6270708 -0.4202626 1.263689473 -0.5985805 -0.710668738
## ATP6V1E2 -0.69261311 -0.9131730 0.9346901 -0.005806305 1.6161382 0.007681934
# 4. Differential analysis
group <- factor(rep(c("ctrl", "treat"), each = 3))
design <- model.matrix(~group)
cat("\n4. Design matrix:\n")
##
## 4. Design matrix:
print(design)
## (Intercept) grouptreat
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 1
## 5 1 1
## 6 1 1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
fit <- lmFit(expr_data, design)
fit <- eBayes(fit)
# Get all differential analysis results without sorting
diff_result <- data.frame(
logFC = fit$coefficients[,2],
P.Value = fit$p.value[,2],
row.names = rownames(expr_data)
)
cat("\nDifferential analysis results dimensions and content:\n")
##
## Differential analysis results dimensions and content:
print(dim(diff_result))
## [1] 1000 2
cat("\nFirst 6 rows of differential analysis results:\n")
##
## First 6 rows of differential analysis results:
print(head(diff_result))
## logFC P.Value
## PPOX -0.2609917 0.74985096
## ANGPTL8 1.5175418 0.06381192
## KIF11 0.2028032 0.80432663
## PCDHGC3 0.8468710 0.30090141
## STOX2 0.1634424 0.84174083
## ATP6V1E2 0.7630366 0.35128204
# 5. Calculate ranking scores
ranking_score <- -log10(diff_result$P.Value) * diff_result$logFC
names(ranking_score) <- rownames(diff_result)
cat("\n5. Number and content of ranking scores:\n")
##
## 5. Number and content of ranking scores:
print(length(ranking_score))
## [1] 1000
cat("\nFirst 6 ranking scores:\n")
##
## First 6 ranking scores:
print(head(ranking_score))
## PPOX ANGPTL8 KIF11 PCDHGC3 STOX2 ATP6V1E2
## -0.03263050 1.81361139 0.01917860 0.44170741 0.01222902 0.34668115
# Sort ranking scores
gene_list <- sort(ranking_score, decreasing = TRUE)
cat("\nFirst 6 values after sorting:\n")
##
## First 6 values after sorting:
print(head(gene_list))
## NHLH2 PTGIS SDK1 HIVEP3 FGF8 SLC35D2
## 6.470878 5.669138 4.534839 4.347385 3.616556 3.298340
cat("\nLast 6 values after sorting:\n")
##
## Last 6 values after sorting:
print(tail(gene_list))
## SFMBT1 SP8 SND1 KLHL1 SLC22A7 H3Y2
## -3.404710 -3.462217 -4.377968 -4.477577 -4.834388 -5.539049
# 6. GSEA analysis
gsea_result <- GSEA(
geneList = gene_list,
TERM2GENE = msig_df,
minGSSize = 1, # Minimum gene set size
maxGSSize = 500, # Maximum gene set size
pvalueCutoff = 1, # P-value cutoff
pAdjustMethod = "BH" # BH correction method
)
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
cat("\n6. GSEA results overview:\n")
##
## 6. GSEA results overview:
print(dim(gsea_result@result))
## [1] 1105 11
cat("\nFirst 6 rows of GSEA results:\n")
##
## First 6 rows of GSEA results:
print(head(gsea_result@result[, c("ID", "setSize", "NES", "pvalue", "p.adjust")]))
## ID
## GOMF_ISOMERASE_ACTIVITY GOMF_ISOMERASE_ACTIVITY
## GOMF_LIPASE_ACTIVITY GOMF_LIPASE_ACTIVITY
## GOMF_HISTONE_BINDING GOMF_HISTONE_BINDING
## GOMF_PROTEIN_HETERODIMERIZATION_ACTIVITY GOMF_PROTEIN_HETERODIMERIZATION_ACTIVITY
## GOMF_LIGAND_GATED_ION_CHANNEL_ACTIVITY GOMF_LIGAND_GATED_ION_CHANNEL_ACTIVITY
## GOMF_MRNA_BINDING GOMF_MRNA_BINDING
## setSize NES pvalue
## GOMF_ISOMERASE_ACTIVITY 9 1.632146 0.005515035
## GOMF_LIPASE_ACTIVITY 13 1.545157 0.033300694
## GOMF_HISTONE_BINDING 20 -1.540655 0.029968665
## GOMF_PROTEIN_HETERODIMERIZATION_ACTIVITY 22 -1.535055 0.020551872
## GOMF_LIGAND_GATED_ION_CHANNEL_ACTIVITY 12 1.528080 0.050660157
## GOMF_MRNA_BINDING 39 1.499095 0.022535921
## p.adjust
## GOMF_ISOMERASE_ACTIVITY 0.9498484
## GOMF_LIPASE_ACTIVITY 0.9498484
## GOMF_HISTONE_BINDING 0.9498484
## GOMF_PROTEIN_HETERODIMERIZATION_ACTIVITY 0.9498484
## GOMF_LIGAND_GATED_ION_CHANNEL_ACTIVITY 0.9498484
## GOMF_MRNA_BINDING 0.9498484
# 7. Save results
saveRDS(gsea_result, "human_gsea_data.rds")
####################################################################################plot
# Compare gseaplot2 and gseaNb
library(enrichplot)
## Warning: package 'enrichplot' was built under R version 4.4.2
## enrichplot v1.26.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, 19(11):3292-3320
library(GseaVis) # for gseaNb
## Warning: package 'GseaVis' was built under R version 4.4.2
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
library(ggplot2)
# Load GSEA result
gsea_result <- readRDS("human_gsea_data.rds")
#gsea_result@result
# Show available pathways
cat("Available pathways:\n")
## Available pathways:
# Select first pathway
head(gsea_result@result,2)
## ID Description setSize
## GOMF_ISOMERASE_ACTIVITY GOMF_ISOMERASE_ACTIVITY GOMF_ISOMERASE_ACTIVITY 9
## GOMF_LIPASE_ACTIVITY GOMF_LIPASE_ACTIVITY GOMF_LIPASE_ACTIVITY 13
## enrichmentScore NES pvalue p.adjust
## GOMF_ISOMERASE_ACTIVITY 0.8872200 1.632146 0.005515035 0.9498484
## GOMF_LIPASE_ACTIVITY 0.7541595 1.545157 0.033300694 0.9498484
## qvalue rank leading_edge
## GOMF_ISOMERASE_ACTIVITY 0.9498484 22 tags=22%, list=2%, signal=22%
## GOMF_LIPASE_ACTIVITY 0.9498484 47 tags=15%, list=5%, signal=15%
## core_enrichment
## GOMF_ISOMERASE_ACTIVITY PTGIS/AIPL1
## GOMF_LIPASE_ACTIVITY PNLIPRP3/CLC
pathway_id <- gsea_result@result$ID[1]
pathway_id
## [1] "GOMF_ISOMERASE_ACTIVITY"
# 1. Using gseaplot2
p1 <- gseaplot2(gsea_result,
geneSetID = pathway_id,
title = "GSEA by gseaplot2",
pvalue = TRUE,
base_size = 11)
p1

# Save gseaplot2 result
#pdf("gsea_plot_enrichplot.pdf", width = 8, height = 6)
print(p1)

#dev.off()
# 2. Using gseaNb
p2 <- gseaNb(object = gsea_result,
geneSetID = pathway_id,
subPlot = 3, # show all three plots
lineSize = 1, # line thickness
addPval = TRUE, # add p-value to plot
rmPrefix = TRUE) # remove prefix from pathway name
# Save gseaNb result
#pdf("gsea_plot_gseaVis.pdf", width = 8, height = 6)
print(p2)

#dev.off()
cat("\nPlots have been saved as 'gsea_plot_enrichplot.pdf' and 'gsea_plot_gseaVis.pdf'\n")
##
## Plots have been saved as 'gsea_plot_enrichplot.pdf' and 'gsea_plot_gseaVis.pdf'
#############################################plot custom