# Complete GSEA test data generation code
library(msigdbr)
## Warning: package 'msigdbr' was built under R version 4.4.2
library(clusterProfiler)
## Warning: package 'clusterProfiler' was built under R version 4.4.2
## 
## clusterProfiler v4.14.4 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
## clusterProfiler: an R package for comparing biological themes among
## gene clusters. OMICS: A Journal of Integrative Biology. 2012,
## 16(5):284-287
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:stats':
## 
##     filter
library(limma)
## Warning: package 'limma' was built under R version 4.4.2
# 1. Get MsigDB pathway data
msig_df <- msigdbr(species = "Homo sapiens", 
                   category = "C5", 
                   subcategory = "MF") %>%
  dplyr::select(gs_name, gene_symbol)

cat("\n1. Pathway data dimensions and content:\n")
## 
## 1. Pathway data dimensions and content:
print(dim(msig_df))
## [1] 122674      2
cat("\nFirst 6 rows of pathway data:\n")
## 
## First 6 rows of pathway data:
print(head(msig_df))
## # A tibble: 6 × 2
##   gs_name                     gene_symbol
##   <chr>                       <chr>      
## 1 GOMF_11_CIS_RETINAL_BINDING ABCA4      
## 2 GOMF_11_CIS_RETINAL_BINDING CYP27C1    
## 3 GOMF_11_CIS_RETINAL_BINDING OPN3       
## 4 GOMF_11_CIS_RETINAL_BINDING OPN4       
## 5 GOMF_11_CIS_RETINAL_BINDING OPN5       
## 6 GOMF_11_CIS_RETINAL_BINDING RHO
# 2. Randomly select 1000 genes
set.seed(42)
selected_genes <- sample(unique(msig_df$gene_symbol), 1000)

cat("\n2. Number and content of selected genes:\n")
## 
## 2. Number and content of selected genes:
print(length(selected_genes))
## [1] 1000
cat("\nFirst 6 selected genes:\n")
## 
## First 6 selected genes:
print(head(selected_genes))
## [1] "PPOX"     "ANGPTL8"  "KIF11"    "PCDHGC3"  "STOX2"    "ATP6V1E2"
# 3. Generate expression matrix
n_samples <- 6  # 3 samples per group
expr_data <- matrix(rnorm(length(selected_genes) * n_samples), 
                    ncol = n_samples,
                    dimnames = list(selected_genes, paste0("sample", 1:n_samples)))

cat("\n3. Expression matrix dimensions and content:\n")
## 
## 3. Expression matrix dimensions and content:
print(dim(expr_data))
## [1] 1000    6
cat("\nFirst 6 rows of expression matrix:\n")
## 
## First 6 rows of expression matrix:
print(head(expr_data))
##              sample1    sample2    sample3      sample4    sample5      sample6
## PPOX      0.06999775  0.8274220  0.5240592 -0.470569909  0.7118706  0.397203036
## ANGPTL8  -0.56167118 -1.9415441 -0.3760024  1.222223044 -0.1332234  0.584407977
## KIF11     1.26949632  1.6289717 -1.1941578  0.796876140  1.4003775  0.115466272
## PCDHGC3  -1.63101018 -0.3619278  0.6085870  0.660395481 -0.1558488  0.651715393
## STOX2     0.51144652 -0.6270708 -0.4202626  1.263689473 -0.5985805 -0.710668738
## ATP6V1E2 -0.69261311 -0.9131730  0.9346901 -0.005806305  1.6161382  0.007681934
# 4. Differential analysis
group <- factor(rep(c("ctrl", "treat"), each = 3))
design <- model.matrix(~group)

cat("\n4. Design matrix:\n")
## 
## 4. Design matrix:
print(design)
##   (Intercept) grouptreat
## 1           1          0
## 2           1          0
## 3           1          0
## 4           1          1
## 5           1          1
## 6           1          1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
fit <- lmFit(expr_data, design)
fit <- eBayes(fit)

# Get all differential analysis results without sorting
diff_result <- data.frame(
  logFC = fit$coefficients[,2],
  P.Value = fit$p.value[,2],
  row.names = rownames(expr_data)
)

cat("\nDifferential analysis results dimensions and content:\n")
## 
## Differential analysis results dimensions and content:
print(dim(diff_result))
## [1] 1000    2
cat("\nFirst 6 rows of differential analysis results:\n")
## 
## First 6 rows of differential analysis results:
print(head(diff_result))
##               logFC    P.Value
## PPOX     -0.2609917 0.74985096
## ANGPTL8   1.5175418 0.06381192
## KIF11     0.2028032 0.80432663
## PCDHGC3   0.8468710 0.30090141
## STOX2     0.1634424 0.84174083
## ATP6V1E2  0.7630366 0.35128204
# 5. Calculate ranking scores
ranking_score <- -log10(diff_result$P.Value) * diff_result$logFC
names(ranking_score) <- rownames(diff_result)

cat("\n5. Number and content of ranking scores:\n")
## 
## 5. Number and content of ranking scores:
print(length(ranking_score))
## [1] 1000
cat("\nFirst 6 ranking scores:\n")
## 
## First 6 ranking scores:
print(head(ranking_score))
##        PPOX     ANGPTL8       KIF11     PCDHGC3       STOX2    ATP6V1E2 
## -0.03263050  1.81361139  0.01917860  0.44170741  0.01222902  0.34668115
# Sort ranking scores
gene_list <- sort(ranking_score, decreasing = TRUE)

cat("\nFirst 6 values after sorting:\n")
## 
## First 6 values after sorting:
print(head(gene_list))
##    NHLH2    PTGIS     SDK1   HIVEP3     FGF8  SLC35D2 
## 6.470878 5.669138 4.534839 4.347385 3.616556 3.298340
cat("\nLast 6 values after sorting:\n")
## 
## Last 6 values after sorting:
print(tail(gene_list))
##    SFMBT1       SP8      SND1     KLHL1   SLC22A7      H3Y2 
## -3.404710 -3.462217 -4.377968 -4.477577 -4.834388 -5.539049
# 6. GSEA analysis
gsea_result <- GSEA(
  geneList = gene_list,      
  TERM2GENE = msig_df,       
  minGSSize = 1,            # Minimum gene set size
  maxGSSize = 500,           # Maximum gene set size
  pvalueCutoff = 1,       # P-value cutoff
  pAdjustMethod = "BH"       # BH correction method
)
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
cat("\n6. GSEA results overview:\n")
## 
## 6. GSEA results overview:
print(dim(gsea_result@result))
## [1] 1105   11
cat("\nFirst 6 rows of GSEA results:\n")
## 
## First 6 rows of GSEA results:
print(head(gsea_result@result[, c("ID", "setSize", "NES", "pvalue", "p.adjust")]))
##                                                                                ID
## GOMF_ISOMERASE_ACTIVITY                                   GOMF_ISOMERASE_ACTIVITY
## GOMF_LIPASE_ACTIVITY                                         GOMF_LIPASE_ACTIVITY
## GOMF_HISTONE_BINDING                                         GOMF_HISTONE_BINDING
## GOMF_PROTEIN_HETERODIMERIZATION_ACTIVITY GOMF_PROTEIN_HETERODIMERIZATION_ACTIVITY
## GOMF_LIGAND_GATED_ION_CHANNEL_ACTIVITY     GOMF_LIGAND_GATED_ION_CHANNEL_ACTIVITY
## GOMF_MRNA_BINDING                                               GOMF_MRNA_BINDING
##                                          setSize       NES      pvalue
## GOMF_ISOMERASE_ACTIVITY                        9  1.632146 0.005515035
## GOMF_LIPASE_ACTIVITY                          13  1.545157 0.033300694
## GOMF_HISTONE_BINDING                          20 -1.540655 0.029968665
## GOMF_PROTEIN_HETERODIMERIZATION_ACTIVITY      22 -1.535055 0.020551872
## GOMF_LIGAND_GATED_ION_CHANNEL_ACTIVITY        12  1.528080 0.050660157
## GOMF_MRNA_BINDING                             39  1.499095 0.022535921
##                                           p.adjust
## GOMF_ISOMERASE_ACTIVITY                  0.9498484
## GOMF_LIPASE_ACTIVITY                     0.9498484
## GOMF_HISTONE_BINDING                     0.9498484
## GOMF_PROTEIN_HETERODIMERIZATION_ACTIVITY 0.9498484
## GOMF_LIGAND_GATED_ION_CHANNEL_ACTIVITY   0.9498484
## GOMF_MRNA_BINDING                        0.9498484
# 7. Save results
saveRDS(gsea_result, "human_gsea_data.rds")

####################################################################################plot
# Compare gseaplot2 and gseaNb
library(enrichplot)
## Warning: package 'enrichplot' was built under R version 4.4.2
## enrichplot v1.26.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, 19(11):3292-3320
library(GseaVis)  # for gseaNb
## Warning: package 'GseaVis' was built under R version 4.4.2
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
library(ggplot2)

# Load GSEA result
gsea_result <- readRDS("human_gsea_data.rds")
#gsea_result@result
# Show available pathways
cat("Available pathways:\n")
## Available pathways:
# Select first pathway
head(gsea_result@result,2)
##                                              ID             Description setSize
## GOMF_ISOMERASE_ACTIVITY GOMF_ISOMERASE_ACTIVITY GOMF_ISOMERASE_ACTIVITY       9
## GOMF_LIPASE_ACTIVITY       GOMF_LIPASE_ACTIVITY    GOMF_LIPASE_ACTIVITY      13
##                         enrichmentScore      NES      pvalue  p.adjust
## GOMF_ISOMERASE_ACTIVITY       0.8872200 1.632146 0.005515035 0.9498484
## GOMF_LIPASE_ACTIVITY          0.7541595 1.545157 0.033300694 0.9498484
##                            qvalue rank                  leading_edge
## GOMF_ISOMERASE_ACTIVITY 0.9498484   22 tags=22%, list=2%, signal=22%
## GOMF_LIPASE_ACTIVITY    0.9498484   47 tags=15%, list=5%, signal=15%
##                         core_enrichment
## GOMF_ISOMERASE_ACTIVITY     PTGIS/AIPL1
## GOMF_LIPASE_ACTIVITY       PNLIPRP3/CLC
pathway_id <- gsea_result@result$ID[1]
pathway_id
## [1] "GOMF_ISOMERASE_ACTIVITY"
# 1. Using gseaplot2
p1 <- gseaplot2(gsea_result, 
                geneSetID = pathway_id,
                title = "GSEA by gseaplot2",
                pvalue = TRUE, 
                base_size = 11)
p1

# Save gseaplot2 result
#pdf("gsea_plot_enrichplot.pdf", width = 8, height = 6)
print(p1)

#dev.off()

# 2. Using gseaNb
p2 <- gseaNb(object = gsea_result,
             geneSetID = pathway_id,
             subPlot = 3,              # show all three plots
             lineSize = 1,             # line thickness
             addPval = TRUE,           # add p-value to plot
             rmPrefix = TRUE)          # remove prefix from pathway name

# Save gseaNb result
#pdf("gsea_plot_gseaVis.pdf", width = 8, height = 6)
print(p2)

#dev.off()

cat("\nPlots have been saved as 'gsea_plot_enrichplot.pdf' and 'gsea_plot_gseaVis.pdf'\n")
## 
## Plots have been saved as 'gsea_plot_enrichplot.pdf' and 'gsea_plot_gseaVis.pdf'
#############################################plot custom