setwd("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6")
# 已经完成了 Stage4 vs Normal 的差异基因分析
deg_stage4 <- read.csv("Final_DEG_Stage4_vs_Normal.csv", row.names = 1)
# 提取 logFC 列,并去除 NA 值
geneList <- deg_stage4$logFC
# 将基因名称作为 geneList 的名字
names(geneList) <- rownames(deg_stage4)
# 去除 NA 值
geneList <- na.omit(geneList)
# 根据 logFC 从大到小排序
geneList <- sort(geneList, decreasing = TRUE)
# 加载 MSigDB 等数据库(需要安装 msigdbr 和 clusterProfiler 包)
library(msigdbr)
library(clusterProfiler)
##
## clusterProfiler v4.14.4 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## G Yu. Thirteen years of clusterProfiler. The Innovation. 2024,
## 5(6):100722
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:stats':
##
## filter
# 获取 MSigDB Hallmark 基因集(例如 HALLMARK 基因集)
msigdbr_df <- msigdbr(species="Homo sapiens", category="H")
# 创建从基因集名称到基因符号的映射
msigdbr_t2g <- msigdbr_df %>%
dplyr::distinct(gs_name, gene_symbol) %>%
as.data.frame()
# 进行 GSEA 分析
gsea_result <- GSEA(geneList = geneList,
minGSSize = 1,
maxGSSize = 1000,
pvalueCutoff = 0.2,
TERM2GENE = msigdbr_t2g)
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
# 导出结果
write.csv(gsea_result, file="gsea_result_N4.csv")
gsea_result_N4<- read.csv("gsea_result_N4.csv", header = TRUE)
head(gsea_result_N4)
## X ID
## 1 HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS
## 2 HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT
## 3 HALLMARK_TNFA_SIGNALING_VIA_NFKB HALLMARK_TNFA_SIGNALING_VIA_NFKB
## 4 HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE
## 5 HALLMARK_FATTY_ACID_METABOLISM HALLMARK_FATTY_ACID_METABOLISM
## 6 HALLMARK_XENOBIOTIC_METABOLISM HALLMARK_XENOBIOTIC_METABOLISM
## Description setSize enrichmentScore NES
## 1 HALLMARK_E2F_TARGETS 46 0.4701923 2.694023
## 2 HALLMARK_G2M_CHECKPOINT 44 0.4752417 2.661733
## 3 HALLMARK_TNFA_SIGNALING_VIA_NFKB 32 -0.5407841 -2.615474
## 4 HALLMARK_INTERFERON_GAMMA_RESPONSE 22 -0.6017602 -2.515406
## 5 HALLMARK_FATTY_ACID_METABOLISM 21 -0.5718637 -2.348903
## 6 HALLMARK_XENOBIOTIC_METABOLISM 28 -0.5124435 -2.342257
## pvalue p.adjust qvalue rank leading_edge
## 1 6.175927e-07 3.087964e-05 2.210332e-05 441 tags=89%, list=47%, signal=49%
## 2 1.694533e-06 4.236333e-05 3.032323e-05 441 tags=91%, list=47%, signal=50%
## 3 3.154977e-06 5.258295e-05 3.763832e-05 376 tags=91%, list=40%, signal=56%
## 4 1.327700e-05 1.659625e-04 1.187942e-04 385 tags=100%, list=41%, signal=60%
## 5 6.756784e-05 6.756784e-04 4.836435e-04 284 tags=81%, list=31%, signal=58%
## 6 1.466081e-04 1.221734e-03 8.745045e-04 356 tags=86%, list=38%, signal=55%
## core_enrichment
## 1 MKI67/BIRC5/CDKN3/UBE2T/CDC20/MYBL2/SPC24/PTTG1/CDK1/SPC25/KIF4A/MELK/CCNB2/DEPDC1/TOP2A/DLGAP5/PLK1/KIF18B/CDCA3/KIF2C/E2F8/AURKA/CDKN2A/CENPM/BUB1B/RRM2/ASF1B/CDCA8/TRIP13/AURKB/HMMR/CENPE/ORC6/MXD3/TCF19/STMN1/CDKN2C/CCNE1/RACGAP1/DDX39A/HMGA1
## 2 MKI67/BIRC5/UBE2C/CDKN3/NEK2/CENPF/CENPA/CDC20/MYBL2/PTTG1/CDK1/KIF4A/CCNB2/TROAP/TOP2A/CDC6/E2F1/EXO1/PLK1/BUB1/CCNA2/KIF2C/AURKA/TTK/KIF15/NDC80/PBK/CDC45/TRAIP/AURKB/HMMR/CENPE/ORC6/KIF23/STMN1/CDKN2C/RACGAP1/UCK2/DDX39A/HMGA1
## 3 KDM6B/KLF6/RHOB/ETS2/PNRC1/CD69/NFIL3/GADD45B/PLPP3/NAMPT/SDC4/ZFP36/SOCS3/CCL4/JUN/SERPINB8/ATF3/CCN1/EGR1/TRIB1/NR4A2/PHLDA1/NR4A1/CXCL2/EGR2/RCAN1/FOS/NR4A3/FOSB
## 4 DDX60/USP18/PLSCR1/IRF8/STAT4/OASL/CD69/HERC6/EPSTI1/FGL2/APOL6/NAMPT/IFI44L/SOCS3/CD274/RSAD2/C1S/C1R/XAF1/IL2RB/FPR1
## 5 ACADM/ACADS/ETFDH/ACSL5/ACAA1/BCKDHB/ACAA2/CD1D/EHHADH/ACSL1/ENO3/GSTZ1/CYP4A11/ACSM3/RDH16/AADAT/HAO2
## 6 ETS2/IRF8/PEMT/ACOX1/MAN1A1/SLC35D1/LPIN2/ACOX2/EPHA2/NDRG2/DHRS1/CYB5A/MTHFD1/CAT/ETFDH/ALDH2/CSAD/PLG/ALAS1/LCAT/MBL2/FMO3/ESR1/CYP1A2
# 提取 NES > 0 的 Top 3 上调基因集
top_upregulated <- gsea_result_N4 %>%
filter(NES > 0) %>% # 过滤 NES > 0
arrange(desc(NES)) %>% # 按 NES 从大到小排序
head(3) # 取前 3 条记录
# 提取 NES < 0 的 Top 3 下调基因集
top_downregulated <- gsea_result_N4 %>%
filter(NES < 0) %>% # 过滤 NES < 0
arrange(NES) %>% # 按 NES 从小到大排序
head(3) # 取前 3 条记录
# 合并两组数据
top_gene_sets <- rbind(top_upregulated, top_downregulated)
# 添加组别信息(上调/下调)
top_gene_sets$Group <- ifelse(top_gene_sets$NES > 0, "Upregulated", "Downregulated")
# 提取 NES 和 FDR 阈值
nes_threshold <- sort(gsea_result_N4$NES, decreasing = TRUE)[3]
fdr_threshold <- sort(gsea_result_N4$qvalue)[3]
# 文本指示 NES 和 FDR 阈值
cat("NES Threshold:", round(nes_threshold, 2), "\n")
## NES Threshold: 2.19
cat("FDR Threshold:", round(fdr_threshold, 2), "\n")
## FDR Threshold: 0
# 设置 NES 和 FDR 阈值
nes_threshold_up <- 2.15 # NES 上调阈值
nes_threshold_down <- -2.15 # NES 下调阈值
fdr_threshold <- 0.001 # FDR 阈值
# 绘制条形图
library(ggplot2)
ggplot(top_gene_sets, aes(x = reorder(Description, NES), y = NES, fill = Group)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = c("Upregulated" = "red", "Downregulated" = "blue")) +
labs(title = "Top 3 Upregulated and Downregulated Gene Sets in Stage4 vs Normal",
x = "Gene Set Name",
y = "Normalized Enrichment Score (NES)",
fill = "Group") +
theme_minimal() +
theme(axis.text.x = element_text(size = 10, angle = 0, hjust = 1),
plot.title = element_text(hjust = 0.5), # 居中标题
panel.grid.major.x = element_blank(), # 去掉x轴的主要网格线
panel.grid.minor.x = element_blank()) + # 去掉x轴的次要网格线
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.8) + # 修改 linewidth 替代 size
annotate("text", x = 3.5, y = nes_threshold_up + 0.2, label = "NES > 2.13,NES < -2.13", hjust = 0.7) +
annotate("text", x = 3.5, y = 0.5, label = paste("FDR < 0.001"))

# ##绘制20个条形码图
# 对20个基因集可视化生成二维码图并保存为PDF
library(enrichplot)
## enrichplot v1.26.3 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## G Yu. Thirteen years of clusterProfiler. The Innovation. 2024,
## 5(6):100722
# 提取前20个基因集
top20_gene_sets <- head(gsea_result_N4[order(gsea_result_N4$NES, decreasing = TRUE), ], 20)
# 循环生成并保存每个基因集的二维码图
for (i in 1:20) {
# 获取当前基因集的名称
gene_set_name <- top20_gene_sets$Description[i]
# 创建二维码图并保存为 PDF 文件
gsea_plot <- gseaplot2(gsea_result,
geneSetID = i, # 选择当前基因集
color = "green", # 设置颜色
pvalue_table = TRUE) # 显示 p-value 表格
# 保存每个基因集的二维码图为单独的 PDF 文件
file_name <- paste0("gsea_result_", i, ".png")
ggsave(gsea_plot, file = file_name, width = 11.69, height = 8.27, units = 'in')
cat("Saved:", file_name, "\n")
}
## Saved: gsea_result_1.png
## Saved: gsea_result_2.png
## Saved: gsea_result_3.png
## Saved: gsea_result_4.png
## Saved: gsea_result_5.png
## Saved: gsea_result_6.png
## Saved: gsea_result_7.png
## Saved: gsea_result_8.png
## Saved: gsea_result_9.png
## Saved: gsea_result_10.png
## Saved: gsea_result_11.png
## Saved: gsea_result_12.png
## Saved: gsea_result_13.png
## Saved: gsea_result_14.png
## Saved: gsea_result_15.png
## Saved: gsea_result_16.png
## Saved: gsea_result_17.png
## Saved: gsea_result_18.png
## Saved: gsea_result_19.png
## Saved: gsea_result_20.png