从差异基因开始的GSEA分析 注意此时应纳入所有的基因,但logFC已经计算完成
require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.0 √ purrr 0.3.2
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 0.8.3 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## Warning: package 'dplyr' was built under R version 3.6.1
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
require(ggplot2)
require(clusterProfiler)
## Loading required package: clusterProfiler
##
## Registered S3 method overwritten by 'enrichplot':
## method from
## fortify.enrichResult DOSE
## clusterProfiler v3.12.0 For help: https://guangchuangyu.github.io/software/clusterProfiler
##
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:purrr':
##
## simplify
require(org.Hs.eg.db)
## Loading required package: org.Hs.eg.db
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unsplit, which,
## which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:grDevices':
##
## windows
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
require(enrichplot)
## Loading required package: enrichplot
### GSEA
## DEG_3读入所有基因
DEG_3<-read.csv("F:/Bioinfor_project/Breast/AS_research/AS/result/TNBC_mRNA_edgeR.csv",row.names = 1,header = TRUE)
DEG_3<-DEG_3 %>%
rownames_to_column("genename")
dim(DEG_3)
## [1] 24166 6
head(DEG_3)
## genename logFC logCPM F PValue FDR
## 1 CA4 -2.081217 5.566371 581.0757 5.922684e-65 1.431276e-60
## 2 VEGFD -1.208131 5.835019 720.0615 1.387734e-64 1.676799e-60
## 3 CD300LG -1.353763 5.918397 502.2992 7.848027e-60 6.321847e-56
## 4 HSD17B13 -1.472940 5.460879 481.1757 2.303186e-58 1.248068e-54
## 5 MYOC -2.603188 5.361846 480.4715 2.582281e-58 1.248068e-54
## 6 ATP1A2 -1.401117 5.864584 474.6783 6.645676e-58 2.676657e-54
genelist<-DEG_3$genename
## ID转换
genelist<- genelist %>% bitr(fromType = "SYMBOL",
toType = c("ENSEMBL", "ENTREZID"),
OrgDb = org.Hs.eg.db)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(., fromType = "SYMBOL", toType = c("ENSEMBL",
## "ENTREZID"), : 2.58% of input gene IDs are fail to map...
head(genelist)
## SYMBOL ENSEMBL ENTREZID
## 1 CA4 ENSG00000167434 762
## 2 VEGFD ENSG00000165197 2277
## 3 CD300LG ENSG00000161649 146894
## 4 HSD17B13 ENSG00000170509 345275
## 5 MYOC ENSG00000034971 4653
## 6 ATP1A2 ENSG00000018625 477
dim(genelist)## 1689
## [1] 26062 3
colnames(genelist)[1]<-"genename"
DEG_3<-genelist %>%
inner_join(DEG_3,by='genename') %>%
## select配合everything排序把,改变变量顺序
dplyr::select(ENTREZID,logFC,everything())
head(DEG_3)
## ENTREZID logFC genename ENSEMBL logCPM F
## 1 762 -2.081217 CA4 ENSG00000167434 5.566371 581.0757
## 2 2277 -1.208131 VEGFD ENSG00000165197 5.835019 720.0615
## 3 146894 -1.353763 CD300LG ENSG00000161649 5.918397 502.2992
## 4 345275 -1.472940 HSD17B13 ENSG00000170509 5.460879 481.1757
## 5 4653 -2.603188 MYOC ENSG00000034971 5.361846 480.4715
## 6 477 -1.401117 ATP1A2 ENSG00000018625 5.864584 474.6783
## PValue FDR
## 1 5.922684e-65 1.431276e-60
## 2 1.387734e-64 1.676799e-60
## 3 7.848027e-60 6.321847e-56
## 4 2.303186e-58 1.248068e-54
## 5 2.582281e-58 1.248068e-54
## 6 6.645676e-58 2.676657e-54
### 构建genelist
geneList<-DEG_3$logFC
names(geneList)<-as.character(DEG_3$ENTREZID)
geneList<-geneList[!duplicated(names(geneList))]
geneList<-sort(geneList,decreasing=TRUE)
head(geneList)##构建geneList对象成功
## 100038246 30012 158511 106146148 53940 3651
## 3.886999 3.770181 3.593276 3.579486 3.425571 3.283401
require(enrichplot)
library(msigdbr)
## Warning: package 'msigdbr' was built under R version 3.6.1
msigdbr_show_species()
## [1] "Bos taurus" "Caenorhabditis elegans"
## [3] "Canis lupus familiaris" "Danio rerio"
## [5] "Drosophila melanogaster" "Gallus gallus"
## [7] "Homo sapiens" "Mus musculus"
## [9] "Rattus norvegicus" "Saccharomyces cerevisiae"
## [11] "Sus scrofa"
## 获取所有人类genesets
#m_df <- msigdbr(species = "Homo sapiens")
#head(m_df, 2) %>% as.data.frame
## 获取感兴趣的genesets
## Msigdbr包括H:hallmark genes, C1-C7等
m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>%
dplyr::select(gs_name, entrez_gene)
head(m_t2g)
## # A tibble: 6 x 2
## gs_name entrez_gene
## <chr> <int>
## 1 HALLMARK_ADIPOGENESIS 19
## 2 HALLMARK_ADIPOGENESIS 11194
## 3 HALLMARK_ADIPOGENESIS 10449
## 4 HALLMARK_ADIPOGENESIS 33
## 5 HALLMARK_ADIPOGENESIS 34
## 6 HALLMARK_ADIPOGENESIS 35
##
em <- GSEA(geneList,nPerm = 10000 ,TERM2GENE = m_t2g,pvalueCutoff = 0.05,verbose = FALSE)
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize = minGSSize, : There are ties in the preranked stats (0.01% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
em[1:5,1:5]
## ID Description
## HALLMARK_MYC_TARGETS_V1 HALLMARK_MYC_TARGETS_V1 HALLMARK_MYC_TARGETS_V1
## HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS
## HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT
## HALLMARK_MITOTIC_SPINDLE HALLMARK_MITOTIC_SPINDLE HALLMARK_MITOTIC_SPINDLE
## HALLMARK_MYC_TARGETS_V2 HALLMARK_MYC_TARGETS_V2 HALLMARK_MYC_TARGETS_V2
## setSize enrichmentScore NES
## HALLMARK_MYC_TARGETS_V1 195 0.5475417 1.847455
## HALLMARK_E2F_TARGETS 191 0.6713205 2.258858
## HALLMARK_G2M_CHECKPOINT 192 0.6568366 2.211457
## HALLMARK_MITOTIC_SPINDLE 199 0.4619989 1.562069
## HALLMARK_MYC_TARGETS_V2 57 0.6113801 1.730700
#write.csv(em,"F:/Bioinfor_project/Breast/AS_research/AS/result/GSEA_hallmark_enrich.csv")
## 整体GSEA分析富集情况
dotplot(em,showCategory=12,split=".sign")+facet_grid(~.sign)##
## wrong orderBy parameter; set to default `orderBy = "x"`
#ggsave(filename = "GSEA_dot_plot.pdf",width = 10,height =6)
gseaplot2(em, geneSetID = 1, title = em$ID[1],color="red",pvalue_table=T)
#ggsave(filename = "GSEA_dot_plot_ID1.pdf",width = 8,height =5)
gseaplot2(em, geneSetID = 1:3, title ="Activated Pathway",pvalue_table=T)
#ggsave(filename = "GSEA_dot_plot_Activated.pdf",width = 8,height =5 )
require(cowplot)
## Loading required package: cowplot
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
p<-list()
for (i in c(1:11)){
p[[i]]<-gseaplot2(em, geneSetID = i, title=em$ID[i],color="red",pvalue_table=T)
ggsave(sprintf("GSEAp%s.pdf", i), width = 8, height = 6, onefile = T)##批量保存
p[[i]]
dev.off()
}
p3<-marrangeGrob(p,nrow = 2,ncol=2)##marrangeGrob接受list对象组图,指定ncol,nrow
p3
#ggsave("all_gsea.pdf",width = 14,height = 8,plot = p3)
GSEAplot<-function(x){
gseaplot2(em, geneSetID =x,title=em$ID[x],color="red",subplots = ,
pvalue_table=T)
}
GSEAplot(1)
nrow(em@result)## GSEA富集结果数
## [1] 11
p4<-lapply(c(1:nrow(em@result)), GSEAplot)## 批量绘制
print(p4)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]