GSEA

从差异基因开始的GSEA分析 注意此时应纳入所有的基因,但logFC已经计算完成

require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.0     √ purrr   0.3.2
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   0.8.3     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## Warning: package 'dplyr' was built under R version 3.6.1
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
require(ggplot2)
require(clusterProfiler)
## Loading required package: clusterProfiler
## 
## Registered S3 method overwritten by 'enrichplot':
##   method               from
##   fortify.enrichResult DOSE
## clusterProfiler v3.12.0  For help: https://guangchuangyu.github.io/software/clusterProfiler
## 
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:purrr':
## 
##     simplify
require(org.Hs.eg.db)
## Loading required package: org.Hs.eg.db
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:grDevices':
## 
##     windows
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
require(enrichplot)
## Loading required package: enrichplot
### GSEA
## DEG_3读入所有基因
DEG_3<-read.csv("F:/Bioinfor_project/Breast/AS_research/AS/result/TNBC_mRNA_edgeR.csv",row.names = 1,header = TRUE)
DEG_3<-DEG_3 %>% 
  rownames_to_column("genename") 
dim(DEG_3)
## [1] 24166     6
head(DEG_3)
##   genename     logFC   logCPM        F       PValue          FDR
## 1      CA4 -2.081217 5.566371 581.0757 5.922684e-65 1.431276e-60
## 2    VEGFD -1.208131 5.835019 720.0615 1.387734e-64 1.676799e-60
## 3  CD300LG -1.353763 5.918397 502.2992 7.848027e-60 6.321847e-56
## 4 HSD17B13 -1.472940 5.460879 481.1757 2.303186e-58 1.248068e-54
## 5     MYOC -2.603188 5.361846 480.4715 2.582281e-58 1.248068e-54
## 6   ATP1A2 -1.401117 5.864584 474.6783 6.645676e-58 2.676657e-54
genelist<-DEG_3$genename

## ID转换
genelist<- genelist %>%  bitr(fromType = "SYMBOL",
        toType = c("ENSEMBL", "ENTREZID"),
        OrgDb = org.Hs.eg.db)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(., fromType = "SYMBOL", toType = c("ENSEMBL",
## "ENTREZID"), : 2.58% of input gene IDs are fail to map...
head(genelist)
##     SYMBOL         ENSEMBL ENTREZID
## 1      CA4 ENSG00000167434      762
## 2    VEGFD ENSG00000165197     2277
## 3  CD300LG ENSG00000161649   146894
## 4 HSD17B13 ENSG00000170509   345275
## 5     MYOC ENSG00000034971     4653
## 6   ATP1A2 ENSG00000018625      477
dim(genelist)## 1689
## [1] 26062     3
colnames(genelist)[1]<-"genename"
DEG_3<-genelist %>%
  inner_join(DEG_3,by='genename') %>% 
  ## select配合everything排序把,改变变量顺序
  dplyr::select(ENTREZID,logFC,everything())
head(DEG_3)
##   ENTREZID     logFC genename         ENSEMBL   logCPM        F
## 1      762 -2.081217      CA4 ENSG00000167434 5.566371 581.0757
## 2     2277 -1.208131    VEGFD ENSG00000165197 5.835019 720.0615
## 3   146894 -1.353763  CD300LG ENSG00000161649 5.918397 502.2992
## 4   345275 -1.472940 HSD17B13 ENSG00000170509 5.460879 481.1757
## 5     4653 -2.603188     MYOC ENSG00000034971 5.361846 480.4715
## 6      477 -1.401117   ATP1A2 ENSG00000018625 5.864584 474.6783
##         PValue          FDR
## 1 5.922684e-65 1.431276e-60
## 2 1.387734e-64 1.676799e-60
## 3 7.848027e-60 6.321847e-56
## 4 2.303186e-58 1.248068e-54
## 5 2.582281e-58 1.248068e-54
## 6 6.645676e-58 2.676657e-54
### 构建genelist  
geneList<-DEG_3$logFC
names(geneList)<-as.character(DEG_3$ENTREZID)
geneList<-geneList[!duplicated(names(geneList))]
geneList<-sort(geneList,decreasing=TRUE)
head(geneList)##构建geneList对象成功
## 100038246     30012    158511 106146148     53940      3651 
##  3.886999  3.770181  3.593276  3.579486  3.425571  3.283401

Msigdbr

require(enrichplot)
library(msigdbr)
## Warning: package 'msigdbr' was built under R version 3.6.1
msigdbr_show_species()
##  [1] "Bos taurus"               "Caenorhabditis elegans"  
##  [3] "Canis lupus familiaris"   "Danio rerio"             
##  [5] "Drosophila melanogaster"  "Gallus gallus"           
##  [7] "Homo sapiens"             "Mus musculus"            
##  [9] "Rattus norvegicus"        "Saccharomyces cerevisiae"
## [11] "Sus scrofa"
## 获取所有人类genesets
#m_df <- msigdbr(species = "Homo sapiens")
#head(m_df, 2) %>% as.data.frame

## 获取感兴趣的genesets
## Msigdbr包括H:hallmark genes, C1-C7等
m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>% 
  dplyr::select(gs_name, entrez_gene)
head(m_t2g)
## # A tibble: 6 x 2
##   gs_name               entrez_gene
##   <chr>                       <int>
## 1 HALLMARK_ADIPOGENESIS          19
## 2 HALLMARK_ADIPOGENESIS       11194
## 3 HALLMARK_ADIPOGENESIS       10449
## 4 HALLMARK_ADIPOGENESIS          33
## 5 HALLMARK_ADIPOGENESIS          34
## 6 HALLMARK_ADIPOGENESIS          35
##
em <- GSEA(geneList,nPerm = 10000 ,TERM2GENE = m_t2g,pvalueCutoff = 0.05,verbose = FALSE)
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize = minGSSize, : There are ties in the preranked stats (0.01% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
em[1:5,1:5]
##                                                ID              Description
## HALLMARK_MYC_TARGETS_V1   HALLMARK_MYC_TARGETS_V1  HALLMARK_MYC_TARGETS_V1
## HALLMARK_E2F_TARGETS         HALLMARK_E2F_TARGETS     HALLMARK_E2F_TARGETS
## HALLMARK_G2M_CHECKPOINT   HALLMARK_G2M_CHECKPOINT  HALLMARK_G2M_CHECKPOINT
## HALLMARK_MITOTIC_SPINDLE HALLMARK_MITOTIC_SPINDLE HALLMARK_MITOTIC_SPINDLE
## HALLMARK_MYC_TARGETS_V2   HALLMARK_MYC_TARGETS_V2  HALLMARK_MYC_TARGETS_V2
##                          setSize enrichmentScore      NES
## HALLMARK_MYC_TARGETS_V1      195       0.5475417 1.847455
## HALLMARK_E2F_TARGETS         191       0.6713205 2.258858
## HALLMARK_G2M_CHECKPOINT      192       0.6568366 2.211457
## HALLMARK_MITOTIC_SPINDLE     199       0.4619989 1.562069
## HALLMARK_MYC_TARGETS_V2       57       0.6113801 1.730700
#write.csv(em,"F:/Bioinfor_project/Breast/AS_research/AS/result/GSEA_hallmark_enrich.csv")

## 整体GSEA分析富集情况
dotplot(em,showCategory=12,split=".sign")+facet_grid(~.sign)##
## wrong orderBy parameter; set to default `orderBy = "x"`

#ggsave(filename = "GSEA_dot_plot.pdf",width = 10,height =6)
gseaplot2(em, geneSetID = 1, title = em$ID[1],color="red",pvalue_table=T)

#ggsave(filename = "GSEA_dot_plot_ID1.pdf",width = 8,height =5)
gseaplot2(em, geneSetID = 1:3, title ="Activated Pathway",pvalue_table=T)

#ggsave(filename = "GSEA_dot_plot_Activated.pdf",width = 8,height =5 )

批量绘图-保存为单个文件

require(cowplot)
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
p<-list()
for (i in c(1:11)){
  p[[i]]<-gseaplot2(em, geneSetID = i, title=em$ID[i],color="red",pvalue_table=T)
  ggsave(sprintf("GSEAp%s.pdf", i), width = 8, height = 6, onefile = T)##批量保存
  p[[i]]
  dev.off()
}
p3<-marrangeGrob(p,nrow = 2,ncol=2)##marrangeGrob接受list对象组图,指定ncol,nrow
p3
#ggsave("all_gsea.pdf",width = 14,height = 8,plot = p3)

lapply批量出图

GSEAplot<-function(x){
    gseaplot2(em, geneSetID =x,title=em$ID[x],color="red",subplots = ,
              pvalue_table=T)
}
GSEAplot(1)

nrow(em@result)## GSEA富集结果数
## [1] 11
p4<-lapply(c(1:nrow(em@result)), GSEAplot)## 批量绘制
print(p4)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]