1 GDCRNATools-TCGA数据挖掘

1.1 基本介绍

  • TCGA ceRNA网络构建神器-几乎要终结那个著名的ceRNA套路了
  • 这是一个用户友好的R/Bioconductor软件包,可用于下载,组织和分析GDC中的RNA数据,亮点功能是能分析ceRNA调控网络。我们的软件包中使用了许多广泛使用的生物信息学工具和数据库。用户可以轻易的集成到自己的工作流程中。

1.2 GDCRNATools主要分析功能

  1. 基于limma,edgeR,DESeq2的差异分析
  2. CoxPH and KM单因素生存分析
  3. ceRNA网络分析
  4. GO,KEGG,DO的功能富集分析

1.2.1 工作目录

temp<-("D:/R/GDCRNATools")
setwd(temp)

1.2.2 install pacakge

## try http:// if https:// URLs are not supported
if (!requireNamespace("BiocManager", quietly=TRUE))
    install.packages("BiocManager")
library(GDCRNATools)
## 
## 
## Registered S3 method overwritten by 'enrichplot':
##   method               from
##   fortify.enrichResult DOSE
## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
## 
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################

1.3 简单案例分析-牛刀小试

1.3.1 count data 标准化

library(DT)

### load RNA counts data
data(rnaCounts)
rnaCounts[1:5,1:5]
##                 TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
## ENSG00000003989            1520             960            2177
## ENSG00000004799            7659             957            2295
## ENSG00000005812            2246            1698            2454
## ENSG00000007908             214              39              30
## ENSG00000011105            5246            1664            4028
##                 TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
## ENSG00000003989             454            1211
## ENSG00000004799            9393             908
## ENSG00000005812            2377            1561
## ENSG00000007908             237              51
## ENSG00000011105            1310            1971
dim(rnaCounts)
## [1] 1000   45
### load miRNAs counts data
data(mirCounts)
mirCounts[1:5,1:5]
##                 TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
## hsa-let-7a-5p            165141          132094          210259
## hsa-let-7a-3p               204             169             298
## hsa-let-7a-2-3p              30              26              50
## hsa-let-7b-5p             63905           38917          138022
## hsa-let-7b-3p                50              60             138
##                 TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
## hsa-let-7a-5p             84663          117897
## hsa-let-7a-3p               127             149
## hsa-let-7a-2-3p               1              39
## hsa-let-7b-5p             55664          101545
## hsa-let-7b-3p                65             125
####### Normalization of RNAseq data #######
rnaExpr <- gdcVoomNormalization(counts = rnaCounts, filter = FALSE)
rnaExpr[1:5,1:5]
##                 TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
## ENSG00000003989       10.435492       10.007138       10.990261
## ENSG00000004799       12.768196       10.002625       11.066397
## ENSG00000005812       10.998625       10.829542       11.163017
## ENSG00000007908        7.609995        5.403277        4.832541
## ENSG00000011105       12.222301       10.800370       11.877831
##                 TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
## ENSG00000003989        8.859192       10.094094
## ENSG00000004799       13.228502        9.678858
## ENSG00000005812       11.246285       10.460233
## ENSG00000007908        7.922839        5.538016
## ENSG00000011105       10.386957       10.796594
####### Normalization of miRNAs data #######
mirExpr <- gdcVoomNormalization(counts = mirCounts, filter = FALSE)
mirExpr[1:5,1:5]
##                 TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
## hsa-let-7a-5p         14.781661       14.850358       15.066842
## hsa-let-7a-3p          5.124275        5.244289        5.606618
## hsa-let-7a-2-3p        2.379055        2.567068        3.043242
## hsa-let-7b-5p         13.411970       13.087279       14.459575
## hsa-let-7b-3p          3.106529        3.758011        4.498773
##                 TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
## hsa-let-7a-5p         13.937813       14.653520
## hsa-let-7a-3p          4.562713        5.030348
## hsa-let-7a-2-3p       -1.846678        3.110128
## hsa-let-7b-5p         13.332830       14.438114
## hsa-let-7b-3p          3.601783        4.777890

1.3.2 解析metadata

####### Parse and filter RNAseq metadata #######
metaMatrix.RNA <- gdcParseMetadata(project.id = 'TCGA-CHOL',
                                   data.type  = 'RNAseq', 
                                   write.meta = FALSE)
metaMatrix.RNA[1:5,1:5]
## # A tibble: 5 x 5
##   file_name               file_id           patient  sample   submitter_id 
##   <chr>                   <chr>             <chr>    <chr>    <chr>        
## 1 725eaa94-5221-4c22-bce~ 85bc7f81-51fb-44~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAV9~
## 2 b6a2c03a-c8ad-41e9-8a1~ 42b8d463-6209-4e~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVA~
## 3 c2765336-c804-4fd2-b45~ 6e2031e9-df75-48~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVB~
## 4 8b20cba8-9fd5-4d56-bd0~ 19e8fd21-f6c8-49~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVC~
## 5 4082f7d5-5656-476a-9aa~ 1ace0df3-9837-46~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVE~
colnames(metaMatrix.RNA)
##  [1] "file_name"              "file_id"               
##  [3] "patient"                "sample"                
##  [5] "submitter_id"           "entity_submitter_id"   
##  [7] "sample_type"            "gender"                
##  [9] "age_at_diagnosis"       "tumor_stage"           
## [11] "tumor_grade"            "days_to_death"         
## [13] "days_to_last_follow_up" "vital_status"          
## [15] "project_id"
## metadata过滤
metaMatrix.RNA <- gdcFilterDuplicate(metaMatrix.RNA)
## Removed 0 samples
metaMatrix.RNA <- gdcFilterSampleType(metaMatrix.RNA)
## Removed 0 samples
metaMatrix.RNA[1:5,1:5]
## # A tibble: 5 x 5
##   file_name               file_id           patient  sample   submitter_id 
##   <chr>                   <chr>             <chr>    <chr>    <chr>        
## 1 725eaa94-5221-4c22-bce~ 85bc7f81-51fb-44~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAV9~
## 2 b6a2c03a-c8ad-41e9-8a1~ 42b8d463-6209-4e~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVA~
## 3 c2765336-c804-4fd2-b45~ 6e2031e9-df75-48~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVB~
## 4 8b20cba8-9fd5-4d56-bd0~ 19e8fd21-f6c8-49~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVC~
## 5 4082f7d5-5656-476a-9aa~ 1ace0df3-9837-46~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVE~
table(metaMatrix.RNA$sample_type)
## 
##      PrimaryTumor SolidTissueNormal 
##                36                 9

1.4 ceRNA网络分析

1.4.1 筛选差异mRNA,miRNA,lncRNA

DEGAll <- gdcDEAnalysis(counts     = rnaCounts, # rnacounts数据
                        group      = metaMatrix.RNA$sample_type, # 分组
                        comparison = 'PrimaryTumor-SolidTissueNormal', # 比较
                        method     = 'limma')
## 展示所有基因
DEGAll[1:5,1:5]
## # A tibble: 5 x 5
##   symbol group          logFC AveExpr     t
##   <fct>  <fct>          <dbl>   <dbl> <dbl>
## 1 NR1I3  protein_coding -6.92    7.02 -17.3
## 2 ETFRF1 protein_coding -2.49    9.52 -16.1
## 3 SOX5   protein_coding -4.87    6.23 -15.0
## 4 ABCA8  protein_coding -5.65    7.52 -14.9
## 5 ISOC1  protein_coding -2.37   10.5  -14.6
dim(DEGAll)
## [1] 565   8
table(DEGAll$group)
## 
##              IG long_non_coding           ncRNA  protein_coding 
##               2             101               8             403 
##      pseudogene             TEC              TR 
##              45               4               2
range(DEGAll$logFC)
## [1] -10.855558   5.919461
### All DEGs
deALL <- gdcDEReport(deg = DEGAll, gene.type = 'all')
dim(deALL)
## [1] 283   8
### DE long-noncoding
deLNC <- gdcDEReport(deg = DEGAll, gene.type = 'long_non_coding')

### DE protein coding genes
dePC <- gdcDEReport(deg = DEGAll, gene.type = 'protein_coding')

1.4.2 构建差异基因ceRNA网络

ceOutput <- gdcCEAnalysis(lnc         = rownames(deLNC), # lncRNA
                          pc          = rownames(dePC), # mRNA
                          lnc.targets = 'starBase', 
                          pc.targets  = 'starBase', 
                          rna.expr    = rnaExpr, 
                          mir.expr    = mirExpr)
## Step 1/3: Hypergenometric test done !
## Step 2/3: Correlation analysis done !
## Step 3/3: Regulation pattern analysis done !
dim(ceOutput)
## [1] 453  13
ceOutput[1:5,1:13]
## # A tibble: 5 x 13
##   lncRNAs Genes Counts listTotal popHits popTotal foldEnrichment
##   <chr>   <chr> <chr>  <chr>     <chr>   <chr>    <chr>         
## 1 ENSG00~ ENSG~ 2      2         95      277      2.91578947368~
## 2 ENSG00~ ENSG~ 2      2         24      277      11.5416666666~
## 3 ENSG00~ ENSG~ 2      2         8       277      34.625        
## 4 ENSG00~ ENSG~ 2      2         20      277      13.85         
## 5 ENSG00~ ENSG~ 2      2         28      277      9.89285714285~
## # ... with 6 more variables: hyperPValue <chr>, miRNAs <chr>, cor <dbl>,
## #   corPValue <dbl>, regSim <dbl>, sppc <dbl>

1.4.3 导出ceRNA网络到cytoscape

ceOutput2 <- ceOutput[ceOutput$hyperPValue<0.01 ## hyperPvalue少选
    & ceOutput$corPValue<0.01 & ceOutput$regSim != 0,] ## corPvalue筛选

## 导出edges
edges <- gdcExportNetwork(ceNetwork = ceOutput2, net = 'edges')
str(edges)
## 'data.frame':    452 obs. of  3 variables:
##  $ fromNode    : chr  "ENSG00000234456" "ENSG00000234456" "ENSG00000234741" "ENSG00000255717" ...
##  $ toNode      : chr  "hsa-miR-374b-5p" "hsa-miR-374a-5p" "hsa-miR-137" "hsa-miR-377-3p" ...
##  $ altNode1Name: chr  "MAGI2-AS3" "MAGI2-AS3" "GAS5" "SNHG1" ...
head(edges)
## # A tibble: 6 x 3
##   fromNode        toNode          altNode1Name
##   <chr>           <chr>           <chr>       
## 1 ENSG00000234456 hsa-miR-374b-5p MAGI2-AS3   
## 2 ENSG00000234456 hsa-miR-374a-5p MAGI2-AS3   
## 3 ENSG00000234741 hsa-miR-137     GAS5        
## 4 ENSG00000255717 hsa-miR-377-3p  SNHG1       
## 5 ENSG00000255717 hsa-miR-421     SNHG1       
## 6 ENSG00000255717 hsa-miR-326     SNHG1
## 导出nodes
nodes <- gdcExportNetwork(ceNetwork = ceOutput2, net = 'nodes')
str(nodes)
## 'data.frame':    187 obs. of  4 variables:
##  $ gene           : Factor w/ 187 levels "ENSG00000003989",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ symbol         : Factor w/ 187 levels "ABCC5","AC015813.1",..: 162 130 31 154 131 129 124 118 113 15 ...
##  $ type           : Factor w/ 3 levels "lnc","mir","pc": 3 3 3 3 3 3 3 3 3 3 ...
##  $ numInteractions: num  2 5 3 3 3 2 2 2 2 1 ...
head(nodes)
## # A tibble: 6 x 4
##   gene            symbol type  numInteractions
##   <fct>           <fct>  <fct>           <dbl>
## 1 ENSG00000003989 SLC7A2 pc                  2
## 2 ENSG00000004799 PDK4   pc                  5
## 3 ENSG00000021826 CPS1   pc                  3
## 4 ENSG00000047634 SCML1  pc                  3
## 5 ENSG00000049246 PER3   pc                  3
## 6 ENSG00000065320 NTN1   pc                  2