Contents

1 Comparing datasets

This small vignette compares the resources for LIHC or LIver Hepatocellular Carcinoma.

1.1 Origin

The dataset in curatedTCGAData is the output of a pipeline that begins from the Broad Firehose. It makes its way through RTCGAToolbox and is delivered via ExperimentHub.

The lihc450k dataset was retreived using the download utility of MethylMix and missing values were imputed via the K-Nearest Neighbors method.

1.2 Load packages

library(curatedTCGAData)
library(TCGAutils)
# BiocInstaller::biocLite("vjcitn/lihc450k")
library(lihc450k)

2 Loading the data

example("loadLIHC450k")
## 
## lLIHC4> lim = loadLIHC450k()
## found SE 'shell' in cache, will retrieve
## 
## lLIHC4> lim
## class: SummarizedExperiment 
## dim: 396065 429 
## metadata(0):
## assays(1): betas
## rownames(396065): cg00000029 cg00000165 ... rs966367 rs9839873
## rowData names(4): Hybridization.REF sym chr loc
## colnames(429): TCGA-2V-A95S-01A-11D-A36Y-05
##   TCGA-2Y-A9GS-01A-12D-A383-05 ... TCGA-ZS-A9CF-02A-11D-A383-05
##   TCGA-ZS-A9CG-01A-11D-A36Y-05
## colData names(0):
## 
## lLIHC4> assay(lim)
## <396065 x 429> DelayedMatrix object of type "double":
##            TCGA-2V-A95S-01A-11D-A36Y-05 ... TCGA-ZS-A9CG-01A-11D-A36Y-05
## cg00000029                    0.4681022   .                    0.6463057
## cg00000165                    0.3131972   .                    0.1056761
## cg00000236                    0.8809183   .                    0.8645760
## cg00000289                    0.5141503   .                    0.5573469
## cg00000292                    0.8566222   .                    0.7202898
##        ...                            .   .                            .
##  rs9363764                   0.54155102   .                   0.06610568
##   rs939290                   0.56285069   .                   0.26734545
##   rs951295                   0.49516651   .                   0.94955461
##   rs966367                   0.04857567   .                   0.53159356
##  rs9839873                   0.90184986   .                   0.85871264
## 
## lLIHC4> assay(lim)[1,429]
## <1 x 1> DelayedMatrix object of type "double":
##            TCGA-ZS-A9CG-01A-11D-A36Y-05
## cg00000029                    0.6463057
## 
## lLIHC4> # 'confirm' with restfulSE interface to ISB-CGC
## lLIHC4> # cgcConnection() %>% tbl("DNA_Methylation_chr16") %>% 
## lLIHC4> #    filter( ParticipantBarcode=="TCGA-ZS-A9CG", Probe_Id == "cg00000029") %>% 
## lLIHC4> #    glimpse()
## lLIHC4> # Observations: ??
## lLIHC4> # Running job \:  1s:18.9 gigabytes processed
## lLIHC4> # Variables: 8                                                                  
## lLIHC4> # $ ParticipantBarcode   <chr> "TCGA-ZS-A9CG"
## lLIHC4> # $ SampleBarcode        <chr> "TCGA-ZS-A9CG-01A"
## lLIHC4> # $ SampleTypeLetterCode <chr> "TP"
## lLIHC4> # $ AliquotBarcode       <chr> "TCGA-ZS-A9CG-01A-11D-A36Y-05"
## lLIHC4> # $ Platform             <chr> "HumanMethylation450"
## lLIHC4> # $ Study                <chr> "LIHC"
## lLIHC4> # $ Probe_Id             <chr> "cg00000029"
## lLIHC4> # $ Beta_Value           <dbl> 0.65
## lLIHC4> #
## lLIHC4> 
## lLIHC4> 
## lLIHC4>
system.time(lihc <- curatedTCGAData("LIHC", "Methyl*", FALSE))
## snapshotDate(): 2018-03-16
## snapshotDate(): 2018-03-16
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache 
##     '/home/mramos//.ExperimentHub/772'
## snapshotDate(): 2018-03-16
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache 
##     '/home/mramos//.ExperimentHub/768'
## snapshotDate(): 2018-03-16
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache 
##     '/home/mramos//.ExperimentHub/771'
## snapshotDate(): 2018-03-16
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache 
##     '/home/mramos//.ExperimentHub/778'
## harmonizing input:
##   removing 3515 sampleMap rows not in names(experiments)
##    user  system elapsed 
## 378.900  35.744 415.296

3 Dimnames

3.1 Check if all column names are the same

all.equal(sort(colnames(lim)), sort(colnames(lihc)[[1]]))
## [1] TRUE

3.2 Check if all row names are the same

all.equal(sort(rownames(lim)), sort(rownames(lihc)[[1]]))
## [1] "Lengths (396065, 485577) differ (string compare on first 396065)"
## [2] "396064 string mismatches"

3.3 Remote HDF5 data is missing 89512 rows

setdiff(rownames(lihc)[[1]], rownames(lim))[1:42]
##     [1] "cg00000108"       "cg00000109"       "cg00000807"      
##     [4] "cg00000884"       "cg00001099"       "cg00001269"      
##     [7] "cg00001534"       "cg00001593"       "cg00001594"      
##    [10] "cg00001930"       "cg00002028"       "cg00002080"      
##    [13] "cg00002473"       "cg00003014"       "cg00003187"      
##    [16] "cg00003722"       "cg00003818"       "cg00003858"      
##    [19] "cg00003900"       "cg00003965"       "cg00004073"      
##    [22] "cg00004219"       "cg00004667"       "cg00004700"      
##    [25] "cg00004771"       "cg00004775"       "cg00004806"      
##    [28] "cg00004859"       "cg00005164"       "cg00005541"      
##    [31] "cg00005622"       "cg00006090"       "cg00006414"      
##    [34] "cg00006867"       "cg00007540"       "cg00007800"      
##    [37] "cg00007810"       "cg00008033"       "cg00008621"      
##    [40] "cg00008819"       "cg00009088"       "cg00009306"      

3.4 colnames in the same order

identical(order(colnames(lim)), order(colnames(lihc)[[1]]))
## [1] TRUE

4 Compare and Subset rows

4.1 Find rows in both datasets

rowinboth <- intersect(rownames(lim), rownames(lihc)[[1]])

4.2 subset to only rows in both datasets

lihc <- lihc[rownames(lihc)[[1]] %in% rowinboth, ]

4.3 Extract from MultiAssayExperiment

  • represented as SummarizedExperiment with a matrix assay
lihc1 <- lihc[[1]]

5 Internal assay representation

5.1 curatedTCGAData: matrix of type “character”

  • Here we convert matrix to numeric:
lihcassay <- assay(lihc1, 1L)
mode(lihcassay) <- "numeric"

5.2 Remote HDF5 data: DelayedMatrix of type “double”

limassay <- assay(lim, 1L)
class(limassay)
## [1] "DelayedMatrix"
## attr(,"package")
## [1] "DelayedArray"

6 Actual values

6.1 data is more or less the same

(At least for the first 10 by 10 values!)

all.equal(lihcassay[1:10, 1:10], as.matrix(limassay[1:10, 1:10]))
## [1] TRUE

6.2 More missing values are present in curatedTCGAData dataset

all.equal(lihcassay[40:100, 20:30], as.matrix(limassay[40:100, 20:30]))
## [1] "'is.NA' value mismatch: 0 in current 1 in target"

6.3 Finding NAs

  • This runs ok:
namae <- rowSums(is.na(lihcassay[1:10, 1:10]))
  • This throws an error:
nalim <- rowSums(is.na(limassay[1:10, 1:10]))
Error: BiocParallel errors
element index: 2, 3, 4, 5, 6, 7, ...
first error: Failed to connect to 52.4.181.237 port 5101: Connection refused
  • Realize the data first:
nalim <- rowSums(is.na(as.matrix(limassay[1:10, 1:10])))

6.4 Comparing metadata

  • Remote HDF5: rowData contains Hybridization.REF
rowData(lim)
## DataFrame with 396065 rows and 4 columns
##        Hybridization.REF         sym         chr       loc
##              <character> <character> <character> <numeric>
## 1             cg00000029        RBL2          16  53468112
## 2             cg00000165          NA           1  91194674
## 3             cg00000236       VDAC3           8  42263294
## 4             cg00000289       ACTN1          14  69341139
## 5             cg00000292      ATP2A1          16  28890100
## ...                  ...         ...         ...       ...
## 396061         rs9363764          NA          NA         0
## 396062          rs939290          NA          NA         0
## 396063          rs951295          NA          NA         0
## 396064          rs966367          NA          NA         0
## 396065         rs9839873          NA          NA         0
  • curateTCGAData: more standardized rowData colnames
rowData(lihc[[1]])
## DataFrame with 396065 rows and 3 columns
##        Gene_Symbol  Chromosome Genomic_Coordinate
##        <character> <character>        <character>
## 1             RBL2          16           53468112
## 2               NA           1           91194674
## 3            VDAC3           8           42263294
## 4            ACTN1          14           69341139
## 5           ATP2A1          16           28890100
## ...            ...         ...                ...
## 396061          NA          NA                  0
## 396062          NA          NA                  0
## 396063          NA          NA                  0
## 396064          NA          NA                  0
## 396065          NA          NA                  0

6.5 Session Info

sessionInfo()
## R Under development (unstable) (2018-02-05 r74214)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/atlas-base/atlas/libblas.so.3.0
## LAPACK: /usr/lib/atlas-base/atlas/liblapack.so.3.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] BiocStyle_2.7.8             knitr_1.20                 
##  [3] markdown_0.8                bindrcpp_0.2.2             
##  [5] lihc450k_0.99.3             BiocFileCache_1.3.40       
##  [7] dbplyr_1.2.1                SummarizedExperiment_1.9.16
##  [9] DelayedArray_0.5.23         BiocParallel_1.13.3        
## [11] matrixStats_0.53.1          Biobase_2.39.2             
## [13] GenomicRanges_1.31.23       GenomeInfoDb_1.15.5        
## [15] IRanges_2.13.28             S4Vectors_0.17.38          
## [17] BiocGenerics_0.25.3         TCGAutils_0.99.55          
## [19] curatedTCGAData_1.1.12      MultiAssayExperiment_1.5.98
## [21] nvimcom_0.9-41             
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6                  bigrquery_0.4.1              
##  [3] bit64_0.9-7                   httr_1.3.1                   
##  [5] rprojroot_1.3-2               GenomicDataCommons_1.3.4     
##  [7] backports_1.1.2               tools_3.5.0                  
##  [9] R6_2.2.2                      HDF5Array_1.7.9              
## [11] DBI_0.8                       lazyeval_0.2.1               
## [13] bit_1.1-12                    curl_3.2                     
## [15] compiler_3.5.0                rvest_0.3.2                  
## [17] xml2_1.2.0                    bookdown_0.7                 
## [19] restfulSE_1.1.8               readr_1.1.1                  
## [21] rappdirs_0.3.1                RCircos_1.2.0                
## [23] stringr_1.3.0                 digest_0.6.15                
## [25] rmarkdown_1.9                 XVector_0.19.9               
## [27] pkgconfig_2.0.1               htmltools_0.3.6              
## [29] limma_3.35.14                 rlang_0.2.0                  
## [31] RSQLite_2.0                   BiocInstaller_1.29.4         
## [33] shiny_1.0.5                   bindr_0.1.1                  
## [35] jsonlite_1.5                  dplyr_0.7.4                  
## [37] RCurl_1.95-4.10               magrittr_1.5                 
## [39] GO.db_3.5.0                   GenomeInfoDbData_1.1.0       
## [41] Matrix_1.2-12                 Rcpp_0.12.16                 
## [43] Rhdf5lib_1.1.5                stringi_1.1.7                
## [45] yaml_2.1.18                   RaggedExperiment_1.3.12      
## [47] RJSONIO_1.3-0                 zlibbioc_1.25.0              
## [49] rhdf5_2.23.5                  plyr_1.8.4                   
## [51] AnnotationHub_2.11.2          grid_3.5.0                   
## [53] blob_1.1.1                    ExperimentHub_1.5.2          
## [55] lattice_0.20-35               splines_3.5.0                
## [57] rhdf5client_1.1.10            hms_0.4.2                    
## [59] pillar_1.2.1                  rjson_0.2.15                 
## [61] codetools_0.2-15              reshape2_1.4.3               
## [63] XML_3.98-1.10                 glue_1.2.0                   
## [65] evaluate_0.10.1               data.table_1.10.4-3          
## [67] httpuv_1.3.6.2                purrr_0.2.4                  
## [69] tidyr_0.8.0                   assertthat_0.2.0             
## [71] xfun_0.1                      mime_0.5                     
## [73] xtable_1.8-2                  survival_2.41-3              
## [75] tibble_1.4.2                  RTCGAToolbox_2.9.38          
## [77] AnnotationDbi_1.41.4          memoise_1.1.0                
## [79] interactiveDisplayBase_1.17.0