This small vignette compares the resources for LIHC or LIver Hepatocellular Carcinoma.
The dataset in curatedTCGAData is the output of a pipeline that begins from the Broad Firehose. It makes its way through RTCGAToolbox and is delivered via ExperimentHub.
The lihc450k dataset was retreived using the download utility of MethylMix and missing values were imputed via the K-Nearest Neighbors method.
library(curatedTCGAData)
library(TCGAutils)
# BiocInstaller::biocLite("vjcitn/lihc450k")
library(lihc450k)
.GlobalEnvcuratedTCGAData is considerably slowerexample("loadLIHC450k")
##
## lLIHC4> lim = loadLIHC450k()
## found SE 'shell' in cache, will retrieve
##
## lLIHC4> lim
## class: SummarizedExperiment
## dim: 396065 429
## metadata(0):
## assays(1): betas
## rownames(396065): cg00000029 cg00000165 ... rs966367 rs9839873
## rowData names(4): Hybridization.REF sym chr loc
## colnames(429): TCGA-2V-A95S-01A-11D-A36Y-05
## TCGA-2Y-A9GS-01A-12D-A383-05 ... TCGA-ZS-A9CF-02A-11D-A383-05
## TCGA-ZS-A9CG-01A-11D-A36Y-05
## colData names(0):
##
## lLIHC4> assay(lim)
## <396065 x 429> DelayedMatrix object of type "double":
## TCGA-2V-A95S-01A-11D-A36Y-05 ... TCGA-ZS-A9CG-01A-11D-A36Y-05
## cg00000029 0.4681022 . 0.6463057
## cg00000165 0.3131972 . 0.1056761
## cg00000236 0.8809183 . 0.8645760
## cg00000289 0.5141503 . 0.5573469
## cg00000292 0.8566222 . 0.7202898
## ... . . .
## rs9363764 0.54155102 . 0.06610568
## rs939290 0.56285069 . 0.26734545
## rs951295 0.49516651 . 0.94955461
## rs966367 0.04857567 . 0.53159356
## rs9839873 0.90184986 . 0.85871264
##
## lLIHC4> assay(lim)[1,429]
## <1 x 1> DelayedMatrix object of type "double":
## TCGA-ZS-A9CG-01A-11D-A36Y-05
## cg00000029 0.6463057
##
## lLIHC4> # 'confirm' with restfulSE interface to ISB-CGC
## lLIHC4> # cgcConnection() %>% tbl("DNA_Methylation_chr16") %>%
## lLIHC4> # filter( ParticipantBarcode=="TCGA-ZS-A9CG", Probe_Id == "cg00000029") %>%
## lLIHC4> # glimpse()
## lLIHC4> # Observations: ??
## lLIHC4> # Running job \: 1s:18.9 gigabytes processed
## lLIHC4> # Variables: 8
## lLIHC4> # $ ParticipantBarcode <chr> "TCGA-ZS-A9CG"
## lLIHC4> # $ SampleBarcode <chr> "TCGA-ZS-A9CG-01A"
## lLIHC4> # $ SampleTypeLetterCode <chr> "TP"
## lLIHC4> # $ AliquotBarcode <chr> "TCGA-ZS-A9CG-01A-11D-A36Y-05"
## lLIHC4> # $ Platform <chr> "HumanMethylation450"
## lLIHC4> # $ Study <chr> "LIHC"
## lLIHC4> # $ Probe_Id <chr> "cg00000029"
## lLIHC4> # $ Beta_Value <dbl> 0.65
## lLIHC4> #
## lLIHC4>
## lLIHC4>
## lLIHC4>
system.time(lihc <- curatedTCGAData("LIHC", "Methyl*", FALSE))
## snapshotDate(): 2018-03-16
## snapshotDate(): 2018-03-16
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## '/home/mramos//.ExperimentHub/772'
## snapshotDate(): 2018-03-16
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## '/home/mramos//.ExperimentHub/768'
## snapshotDate(): 2018-03-16
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## '/home/mramos//.ExperimentHub/771'
## snapshotDate(): 2018-03-16
## see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
## downloading 0 resources
## loading from cache
## '/home/mramos//.ExperimentHub/778'
## harmonizing input:
## removing 3515 sampleMap rows not in names(experiments)
## user system elapsed
## 378.900 35.744 415.296
all.equal(sort(colnames(lim)), sort(colnames(lihc)[[1]]))
## [1] TRUE
all.equal(sort(rownames(lim)), sort(rownames(lihc)[[1]]))
## [1] "Lengths (396065, 485577) differ (string compare on first 396065)"
## [2] "396064 string mismatches"
setdiff(rownames(lihc)[[1]], rownames(lim))[1:42]
## [1] "cg00000108" "cg00000109" "cg00000807"
## [4] "cg00000884" "cg00001099" "cg00001269"
## [7] "cg00001534" "cg00001593" "cg00001594"
## [10] "cg00001930" "cg00002028" "cg00002080"
## [13] "cg00002473" "cg00003014" "cg00003187"
## [16] "cg00003722" "cg00003818" "cg00003858"
## [19] "cg00003900" "cg00003965" "cg00004073"
## [22] "cg00004219" "cg00004667" "cg00004700"
## [25] "cg00004771" "cg00004775" "cg00004806"
## [28] "cg00004859" "cg00005164" "cg00005541"
## [31] "cg00005622" "cg00006090" "cg00006414"
## [34] "cg00006867" "cg00007540" "cg00007800"
## [37] "cg00007810" "cg00008033" "cg00008621"
## [40] "cg00008819" "cg00009088" "cg00009306"
identical(order(colnames(lim)), order(colnames(lihc)[[1]]))
## [1] TRUE
rowinboth <- intersect(rownames(lim), rownames(lihc)[[1]])
lihc <- lihc[rownames(lihc)[[1]] %in% rowinboth, ]
lihc1 <- lihc[[1]]
lihcassay <- assay(lihc1, 1L)
mode(lihcassay) <- "numeric"
limassay <- assay(lim, 1L)
class(limassay)
## [1] "DelayedMatrix"
## attr(,"package")
## [1] "DelayedArray"
(At least for the first 10 by 10 values!)
all.equal(lihcassay[1:10, 1:10], as.matrix(limassay[1:10, 1:10]))
## [1] TRUE
curatedTCGAData datasetall.equal(lihcassay[40:100, 20:30], as.matrix(limassay[40:100, 20:30]))
## [1] "'is.NA' value mismatch: 0 in current 1 in target"
namae <- rowSums(is.na(lihcassay[1:10, 1:10]))
nalim <- rowSums(is.na(limassay[1:10, 1:10]))
Error: BiocParallel errors
element index: 2, 3, 4, 5, 6, 7, ...
first error: Failed to connect to 52.4.181.237 port 5101: Connection refused
nalim <- rowSums(is.na(as.matrix(limassay[1:10, 1:10])))
Hybridization.REFrowData(lim)
## DataFrame with 396065 rows and 4 columns
## Hybridization.REF sym chr loc
## <character> <character> <character> <numeric>
## 1 cg00000029 RBL2 16 53468112
## 2 cg00000165 NA 1 91194674
## 3 cg00000236 VDAC3 8 42263294
## 4 cg00000289 ACTN1 14 69341139
## 5 cg00000292 ATP2A1 16 28890100
## ... ... ... ... ...
## 396061 rs9363764 NA NA 0
## 396062 rs939290 NA NA 0
## 396063 rs951295 NA NA 0
## 396064 rs966367 NA NA 0
## 396065 rs9839873 NA NA 0
rowData(lihc[[1]])
## DataFrame with 396065 rows and 3 columns
## Gene_Symbol Chromosome Genomic_Coordinate
## <character> <character> <character>
## 1 RBL2 16 53468112
## 2 NA 1 91194674
## 3 VDAC3 8 42263294
## 4 ACTN1 14 69341139
## 5 ATP2A1 16 28890100
## ... ... ... ...
## 396061 NA NA 0
## 396062 NA NA 0
## 396063 NA NA 0
## 396064 NA NA 0
## 396065 NA NA 0
sessionInfo()
## R Under development (unstable) (2018-02-05 r74214)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/atlas-base/atlas/libblas.so.3.0
## LAPACK: /usr/lib/atlas-base/atlas/liblapack.so.3.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] BiocStyle_2.7.8 knitr_1.20
## [3] markdown_0.8 bindrcpp_0.2.2
## [5] lihc450k_0.99.3 BiocFileCache_1.3.40
## [7] dbplyr_1.2.1 SummarizedExperiment_1.9.16
## [9] DelayedArray_0.5.23 BiocParallel_1.13.3
## [11] matrixStats_0.53.1 Biobase_2.39.2
## [13] GenomicRanges_1.31.23 GenomeInfoDb_1.15.5
## [15] IRanges_2.13.28 S4Vectors_0.17.38
## [17] BiocGenerics_0.25.3 TCGAutils_0.99.55
## [19] curatedTCGAData_1.1.12 MultiAssayExperiment_1.5.98
## [21] nvimcom_0.9-41
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bigrquery_0.4.1
## [3] bit64_0.9-7 httr_1.3.1
## [5] rprojroot_1.3-2 GenomicDataCommons_1.3.4
## [7] backports_1.1.2 tools_3.5.0
## [9] R6_2.2.2 HDF5Array_1.7.9
## [11] DBI_0.8 lazyeval_0.2.1
## [13] bit_1.1-12 curl_3.2
## [15] compiler_3.5.0 rvest_0.3.2
## [17] xml2_1.2.0 bookdown_0.7
## [19] restfulSE_1.1.8 readr_1.1.1
## [21] rappdirs_0.3.1 RCircos_1.2.0
## [23] stringr_1.3.0 digest_0.6.15
## [25] rmarkdown_1.9 XVector_0.19.9
## [27] pkgconfig_2.0.1 htmltools_0.3.6
## [29] limma_3.35.14 rlang_0.2.0
## [31] RSQLite_2.0 BiocInstaller_1.29.4
## [33] shiny_1.0.5 bindr_0.1.1
## [35] jsonlite_1.5 dplyr_0.7.4
## [37] RCurl_1.95-4.10 magrittr_1.5
## [39] GO.db_3.5.0 GenomeInfoDbData_1.1.0
## [41] Matrix_1.2-12 Rcpp_0.12.16
## [43] Rhdf5lib_1.1.5 stringi_1.1.7
## [45] yaml_2.1.18 RaggedExperiment_1.3.12
## [47] RJSONIO_1.3-0 zlibbioc_1.25.0
## [49] rhdf5_2.23.5 plyr_1.8.4
## [51] AnnotationHub_2.11.2 grid_3.5.0
## [53] blob_1.1.1 ExperimentHub_1.5.2
## [55] lattice_0.20-35 splines_3.5.0
## [57] rhdf5client_1.1.10 hms_0.4.2
## [59] pillar_1.2.1 rjson_0.2.15
## [61] codetools_0.2-15 reshape2_1.4.3
## [63] XML_3.98-1.10 glue_1.2.0
## [65] evaluate_0.10.1 data.table_1.10.4-3
## [67] httpuv_1.3.6.2 purrr_0.2.4
## [69] tidyr_0.8.0 assertthat_0.2.0
## [71] xfun_0.1 mime_0.5
## [73] xtable_1.8-2 survival_2.41-3
## [75] tibble_1.4.2 RTCGAToolbox_2.9.38
## [77] AnnotationDbi_1.41.4 memoise_1.1.0
## [79] interactiveDisplayBase_1.17.0