Contents

1 GitHub Repository

https://github.com/LiNk-NY/raggedchk cloned from https://github.com/vjcitn/raggedchk

2 Installation

if (!require("BiocManager"))
    install.packages("BiocManager")
BiocManager::install("RaggedExperiment")
library(RaggedExperiment)
library(curatedTCGAData)
BRCA <- curatedTCGAData(
    "BRCA", version = "2.0.1", assays = c("CN*", "Mutation"), dry.run = FALSE
)
BRCA
## A MultiAssayExperiment object of 4 listed
##  experiments with user-defined names and respective classes.
##  Containing an ExperimentList class object of length 4:
##  [1] BRCA_CNASeq-20160128: RaggedExperiment with 5298 rows and 38 columns
##  [2] BRCA_CNASNP-20160128: RaggedExperiment with 1132786 rows and 2209 columns
##  [3] BRCA_CNVSNP-20160128: RaggedExperiment with 284458 rows and 2199 columns
##  [4] BRCA_Mutation-20160128: RaggedExperiment with 90490 rows and 993 columns
## Functionality:
##  experiments() - obtain the ExperimentList instance
##  colData() - the primary/phenotype DataFrame
##  sampleMap() - the sample coordination DataFrame
##  `$`, `[`, `[[` - extract colData columns, subset, or experiment
##  *Format() - convert into a long or wide DataFrame
##  assays() - convert ExperimentList to a SimpleList of matrices
##  exportClass() - save data to flat files

2.1 Measuring size

Functions used to measure object size

object_size <-  function(x) 
{
    as.object_size(object.size(x))
}

as.object_size <- function(num, unit = "MB") 
{
    class(num) <- "object_size"
    format(num, units = unit, standard = "SI")
}

2.1.0.1 dims

BRCA_CNASeq-20160128

dim(BRCA[["BRCA_CNASeq-20160128"]])
## [1] 5298   38

BRCA_Mutation-20160128

dim(BRCA[["BRCA_Mutation-20160128"]])
## [1] 90490   993

2.1.1 CNAseq

object_size(BRCA[["BRCA_CNASeq-20160128"]])
object_size(sparseAssay(BRCA[["BRCA_CNASeq-20160128"]], sparse = TRUE))
object_size(compactAssay(BRCA[["BRCA_CNASeq-20160128"]]))
object_size(sparseAssay(BRCA[["BRCA_CNASeq-20160128"]]))

2.1.2 Mutation

object_size(BRCA[["BRCA_Mutation-20160128"]])
# object_size(sparseAssay(BRCA[["BRCA_Mutation-20160128"]], sparse = TRUE))
object_size(compactAssay(BRCA[["BRCA_Mutation-20160128"]]))
object_size(sparseAssay(BRCA[["BRCA_Mutation-20160128"]]))

3 Object Sizes from curatedTCGAData

library(TxDb.Hsapiens.UCSC.hg19.knownGene)

3.1 Extract all gene regions from TxDb

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
genes <- genes(txdb, single.strand.genes.only = FALSE)
genes <- keepStandardChromosomes(genes, pruning.mode = "coarse")
ugenes <- unlist(genes)

3.2 Standardize seqlevelsStyle to UCSC

# https://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/BRCA-TP/CopyNumberLowPass_Gistic2/nozzle.html
re <- BRCA[["BRCA_CNASeq-20160128"]]
## possible bug -- seqlevelsStyle must go first, then genome
seqlevelsStyle(re) <- "UCSC"
genome(rowRanges(re)) <- "hg19"
rowRanges(re)
## GRanges object with 5298 ranges and 0 metadata columns:
##          seqnames              ranges strand
##             <Rle>           <IRanges>  <Rle>
##      [1]     chr1       10209-2583075      *
##      [2]     chr1   2583076-249240606      *
##      [3]     chr2     10002-243189359      *
##      [4]     chr3     60175-162511435      *
##      [5]     chr3 162511436-162626067      *
##      ...      ...                 ...    ...
##   [5294]    chr20      60001-62965506      *
##   [5295]    chr21    9422166-48119869      *
##   [5296]    chr22   16051206-51244552      *
##   [5297]    chr23   2699503-116067549      *
##   [5298]    chr24    2649450-28784074      *
##   -------
##   seqinfo: 24 sequences from hg19 genome; no seqlengths

3.3 It looks like the order matters, otherwise you get NCBI seqlevels…

re2 <- BRCA[["BRCA_CNASeq-20160128"]]
genome(rowRanges(re2)) <- "hg19"
seqlevelsStyle(re2) <- "UCSC"
rowRanges(re2)
identical(re, re2)
#' [1] FALSE

3.4 CNAseq in genes

ingenes <- subsetByOverlaps(re, ugenes)
object_size(compactAssay(ingenes))
object_size(sparseAssay(ingenes))

3.5 Mutations in genes

# https://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/BRCA-TP/CopyNumberLowPass_Gistic2/nozzle.html
mre <- BRCA[["BRCA_Mutation-20160128"]]
## possible bug -- seqlevelsStyle must go first, then genome
seqlevelsStyle(mre) <- "UCSC"
## Warning in (function (seqlevels, genome, new_style) : cannot switch some of
## GRCh37's seqlevels from NCBI to UCSC style
genome(rowRanges(mre)) <- "hg19"
rowRanges(mre)
## GRanges object with 90490 ranges and 0 metadata columns:
##           seqnames    ranges strand
##              <Rle> <IRanges>  <Rle>
##       [1]    chr10 116247760      +
##       [2]    chr12  43944926      +
##       [3]     chr3  85932472      +
##       [4]     chr2  25678299      +
##       [5]    chr17  40272381      +
##       ...      ...       ...    ...
##   [90486]     chr3  48299430      +
##   [90487]    chr19  52394623      +
##   [90488]    chr16  30537313      +
##   [90489]    chr19  35449130      +
##   [90490]    chr19  53994951      +
##   -------
##   seqinfo: 26 sequences from hg19 genome; no seqlengths
mingenes <- subsetByOverlaps(mre, ugenes)
object_size(compactAssay(mingenes))
object_size(sparseAssay(mingenes))

3.6 Obtaining data directly from RTCGAToolbox

library(RTCGAToolbox)
getLinks("BRCA", CNASeq = TRUE)
##                                                                                                                                       href 
## "gdac.broadinstitute.org_BRCA.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2016012800.0.0.tar.gz"
BRCAseq <- getFirehoseData("BRCA", CNASeq = TRUE)
## gdac.broadinstitute.org_BRCA.Clinical_Pick_Tier1.Level_4.2016012800.0.0
## gdac.broadinstitute.org_BRCA.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2016012800.0.0
cnatoolbox <- biocExtract(BRCAseq, "CNASeq")
## working on: CNASeq
seqlevelsStyle(cnatoolbox) <- "UCSC"
genome(cnatoolbox) <- "hg19"
rowRanges(cnatoolbox)
## GRanges object with 5298 ranges and 0 metadata columns:
##          seqnames              ranges strand
##             <Rle>           <IRanges>  <Rle>
##      [1]     chr1       10209-2583075      *
##      [2]     chr1   2583076-249240606      *
##      [3]     chr2     10002-243189359      *
##      [4]     chr3     60175-162511435      *
##      [5]     chr3 162511436-162626067      *
##      ...      ...                 ...    ...
##   [5294]    chr20      60001-62965506      *
##   [5295]    chr21    9422166-48119869      *
##   [5296]    chr22   16051206-51244552      *
##   [5297]    chr23   2699503-116067549      *
##   [5298]    chr24    2649450-28784074      *
##   -------
##   seqinfo: 24 sequences from hg19 genome; no seqlengths
getLinks("BRCA", Mutation = TRUE)
##                                                                                 href 
## "gdac.broadinstitute.org_BRCA.Mutation_Packager_Calls.Level_3.2016012800.0.0.tar.gz"
BRCAmut <- getFirehoseData("BRCA", Mutation = TRUE)
## Using locally cached version of /tmp/RtmpndYeq8/20160128-BRCA-Clinical.txt
muttoolbox <- biocExtract(BRCAmut, "Mutation")
## working on: Mutation
seqlevelsStyle(muttoolbox) <- "UCSC"
## Warning in (function (seqlevels, genome, new_style) : cannot switch some of
## GRCh37's seqlevels from NCBI to UCSC style
genome(muttoolbox) <- "hg19"
rowRanges(muttoolbox)
## GRanges object with 90490 ranges and 0 metadata columns:
##           seqnames    ranges strand
##              <Rle> <IRanges>  <Rle>
##       [1]    chr10 116247760      +
##       [2]    chr12  43944926      +
##       [3]     chr3  85932472      +
##       [4]     chr2  25678299      +
##       [5]    chr17  40272381      +
##       ...      ...       ...    ...
##   [90486]     chr3  48299430      +
##   [90487]    chr19  52394623      +
##   [90488]    chr16  30537313      +
##   [90489]    chr19  35449130      +
##   [90490]    chr19  53994951      +
##   -------
##   seqinfo: 26 sequences from hg19 genome; no seqlengths

3.7 Sizes for CNAseq from RTCGAToolbox

object_size(cnatoolbox)
object_size(BRCAseq@CNASeq)
object_size(sparseAssay(cnatoolbox, sparse = TRUE))
object_size(compactAssay(cnatoolbox))
object_size(sparseAssay(cnatoolbox))
object_size(muttoolbox)
object_size(BRCAmut@Mutation)
# object_size(sparseAssay(muttoolbox, sparse = TRUE)) # typeof character
object_size(compactAssay(muttoolbox))
object_size(sparseAssay(muttoolbox))

3.8 Restrict to genic regions

incnabox <- subsetByOverlaps(cnatoolbox, ugenes)
inmutbox <- subsetByOverlaps(muttoolbox, ugenes)
object_size(incnabox) 
# NA
object_size(sparseAssay(incnabox, sparse = TRUE))
object_size(compactAssay(incnabox))
object_size(sparseAssay(incnabox))
object_size(inmutbox) 
# NA
# object_size(sparseAssay(inmutbox, sparse = TRUE))
object_size(compactAssay(inmutbox))
object_size(sparseAssay(inmutbox))

4 Table of object sizes by data type and source

Assay Data Type RaggedExperiment sparse Matrix matrix (reduced rows) matrix (sparse)
CNASeq numeric 0.2 MB 0.3 MB 1 MB 1.9 MB
CNASeq (in genes) numeric 0.2 MB 0.3 MB 0.9 MB 1.7 MB
Mutation character 70.6 MB NA 680.3 MB 726.2 MB
Mutation (in genes) character 37.6 MB NA 351.3 MB 375.5 MB