2 GISTIC copy number

2.1 Called loss/gain/amplification as SummarizedExperiment

Read data:

cn <- read.table(system.file("extdata/tcga_ov/raw", "all_lesions.conf_99.txt", package="biocMultiAssay"), 
                 row.names=1, header=TRUE, sep="\t", as.is=TRUE)

Extract copy number ranges, using region limits. First the chromosomes:

chr <- sub(":.+", "", cn$Region.Limits)
head(chr)
## [1] "chr1" "chr1" "chr1" "chr2" "chr2" "chr3"

Then the ranges:

head(cn$Region.Limits, 1)
## [1] "chr1:34959389-47792362(probes 16723:23408)          "
ranges <- sub(".+?:", "", cn$Region.Limits)
head(ranges, 1)
## [1] "34959389-47792362(probes 16723:23408)          "
ranges <- sub("\\(.+", "", ranges)
head(ranges, 1)
## [1] "34959389-47792362"
ranges <- data.frame(t(do.call(cbind, strsplit(ranges, "-"))), stringsAsFactors=FALSE)
ranges[, 1] <- as.integer(ranges[, 1])
ranges[, 2] <- as.integer(ranges[, 2])
colnames(ranges) <- c("start", "end")
head(ranges, 1)
##      start      end
## 1 34959389 47792362

Construct a GRanges object:

library(GenomicRanges)
gr <- GRanges(seqnames=Rle(chr),
        ranges=IRanges(start=ranges$start, end=ranges$end),
        q.values=cn$q.values)
genome(gr) <- "hg19"
gr
## GRanges object with 136 ranges and 1 metadata column:
##         seqnames                 ranges strand   |   q.values
##            <Rle>              <IRanges>  <Rle>   |  <numeric>
##     [1]     chr1 [ 34959389,  47792362]      *   | 6.4195e-27
##     [2]     chr1 [120527362, 187199283]      *   |  3.679e-26
##     [3]     chr1 [222335196, 249250621]      *   | 1.2932e-13
##     [4]     chr2 [ 28645949,  29243661]      *   |   0.011986
##     [5]     chr2 [170891504, 193661214]      *   | 2.5197e-07
##     ...      ...                    ...    ... ...        ...
##   [132]    chr19   [40358754, 59128983]      *   | 6.0419e-17
##   [133]    chr19   [40358754, 59128983]      *   | 4.2607e-17
##   [134]    chr21   [44539727, 48129895]      *   |   0.018397
##   [135]    chr22   [40785987, 51304566]      *   | 8.8866e-64
##   [136]     chrX   [32077135, 32079659]      *   | 1.0638e-13
##   -------
##   seqinfo: 23 sequences from hg19 genome; no seqlengths

Create a matrix of peak calls, with the patient component of the barcode as column names:

cna.dat <- cn[, -1:-8]
colnames(cna.dat) <- sub("\\.[0-9]*[ABCD].+", "", colnames(cna.dat))
cna.dat$X <- NULL
table(sapply(cna.dat, class))
## 
## numeric 
##     569
cna.dat <- as.matrix(cna.dat)
class(cna.dat)
## [1] "matrix"
dim(cna.dat)
## [1] 136 569
cna.dat[1:3, 1:3]
##                       TCGA.04.1331 TCGA.04.1332 TCGA.04.1335
## Amplification Peak  1            0            1            0
## Amplification Peak  2            1            1            0
## Amplification Peak  3            2            0            0

And finally create a SummarizedExperiment with the copy number data:

cna01.se <- SummarizedExperiment(assays=SimpleList(cna.dat), 
                                  rowData=gr)
colnames(cna01.se) <- colnames(cna.dat)
save(cna01.se, file="cna01.se.rda")
cna01.se
## class: SummarizedExperiment 
## dim: 136 569 
## exptData(0):
## assays(1): ''
## rownames: NULL
## rowData metadata column names(1): q.values
## colnames(569): TCGA.04.1331 TCGA.04.1332 ... TCGA.72.4240
##   TCGA.72.4241
## colData names(0):

2.2 Raw copy number as GRangesList

This section creates a GRangesList object containing raw copy number numeric (non-integer) values. Levi: I am unsure what class to use when there is a different set of GRanges for each patient, with copy number data associated with each of those ranges. In this section I create a GRangesList, even though I don’t imagine that is the correct solution.

cn <- read.table(system.file("extdata/tcga_ov/raw", "focal_input.seg.txt", 
                             package="biocMultiAssay"),
                 header=TRUE, sep="\t", as.is=TRUE)
cn$id <- make.names(sub("\\-[0-9]*[ABCD].+", "", cn[, 1]))

Make a GRangesList, one element per patient.

grl <- lapply(unique(cn$id), function(x){
  cn1 <- cn[cn$id==x, ]
  gr <- GRanges(seqnames=Rle(paste0("chr", cn1$Chromosome)),
          ranges=IRanges(start=cn1$Start.bp, end=cn1$End.bp),
          Num.Markers=cn1$Num.Markers,
          Seg.CN=cn1$Seg.CN)
  genome(gr) <- "hg19"
  gr
})
names(grl) <- unique(cn$id)
cna.grl <- GRangesList(grl)
save(cna.grl, file="cna.grl.rda")

2.3 Raw copy number as SummarizedExperiment

For methods to obtain the file all_data_by_genes.txt, see the inst/extdata/raw directory of biocMultiAssay.

download.file("https://dl.dropboxusercontent.com/u/15152544/TCGA/all_data_by_genes.txt.gz",
              destfile="all_data_by_genes.txt.gz", method="wget")
cn <- read.table(gzfile("all_data_by_genes.txt.gz"), as.is=TRUE, row.names=1, sep="\t", header=TRUE)
colnames(cn) <- sub("\\.[0-9]*[ABCD].+", "", colnames(cn))
cn <- cn[, -1:-2]

Get ranges for each gene:

rowd <- key2GRanges(rownames(cn))
hasrowd = match(names(rowd), rownames(cn), nomatch=0)
ex = as.matrix(cn[hasrowd,])
stopifnot(identical(names(rowd), rownames(ex)))
cna.se <- SummarizedExperiment(assays=SimpleList(ex), 
                                  rowData=rowd)
colnames(cna.se) <- colnames(ex)
save(cna.se, file="cna.se.rda")
cna.se
## class: SummarizedExperiment 
## dim: 21483 569 
## exptData(0):
## assays(1): ''
## rownames(21483): ACAP3 ACTRT2 ... SPRY3 VAMP7
## rowData metadata column names(0):
## colnames(569): TCGA.04.1331 TCGA.04.1332 ... TCGA.72.4240
##   TCGA.72.4241
## colData names(0):

3 Gene Expression

3.1 As ExpressionSet

library(affy)
load(system.file("extdata/tcga_ov","TCGA_eset.rda",package="biocMultiAssay"))
expr.eset <- TCGA_eset
pdata <- pData(TCGA_eset)
save(expr.eset, file="expr.eset.rda")
save(pdata, file="pdata.rda")

3.2 As SummarizedExperiment

featureNames(expr.eset) <- as.character(featureData(expr.eset)$probeset)
expr.eset@annotation <- "hgu133a"
expr.se <- exs2se(expr.eset)
save(expr.se, file="expr.se.rda")