Read data:
cn <- read.table(system.file("extdata/tcga_ov/raw", "all_lesions.conf_99.txt", package="biocMultiAssay"),
row.names=1, header=TRUE, sep="\t", as.is=TRUE)
Extract copy number ranges, using region limits. First the chromosomes:
chr <- sub(":.+", "", cn$Region.Limits)
head(chr)
## [1] "chr1" "chr1" "chr1" "chr2" "chr2" "chr3"
Then the ranges:
head(cn$Region.Limits, 1)
## [1] "chr1:34959389-47792362(probes 16723:23408) "
ranges <- sub(".+?:", "", cn$Region.Limits)
head(ranges, 1)
## [1] "34959389-47792362(probes 16723:23408) "
ranges <- sub("\\(.+", "", ranges)
head(ranges, 1)
## [1] "34959389-47792362"
ranges <- data.frame(t(do.call(cbind, strsplit(ranges, "-"))), stringsAsFactors=FALSE)
ranges[, 1] <- as.integer(ranges[, 1])
ranges[, 2] <- as.integer(ranges[, 2])
colnames(ranges) <- c("start", "end")
head(ranges, 1)
## start end
## 1 34959389 47792362
Construct a GRanges object:
library(GenomicRanges)
gr <- GRanges(seqnames=Rle(chr),
ranges=IRanges(start=ranges$start, end=ranges$end),
q.values=cn$q.values)
genome(gr) <- "hg19"
gr
## GRanges object with 136 ranges and 1 metadata column:
## seqnames ranges strand | q.values
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr1 [ 34959389, 47792362] * | 6.4195e-27
## [2] chr1 [120527362, 187199283] * | 3.679e-26
## [3] chr1 [222335196, 249250621] * | 1.2932e-13
## [4] chr2 [ 28645949, 29243661] * | 0.011986
## [5] chr2 [170891504, 193661214] * | 2.5197e-07
## ... ... ... ... ... ...
## [132] chr19 [40358754, 59128983] * | 6.0419e-17
## [133] chr19 [40358754, 59128983] * | 4.2607e-17
## [134] chr21 [44539727, 48129895] * | 0.018397
## [135] chr22 [40785987, 51304566] * | 8.8866e-64
## [136] chrX [32077135, 32079659] * | 1.0638e-13
## -------
## seqinfo: 23 sequences from hg19 genome; no seqlengths
Create a matrix of peak calls, with the patient component of the barcode as column names:
cna.dat <- cn[, -1:-8]
colnames(cna.dat) <- sub("\\.[0-9]*[ABCD].+", "", colnames(cna.dat))
cna.dat$X <- NULL
table(sapply(cna.dat, class))
##
## numeric
## 569
cna.dat <- as.matrix(cna.dat)
class(cna.dat)
## [1] "matrix"
dim(cna.dat)
## [1] 136 569
cna.dat[1:3, 1:3]
## TCGA.04.1331 TCGA.04.1332 TCGA.04.1335
## Amplification Peak 1 0 1 0
## Amplification Peak 2 1 1 0
## Amplification Peak 3 2 0 0
And finally create a SummarizedExperiment with the copy number data:
cna01.se <- SummarizedExperiment(assays=SimpleList(cna.dat),
rowData=gr)
colnames(cna01.se) <- colnames(cna.dat)
save(cna01.se, file="cna01.se.rda")
cna01.se
## class: SummarizedExperiment
## dim: 136 569
## exptData(0):
## assays(1): ''
## rownames: NULL
## rowData metadata column names(1): q.values
## colnames(569): TCGA.04.1331 TCGA.04.1332 ... TCGA.72.4240
## TCGA.72.4241
## colData names(0):
This section creates a GRangesList object containing raw copy number numeric (non-integer) values. Levi: I am unsure what class to use when there is a different set of GRanges for each patient, with copy number data associated with each of those ranges. In this section I create a GRangesList, even though I don’t imagine that is the correct solution.
cn <- read.table(system.file("extdata/tcga_ov/raw", "focal_input.seg.txt",
package="biocMultiAssay"),
header=TRUE, sep="\t", as.is=TRUE)
cn$id <- make.names(sub("\\-[0-9]*[ABCD].+", "", cn[, 1]))
Make a GRangesList, one element per patient.
grl <- lapply(unique(cn$id), function(x){
cn1 <- cn[cn$id==x, ]
gr <- GRanges(seqnames=Rle(paste0("chr", cn1$Chromosome)),
ranges=IRanges(start=cn1$Start.bp, end=cn1$End.bp),
Num.Markers=cn1$Num.Markers,
Seg.CN=cn1$Seg.CN)
genome(gr) <- "hg19"
gr
})
names(grl) <- unique(cn$id)
cna.grl <- GRangesList(grl)
save(cna.grl, file="cna.grl.rda")
For methods to obtain the file all_data_by_genes.txt
, see the inst/extdata/raw directory of biocMultiAssay.
download.file("https://dl.dropboxusercontent.com/u/15152544/TCGA/all_data_by_genes.txt.gz",
destfile="all_data_by_genes.txt.gz", method="wget")
cn <- read.table(gzfile("all_data_by_genes.txt.gz"), as.is=TRUE, row.names=1, sep="\t", header=TRUE)
colnames(cn) <- sub("\\.[0-9]*[ABCD].+", "", colnames(cn))
cn <- cn[, -1:-2]
Get ranges for each gene:
rowd <- key2GRanges(rownames(cn))
hasrowd = match(names(rowd), rownames(cn), nomatch=0)
ex = as.matrix(cn[hasrowd,])
stopifnot(identical(names(rowd), rownames(ex)))
cna.se <- SummarizedExperiment(assays=SimpleList(ex),
rowData=rowd)
colnames(cna.se) <- colnames(ex)
save(cna.se, file="cna.se.rda")
cna.se
## class: SummarizedExperiment
## dim: 21483 569
## exptData(0):
## assays(1): ''
## rownames(21483): ACAP3 ACTRT2 ... SPRY3 VAMP7
## rowData metadata column names(0):
## colnames(569): TCGA.04.1331 TCGA.04.1332 ... TCGA.72.4240
## TCGA.72.4241
## colData names(0):
library(affy)
load(system.file("extdata/tcga_ov","TCGA_eset.rda",package="biocMultiAssay"))
expr.eset <- TCGA_eset
pdata <- pData(TCGA_eset)
save(expr.eset, file="expr.eset.rda")
save(pdata, file="pdata.rda")
featureNames(expr.eset) <- as.character(featureData(expr.eset)$probeset)
expr.eset@annotation <- "hgu133a"
expr.se <- exs2se(expr.eset)
save(expr.se, file="expr.se.rda")