11.1 Primary Tumors with matched RNA-seq
intersect(substr(rsem.values$sample,1,15),substr(betas[,betas$shortLetterCode == "TP"]$sample,1,15)) %>%
unique() %>%
length
## [1] 454
This document shows the steps used to process the LUAD DNA methylation data from GDC.
GDCprepare
uses openSesame from sesame package with default arguments.
## function (files, barcode, summarizedExperiment, platform, legacy)
## {
## if (!requireNamespace("sesame", quietly = TRUE)) {
## stop("sesame package is needed for this function to work. Please install it.",
## call. = FALSE)
## }
## moved.files <- sapply(files, USE.NAMES = FALSE, function(x) {
## if (grepl("Raw_intensities", dirname(dirname(x)))) {
## return(file.path(dirname(dirname(x)), basename(x)))
## }
## return(x)
## })
## plyr::a_ply(files, 1, function(x) {
## if (grepl("Raw_intensities", dirname(dirname(x)))) {
## tryCatch(move(x, file.path(dirname(dirname(x)), basename(x)),
## keep.copy = FALSE), error = function(e) {
## })
## }
## })
## samples <- unique(gsub("_Grn.idat|_Red.idat", "", moved.files))
## message("Processing IDATs with Sesame - http://bioconductor.org/packages/sesame/")
## message("Running opensesame - applying quality masking and nondetection masking (threshold P-value 0.05)")
## message("Please cite: doi: 10.1093/nar/gky691 and 10.1093/nar/gkt090")
## betas <- sesame::openSesame(samples)
## barcode <- unique(data.frame(file = gsub("_Grn.idat|_Red.idat",
## "", basename(moved.files)), barcode = barcode))
## colnames(betas) <- barcode$barcode[match(basename(samples),
## barcode$file)]
## if (summarizedExperiment) {
## met.platform <- "EPIC"
## if (grepl("450", platform))
## met.platform <- "450K"
## if (grepl("27", platform))
## met.platform <- "27K"
## betas <- makeSEFromDNAMethylationMatrix(betas, genome = ifelse(legacy,
## "hg19", "hg38"), met.platform = met.platform)
## colData(betas) <- DataFrame(colDataPrepare(colnames(betas)))
## }
## return(betas)
## }
## <bytecode: 0x55b2b31d1740>
## <environment: namespace:TCGAbiolinks>
## Raw data has 485577 probes
## Raw data has 507 samples
Removing FFPE samples: http://gdac.broadinstitute.org/runs/stddata__2016_01_28/samples_report/FFPE_Cases.html
##
## FALSE TRUE
## 496 11
Remove samples replicates. For example, if patient has two samples (-01A and -01B), we will keep 01A.
patients.with.duplicated.samples <- unique(substr(betas$sample,1,15)[duplicated(substr(betas$sample,1,15))])
sapply(patients.with.duplicated.samples,
function(x) {
sort(grep(x,colnames(betas),value = T))
})
## TCGA-44-6775-01 TCGA-44-5645-01
## [1,] "TCGA-44-6775-01A-11D-1856-05" "TCGA-44-5645-01A-01D-1626-05"
## [2,] "TCGA-44-6775-01A-11D-A276-05" "TCGA-44-5645-01A-01D-A276-05"
## TCGA-44-6146-01 TCGA-44-6147-01
## [1,] "TCGA-44-6146-01A-11D-1756-05" "TCGA-44-6147-01A-11D-1756-05"
## [2,] "TCGA-44-6146-01A-11D-A276-05" "TCGA-44-6147-01A-11D-A276-05"
samples.to.be.removed <- sapply(patients.with.duplicated.samples,
function(x) {
sort(grep(x,colnames(betas),value = T))[-1]
}) %>% unlist %>% as.character
# Remove duplicated samples (-01A, -01B)
betas <- betas[,!betas$barcode %in% samples.to.be.removed]
table(duplicated(substr(betas$sample,1,15)))
##
## FALSE
## 492
# Remove chrX and chrY probes
betas <- betas[!as.character(seqnames(rowRanges(betas))) %in% c("chrX","chrY"),]
# keep only cg probes
betas <- betas[grep("cg",names(betas)),]
# Remove masked probes (NA for all samples)
not.masked.probes <- which(rowSums(is.na(assay(betas))) < ncol(betas))
message("Masked probes: ", nrow(betas) - length(not.masked.probes), " out of ", nrow(betas))
## Masked probes: 61913 out of 470869
##
## Primary solid Tumor Recurrent Solid Tumor Solid Tissue Normal
## 458 2 32
## Raw data has 408956 probes
## Raw data has 492 samples
colData(betas) %>% as.data.frame() %>% dplyr::select(
grep(
"paper|elea|_id|state|_uuid|bcr|tumor_grade|alcohol_history|progression_or_recurrence",
colnames(colData(betas)),
ignore.case = T,
invert = T
)
) %>% DT::datatable(
filter = 'top',
options = list(
scrollX = TRUE,
keys = TRUE,
pageLength = 10
),
rownames = FALSE,
caption = "Samples metadata"
)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
proj <- "TCGA-LUAD"
# Gene expression aligned against hg19. RSEM values
query.exp.hg19 <- GDCquery(project = proj,
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "normalized_results",
experimental.strategy = "RNA-Seq",
legacy = TRUE)
GDCdownload(query.exp.hg19, method = "api",files.per.chunk = 50)
rsem.values <- GDCprepare(query = query.exp.hg19,
save = TRUE,
save.filename = "LUAD-RSEM_normalized.rda"
)
##
## FALSE
## 576
intersect(substr(rsem.values$sample,1,15),substr(betas[,betas$shortLetterCode == "TP"]$sample,1,15)) %>%
unique() %>%
length
## [1] 454
intersect(substr(rsem.values$sample,1,15),substr(betas[,betas$shortLetterCode == "NT"]$sample,1,15)) %>%
unique() %>%
length
## [1] 21
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.1 (2019-07-05)
## os Ubuntu 19.10
## system x86_64, linux-gnu
## ui X11
## language en_US
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2019-12-09
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## ! package * version date lib source
## annotate 1.64.0 2019-10-29 [1] Bioconductor
## AnnotationDbi 1.48.0 2019-10-29 [1] Bioconductor
## AnnotationHub * 2.18.0 2019-10-29 [1] Bioconductor
## aroma.light 3.16.0 2019-10-29 [1] Bioconductor
## askpass 1.1 2019-01-13 [2] CRAN (R 3.6.1)
## assertthat 0.2.1 2019-03-21 [2] CRAN (R 3.6.1)
## backports 1.1.5 2019-10-02 [2] CRAN (R 3.6.1)
## Biobase * 2.46.0 2019-10-29 [1] Bioconductor
## BiocFileCache * 1.10.0 2019-10-29 [1] Bioconductor
## BiocGenerics * 0.32.0 2019-10-29 [1] Bioconductor
## BiocManager 1.30.10 2019-11-16 [2] CRAN (R 3.6.1)
## BiocParallel * 1.20.0 2019-10-30 [1] Bioconductor
## BiocVersion 3.10.1 2019-06-06 [1] Bioconductor
## biomaRt 2.42.0 2019-10-29 [1] Bioconductor
## Biostrings 2.54.0 2019-10-29 [1] Bioconductor
## bit 1.1-14 2018-05-29 [2] CRAN (R 3.6.1)
## bit64 0.9-7 2017-05-08 [2] CRAN (R 3.6.1)
## bitops 1.0-6 2013-08-17 [2] CRAN (R 3.6.1)
## blob 1.2.0 2019-07-09 [2] CRAN (R 3.6.1)
## broom 0.5.2 2019-04-07 [2] CRAN (R 3.6.1)
## callr 3.4.0 2019-12-09 [2] CRAN (R 3.6.1)
## cli 2.0.0 2019-12-09 [2] CRAN (R 3.6.1)
## codetools 0.2-16 2018-12-24 [4] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [2] CRAN (R 3.6.1)
## crayon 1.3.4 2017-09-16 [2] CRAN (R 3.6.1)
## crosstalk 1.0.0 2016-12-21 [2] CRAN (R 3.6.1)
## curl 4.3 2019-12-02 [2] CRAN (R 3.6.1)
## data.table 1.12.6 2019-10-18 [1] CRAN (R 3.6.1)
## DBI 1.0.0 2018-05-02 [2] CRAN (R 3.6.1)
## dbplyr * 1.4.2 2019-06-17 [2] CRAN (R 3.6.1)
## DelayedArray * 0.12.0 2019-10-29 [1] Bioconductor
## desc 1.2.0 2018-05-01 [2] CRAN (R 3.6.1)
## DESeq 1.38.0 2019-10-29 [1] Bioconductor
## devtools 2.2.1 2019-09-24 [2] CRAN (R 3.6.1)
## digest 0.6.22 2019-10-21 [1] CRAN (R 3.6.1)
## DNAcopy 1.60.0 2019-10-29 [1] Bioconductor
## doParallel 1.0.15 2019-08-02 [2] CRAN (R 3.6.1)
## downloader 0.4 2015-07-09 [2] CRAN (R 3.6.1)
## dplyr * 0.8.3 2019-07-04 [2] CRAN (R 3.6.1)
## DT 0.10 2019-11-12 [2] CRAN (R 3.6.1)
## EDASeq 2.20.0 2019-10-29 [1] Bioconductor
## edgeR 3.28.0 2019-10-29 [1] Bioconductor
## ellipsis 0.3.0 2019-09-20 [2] CRAN (R 3.6.1)
## evaluate 0.14 2019-05-28 [2] CRAN (R 3.6.1)
## ExperimentHub * 1.12.0 2019-10-29 [1] Bioconductor
## fansi 0.4.0 2018-10-05 [2] CRAN (R 3.6.1)
## fastmap 1.0.1 2019-10-08 [2] CRAN (R 3.6.1)
## foreach 1.4.7 2019-07-27 [2] CRAN (R 3.6.1)
## fs 1.3.1 2019-05-06 [2] CRAN (R 3.6.1)
## genefilter 1.68.0 2019-10-29 [1] Bioconductor
## geneplotter 1.64.0 2019-10-29 [1] Bioconductor
## generics 0.0.2 2018-11-29 [2] CRAN (R 3.6.1)
## GenomeInfoDb * 1.22.0 2019-10-29 [1] Bioconductor
## GenomeInfoDbData 1.2.2 2019-11-06 [1] Bioconductor
## GenomicAlignments 1.22.0 2019-10-29 [1] Bioconductor
## GenomicFeatures 1.38.0 2019-10-29 [1] Bioconductor
## GenomicRanges * 1.38.0 2019-10-29 [1] Bioconductor
## ggplot2 3.2.1 2019-08-10 [2] CRAN (R 3.6.1)
## ggpubr 0.2.4 2019-11-14 [2] CRAN (R 3.6.1)
## ggrepel 0.8.1 2019-05-07 [2] CRAN (R 3.6.1)
## ggsignif 0.6.0 2019-08-08 [2] CRAN (R 3.6.1)
## ggthemes 4.2.0 2019-05-13 [2] CRAN (R 3.6.1)
## glue 1.3.1 2019-03-12 [2] CRAN (R 3.6.1)
## gridExtra 2.3 2017-09-09 [2] CRAN (R 3.6.1)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.1)
## hms 0.5.2 2019-10-30 [2] CRAN (R 3.6.1)
## htmltools 0.4.0 2019-10-04 [2] CRAN (R 3.6.1)
## htmlwidgets 1.5.1 2019-10-08 [2] CRAN (R 3.6.1)
## httpuv 1.5.2 2019-09-11 [2] CRAN (R 3.6.1)
## httr 1.4.1 2019-08-05 [2] CRAN (R 3.6.1)
## hwriter 1.3.2 2014-09-10 [2] CRAN (R 3.6.1)
## interactiveDisplayBase 1.24.0 2019-10-29 [1] Bioconductor
## IRanges * 2.20.0 2019-10-29 [1] Bioconductor
## iterators 1.0.12 2019-07-26 [2] CRAN (R 3.6.1)
## jsonlite 1.6 2018-12-07 [2] CRAN (R 3.6.1)
## km.ci 0.5-2 2009-08-30 [2] CRAN (R 3.6.1)
## KMsurv 0.1-5 2012-12-03 [2] CRAN (R 3.6.1)
## knitr 1.26 2019-11-12 [2] CRAN (R 3.6.1)
## later 1.0.0 2019-10-04 [2] CRAN (R 3.6.1)
## lattice 0.20-38 2018-11-04 [4] CRAN (R 3.6.0)
## latticeExtra 0.6-28 2016-02-09 [2] CRAN (R 3.6.1)
## lazyeval 0.2.2 2019-03-15 [2] CRAN (R 3.6.1)
## lifecycle 0.1.0 2019-08-01 [2] CRAN (R 3.6.1)
## limma 3.42.0 2019-10-29 [1] Bioconductor
## locfit 1.5-9.1 2013-04-20 [2] CRAN (R 3.6.1)
## magrittr 1.5 2014-11-22 [2] CRAN (R 3.6.1)
## Matrix 1.2-18 2019-11-27 [2] CRAN (R 3.6.1)
## matrixStats * 0.55.0 2019-09-07 [2] CRAN (R 3.6.1)
## memoise 1.1.0 2017-04-21 [2] CRAN (R 3.6.1)
## mgcv 1.8-31 2019-11-09 [4] CRAN (R 3.6.1)
## mime 0.7 2019-06-11 [2] CRAN (R 3.6.1)
## munsell 0.5.0 2018-06-12 [2] CRAN (R 3.6.1)
## nlme 3.1-142 2019-11-07 [4] CRAN (R 3.6.1)
## openssl 1.4.1 2019-07-18 [2] CRAN (R 3.6.1)
## parsetools 0.1.1 2019-07-11 [2] CRAN (R 3.6.1)
## pillar 1.4.2 2019-06-29 [2] CRAN (R 3.6.1)
## pkgbuild 1.0.6 2019-10-09 [2] CRAN (R 3.6.1)
## pkgcond 0.1.0 2018-12-03 [2] CRAN (R 3.6.1)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 3.6.1)
## pkgload 1.0.2 2018-10-29 [2] CRAN (R 3.6.1)
## plyr 1.8.4 2016-06-08 [2] CRAN (R 3.6.1)
## postlogic 0.1.0 2018-11-26 [2] CRAN (R 3.6.1)
## preprocessCore 1.48.0 2019-10-29 [1] Bioconductor
## prettyunits 1.0.2 2015-07-13 [2] CRAN (R 3.6.1)
## processx 3.4.1 2019-07-18 [2] CRAN (R 3.6.1)
## progress 1.2.2 2019-05-16 [2] CRAN (R 3.6.1)
## promises 1.1.0 2019-10-04 [2] CRAN (R 3.6.1)
## ps 1.3.0 2018-12-21 [2] CRAN (R 3.6.1)
## purrr 0.3.3 2019-10-18 [1] CRAN (R 3.6.1)
## purrrogress 0.1.1 2019-07-22 [2] CRAN (R 3.6.1)
## R.methodsS3 1.7.1 2016-02-16 [2] CRAN (R 3.6.1)
## R.oo 1.23.0 2019-11-03 [1] CRAN (R 3.6.1)
## R.utils 2.9.2 2019-12-08 [2] CRAN (R 3.6.1)
## R6 2.4.1 2019-11-12 [2] CRAN (R 3.6.1)
## randomForest 4.6-14 2018-03-25 [2] CRAN (R 3.6.1)
## rappdirs 0.3.1 2016-03-28 [2] CRAN (R 3.6.1)
## RColorBrewer 1.1-2 2014-12-07 [1] CRAN (R 3.6.1)
## Rcpp 1.0.3 2019-11-08 [2] CRAN (R 3.6.1)
## RCurl 1.95-4.12 2019-03-04 [2] CRAN (R 3.6.1)
## readr 1.3.1 2018-12-21 [2] CRAN (R 3.6.1)
## remotes 2.1.0 2019-06-24 [2] CRAN (R 3.6.1)
## rlang 0.4.2 2019-11-23 [2] CRAN (R 3.6.1)
## rmarkdown 1.16 2019-10-01 [1] CRAN (R 3.6.1)
## rprojroot 1.3-2 2018-01-03 [2] CRAN (R 3.6.1)
## Rsamtools 2.2.0 2019-10-29 [1] Bioconductor
## RSQLite 2.1.4 2019-12-04 [2] CRAN (R 3.6.1)
## rtracklayer 1.46.0 2019-10-29 [1] Bioconductor
## rvest 0.3.5 2019-11-08 [2] CRAN (R 3.6.1)
## S4Vectors * 0.24.0 2019-10-29 [1] Bioconductor
## scales 1.1.0 2019-11-18 [2] CRAN (R 3.6.1)
## selectr 0.4-2 2019-11-20 [2] CRAN (R 3.6.1)
## sesame * 1.4.0 2019-10-29 [1] Bioconductor
## sesameData * 1.4.0 2019-11-05 [1] Bioconductor
## sessioninfo 1.1.1 2018-11-05 [2] CRAN (R 3.6.1)
## shiny 1.4.0 2019-10-10 [2] CRAN (R 3.6.1)
## ShortRead 1.44.0 2019-10-29 [1] Bioconductor
## stringi 1.4.3 2019-03-12 [2] CRAN (R 3.6.1)
## stringr 1.4.0 2019-02-10 [2] CRAN (R 3.6.1)
## SummarizedExperiment * 1.16.0 2019-10-29 [1] Bioconductor
## survival 3.1-8 2019-12-03 [4] CRAN (R 3.6.1)
## survminer 0.4.6 2019-09-03 [2] CRAN (R 3.6.1)
## survMisc 0.5.5 2018-07-05 [2] CRAN (R 3.6.1)
## sva 3.34.0 2019-10-29 [1] Bioconductor
## R TCGAbiolinks * 2.15.2 <NA> [2] <NA>
## TCGAbiolinksGUI.data * 1.7.0 2019-12-04 [2] Bioconductor
## testextra 0.1.0 2019-01-18 [2] CRAN (R 3.6.1)
## testthat 2.3.1 2019-12-01 [2] CRAN (R 3.6.1)
## tibble 2.1.3 2019-06-06 [2] CRAN (R 3.6.1)
## tidyr 1.0.0 2019-09-11 [2] CRAN (R 3.6.1)
## tidyselect 0.2.5 2018-10-11 [2] CRAN (R 3.6.1)
## usethis 1.5.1 2019-07-04 [2] CRAN (R 3.6.1)
## vctrs 0.2.0 2019-07-05 [2] CRAN (R 3.6.1)
## wheatmap 0.1.0 2018-03-15 [2] CRAN (R 3.6.1)
## withr 2.1.2 2018-03-15 [2] CRAN (R 3.6.1)
## xfun 0.10 2019-10-01 [1] CRAN (R 3.6.1)
## XML 3.98-1.20 2019-06-06 [2] CRAN (R 3.6.1)
## xml2 1.2.2 2019-08-09 [2] CRAN (R 3.6.1)
## xtable 1.8-4 2019-04-21 [2] CRAN (R 3.6.1)
## XVector 0.26.0 2019-10-29 [1] Bioconductor
## yaml 2.2.0 2018-07-25 [2] CRAN (R 3.6.1)
## zeallot 0.1.0 2018-01-28 [2] CRAN (R 3.6.1)
## zlibbioc 1.32.0 2019-10-29 [1] Bioconductor
## zoo 1.8-6 2019-05-28 [2] CRAN (R 3.6.1)
##
## [1] /home/tiagochst/R/x86_64-pc-linux-gnu-library/3.6
## [2] /usr/local/lib/R/site-library
## [3] /usr/lib/R/site-library
## [4] /usr/lib/R/library
##
## R ── Package was removed from disk.