Load TCGA data

if(!require(RTCGAToolbox)){
  biocLite("devtools")
  biocLite(c("limma", "RCircos", "data.table", "RCurl", "RJSONIO"))
  biocLite("Link-NY/RTCGAToolbox")
}
## Warning: replacing previous import by 'IRanges::shift' when loading
## 'RTCGAToolbox'
library(DESeq2)
## Warning: package 'DESeq2' was built under R version 3.2.3
## Warning: package 'RcppArmadillo' was built under R version 3.2.3

Download TCGA PRAD data

library(RTCGAToolbox)
rundates <- getFirehoseRunningDates()
analysisdates <- getFirehoseAnalyzeDates()
prad <- getFirehoseData("PRAD", runDate=rundates[1],
                      gistic2_Date=analysisdates[1], RNAseq_Gene=TRUE, 
        miRNASeq_Gene=TRUE, RNAseq2_Gene_Norm=TRUE, CNA_SNP=TRUE,
        CNV_SNP=TRUE, CNA_Seq=TRUE, CNA_CGH=TRUE,  Methylation=TRUE,
        Mutation=TRUE, mRNA_Array=TRUE, miRNA_Array=TRUE, RPPA=TRUE)
## gdac.broadinstitute.org_PRAD.Clinical_Pick_Tier1.Level_4.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Clinical_Pick_Tier1.Level_4.2015040200.0.0
## 
## gdac.broadinstitute.org_PRAD.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2015040200.0.0
## RNAseq2 data will be imported! This may take a while!
## Start: 2015-12-22 13:16:19
## 
Read 48.7% of 20533 rows
Read 97.4% of 20533 rows
Read 20533 rows and 551 (of 551) columns from 0.090 GB file in 00:00:09
## Done: 2015-12-22 13:16:28
## gdac.broadinstitute.org_PRAD.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3.2015040200.0.0
## miRNAseq data will be imported! This may take a while!
## Start: 2015-12-22 13:16:49
## Done: 2015-12-22 13:16:49
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg19__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg19__seg.Level_3.2015040200.0.0
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015040200.0.0
## gdac.broadinstitute.org_PRAD.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2015040200.0.0
## 
## gdac.broadinstitute.org_PRAD.Merge_methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.Level_3.2015040200.0.0.tar.gz
## http://gdac.broadinstitute.org/runs/stddata__2015_04_02/data/PRAD/20150402/gdac.broadinstitute.org_PRAD.Merge_methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.Level_3.2015040200.0.0.tar.gz
## File Size: ~2066MB
## File above won't be downloaded due to data size, RTCGAToolbox will skip this data!
## 
## 
## 
## 
## 
## gdac.broadinstitute.org_PRAD.Mutation_Packager_Calls.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD-TP.CopyNumber_Gistic2.Level_4.2014101700.0.0.tar.gz

Extract available data types

choices <- tolower(gsub("_", "", c("RNAseq_Gene", "miRNASeq_Gene",
             "RNAseq2_Gene_Norm", "CNA_SNP", "CNV_SNP", "CNA_Seq",
             "CNA_CGH", "Methylation", "Mutation", "mRNA_Array",
             "miRNA_Array", "RPPA")))
dses <- lapply(choices, function(choice) try(extract(prad, choice, 
                                            clinic=TRUE),
                                             silent=TRUE))
names(dses) <- choices
dses
## $rnaseqgene
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $mirnaseqgene
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1046 features, 537 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: tcga-2a-a8vl-01 tcga-2a-a8vo-01 ... tcga-zg-a9ni-01
##     (537 total)
##   varLabels: vital_status days_to_death ... center (242 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:  
## 
## $rnaseq2genenorm
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 20501 features, 540 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: tcga-2a-a8vl-01 tcga-2a-a8vo-01 ... tcga-zg-a9ni-01
##     (540 total)
##   varLabels: vital_status days_to_death ... center (242 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:  
## 
## $cnasnp
## GRangesList object of length 1009:
## $tcga-2a-a8vl-01 
## GRanges object with 377 ranges and 2 metadata columns:
##         seqnames                ranges strand   | Num_Probes Segment_Mean
##            <Rle>             <IRanges>  <Rle>   |  <numeric>    <numeric>
##     [1]     chr1  [   61735, 16153497]      *   |       8417       0.0192
##     [2]     chr1  [16153536, 16155010]      *   |          9      -1.3997
##     [3]     chr1  [16165661, 25583291]      *   |       5733       0.0271
##     [4]     chr1  [25583341, 25646986]      *   |         29      -1.7924
##     [5]     chr1  [25661501, 35091594]      *   |       4967       0.0196
##     ...      ...                   ...    ... ...        ...          ...
##   [373]    chr22 [24401529,  51234455]      *   |      18918       0.0199
##   [374]    chr23 [  168477,  34044373]      *   |      21155       0.0011
##   [375]    chr23 [34046553,  34070287]      *   |         23      -3.3231
##   [376]    chr23 [34073425, 155182354]      *   |      62721       0.0039
##   [377]    chr24 [ 2650438,  59018259]      *   |       8631      -0.8552
## 
## ...
## <1008 more elements>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
## 
## $cnvsnp
## GRangesList object of length 1003:
## $tcga-2a-a8vl-01 
## GRanges object with 117 ranges and 2 metadata columns:
##         seqnames                ranges strand   | Num_Probes Segment_Mean
##            <Rle>             <IRanges>  <Rle>   |  <numeric>    <numeric>
##     [1]     chr1 [ 3218610,  46599744]      *   |      22807       0.0218
##     [2]     chr1 [46601887,  53641875]      *   |       3572      -0.3873
##     [3]     chr1 [53641911,  58304399]      *   |       3399       0.0166
##     [4]     chr1 [58314187,  58497759]      *   |        168        -0.37
##     [5]     chr1 [58498264, 104381295]      *   |      28166       0.0157
##     ...      ...                   ...    ... ...        ...          ...
##   [113]    chr21 [42827478,  42864812]      *   |        101      -0.8952
##   [114]    chr21 [42867190,  42872410]      *   |         26      -0.3935
##   [115]    chr21 [42875404,  47678774]      *   |       2367       0.0223
##   [116]    chr22 [17423930,  49331012]      *   |      16920        0.024
##   [117]    chr23 [ 3157107, 154905589]      *   |      63256       0.0041
## 
## ...
## <1002 more elements>
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
## 
## $cnaseq
## GRangesList object of length 230:
## $tcga-ch-5741-01 
## GRanges object with 88 ranges and 2 metadata columns:
##        seqnames                 ranges strand   | Num_Probes
##           <Rle>              <IRanges>  <Rle>   |  <numeric>
##    [1]     chr1 [    10208, 170998749]      *   |       <NA>
##    [2]     chr1 [170998750, 171222653]      *   |       <NA>
##    [3]     chr1 [171222654, 249240606]      *   |       <NA>
##    [4]     chr2 [    10001, 243189359]      *   |       <NA>
##    [5]     chr3 [    60174,  70581515]      *   |       <NA>
##    ...      ...                    ...    ... ...        ...
##   [84]    chr23 [128529035, 128582748]      *   |       <NA>
##   [85]    chr23 [128582749, 154930285]      *   |       <NA>
##   [86]    chr24 [  2649474,   6711879]      *   |       <NA>
##   [87]    chr24 [  6711880,   6813216]      *   |       <NA>
##   [88]    chr24 [  6813217,  28809960]      *   |       <NA>
##              Segment_Mean
##                 <numeric>
##    [1] 0.0445519350183502
##    [2] -0.719244299367207
##    [3] 0.0416368338763181
##    [4] 0.0400665363289484
##    [5] 0.0357777450352256
##    ...                ...
##   [84]  -2.31507050198042
##   [85] 0.0277920429090032
##   [86] 0.0865462501641183
##   [87]  0.952415386548219
##   [88]  0.101459215941079
## 
## ...
## <229 more elements>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
## 
## $cnacgh
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $methylation
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $mutation
## GRangesList object of length 420:
## $tcga-2a-a8vl-01 
## GRanges object with 39 ranges and 0 metadata columns:
##        seqnames                 ranges strand
##           <Rle>              <IRanges>  <Rle>
##    [1]    chr10 [ 29775061,  29775061]      +
##    [2]     chr2 [ 30480447,  30480447]      +
##    [3]     chr8 [ 39495171,  39495171]      +
##    [4]     chr1 [158262073, 158262073]      +
##    [5]     chr8 [ 41519413,  41519413]      +
##    ...      ...                    ...    ...
##   [35]     chr3 [138739004, 138739004]      +
##   [36]    chr20 [ 33440316,  33440316]      +
##   [37]    chr20 [ 33324502,  33324502]      +
##   [38]     chr3 [195713385, 195713386]      +
##   [39]     chr5 [180708776, 180708777]      +
## 
## ...
## <419 more elements>
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
## 
## $mrnaarray
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $mirnaarray
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $rppa
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
eset.rnaseq <- extract(prad, "rnaseq2genenorm")
write.csv(exprs(eset.rnaseq), file="prad_rnaseq.csv")
write.csv(pData(eset.rnaseq), file="prad_clinical.csv")
saveRDS(eset.rnaseq, file="prad_eset.rds")

To load the eset again:

eset.rnaseq <- readRDS("prad_eset.rds")

Make a histogram of PSA (KLK3) expression

hist(exprs(eset.rnaseq["KLK3", ]))

hist(log(exprs(eset.rnaseq["KLK3", ])))

See what clinical data are available by default:

summary(pData(eset.rnaseq))

Look for association between KLK3 expression and clinical PSA

psadat <- data.frame(psa=as.numeric(as.character(eset.rnaseq$patient.stage_event.psa.psa_value)),
                     klk3=t(exprs(eset.rnaseq["KLK3", ])))
psadat.complete <- psadat[complete.cases(psadat), ]
plot(KLK3 ~ psa, data=psadat.complete, xlab="clinical PSA", ylab="KLK3 tumor expression", log="xy")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 29 x values <= 0 omitted
## from logarithmic plot
fit <- lowess(x=psadat.complete$psa, y=psadat.complete$KLK3)
lines(fit, col="red", lw=3)

Is there an association between PSA in the urine and KLK3 in the tumor?

cor.test(x=psadat$KLK3,  y=psadat$psa, method="spearman")
## Warning in cor.test.default(x = psadat$KLK3, y = psadat$psa, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  psadat$KLK3 and psadat$psa
## S = 21183000, p-value = 0.0005893
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1564584

Clinical psa as a function of ethnicity

Need to get the better race variable with complete data from Tiffany:

table(eset.rnaseq$race)
## 
##                     asian black or african american 
##                         2                        11 
##                     white 
##                       189
boxplot(psadat$psa ~ eset.rnaseq$race, ylab="PSA")