Load TCGA data

if(!require(RTCGAToolbox)){
  biocLite("devtools")
  biocLite(c("limma", "RCircos", "data.table", "RCurl", "RJSONIO"))
  biocLite("Link-NY/RTCGAToolbox")
}
## Warning: replacing previous import by 'IRanges::shift' when loading
## 'RTCGAToolbox'
library(DESeq2)
## Warning: package 'DESeq2' was built under R version 3.2.3
## Warning: package 'RcppArmadillo' was built under R version 3.2.3

Download TCGA PRAD data

library(RTCGAToolbox)
rundates <- getFirehoseRunningDates()
analysisdates <- getFirehoseAnalyzeDates()
prad <- getFirehoseData("PRAD", runDate=rundates[1],
                      gistic2_Date=analysisdates[1], RNAseq_Gene=TRUE, 
        miRNASeq_Gene=TRUE, RNAseq2_Gene_Norm=TRUE, CNA_SNP=TRUE,
        CNV_SNP=TRUE, CNA_Seq=TRUE, CNA_CGH=TRUE,  Methylation=TRUE,
        Mutation=TRUE, mRNA_Array=TRUE, miRNA_Array=TRUE, RPPA=TRUE)
## gdac.broadinstitute.org_PRAD.Clinical_Pick_Tier1.Level_4.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Clinical_Pick_Tier1.Level_4.2015040200.0.0
## 
## gdac.broadinstitute.org_PRAD.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2015040200.0.0
## RNAseq2 data will be imported! This may take a while!
## Start: 2015-12-22 13:16:19
## 
Read 48.7% of 20533 rows
Read 97.4% of 20533 rows
Read 20533 rows and 551 (of 551) columns from 0.090 GB file in 00:00:09
## Done: 2015-12-22 13:16:28
## gdac.broadinstitute.org_PRAD.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3.2015040200.0.0
## miRNAseq data will be imported! This may take a while!
## Start: 2015-12-22 13:16:49
## Done: 2015-12-22 13:16:49
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg19__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg19__seg.Level_3.2015040200.0.0
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015040200.0.0
## gdac.broadinstitute.org_PRAD.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2015040200.0.0
## 
## gdac.broadinstitute.org_PRAD.Merge_methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.Level_3.2015040200.0.0.tar.gz
## http://gdac.broadinstitute.org/runs/stddata__2015_04_02/data/PRAD/20150402/gdac.broadinstitute.org_PRAD.Merge_methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.Level_3.2015040200.0.0.tar.gz
## File Size: ~2066MB
## File above won't be downloaded due to data size, RTCGAToolbox will skip this data!
## 
## 
## 
## 
## 
## gdac.broadinstitute.org_PRAD.Mutation_Packager_Calls.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD-TP.CopyNumber_Gistic2.Level_4.2014101700.0.0.tar.gz

Extract available data types

choices <- tolower(gsub("_", "", c("RNAseq_Gene", "miRNASeq_Gene",
             "RNAseq2_Gene_Norm", "CNA_SNP", "CNV_SNP", "CNA_Seq",
             "CNA_CGH", "Methylation", "Mutation", "mRNA_Array",
             "miRNA_Array", "RPPA")))
dses <- lapply(choices, function(choice) try(extract(prad, choice, 
                                            clinic=TRUE),
                                             silent=TRUE))
names(dses) <- choices
dses
## $rnaseqgene
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $mirnaseqgene
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1046 features, 537 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: tcga-2a-a8vl-01 tcga-2a-a8vo-01 ... tcga-zg-a9ni-01
##     (537 total)
##   varLabels: vital_status days_to_death ... center (242 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:  
## 
## $rnaseq2genenorm
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 20501 features, 540 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: tcga-2a-a8vl-01 tcga-2a-a8vo-01 ... tcga-zg-a9ni-01
##     (540 total)
##   varLabels: vital_status days_to_death ... center (242 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:  
## 
## $cnasnp
## GRangesList object of length 1009:
## $tcga-2a-a8vl-01 
## GRanges object with 377 ranges and 2 metadata columns:
##         seqnames                ranges strand   | Num_Probes Segment_Mean
##            <Rle>             <IRanges>  <Rle>   |  <numeric>    <numeric>
##     [1]     chr1  [   61735, 16153497]      *   |       8417       0.0192
##     [2]     chr1  [16153536, 16155010]      *   |          9      -1.3997
##     [3]     chr1  [16165661, 25583291]      *   |       5733       0.0271
##     [4]     chr1  [25583341, 25646986]      *   |         29      -1.7924
##     [5]     chr1  [25661501, 35091594]      *   |       4967       0.0196
##     ...      ...                   ...    ... ...        ...          ...
##   [373]    chr22 [24401529,  51234455]      *   |      18918       0.0199
##   [374]    chr23 [  168477,  34044373]      *   |      21155       0.0011
##   [375]    chr23 [34046553,  34070287]      *   |         23      -3.3231
##   [376]    chr23 [34073425, 155182354]      *   |      62721       0.0039
##   [377]    chr24 [ 2650438,  59018259]      *   |       8631      -0.8552
## 
## ...
## <1008 more elements>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
## 
## $cnvsnp
## GRangesList object of length 1003:
## $tcga-2a-a8vl-01 
## GRanges object with 117 ranges and 2 metadata columns:
##         seqnames                ranges strand   | Num_Probes Segment_Mean
##            <Rle>             <IRanges>  <Rle>   |  <numeric>    <numeric>
##     [1]     chr1 [ 3218610,  46599744]      *   |      22807       0.0218
##     [2]     chr1 [46601887,  53641875]      *   |       3572      -0.3873
##     [3]     chr1 [53641911,  58304399]      *   |       3399       0.0166
##     [4]     chr1 [58314187,  58497759]      *   |        168        -0.37
##     [5]     chr1 [58498264, 104381295]      *   |      28166       0.0157
##     ...      ...                   ...    ... ...        ...          ...
##   [113]    chr21 [42827478,  42864812]      *   |        101      -0.8952
##   [114]    chr21 [42867190,  42872410]      *   |         26      -0.3935
##   [115]    chr21 [42875404,  47678774]      *   |       2367       0.0223
##   [116]    chr22 [17423930,  49331012]      *   |      16920        0.024
##   [117]    chr23 [ 3157107, 154905589]      *   |      63256       0.0041
## 
## ...
## <1002 more elements>
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
## 
## $cnaseq
## GRangesList object of length 230:
## $tcga-ch-5741-01 
## GRanges object with 88 ranges and 2 metadata columns:
##        seqnames                 ranges strand   | Num_Probes
##           <Rle>              <IRanges>  <Rle>   |  <numeric>
##    [1]     chr1 [    10208, 170998749]      *   |       <NA>
##    [2]     chr1 [170998750, 171222653]      *   |       <NA>
##    [3]     chr1 [171222654, 249240606]      *   |       <NA>
##    [4]     chr2 [    10001, 243189359]      *   |       <NA>
##    [5]     chr3 [    60174,  70581515]      *   |       <NA>
##    ...      ...                    ...    ... ...        ...
##   [84]    chr23 [128529035, 128582748]      *   |       <NA>
##   [85]    chr23 [128582749, 154930285]      *   |       <NA>
##   [86]    chr24 [  2649474,   6711879]      *   |       <NA>
##   [87]    chr24 [  6711880,   6813216]      *   |       <NA>
##   [88]    chr24 [  6813217,  28809960]      *   |       <NA>
##              Segment_Mean
##                 <numeric>
##    [1] 0.0445519350183502
##    [2] -0.719244299367207
##    [3] 0.0416368338763181
##    [4] 0.0400665363289484
##    [5] 0.0357777450352256
##    ...                ...
##   [84]  -2.31507050198042
##   [85] 0.0277920429090032
##   [86] 0.0865462501641183
##   [87]  0.952415386548219
##   [88]  0.101459215941079
## 
## ...
## <229 more elements>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
## 
## $cnacgh
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $methylation
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $mutation
## GRangesList object of length 420:
## $tcga-2a-a8vl-01 
## GRanges object with 39 ranges and 0 metadata columns:
##        seqnames                 ranges strand
##           <Rle>              <IRanges>  <Rle>
##    [1]    chr10 [ 29775061,  29775061]      +
##    [2]     chr2 [ 30480447,  30480447]      +
##    [3]     chr8 [ 39495171,  39495171]      +
##    [4]     chr1 [158262073, 158262073]      +
##    [5]     chr8 [ 41519413,  41519413]      +
##    ...      ...                    ...    ...
##   [35]     chr3 [138739004, 138739004]      +
##   [36]    chr20 [ 33440316,  33440316]      +
##   [37]    chr20 [ 33324502,  33324502]      +
##   [38]     chr3 [195713385, 195713386]      +
##   [39]     chr5 [180708776, 180708777]      +
## 
## ...
## <419 more elements>
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
## 
## $mrnaarray
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $mirnaarray
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
## 
## $rppa
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n  There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
eset.rnaseq <- extract(prad, "rnaseq2genenorm")
write.csv(exprs(eset.rnaseq), file="prad_rnaseq.csv")
write.csv(pData(eset.rnaseq), file="prad_clinical.csv")
saveRDS(eset.rnaseq, file="prad_eset.rds")

To load the eset again:

eset.rnaseq <- readRDS("prad_eset.rds")

Make a histogram of PSA (KLK3) expression

hist(exprs(eset.rnaseq["KLK3", ]))

hist(log(exprs(eset.rnaseq["KLK3", ])))

See what clinical data are available by default:

summary(pData(eset.rnaseq))

Look for association between KLK3 expression and clinical PSA

psadat <- data.frame(psa=as.numeric(as.character(eset.rnaseq$patient.stage_event.psa.psa_value)),
                     klk3=t(exprs(eset.rnaseq["KLK3", ])))
psadat.complete <- psadat[complete.cases(psadat), ]
plot(KLK3 ~ psa, data=psadat.complete, xlab="clinical PSA", ylab="KLK3 tumor expression", log="xy")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 29 x values <= 0 omitted
## from logarithmic plot
fit <- lowess(x=psadat.complete$psa, y=psadat.complete$KLK3)
lines(fit, col="red", lw=3)

Is there an association between PSA in the urine and KLK3 in the tumor?

cor.test(x=psadat$KLK3,  y=psadat$psa, method="spearman")
## Warning in cor.test.default(x = psadat$KLK3, y = psadat$psa, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  psadat$KLK3 and psadat$psa
## S = 21183000, p-value = 0.0005893
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1564584

Clinical psa as a function of ethnicity

Need to get the better race variable with complete data from Tiffany:

table(eset.rnaseq$race)
## 
##                     asian black or african american 
##                         2                        11 
##                     white 
##                       189
boxplot(psadat$psa ~ eset.rnaseq$race, ylab="PSA")

MicroRNA dataset

Create and save to disk:

library(Biobase)
eset <- extract(prad, "miRNASeq_Gene")
saveRDS(eset, file="prad_mirna_eset.rds")
eset <- readRDS("prad_mirna_eset.rds")
mergeVecs <- function(x1, x2){
  ##x1 and x2 are vectors to be merged.
  ##x1 will be over-written by x2, and in case of conflict, x2 takes priority
  if(!identical(length(x1), length(x2))) stop("x1 and x2 must have the same length")
  if(!identical(class(x1), class(x2))) stop("x1 and x2 must have the same class")
  x1[is.na(x1)] = x2[is.na(x1)]
  mismatches <- which(x1 != x2)
  if(length(mismatches) > 0){
    warning(paste("There were mismatches in positions:", paste0(mismatches, collapse=", ")))
    x1[mismatches] = x2[mismatches]
  }
  return(x1)
}
mergeVecs(x1=c(1, 2, 3, NA), x2=c(2, 2, NA, 4))
## Warning in mergeVecs(x1 = c(1, 2, 3, NA), x2 = c(2, 2, NA, 4)): There were
## mismatches in positions: 1
## [1] 2 2 3 4
eset$racevar = factor(mergeVecs(as.character(eset$patient.clinical_cqcf.race), as.character(eset$patient.race)))
eset$batch <- factor(eset$batch)
eset = eset[, eset$racevar %in% c("white", "black or african american")]
eset$gleason <- as.numeric(eset$gleason_score)

DESeq2 correcting for batch and Gleason score

countData <- exprs(eset)
colData <- pData(eset)
dds <- DESeqDataSetFromMatrix(countData = countData,
                              colData = colData,
                              design = ~ gleason + batch_number + racevar)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## the design formula contains a numeric variable with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function
## factor levels were dropped which had no samples
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 847 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res.batch.gleason <- results(dds, contrast=c("racevar", "black or african american", "white"))
res.batch.gleason <- res.batch.gleason[order(res.batch.gleason$pvalue), ]
write.csv(res.batch.gleason, file="DESeq2_batch.gleason.csv")
(res.batch.gleason.sig <- res.batch.gleason[which(res.batch.gleason$padj < 0.05), ])
## log2 fold change (MAP): racevar black or african american vs white 
## Wald test p-value: racevar black or african american vs white 
## DataFrame with 20 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat       pvalue
##                 <numeric>      <numeric> <numeric> <numeric>    <numeric>
## hsa-mir-1304     3.376936      1.1532538 0.1272248  9.064691 1.249581e-19
## hsa-mir-767      2.082448      0.6921243 0.1159498  5.969172 2.384600e-09
## hsa-mir-105-2    1.494232      0.6365401 0.1150669  5.531912 3.167595e-08
## hsa-mir-105-1    1.366677      0.5715556 0.1146741  4.984175 6.222678e-07
## hsa-mir-3647    15.136337      0.4501900 0.1046114  4.303450 1.681586e-05
## ...                   ...            ...       ...       ...          ...
## hsa-mir-196a-1 16.3360090      0.3867707 0.1178358  3.282285  0.001029694
## hsa-mir-137    11.2896811      0.3737656 0.1158931  3.225090  0.001259330
## hsa-mir-346     0.4051674      0.4015446 0.1250443  3.211218  0.001321736
## hsa-mir-585     0.9648963     -0.4075468 0.1270859 -3.206862  0.001341914
## hsa-mir-1911    6.1340046     -0.4039616 0.1277449 -3.162252  0.001565538
##                        padj
##                   <numeric>
## hsa-mir-1304   7.035139e-17
## hsa-mir-767    6.712649e-07
## hsa-mir-105-2  5.944520e-06
## hsa-mir-105-1  8.758420e-05
## hsa-mir-3647   1.893466e-03
## ...                     ...
## hsa-mir-196a-1   0.03623235
## hsa-mir-137      0.03976303
## hsa-mir-346      0.03976303
## hsa-mir-585      0.03976303
## hsa-mir-1911     0.04406988

Heatmap

library(pheatmap)
## Warning: package 'pheatmap' was built under R version 3.2.3
sig.mir <- rownames(countData)[rownames(countData) %in% rownames(res.batch.gleason.sig)]
nt <- normTransform(dds) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[sig.mir, ]
df <- as.data.frame(colData(dds))[, c("gleason", "racevar"), drop=FALSE]
raceorder <- order(df$racevar, colSums(log2.norm.counts))
df <- df[raceorder, , drop=FALSE]
log2.norm.counts <- log2.norm.counts[, raceorder]
stopifnot(identical(rownames(df), colnames(log2.norm.counts)))
pheatmap(log2.norm.counts, cluster_rows=TRUE, show_colnames=FALSE, 
         cluster_cols=TRUE,
         clustering_distance_rows="correlation", 
         clustering_distance_cols="correlation", 
         scale="none", annotation_col=df["racevar"])

Boxplot of the top differentially expressed miRNA

topmir=rownames(res.batch.gleason.sig)[1]
boxplot(log2.norm.counts[topmir, ] ~ df$racevar,
        xlab="Race", ylab="log2 expression", main=topmir)
stripchart(log2.norm.counts[topmir, ] ~ df$racevar, vertical=TRUE, method="jitter",
            pch = 21, col = "maroon", bg = "bisque", 
            add = TRUE)

DESeq2 correcting for batch only

dds2 <- DESeqDataSetFromMatrix(countData = countData,
                              colData = colData,
                              design = ~ batch_number + racevar)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
dds2 <- DESeq(dds2)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 848 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res.batch <- results(dds2, contrast=c("racevar", "black or african american", "white"))
res.batch <- res.batch[order(res.batch$pvalue), ]
write.csv(res.batch, file="DESeq2_batch.csv")
(res.batch.sig <- res.batch[which(res.batch$padj < 0.05), ])
## log2 fold change (MAP): racevar black or african american vs white 
## Wald test p-value: racevar black or african american vs white 
## DataFrame with 16 rows and 6 columns
##                   baseMean log2FoldChange      lfcSE      stat
##                  <numeric>      <numeric>  <numeric> <numeric>
## hsa-mir-1304      3.376936      1.3193026  0.1413378  9.334391
## hsa-mir-767       2.082448      0.7612973  0.1422793  5.350726
## hsa-mir-105-2     1.494232      0.6719905  0.1407723  4.773600
## hsa-mir-3647     15.136337      0.4711682  0.1083900  4.346971
## hsa-mir-105-1     1.366677      0.6113479  0.1408223  4.341271
## ...                    ...            ...        ...       ...
## hsa-mir-585   9.648963e-01     -0.5062484 0.15085353 -3.355893
## hsa-let-7c    2.985206e+04      0.2686979 0.08076004  3.327114
## hsa-mir-1298  7.507258e+00     -0.5004508 0.15058351 -3.323410
## hsa-mir-202   2.887445e+00     -0.4813462 0.14614334 -3.293658
## hsa-mir-1224  3.135288e+00      0.4869132 0.14809472  3.287850
##                     pvalue         padj
##                  <numeric>    <numeric>
## hsa-mir-1304  1.015740e-20 6.196011e-18
## hsa-mir-767   8.760207e-08 2.671863e-05
## hsa-mir-105-2 1.809615e-06 3.679550e-04
## hsa-mir-3647  1.380305e-05 1.728259e-03
## hsa-mir-105-1 1.416606e-05 1.728259e-03
## ...                    ...          ...
## hsa-mir-585   0.0007910903   0.03848933
## hsa-let-7c    0.0008775036   0.03848933
## hsa-mir-1298  0.0008892402   0.03848933
## hsa-mir-202   0.0009889265   0.03848933
## hsa-mir-1224  0.0010095562   0.03848933

Heatmap

library(pheatmap)
sig.mir.batch <- rownames(countData)[rownames(countData) %in% rownames(res.batch.sig)]
nt <- normTransform(dds2) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[sig.mir.batch, ]
df <- as.data.frame(colData(dds2)[, c("gleason","racevar")])
raceorder <- order(df$racevar, colSums(log2.norm.counts))
df <- df[raceorder, , drop=FALSE]
log2.norm.counts <- log2.norm.counts[, raceorder]
stopifnot(identical(rownames(df), colnames(log2.norm.counts)))
pheatmap(log2.norm.counts, cluster_rows=TRUE, 
         show_colnames=FALSE, cluster_cols=FALSE,
         clustering_distance_rows="correlation", 
         clustering_distance_cols="correlation", 
         scale="row", annotation_col=df["racevar"])

Not needed, here for posterity. Loading & checking Tiffany’s racevar.

Load Tiffany’s file with the race variable and make its barcodes equivalent to eset:

racevar <- read.csv("racevariable.csv", stringsAsFactors = FALSE)
racevar[, 1] <- paste0(racevar[, 1], "-01")
racevar[, 1] <- gsub(".", "-", racevar[, 1], fixed=TRUE)

For interests’ sake, which patients are in one dataset but not the other?

racevar[, 1][!racevar[, 1] %in% sampleNames(eset)]
##  [1] "tcga-2a-a8vt-01" "tcga-2a-a8vv-01" "tcga-2a-a8w1-01"
##  [4] "tcga-ch-5751-01" "tcga-ej-5499-01" "tcga-ej-5502-01"
##  [7] "tcga-ej-a8fo-01" "tcga-ej-ab20-01" "tcga-fc-7708-01"
## [10] "tcga-fc-a4ji-01" "tcga-g9-6347-01" "tcga-g9-6354-01"
## [13] "tcga-hc-7741-01" "tcga-hc-8212-01" "tcga-j4-8200-01"
## [16] "tcga-j4-aatz-01" "tcga-j9-a8ck-01" "tcga-kc-a4bv-01"
## [19] "tcga-kk-a6e0-01" "tcga-kk-a6e6-01" "tcga-kk-a6e7-01"
## [22] "tcga-kk-a7az-01" "tcga-kk-a8i9-01" "tcga-kk-a8id-01"
## [25] "tcga-m7-a71y-01" "tcga-qu-a6il-01" "tcga-v1-a8ww-01"
## [28] "tcga-v1-a9ol-01" "tcga-v1-a9zk-01" "tcga-vn-a88o-01"
## [31] "tcga-xa-a8jr-01" "tcga-xj-a9dx-01" "tcga-xk-aaja-01"
## [34] "tcga-xq-a8tb-01" "tcga-y6-a8tl-01" "tcga-yj-a8sw-01"
## [37] "tcga-zg-a9l1-01"
sampleNames(eset)[!sampleNames(eset) %in% racevar[, 1]]
##  [1] "tcga-ch-5761-11" "tcga-ch-5767-11" "tcga-ch-5768-11"
##  [4] "tcga-ch-5769-11" "tcga-ej-7115-11" "tcga-ej-7123-11"
##  [7] "tcga-ej-7125-11" "tcga-ej-7314-11" "tcga-ej-7315-11"
## [10] "tcga-ej-7317-11" "tcga-ej-7321-11" "tcga-ej-7327-11"
## [13] "tcga-ej-7328-11" "tcga-ej-7330-11" "tcga-ej-7331-11"
## [16] "tcga-ej-7781-11" "tcga-ej-7782-11" "tcga-ej-7783-11"
## [19] "tcga-ej-7784-11" "tcga-ej-7785-11" "tcga-ej-7786-11"
## [22] "tcga-ej-7789-11" "tcga-ej-7792-11" "tcga-ej-7793-11"
## [25] "tcga-ej-7794-11" "tcga-ej-7797-11" "tcga-g9-6333-11"
## [28] "tcga-g9-6342-11" "tcga-g9-6348-11" "tcga-g9-6351-11"
## [31] "tcga-g9-6356-11" "tcga-g9-6362-11" "tcga-g9-6363-11"
## [34] "tcga-g9-6365-11" "tcga-g9-6384-11" "tcga-g9-6496-11"
## [37] "tcga-g9-6499-11" "tcga-hc-7211-11" "tcga-hc-7737-11"
## [40] "tcga-hc-7738-11" "tcga-hc-7740-11" "tcga-hc-7742-11"
## [43] "tcga-hc-7745-11" "tcga-hc-7747-11" "tcga-hc-7752-11"
## [46] "tcga-hc-7819-11" "tcga-hc-8258-11" "tcga-hc-8259-11"
## [49] "tcga-hc-8260-11" "tcga-hc-8262-11" "tcga-j4-a83j-11"
## [52] "tcga-v1-a9o5-06"

Keep only patients in both eset and racevar, and match up the rows:

eset2 <- eset[, sampleNames(eset) %in% racevar[, 1]]
racevar <- racevar[racevar[, 1] %in% sampleNames(eset2), ]
racevar <- racevar[match(sampleNames(eset2), racevar[, 1]), ]
all.equal(sampleNames(eset2), racevar[, 1])
## [1] TRUE

Add racevar[, 2] to eset2:

eset2$racevar <- racevar[, 2]
summary(eset2$racevar == eset2$race)
##    Mode    TRUE    NA's 
## logical     153     305