if(!require(RTCGAToolbox)){
biocLite("devtools")
biocLite(c("limma", "RCircos", "data.table", "RCurl", "RJSONIO"))
biocLite("Link-NY/RTCGAToolbox")
}
## Warning: replacing previous import by 'IRanges::shift' when loading
## 'RTCGAToolbox'
library(DESeq2)
## Warning: package 'DESeq2' was built under R version 3.2.3
## Warning: package 'RcppArmadillo' was built under R version 3.2.3
library(RTCGAToolbox)
rundates <- getFirehoseRunningDates()
analysisdates <- getFirehoseAnalyzeDates()
prad <- getFirehoseData("PRAD", runDate=rundates[1],
gistic2_Date=analysisdates[1], RNAseq_Gene=TRUE,
miRNASeq_Gene=TRUE, RNAseq2_Gene_Norm=TRUE, CNA_SNP=TRUE,
CNV_SNP=TRUE, CNA_Seq=TRUE, CNA_CGH=TRUE, Methylation=TRUE,
Mutation=TRUE, mRNA_Array=TRUE, miRNA_Array=TRUE, RPPA=TRUE)
## gdac.broadinstitute.org_PRAD.Clinical_Pick_Tier1.Level_4.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Clinical_Pick_Tier1.Level_4.2015040200.0.0
##
## gdac.broadinstitute.org_PRAD.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2015040200.0.0
## RNAseq2 data will be imported! This may take a while!
## Start: 2015-12-22 13:16:19
##
Read 48.7% of 20533 rows
Read 97.4% of 20533 rows
Read 20533 rows and 551 (of 551) columns from 0.090 GB file in 00:00:09
## Done: 2015-12-22 13:16:28
## gdac.broadinstitute.org_PRAD.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3.2015040200.0.0
## miRNAseq data will be imported! This may take a while!
## Start: 2015-12-22 13:16:49
## Done: 2015-12-22 13:16:49
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg19__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg19__seg.Level_3.2015040200.0.0
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2015040200.0.0
## gdac.broadinstitute.org_PRAD.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2015040200.0.0
##
## gdac.broadinstitute.org_PRAD.Merge_methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.Level_3.2015040200.0.0.tar.gz
## http://gdac.broadinstitute.org/runs/stddata__2015_04_02/data/PRAD/20150402/gdac.broadinstitute.org_PRAD.Merge_methylation__humanmethylation450__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.Level_3.2015040200.0.0.tar.gz
## File Size: ~2066MB
## File above won't be downloaded due to data size, RTCGAToolbox will skip this data!
##
##
##
##
##
## gdac.broadinstitute.org_PRAD.Mutation_Packager_Calls.Level_3.2015040200.0.0.tar.gz
## gdac.broadinstitute.org_PRAD-TP.CopyNumber_Gistic2.Level_4.2014101700.0.0.tar.gz
choices <- tolower(gsub("_", "", c("RNAseq_Gene", "miRNASeq_Gene",
"RNAseq2_Gene_Norm", "CNA_SNP", "CNV_SNP", "CNA_Seq",
"CNA_CGH", "Methylation", "Mutation", "mRNA_Array",
"miRNA_Array", "RPPA")))
dses <- lapply(choices, function(choice) try(extract(prad, choice,
clinic=TRUE),
silent=TRUE))
names(dses) <- choices
dses
## $rnaseqgene
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
##
## $mirnaseqgene
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1046 features, 537 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: tcga-2a-a8vl-01 tcga-2a-a8vo-01 ... tcga-zg-a9ni-01
## (537 total)
## varLabels: vital_status days_to_death ... center (242 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
##
## $rnaseq2genenorm
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 20501 features, 540 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: tcga-2a-a8vl-01 tcga-2a-a8vo-01 ... tcga-zg-a9ni-01
## (540 total)
## varLabels: vital_status days_to_death ... center (242 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
##
## $cnasnp
## GRangesList object of length 1009:
## $tcga-2a-a8vl-01
## GRanges object with 377 ranges and 2 metadata columns:
## seqnames ranges strand | Num_Probes Segment_Mean
## <Rle> <IRanges> <Rle> | <numeric> <numeric>
## [1] chr1 [ 61735, 16153497] * | 8417 0.0192
## [2] chr1 [16153536, 16155010] * | 9 -1.3997
## [3] chr1 [16165661, 25583291] * | 5733 0.0271
## [4] chr1 [25583341, 25646986] * | 29 -1.7924
## [5] chr1 [25661501, 35091594] * | 4967 0.0196
## ... ... ... ... ... ... ...
## [373] chr22 [24401529, 51234455] * | 18918 0.0199
## [374] chr23 [ 168477, 34044373] * | 21155 0.0011
## [375] chr23 [34046553, 34070287] * | 23 -3.3231
## [376] chr23 [34073425, 155182354] * | 62721 0.0039
## [377] chr24 [ 2650438, 59018259] * | 8631 -0.8552
##
## ...
## <1008 more elements>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
##
## $cnvsnp
## GRangesList object of length 1003:
## $tcga-2a-a8vl-01
## GRanges object with 117 ranges and 2 metadata columns:
## seqnames ranges strand | Num_Probes Segment_Mean
## <Rle> <IRanges> <Rle> | <numeric> <numeric>
## [1] chr1 [ 3218610, 46599744] * | 22807 0.0218
## [2] chr1 [46601887, 53641875] * | 3572 -0.3873
## [3] chr1 [53641911, 58304399] * | 3399 0.0166
## [4] chr1 [58314187, 58497759] * | 168 -0.37
## [5] chr1 [58498264, 104381295] * | 28166 0.0157
## ... ... ... ... ... ... ...
## [113] chr21 [42827478, 42864812] * | 101 -0.8952
## [114] chr21 [42867190, 42872410] * | 26 -0.3935
## [115] chr21 [42875404, 47678774] * | 2367 0.0223
## [116] chr22 [17423930, 49331012] * | 16920 0.024
## [117] chr23 [ 3157107, 154905589] * | 63256 0.0041
##
## ...
## <1002 more elements>
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
##
## $cnaseq
## GRangesList object of length 230:
## $tcga-ch-5741-01
## GRanges object with 88 ranges and 2 metadata columns:
## seqnames ranges strand | Num_Probes
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr1 [ 10208, 170998749] * | <NA>
## [2] chr1 [170998750, 171222653] * | <NA>
## [3] chr1 [171222654, 249240606] * | <NA>
## [4] chr2 [ 10001, 243189359] * | <NA>
## [5] chr3 [ 60174, 70581515] * | <NA>
## ... ... ... ... ... ...
## [84] chr23 [128529035, 128582748] * | <NA>
## [85] chr23 [128582749, 154930285] * | <NA>
## [86] chr24 [ 2649474, 6711879] * | <NA>
## [87] chr24 [ 6711880, 6813216] * | <NA>
## [88] chr24 [ 6813217, 28809960] * | <NA>
## Segment_Mean
## <numeric>
## [1] 0.0445519350183502
## [2] -0.719244299367207
## [3] 0.0416368338763181
## [4] 0.0400665363289484
## [5] 0.0357777450352256
## ... ...
## [84] -2.31507050198042
## [85] 0.0277920429090032
## [86] 0.0865462501641183
## [87] 0.952415386548219
## [88] 0.101459215941079
##
## ...
## <229 more elements>
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
##
## $cnacgh
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
##
## $methylation
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
##
## $mutation
## GRangesList object of length 420:
## $tcga-2a-a8vl-01
## GRanges object with 39 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr10 [ 29775061, 29775061] +
## [2] chr2 [ 30480447, 30480447] +
## [3] chr8 [ 39495171, 39495171] +
## [4] chr1 [158262073, 158262073] +
## [5] chr8 [ 41519413, 41519413] +
## ... ... ... ...
## [35] chr3 [138739004, 138739004] +
## [36] chr20 [ 33440316, 33440316] +
## [37] chr20 [ 33324502, 33324502] +
## [38] chr3 [195713385, 195713386] +
## [39] chr5 [180708776, 180708777] +
##
## ...
## <419 more elements>
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
##
## $mrnaarray
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
##
## $mirnaarray
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
##
## $rppa
## [1] "Error in extract(prad, choice, clinic = TRUE) : \n There is no data for that data type!\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in extract(prad, choice, clinic = TRUE): There is no data for that data type!>
eset.rnaseq <- extract(prad, "rnaseq2genenorm")
write.csv(exprs(eset.rnaseq), file="prad_rnaseq.csv")
write.csv(pData(eset.rnaseq), file="prad_clinical.csv")
saveRDS(eset.rnaseq, file="prad_eset.rds")
To load the eset again:
eset.rnaseq <- readRDS("prad_eset.rds")
hist(exprs(eset.rnaseq["KLK3", ]))
hist(log(exprs(eset.rnaseq["KLK3", ])))
summary(pData(eset.rnaseq))
psadat <- data.frame(psa=as.numeric(as.character(eset.rnaseq$patient.stage_event.psa.psa_value)),
klk3=t(exprs(eset.rnaseq["KLK3", ])))
psadat.complete <- psadat[complete.cases(psadat), ]
plot(KLK3 ~ psa, data=psadat.complete, xlab="clinical PSA", ylab="KLK3 tumor expression", log="xy")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 29 x values <= 0 omitted
## from logarithmic plot
fit <- lowess(x=psadat.complete$psa, y=psadat.complete$KLK3)
lines(fit, col="red", lw=3)
cor.test(x=psadat$KLK3, y=psadat$psa, method="spearman")
## Warning in cor.test.default(x = psadat$KLK3, y = psadat$psa, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: psadat$KLK3 and psadat$psa
## S = 21183000, p-value = 0.0005893
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1564584
Need to get the better race variable with complete data from Tiffany:
table(eset.rnaseq$race)
##
## asian black or african american
## 2 11
## white
## 189
boxplot(psadat$psa ~ eset.rnaseq$race, ylab="PSA")
Create and save to disk:
library(Biobase)
eset <- extract(prad, "miRNASeq_Gene")
saveRDS(eset, file="prad_mirna_eset.rds")
eset <- readRDS("prad_mirna_eset.rds")
mergeVecs <- function(x1, x2){
##x1 and x2 are vectors to be merged.
##x1 will be over-written by x2, and in case of conflict, x2 takes priority
if(!identical(length(x1), length(x2))) stop("x1 and x2 must have the same length")
if(!identical(class(x1), class(x2))) stop("x1 and x2 must have the same class")
x1[is.na(x1)] = x2[is.na(x1)]
mismatches <- which(x1 != x2)
if(length(mismatches) > 0){
warning(paste("There were mismatches in positions:", paste0(mismatches, collapse=", ")))
x1[mismatches] = x2[mismatches]
}
return(x1)
}
mergeVecs(x1=c(1, 2, 3, NA), x2=c(2, 2, NA, 4))
## Warning in mergeVecs(x1 = c(1, 2, 3, NA), x2 = c(2, 2, NA, 4)): There were
## mismatches in positions: 1
## [1] 2 2 3 4
eset$racevar = factor(mergeVecs(as.character(eset$patient.clinical_cqcf.race), as.character(eset$patient.race)))
eset$batch <- factor(eset$batch)
eset = eset[, eset$racevar %in% c("white", "black or african american")]
eset$gleason <- as.numeric(eset$gleason_score)
countData <- exprs(eset)
colData <- pData(eset)
dds <- DESeqDataSetFromMatrix(countData = countData,
colData = colData,
design = ~ gleason + batch_number + racevar)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## the design formula contains a numeric variable with integer values,
## specifying a model with increasing fold change for higher values.
## did you mean for this to be a factor? if so, first convert
## this variable to a factor using the factor() function
## factor levels were dropped which had no samples
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 847 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res.batch.gleason <- results(dds, contrast=c("racevar", "black or african american", "white"))
res.batch.gleason <- res.batch.gleason[order(res.batch.gleason$pvalue), ]
write.csv(res.batch.gleason, file="DESeq2_batch.gleason.csv")
(res.batch.gleason.sig <- res.batch.gleason[which(res.batch.gleason$padj < 0.05), ])
## log2 fold change (MAP): racevar black or african american vs white
## Wald test p-value: racevar black or african american vs white
## DataFrame with 20 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## hsa-mir-1304 3.376936 1.1532538 0.1272248 9.064691 1.249581e-19
## hsa-mir-767 2.082448 0.6921243 0.1159498 5.969172 2.384600e-09
## hsa-mir-105-2 1.494232 0.6365401 0.1150669 5.531912 3.167595e-08
## hsa-mir-105-1 1.366677 0.5715556 0.1146741 4.984175 6.222678e-07
## hsa-mir-3647 15.136337 0.4501900 0.1046114 4.303450 1.681586e-05
## ... ... ... ... ... ...
## hsa-mir-196a-1 16.3360090 0.3867707 0.1178358 3.282285 0.001029694
## hsa-mir-137 11.2896811 0.3737656 0.1158931 3.225090 0.001259330
## hsa-mir-346 0.4051674 0.4015446 0.1250443 3.211218 0.001321736
## hsa-mir-585 0.9648963 -0.4075468 0.1270859 -3.206862 0.001341914
## hsa-mir-1911 6.1340046 -0.4039616 0.1277449 -3.162252 0.001565538
## padj
## <numeric>
## hsa-mir-1304 7.035139e-17
## hsa-mir-767 6.712649e-07
## hsa-mir-105-2 5.944520e-06
## hsa-mir-105-1 8.758420e-05
## hsa-mir-3647 1.893466e-03
## ... ...
## hsa-mir-196a-1 0.03623235
## hsa-mir-137 0.03976303
## hsa-mir-346 0.03976303
## hsa-mir-585 0.03976303
## hsa-mir-1911 0.04406988
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 3.2.3
sig.mir <- rownames(countData)[rownames(countData) %in% rownames(res.batch.gleason.sig)]
nt <- normTransform(dds) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[sig.mir, ]
df <- as.data.frame(colData(dds))[, c("gleason", "racevar"), drop=FALSE]
raceorder <- order(df$racevar, colSums(log2.norm.counts))
df <- df[raceorder, , drop=FALSE]
log2.norm.counts <- log2.norm.counts[, raceorder]
stopifnot(identical(rownames(df), colnames(log2.norm.counts)))
pheatmap(log2.norm.counts, cluster_rows=TRUE, show_colnames=FALSE,
cluster_cols=TRUE,
clustering_distance_rows="correlation",
clustering_distance_cols="correlation",
scale="none", annotation_col=df["racevar"])
Boxplot of the top differentially expressed miRNA
topmir=rownames(res.batch.gleason.sig)[1]
boxplot(log2.norm.counts[topmir, ] ~ df$racevar,
xlab="Race", ylab="log2 expression", main=topmir)
stripchart(log2.norm.counts[topmir, ] ~ df$racevar, vertical=TRUE, method="jitter",
pch = 21, col = "maroon", bg = "bisque",
add = TRUE)
dds2 <- DESeqDataSetFromMatrix(countData = countData,
colData = colData,
design = ~ batch_number + racevar)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
dds2 <- DESeq(dds2)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 848 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res.batch <- results(dds2, contrast=c("racevar", "black or african american", "white"))
res.batch <- res.batch[order(res.batch$pvalue), ]
write.csv(res.batch, file="DESeq2_batch.csv")
(res.batch.sig <- res.batch[which(res.batch$padj < 0.05), ])
## log2 fold change (MAP): racevar black or african american vs white
## Wald test p-value: racevar black or african american vs white
## DataFrame with 16 rows and 6 columns
## baseMean log2FoldChange lfcSE stat
## <numeric> <numeric> <numeric> <numeric>
## hsa-mir-1304 3.376936 1.3193026 0.1413378 9.334391
## hsa-mir-767 2.082448 0.7612973 0.1422793 5.350726
## hsa-mir-105-2 1.494232 0.6719905 0.1407723 4.773600
## hsa-mir-3647 15.136337 0.4711682 0.1083900 4.346971
## hsa-mir-105-1 1.366677 0.6113479 0.1408223 4.341271
## ... ... ... ... ...
## hsa-mir-585 9.648963e-01 -0.5062484 0.15085353 -3.355893
## hsa-let-7c 2.985206e+04 0.2686979 0.08076004 3.327114
## hsa-mir-1298 7.507258e+00 -0.5004508 0.15058351 -3.323410
## hsa-mir-202 2.887445e+00 -0.4813462 0.14614334 -3.293658
## hsa-mir-1224 3.135288e+00 0.4869132 0.14809472 3.287850
## pvalue padj
## <numeric> <numeric>
## hsa-mir-1304 1.015740e-20 6.196011e-18
## hsa-mir-767 8.760207e-08 2.671863e-05
## hsa-mir-105-2 1.809615e-06 3.679550e-04
## hsa-mir-3647 1.380305e-05 1.728259e-03
## hsa-mir-105-1 1.416606e-05 1.728259e-03
## ... ... ...
## hsa-mir-585 0.0007910903 0.03848933
## hsa-let-7c 0.0008775036 0.03848933
## hsa-mir-1298 0.0008892402 0.03848933
## hsa-mir-202 0.0009889265 0.03848933
## hsa-mir-1224 0.0010095562 0.03848933
library(pheatmap)
sig.mir.batch <- rownames(countData)[rownames(countData) %in% rownames(res.batch.sig)]
nt <- normTransform(dds2) # defaults to log2(x+1)
log2.norm.counts <- assay(nt)[sig.mir.batch, ]
df <- as.data.frame(colData(dds2)[, c("gleason","racevar")])
raceorder <- order(df$racevar, colSums(log2.norm.counts))
df <- df[raceorder, , drop=FALSE]
log2.norm.counts <- log2.norm.counts[, raceorder]
stopifnot(identical(rownames(df), colnames(log2.norm.counts)))
pheatmap(log2.norm.counts, cluster_rows=TRUE,
show_colnames=FALSE, cluster_cols=FALSE,
clustering_distance_rows="correlation",
clustering_distance_cols="correlation",
scale="row", annotation_col=df["racevar"])
Load Tiffany’s file with the race variable and make its barcodes equivalent to eset:
racevar <- read.csv("racevariable.csv", stringsAsFactors = FALSE)
racevar[, 1] <- paste0(racevar[, 1], "-01")
racevar[, 1] <- gsub(".", "-", racevar[, 1], fixed=TRUE)
For interests’ sake, which patients are in one dataset but not the other?
racevar[, 1][!racevar[, 1] %in% sampleNames(eset)]
## [1] "tcga-2a-a8vt-01" "tcga-2a-a8vv-01" "tcga-2a-a8w1-01"
## [4] "tcga-ch-5751-01" "tcga-ej-5499-01" "tcga-ej-5502-01"
## [7] "tcga-ej-a8fo-01" "tcga-ej-ab20-01" "tcga-fc-7708-01"
## [10] "tcga-fc-a4ji-01" "tcga-g9-6347-01" "tcga-g9-6354-01"
## [13] "tcga-hc-7741-01" "tcga-hc-8212-01" "tcga-j4-8200-01"
## [16] "tcga-j4-aatz-01" "tcga-j9-a8ck-01" "tcga-kc-a4bv-01"
## [19] "tcga-kk-a6e0-01" "tcga-kk-a6e6-01" "tcga-kk-a6e7-01"
## [22] "tcga-kk-a7az-01" "tcga-kk-a8i9-01" "tcga-kk-a8id-01"
## [25] "tcga-m7-a71y-01" "tcga-qu-a6il-01" "tcga-v1-a8ww-01"
## [28] "tcga-v1-a9ol-01" "tcga-v1-a9zk-01" "tcga-vn-a88o-01"
## [31] "tcga-xa-a8jr-01" "tcga-xj-a9dx-01" "tcga-xk-aaja-01"
## [34] "tcga-xq-a8tb-01" "tcga-y6-a8tl-01" "tcga-yj-a8sw-01"
## [37] "tcga-zg-a9l1-01"
sampleNames(eset)[!sampleNames(eset) %in% racevar[, 1]]
## [1] "tcga-ch-5761-11" "tcga-ch-5767-11" "tcga-ch-5768-11"
## [4] "tcga-ch-5769-11" "tcga-ej-7115-11" "tcga-ej-7123-11"
## [7] "tcga-ej-7125-11" "tcga-ej-7314-11" "tcga-ej-7315-11"
## [10] "tcga-ej-7317-11" "tcga-ej-7321-11" "tcga-ej-7327-11"
## [13] "tcga-ej-7328-11" "tcga-ej-7330-11" "tcga-ej-7331-11"
## [16] "tcga-ej-7781-11" "tcga-ej-7782-11" "tcga-ej-7783-11"
## [19] "tcga-ej-7784-11" "tcga-ej-7785-11" "tcga-ej-7786-11"
## [22] "tcga-ej-7789-11" "tcga-ej-7792-11" "tcga-ej-7793-11"
## [25] "tcga-ej-7794-11" "tcga-ej-7797-11" "tcga-g9-6333-11"
## [28] "tcga-g9-6342-11" "tcga-g9-6348-11" "tcga-g9-6351-11"
## [31] "tcga-g9-6356-11" "tcga-g9-6362-11" "tcga-g9-6363-11"
## [34] "tcga-g9-6365-11" "tcga-g9-6384-11" "tcga-g9-6496-11"
## [37] "tcga-g9-6499-11" "tcga-hc-7211-11" "tcga-hc-7737-11"
## [40] "tcga-hc-7738-11" "tcga-hc-7740-11" "tcga-hc-7742-11"
## [43] "tcga-hc-7745-11" "tcga-hc-7747-11" "tcga-hc-7752-11"
## [46] "tcga-hc-7819-11" "tcga-hc-8258-11" "tcga-hc-8259-11"
## [49] "tcga-hc-8260-11" "tcga-hc-8262-11" "tcga-j4-a83j-11"
## [52] "tcga-v1-a9o5-06"
Keep only patients in both eset and racevar, and match up the rows:
eset2 <- eset[, sampleNames(eset) %in% racevar[, 1]]
racevar <- racevar[racevar[, 1] %in% sampleNames(eset2), ]
racevar <- racevar[match(sampleNames(eset2), racevar[, 1]), ]
all.equal(sampleNames(eset2), racevar[, 1])
## [1] TRUE
Add racevar[, 2] to eset2:
eset2$racevar <- racevar[, 2]
summary(eset2$racevar == eset2$race)
## Mode TRUE NA's
## logical 153 305