Load data

load("~/Downloads/drive-download-20190403T144049Z-001/TCGA_BLCA.RData")
load("~/Downloads/drive-download-20190403T144049Z-001/test_tcga_blca_se.RData")


TCGA_BLCA
## class: RangedSummarizedExperiment 
## dim: 56830 433 
## metadata(1): data_release
## assays(1): HTSeq - Counts
## rownames(56830): ENSG00000000003 ENSG00000000005 ...
##   ENSG00000281912 ENSG00000281920
## rowData names(3): ensembl_gene_id external_gene_name
##   original_ensembl_gene_id
## colnames(433): TCGA-XF-A8HD-01A-11R-A36F-07
##   TCGA-DK-AA6W-01A-12R-A39I-07 ... TCGA-GD-A76B-01A-11R-A32O-07
##   TCGA-ZF-AA4T-01A-11R-A38B-07
## colData names(229): sample patient ... subtype_Fusion.in.TNFRSF21
##   subtype_Fusion.in.ASIP
test_tcga_blca_se
## class: RangedSummarizedExperiment 
## dim: 56925 433 
## metadata(1): data_release
## assays(1): HTSeq - Counts
## rownames(56925): ENSG00000000003 ENSG00000000005 ...
##   ENSG00000281912 ENSG00000281920
## rowData names(3): ensembl_gene_id external_gene_name
##   original_ensembl_gene_id
## colnames(433): TCGA-GD-A2C5-01A-12R-A180-07
##   TCGA-BT-A42F-01A-11R-A23W-07 ... TCGA-GU-A766-01A-11R-A32O-07
##   TCGA-GU-A42R-01A-11R-A23N-07
## colData names(230): sample patient ... subtype_Fusion.in.TNFRSF21
##   subtype_Fusion.in.ASIP
metadata(TCGA_BLCA)
## $data_release
## [1] "Data Release 12.0 - June 13, 2018"
metadata(test_tcga_blca_se)
## $data_release
## [1] "Data Release 15.0 - February 20, 2019"

compare objects

dim(TCGA_BLCA)
## [1] 56830   433
dim(test_tcga_blca_se)
## [1] 56925   433
names(assays(TCGA_BLCA)) == names(assays(test_tcga_blca_se))
## [1] TRUE
plyr::count(rownames(assay(TCGA_BLCA)) %in% rownames(assay(test_tcga_blca_se)))
##       x  freq
## 1 FALSE   114
## 2  TRUE 56716

compare counts for the first 10 genes

rownames(assay(TCGA_BLCA))[1] == rownames(assay(test_tcga_blca_se))[1]
## [1] TRUE
# Since the column is in different order we need to put them in the same order
for(i in which(rownames(assay(TCGA_BLCA))[1:10] == rownames(assay(test_tcga_blca_se))[1:10])){
  print(plyr::count(assay(TCGA_BLCA)[i,match(colnames(test_tcga_blca_se),colnames(TCGA_BLCA))] == assay(test_tcga_blca_se)[i,]))
}
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433
##      x freq
## 1 TRUE  433

Compare genes

# all genes in the new data was in the old one
which(!rownames(test_tcga_blca_se) %in% rownames(TCGA_BLCA))
## integer(0)
# but some were remoded
rownames(TCGA_BLCA)[which(!rownames(TCGA_BLCA) %in% rownames(test_tcga_blca_se))]
##   [1] "ENSG00000167945" "ENSG00000176075" "ENSG00000178193"
##   [4] "ENSG00000186354" "ENSG00000189393" "ENSG00000198384"
##   [7] "ENSG00000203386" "ENSG00000203819" "ENSG00000203886"
##  [10] "ENSG00000204187" "ENSG00000204717" "ENSG00000205662"
##  [13] "ENSG00000205663" "ENSG00000205664" "ENSG00000206483"
##  [16] "ENSG00000212290" "ENSG00000214215" "ENSG00000215365"
##  [19] "ENSG00000221972" "ENSG00000223414" "ENSG00000223535"
##  [22] "ENSG00000223929" "ENSG00000224710" "ENSG00000225541"
##  [25] "ENSG00000225704" "ENSG00000225860" "ENSG00000226434"
##  [28] "ENSG00000226738" "ENSG00000226785" "ENSG00000227603"
##  [31] "ENSG00000228614" "ENSG00000228651" "ENSG00000229477"
##  [34] "ENSG00000230439" "ENSG00000231200" "ENSG00000231429"
##  [37] "ENSG00000231435" "ENSG00000231656" "ENSG00000231842"
##  [40] "ENSG00000233265" "ENSG00000233895" "ENSG00000234165"
##  [43] "ENSG00000234449" "ENSG00000235475" "ENSG00000235825"
##  [46] "ENSG00000236082" "ENSG00000236392" "ENSG00000236660"
##  [49] "ENSG00000237122" "ENSG00000237534" "ENSG00000238033"
##  [52] "ENSG00000241737" "ENSG00000242349" "ENSG00000243012"
##  [55] "ENSG00000247732" "ENSG00000248478" "ENSG00000248686"
##  [58] "ENSG00000249734" "ENSG00000251085" "ENSG00000251628"
##  [61] "ENSG00000253248" "ENSG00000253371" "ENSG00000253426"
##  [64] "ENSG00000253839" "ENSG00000254018" "ENSG00000254235"
##  [67] "ENSG00000254267" "ENSG00000254281" "ENSG00000254597"
##  [70] "ENSG00000254671" "ENSG00000254869" "ENSG00000254981"
##  [73] "ENSG00000254998" "ENSG00000256304" "ENSG00000258297"
##  [76] "ENSG00000260114" "ENSG00000260940" "ENSG00000260977"
##  [79] "ENSG00000261013" "ENSG00000261176" "ENSG00000261643"
##  [82] "ENSG00000262251" "ENSG00000262888" "ENSG00000263553"
##  [85] "ENSG00000264242" "ENSG00000264263" "ENSG00000265114"
##  [88] "ENSG00000266411" "ENSG00000267747" "ENSG00000268036"
##  [91] "ENSG00000268439" "ENSG00000269916" "ENSG00000270028"
##  [94] "ENSG00000270058" "ENSG00000271020" "ENSG00000271324"
##  [97] "ENSG00000271762" "ENSG00000271840" "ENSG00000271941"
## [100] "ENSG00000272486" "ENSG00000272780" "ENSG00000272993"
## [103] "ENSG00000273071" "ENSG00000273237" "ENSG00000273478"
## [106] "ENSG00000274457" "ENSG00000275142" "ENSG00000275611"
## [109] "ENSG00000276911" "ENSG00000277420" "ENSG00000277539"
## [112] "ENSG00000278937" "ENSG00000279535" "ENSG00000281508"
# i.e.https://www.ensembl.org/Homo_sapiens/Gene/Idhistory?g=ENSG00000167945

Compare samples metadata

plyr::count(test_tcga_blca_se$age_at_diagnosis == TCGA_BLCA[,match(colnames(test_tcga_blca_se),colnames(TCGA_BLCA))]$age_at_diagnosis)
##      x freq
## 1 TRUE  432
## 2   NA    1
plyr::count(test_tcga_blca_se$vital_status == TCGA_BLCA[,match(colnames(test_tcga_blca_se),colnames(TCGA_BLCA))]$vital_status)
##      x freq
## 1 TRUE  433
plyr::count(test_tcga_blca_se$days_to_death == TCGA_BLCA[,match(colnames(test_tcga_blca_se),colnames(TCGA_BLCA))]$days_to_death)
##      x freq
## 1 TRUE  193
## 2   NA  240
plyr::count(test_tcga_blca_se$days_to_last_follow_up == TCGA_BLCA[,match(colnames(test_tcga_blca_se),colnames(TCGA_BLCA))]$days_to_last_follow_up)
##      x freq
## 1 TRUE  314
## 2   NA  119