hg19 Legacy (GDC r1.0-3.0)

File Counts

df_legacy <- list(
    hm27=read_tsv(url('https://raw.githubusercontent.com/zwdzwd/GDC_DNA_methylation_QC/master/file_lists/20180410_GDC_manifest_legacy.tsv_HM27_lvl3.tsv')),
    hm450=read_tsv(url('https://raw.githubusercontent.com/zwdzwd/GDC_DNA_methylation_QC/master/file_lists/20180410_GDC_manifest_legacy.tsv_HM450_lvl3.tsv'))
)
sapply(df_legacy, nrow)
##  hm27 hm450 
##  2662  9813

Cases/patients (first 12 letters of TCGA barcode)

sapply(df_legacy, function(x) length(unique(substr(x$TCGA_barcode,1,12))))
##  hm27 hm450 
##  2280  8949

Data

dfdata_legacy <- list(
    hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release1-3_legacy/07e1bc0f-b2d9-41cb-bf4d-1a9ac769654a/jhu-usc.edu_COAD.HumanMethylation27.4.lvl-3.TCGA-AA-A004-01A-01D-A00B-05.txt'),skip=1),
    hm450=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release1-3_legacy/3d4a102e-4a4c-48c3-ad51-8cc9f9b879e7/jhu-usc.edu_KIRC.HumanMethylation450.7.lvl-3.TCGA-DV-A4W0-05A-11D-A264-05.txt'),skip=1))

dfdata_legacy$hm27 %>% head
## # A tibble: 6 x 5
##   `Composite Element … Beta_value Gene_Symbol  Chromosome Genomic_Coordin…
##   <chr>                     <dbl> <chr>        <chr>                 <int>
## 1 cg00000292               0.738  ATP2A1       16                 28890100
## 2 cg00002426               0.274  SLMAP        3                  57743543
## 3 cg00003994               0.248  MEOX2        7                  15725862
## 4 cg00005847               0.605  HOXD3        2                 177029073
## 5 cg00006414              NA      ZNF425;ZNF3… 7                 148822837
## 6 cg00007981               0.0338 PANX1        11                 93862594
dfdata_legacy$hm450 %>% head
## # A tibble: 6 x 5
##   `Composite Element … Beta_value Gene_Symbol Chromosome Genomic_Coordina…
##   <chr>                     <dbl> <chr>       <chr>                  <int>
## 1 cg00000029                0.659 RBL2        16                  53468112
## 2 cg00000108               NA     C3orf35     3                   37459206
## 3 cg00000109               NA     FNDC3B      3                  171916037
## 4 cg00000165                0.125 <NA>        1                   91194674
## 5 cg00000236                0.900 VDAC3       8                   42263294
## 6 cg00000289                0.713 ACTN1       14                  69341139

hg38 LiftOver (GDC r4.0-12.0)

File Counts

df_R4 <- list(
    hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/file_lists/20180410_GDC_manifest_liftOver_workflow.tsv_HM27_lvl3.tsv')),
    hm450=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/file_lists/20180410_GDC_manifest_liftOver_workflow.tsv_HM450_lvl3.tsv'))
)
sapply(df_R4, nrow)
##  hm27 hm450 
##  2603  9756

Cases/patients (first 12 letters of TCGA barcode)

sapply(df_R4, function(x) length(unique(substr(x$TCGA_barcode,1,12))))
##  hm27 hm450 
##  2223  8893

Data

dfdata_R4 <- list(
    hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release4_plus_hg38/df4a53cd-6d07-41be-a7b7-c381a658e7ae/jhu-usc.edu_COAD.HumanMethylation27.4.lvl-3.TCGA-AA-A004-01A-01D-A00B-05.gdc_hg38.txt')),
    hm450=read_tsv(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release4_plus_hg38/ef164350-d911-4a3d-a53a-b2efed5e682c/jhu-usc.edu_KIRC.HumanMethylation450.7.lvl-3.TCGA-DV-A4W0-05A-11D-A264-05.gdc_hg38.txt.gz'))))

dfdata_R4$hm27 %>% head
## # A tibble: 6 x 11
##   `Composite Elem… Beta_value Chromosome  Start    End Gene_Symbol
##   <chr>                 <dbl> <chr>       <int>  <int> <chr>      
## 1 cg00000292           0.738  chr16      2.89e7 2.89e7 ATP2A1;ATP…
## 2 cg00002426           0.274  chr3       5.78e7 5.78e7 SLMAP;SLMA…
## 3 cg00003994           0.248  chr7       1.57e7 1.57e7 MEOX2      
## 4 cg00005847           0.605  chr2       1.76e8 1.76e8 AC009336.1…
## 5 cg00006414          NA      chr7       1.49e8 1.49e8 RN7SL521P;…
## 6 cg00007981           0.0338 chr11      9.41e7 9.41e7 PANX1;PANX1
## # ... with 5 more variables: Gene_Type <chr>, Transcript_ID <chr>,
## #   Position_to_TSS <chr>, CGI_Coordinate <chr>, Feature_Type <chr>
dfdata_R4$hm450 %>% head
## # A tibble: 6 x 11
##   `Composite Elem… Beta_value Chromosome  Start    End Gene_Symbol
##   <chr>                 <dbl> <chr>       <int>  <int> <chr>      
## 1 cg00000029            0.659 chr16      5.34e7 5.34e7 RBL2;RBL2;…
## 2 cg00000108           NA     chr3       3.74e7 3.74e7 C3orf35;C3…
## 3 cg00000109           NA     chr3       1.72e8 1.72e8 FNDC3B;FND…
## 4 cg00000165            0.125 chr1       9.07e7 9.07e7 .          
## 5 cg00000236            0.900 chr8       4.24e7 4.24e7 VDAC3      
## 6 cg00000289            0.713 chr14      6.89e7 6.89e7 ACTN1;ACTN…
## # ... with 5 more variables: Gene_Type <chr>, Transcript_ID <chr>,
## #   Position_to_TSS <chr>, CGI_Coordinate <chr>, Feature_Type <chr>

Compare hg19 and hg38

Measurements

Measurements are the same, only the coordinates and gene annotation have changed.

sapply(dfdata_R4, nrow)
##   hm27  hm450 
##  27578 485577
sapply(dfdata_legacy, nrow)
##   hm27  hm450 
##  27578 485577
all(dfdata_legacy$hm450$Beta_value == dfdata_R4$hm450$Beta_value, na.rm = TRUE)
## [1] TRUE

Coordinates

Unmapped probes on hg38 are annotated with “*“.

Fraction of probes unmapped in hg38

HM27

tb <- table(dfdata_R4$hm27$Chromosome)
kable(tb)
Var1 Freq
* 544
chr1 2849
chr10 1035
chr11 1716
chr12 1517
chr13 485
chr14 822
chr15 803
chr16 1163
chr17 1538
chr18 390
chr19 1880
chr2 1677
chr20 879
chr21 296
chr22 626
chr3 1514
chr4 1013
chr5 1140
chr6 1465
chr7 1227
chr8 921
chr9 1050
chrX 1024
chrY 4
(1-unname(tb['*'])/sum(tb)) * 100 # percentage of mapped probes
## [1] 98.02741

HM450

tb <- table(dfdata_R4$hm450$Chromosome)
kable(tb)
Var1 Freq
* 5120
chr1 46242
chr10 24109
chr11 28558
chr12 24363
chr13 12131
chr14 14972
chr15 15093
chr16 21767
chr17 27654
chr18 5862
chr19 25344
chr2 34496
chr20 10341
chr21 3868
chr22 8435
chr3 24967
chr4 20203
chr5 24117
chr6 36299
chr7 29651
chr8 20670
chr9 9791
chrX 11127
chrY 397
(1-unname(tb['*'])/sum(tb)) * 100 # percentage of mapped probes
## [1] 98.94558

Fraction of unmapped

kable(sapply(dfdata_R4, function(x) table(x$Chromosome=='*')))
hm27 hm450
FALSE 27034 480457
TRUE 544 5120
# legacy doesn't have any unmapped probes
sapply(dfdata_legacy, function(x) table(x$Chromosome=='*'))
##  hm27.FALSE hm450.FALSE 
##       25978      485512

Probe Mapping by Chromosome

Most probes are mapped to the same chromosome.

kable(table(dfdata_legacy$hm450$Chromosome, dfdata_R4$hm450$Chromosome))
* chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY
1 608 46233 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 8 0 0
10 281 0 24107 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 238 0 0 28556 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 180 0 0 0 24359 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13 154 0 0 0 0 12131 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14 106 0 0 0 0 0 14972 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
15 168 0 0 0 0 0 0 15091 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
16 203 0 0 0 0 0 0 0 21766 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
17 225 0 0 0 0 0 0 0 0 27654 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18 61 0 0 0 0 0 0 0 0 0 5861 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 177 0 0 0 0 0 0 0 0 0 0 25344 0 0 0 0 0 0 0 0 0 0 0 0 0
2 319 0 0 0 0 0 0 0 0 0 0 0 34491 0 0 0 0 0 0 0 0 0 0 0 0
20 40 0 0 0 0 0 0 0 0 0 0 0 0 10339 0 0 0 0 0 0 0 0 0 0 0
21 377 0 0 0 0 0 0 0 0 0 0 0 0 0 3866 0 0 0 0 0 0 0 0 0 0
22 127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8422 0 1 0 0 0 2 0 0 0
3 196 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24963 0 0 0 0 0 0 0 0
4 266 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 20198 0 0 0 0 0 0 0
5 214 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24113 0 0 0 0 0 0
6 315 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36296 0 0 0 0 0
7 365 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 29651 0 0 0 0
8 288 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 20662 0 0 0
9 80 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 9780 0 0
X 113 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11119 0
Y 19 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 397
kable(table(dfdata_legacy$hm27$Chromosome, dfdata_R4$hm27$Chromosome))
* chr1 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr2 chr20 chr21 chr22 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY
1 5 2745 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 2 0 1013 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 2 0 0 1652 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 1453 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13 1 0 0 0 0 462 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14 0 0 0 0 0 0 797 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 0 0 772 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0 1128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
17 5 0 0 0 0 0 0 0 0 1477 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18 0 0 0 0 0 0 0 0 0 0 373 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 3 0 0 0 0 0 0 0 0 0 0 1821 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 0 0 1594 0 0 0 0 0 0 0 0 0 0 0 0
20 1 0 0 0 0 0 0 0 0 0 0 0 0 845 0 0 0 0 0 0 0 0 0 0 0
21 15 0 0 0 0 0 0 0 0 0 0 0 0 0 278 0 0 0 0 0 0 0 0 0 0
22 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 611 0 0 0 0 0 0 0 0 0
3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1450 0 0 0 0 0 0 0 0
4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 947 0 0 0 0 0 0 0
5 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1104 0 0 0 0 0 0
6 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1417 0 0 0 0 0
7 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1169 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 884 0 0 0
9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 978 0 0
X 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 959 0
Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3

CpGs Targeted by Multiple Probes

HM450

dfdata_R4$hm450 %>% filter(Chromosome!='*') %>% arrange(Chromosome, Start) -> dfdata_R4_bycoord
dup_index <- which(dfdata_R4_bycoord$Start[-1] == dfdata_R4_bycoord$Start[-nrow(dfdata_R4_bycoord)])
dup_probe_index <- sort(unique(c(dup_index, dup_index+1)))
dfdata_R4_bycoord[dup_probe_index,] %>% dplyr::select(Chromosome, Start, End, 'Composite Element REF')
## # A tibble: 42 x 4
##    Chromosome     Start       End `Composite Element REF`
##    <chr>          <int>     <int> <chr>                  
##  1 chr1       120851001 120851002 cg09137068             
##  2 chr1       120851001 120851002 cg26329816             
##  3 chr1       148152231 148152232 cg10127479             
##  4 chr1       148152231 148152232 cg20294457             
##  5 chr10       17844509  17844510 cg07386868             
##  6 chr10       17844509  17844510 cg19201533             
##  7 chr10       45972791  45972792 cg10432423             
##  8 chr10       45972791  45972792 cg20477318             
##  9 chr10       46397210  46397211 cg15247093             
## 10 chr10       46397210  46397211 cg17243957             
## # ... with 32 more rows

Under GRCh38, 21 CpGs become interrogated with multiple probes (42 probes). Note this is after excluding MASK_mapping. The full list is available at here.

HM27

No such CpGs exist for HM27

dfdata_R4$hm27 %>% filter(Chromosome!='*') %>% arrange(Chromosome, Start) -> dfdata_R4_bycoord
which(dfdata_R4_bycoord$Start[-1] == dfdata_R4_bycoord$Start[-nrow(dfdata_R4_bycoord)])
## integer(0)

Probe Mapping by Quality

HM450.hg38.manifest <- readRDS('~/Downloads/tmp/20180808/hm450.hg38.manifest.rds')
HM450.hg19.manifest <- readRDS('~/Downloads/tmp/20180808/hm450.hg19.manifest.rds')

All probes

da <- HM450.hg19.manifest
db <- HM450.hg38.manifest[names(da)]
a <- cut(da[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
tble['(59,60]','(59,60]'] / sum(tble) # Fraction of unique mapping
## [1] 0.8614918

Type-I

a <- cut(da[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg Type-I A')
Mapping quality cross comparison type cg Type-I A
[0,10] (10,40] (40,59] (59,60]
[0,10] 7216 15 4 4
(10,40] 114 6949 5 4
(40,59] 15 28 4114 9
(59,60] 194 45 52 116733
a <- cut(da[da$designType=='I']$mapQ_B, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_B, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg Type-I B')
Mapping quality cross comparison type cg Type-I B
[0,10] (10,40] (40,59] (59,60]
[0,10] 5693 20 2 9
(10,40] 115 6389 4 7
(40,59] 14 25 2709 7
(59,60] 204 55 36 120212

Type-II

a <- cut(da[da$designType=='II']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='II']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg type II')
Mapping quality cross comparison type cg type II
[0,10] (10,40] (40,59] (59,60]
[0,10] 11295 33 4 32
(10,40] 249 14548 9 8
(40,59] 61 34 10003 15
(59,60] 520 84 68 313113

Probe Mapping by Mismatch

A-allele

tble <- table(da[da$probeType!='rs']$NM_A, db[db$probeType!='rs']$NM_A)
kable(tble)
0 1 2 3 4 5 6 8 12
0 484743 629 70 14 17 2 5 1 1
tble['0','0'] / sum(tble) # Fraction of perfect mapping
## [1] 0.9984778
tble_decoy <- table(da[da$probeType!='rs']$wDecoy_NM_A, db[db$probeType!='rs']$wDecoy_NM_A)
kable(tble_decoy)
0 1 2 3 4 5 6 8 12
0 484766 629 68 16 15 2 5 2 1
tble_decoy['0','0'] / sum(tble_decoy) # Fraction of perfect mapping with decoy
## [1] 0.9984799
tble_decoy['0','0'] - tble['0','0'] # Number of probes moved to decoy
## [1] 23

B-allele

tble <- table(da[da$probeType!='rs']$NM_B, db[db$probeType!='rs']$NM_B)
kable(tble)
0 1 2 3 4 5 6 7
0 135235 191 22 6 5 1 2 1
tble['0','0'] / sum(tble) # Fraction of perfect mapping
## [1] 0.9983169
tble_decoy <- table(da[da$probeType!='rs']$wDecoy_NM_B, db[db$probeType!='rs']$wDecoy_NM_B)
kable(tble_decoy)
0 1 2 3 4 5 6 8
0 135246 191 21 4 5 1 3 1
tble_decoy['0','0'] / sum(tble_decoy) # Fraction of perfect mapping with decoy
## [1] 0.9983318
tble_decoy['0','0'] - tble['0','0'] # Number of probes moved to decoy
## [1] 11

SNP probes

One probe switched the REF and ALT.

kable(table(da[da$designType=='I' & da$probeType=='rs']$NM_A, db[db$designType=='I' & db$probeType=='rs']$NM_A))
0 1
0 9 0
1 1 15
kable(table(da[da$designType=='I' & da$probeType=='rs']$NM_B, db[db$designType=='I' & db$probeType=='rs']$NM_B))
0 1
0 15 1
1 0 9
kable(table(da[da$designType=='II' & da$probeType=='rs']$NM_A, db[db$designType=='II' & db$probeType=='rs']$NM_A))
0
0 40

Gene Association

HM450

all(dfdata_R4$hm450$`Composite Element REF` == dfdata_legacy$hm450$`Composite Element REF`)
## [1] TRUE
dfdata_R4$hm450$geneUniq <- sapply(strsplit(dfdata_R4$hm450$Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))
dfdata_R4$hm450$geneType <- sapply(strsplit(dfdata_R4$hm450$Gene_Type, ';'), function(x) paste0(unique(x), collapse = ';'))
df <- dfdata_R4$hm450 %>% 
    dplyr::rename(probe='Composite Element REF') %>% 
    dplyr::select(probe, geneUniq, geneType) %>% 
    cbind(dfdata_legacy$hm450) %>% 
    mutate(geneUniq=replace(geneUniq, geneUniq=='.',NA)) %>% 
    dplyr::rename(gene_hg38=geneUniq) %>% 
    mutate(gene_hg19 = sapply(strsplit(Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))) %>% 
    mutate(gene_hg19 = replace(gene_hg19, gene_hg19=='',NA))

More probes are associated with gene in GRCh38.

tb <- table(!is.na(df$gene_hg38), !is.na(df$gene_hg19))
rownames(tb) <- c('Not Annotated in hg38','Annotated in hg38')
colnames(tb) <- c('Not Annotated in hg19','Annotated in hg19')
kable(tb)
Not Annotated in hg19 Annotated in hg19
Not Annotated in hg38 72026 3291
Annotated in hg38 47691 362569
kable(tb/sum(tb))
Not Annotated in hg19 Annotated in hg19
Not Annotated in hg38 0.1483307 0.0067775
Annotated in hg38 0.0982151 0.7466766

365860 in 485577 probes are associated with gene in hg19. 410260 in 485577 probes are associated with gene in hg38. 255082 probes are annotated with exactly the same genes.

df$hg19_in_hg38 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg19'],';')[[1]], function(xx) grepl(xx,x['gene_hg38']))))
df$hg19_in_hg38 <- ifelse(is.na(df$gene_hg19), TRUE, df$hg19_in_hg38)
df$hg38_in_hg19 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg38'],';')[[1]], function(xx) grepl(xx,x['gene_hg19']))))
df$hg38_in_hg19 <- ifelse(is.na(df$gene_hg38), TRUE, df$hg38_in_hg19)
df_identical <- (df %>% filter((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19)))
df_different <- (df %>% filter(!((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19))))

327145 (67.372425%) probes probes have identical gene annotations. 118767 (24.4589427%) probes annotated in hg19 is not included in genes annotated hg38. Only 39665 (8.1686324%) probes lost gene annotation from hg19 in hg38. 17952 probes are either with ‘orf’ or ‘LOC’ names, indicating an update of gene annotation from resolving these identifiers. 153082 (31.525793%) probes gain new annotation in hg38 compared to hg19. A substantial parts are due to the addition of non-coding genes.

df_different %>% filter(!hg38_in_hg19) %>% summarise(
    antisense=sum(grepl('antisense', geneType))/n()*100,
    lincRNA=sum(grepl('lincRNA', geneType))/n()*100,
    miRNA=sum(grepl('miRNA', geneType))/n()*100,
    processed_transcript=sum(grepl('processed_transcript', geneType))/n()*100,
    pseudogene=sum(grepl('pseudogene', geneType))/n()*100)
##   antisense  lincRNA    miRNA processed_transcript pseudogene
## 1  27.47874 18.02629 2.837042             5.000588   9.666715
df_different %>% head
##        probe      gene_hg38             geneType Composite Element REF
## 1 cg00000363          PGBD5       protein_coding            cg00000363
## 2 cg00000884           <NA>                    .            cg00000884
## 3 cg00001099 ATP6V0D2;PSKH2       protein_coding            cg00001099
## 4 cg00001249          LRRC9       protein_coding            cg00001249
## 5 cg00001261  LA16c-306E5.2       protein_coding            cg00001261
## 6 cg00001446 ELOVL1;MIR6734 protein_coding;miRNA            cg00001446
##   Beta_value Gene_Symbol Chromosome Genomic_Coordinate gene_hg19
## 1  0.1204661        <NA>          1          230560793      <NA>
## 2         NA        TLR2          4          154609857      TLR2
## 3         NA       PSKH2          8           87081553     PSKH2
## 4  0.8729648        <NA>         14           60389786      <NA>
## 5  0.2489936        <NA>         16            3463964      <NA>
## 6  0.8604751      ELOVL1          1           43831041    ELOVL1
##   hg19_in_hg38 hg38_in_hg19
## 1         TRUE        FALSE
## 2        FALSE         TRUE
## 3         TRUE        FALSE
## 4         TRUE        FALSE
## 5         TRUE        FALSE
## 6         TRUE        FALSE

The complete gene discordance table can be downloaded at https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_probes_with_different_gene_annotation_column.tsv

df %>% mutate(same=ifelse(hg19_in_hg38&hg38_in_hg19, 'CONCORDANT', ifelse(hg19_in_hg38, 'AUGMENTED', 'DISCORDANT')), geneType=sapply(strsplit(geneType,';'), function(x) paste0(sort(x), collapse=';'))) %>% mutate(geneType=replace(geneType, geneType=='.', 'NA')) -> df1
df1 %>% count(same) %>% mutate(prop=prop.table(n))
## # A tibble: 3 x 3
##   same            n   prop
##   <chr>       <int>  <dbl>
## 1 AUGMENTED  118767 0.245 
## 2 CONCORDANT 327145 0.674 
## 3 DISCORDANT  39665 0.0817
df1 %>% group_by(same, geneType) %>% summarise(freq=n()) %>% ungroup() %>% mutate(geneType = sprintf('%s (%1.2f%%)', sub(';',' ',geneType), freq/sum(freq)*100)) -> df2

# vocabulary_hg19 <- do.call(c,strsplit(df$gene_hg19,';'))
# pdf('~/gallery/20180819_gdc_qc_treemap_hm450.pdf')
treemap(df2, index=c('same','geneType'),vSize='freq',palette='Blues')

HM27

all(dfdata_R4$hm27$`Composite Element REF` == dfdata_legacy$hm27$`Composite Element REF`)
## [1] TRUE
dfdata_R4$hm27$geneUniq <- sapply(strsplit(dfdata_R4$hm27$Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))
dfdata_R4$hm27$geneType <- sapply(strsplit(dfdata_R4$hm27$Gene_Type, ';'), function(x) paste0(unique(x), collapse = ';'))
df <- dfdata_R4$hm27 %>% 
    dplyr::rename(probe='Composite Element REF') %>% 
    dplyr::select(probe, geneUniq, geneType) %>% 
    cbind(dfdata_legacy$hm27) %>% 
    mutate(geneUniq=replace(geneUniq, geneUniq=='.',NA)) %>% 
    dplyr::rename(gene_hg38=geneUniq) %>% 
    mutate(gene_hg19 = sapply(strsplit(Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))) %>% 
    mutate(gene_hg19 = replace(gene_hg19, gene_hg19=='',NA))

More probes are associated with gene in GRCh38.

tb <- table(!is.na(df$gene_hg38), !is.na(df$gene_hg19))
rownames(tb) <- c('Not Annotated in hg38','Annotated in hg38')
colnames(tb) <- c('Not Annotated in hg19','Annotated in hg19')
kable(tb)
Not Annotated in hg19 Annotated in hg19
Not Annotated in hg38 562 61
Annotated in hg38 1254 25701
kable(tb/sum(tb))
Not Annotated in hg19 Annotated in hg19
Not Annotated in hg38 0.0203786 0.0022119
Annotated in hg38 0.0454710 0.9319385

25762 in 27578 probes are associated with gene in hg19. 26955 in 27578 probes are associated with gene in hg38. 17174 probes are annotated with exactly the same genes.

df$hg19_in_hg38 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg19'],';')[[1]], function(xx) grepl(xx,x['gene_hg38']))))
df$hg19_in_hg38 <- ifelse(is.na(df$gene_hg19), TRUE, df$hg19_in_hg38)
df$hg38_in_hg19 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg38'],';')[[1]], function(xx) grepl(xx,x['gene_hg19']))))
df$hg38_in_hg19 <- ifelse(is.na(df$gene_hg38), TRUE, df$hg38_in_hg19)
df_identical <- (df %>% filter((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19)))
df_different <- (df %>% filter(!((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19))))

17740 (64.3266372%) probes have identical gene annotations. 7586 (27.5074335%) probes annotated in hg19 is not included in genes annotated hg38. Only 2252 (8.1659294%) probes lost gene annotation from hg19 in hg38. 1099 probes are either with ‘orf’ or ‘LOC’ names, indicating an update of gene annotation from resolving these identifiers. 9662 (35.035173%) probes gain new annotation in hg38 compared to hg19. A substantial parts are due to the addition of non-coding genes.

df_different %>% filter(!hg38_in_hg19) %>% summarise(
    antisense=sum(grepl('antisense', geneType))/n()*100,
    lincRNA=sum(grepl('lincRNA', geneType))/n()*100,
    miRNA=sum(grepl('miRNA', geneType))/n()*100,
    processed_transcript=sum(grepl('processed_transcript', geneType))/n()*100,
    pseudogene=sum(grepl('pseudogene', geneType))/n()*100)
##   antisense  lincRNA    miRNA processed_transcript pseudogene
## 1  31.63941 10.29807 3.405092              4.72987   3.415442
df_different %>% head
##        probe                             gene_hg38
## 1 cg00005847        AC009336.19;HOXD3;RP11-387A1.5
## 2 cg00006414               RN7SL521P;ZNF398;ZNF425
## 3 cg00008493                           COX8C;UNC79
## 4 cg00010193                    AC092535.3;TMED11P
## 5 cg00012792 BLOC1S5;BLOC1S5-TXNDC5;EEF1E1-BLOC1S5
## 6 cg00013618                      IGLVI-70;PRAMENP
##                                             geneType Composite Element REF
## 1                           protein_coding;antisense            cg00005847
## 2                            misc_RNA;protein_coding            cg00006414
## 3                                     protein_coding            cg00008493
## 4                       antisense;unitary_pseudogene            cg00010193
## 5                                     protein_coding            cg00012792
## 6 IG_V_pseudogene;transcribed_unprocessed_pseudogene            cg00013618
##   Beta_value    Gene_Symbol Chromosome Genomic_Coordinate      gene_hg19
## 1 0.60548056          HOXD3          2          177029073          HOXD3
## 2         NA  ZNF425;ZNF398          7          148822837  ZNF398;ZNF425
## 3 0.98931512 COX8C;KIAA1409         14           93813777 COX8C;KIAA1409
## 4 0.60833069           <NA>       <NA>                  0           <NA>
## 5 0.01845706          MUTED          6            8064493          MUTED
## 6 0.53753164           <NA>         22           22380839           <NA>
##   hg19_in_hg38 hg38_in_hg19
## 1         TRUE        FALSE
## 2         TRUE        FALSE
## 3        FALSE        FALSE
## 4         TRUE        FALSE
## 5        FALSE        FALSE
## 6         TRUE        FALSE

The complete gene discordance table can be downloaded at https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM27_probes_with_different_gene_annotation_column.tsv

df %>% mutate(same=ifelse(hg19_in_hg38&hg38_in_hg19, 'CONCORDANT', ifelse(hg19_in_hg38, 'AUGMENTED', 'DISCORDANT')), geneType=sapply(strsplit(geneType,';'), function(x) paste0(sort(x), collapse=';'))) %>% mutate(geneType=replace(geneType, geneType=='.', 'NA')) -> df1
df1 %>% count(same) %>% mutate(prop=prop.table(n))
## # A tibble: 3 x 3
##   same           n   prop
##   <chr>      <int>  <dbl>
## 1 AUGMENTED   7586 0.275 
## 2 CONCORDANT 17740 0.643 
## 3 DISCORDANT  2252 0.0817
df1 %>% group_by(same, geneType) %>% summarise(freq=n()) %>% ungroup() %>% mutate(geneType = sprintf('%s (%1.2f%%)', sub(';',' ',geneType), freq/sum(freq)*100)) -> df2

# vocabulary_hg19 <- do.call(c,strsplit(df$gene_hg19,';'))
# pdf('~/gallery/20180819_gdc_qc_treemap_hm450.pdf')
treemap(df2, index=c('same','geneType'),vSize='freq',palette='Blues')