hg19 Legacy (GDC r1.0-3.0)

File Counts

df_legacy <- list(
    hm27=read_tsv(url('https://raw.githubusercontent.com/zwdzwd/GDC_DNA_methylation_QC/master/file_lists/20180410_GDC_manifest_legacy.tsv_HM27_lvl3.tsv')),
    hm450=read_tsv(url('https://raw.githubusercontent.com/zwdzwd/GDC_DNA_methylation_QC/master/file_lists/20180410_GDC_manifest_legacy.tsv_HM450_lvl3.tsv'))
)
sapply(df_legacy, nrow)

##  hm27 hm450 
##  2662  9813

Cases/patients (first 12 letters of TCGA barcode)

sapply(df_legacy, function(x) length(unique(substr(x$TCGA_barcode,1,12))))

##  hm27 hm450 
##  2280  8949

Data

dfdata_legacy <- list(
    hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release1-3_legacy/07e1bc0f-b2d9-41cb-bf4d-1a9ac769654a/jhu-usc.edu_COAD.HumanMethylation27.4.lvl-3.TCGA-AA-A004-01A-01D-A00B-05.txt'),skip=1),
    hm450=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release1-3_legacy/3d4a102e-4a4c-48c3-ad51-8cc9f9b879e7/jhu-usc.edu_KIRC.HumanMethylation450.7.lvl-3.TCGA-DV-A4W0-05A-11D-A264-05.txt'),skip=1))

dfdata_legacy$hm27 %>% head

## # A tibble: 6 x 5
##   `Composite Element … Beta_value Gene_Symbol  Chromosome Genomic_Coordin…
##   <chr>                     <dbl> <chr>        <chr>                 <int>
## 1 cg00000292               0.738  ATP2A1       16                 28890100
## 2 cg00002426               0.274  SLMAP        3                  57743543
## 3 cg00003994               0.248  MEOX2        7                  15725862
## 4 cg00005847               0.605  HOXD3        2                 177029073
## 5 cg00006414              NA      ZNF425;ZNF3… 7                 148822837
## 6 cg00007981               0.0338 PANX1        11                 93862594

dfdata_legacy$hm450 %>% head

## # A tibble: 6 x 5
##   `Composite Element … Beta_value Gene_Symbol Chromosome Genomic_Coordina…
##   <chr>                     <dbl> <chr>       <chr>                  <int>
## 1 cg00000029                0.659 RBL2        16                  53468112
## 2 cg00000108               NA     C3orf35     3                   37459206
## 3 cg00000109               NA     FNDC3B      3                  171916037
## 4 cg00000165                0.125 <NA>        1                   91194674
## 5 cg00000236                0.900 VDAC3       8                   42263294
## 6 cg00000289                0.713 ACTN1       14                  69341139

hg38 LiftOver (GDC r4.0-12.0)

File Counts

df_R4 <- list(
    hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/file_lists/20180410_GDC_manifest_liftOver_workflow.tsv_HM27_lvl3.tsv')),
    hm450=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/file_lists/20180410_GDC_manifest_liftOver_workflow.tsv_HM450_lvl3.tsv'))
)
sapply(df_R4, nrow)

##  hm27 hm450 
##  2603  9756

Cases/patients (first 12 letters of TCGA barcode)

sapply(df_R4, function(x) length(unique(substr(x$TCGA_barcode,1,12))))

##  hm27 hm450 
##  2223  8893

Data

dfdata_R4 <- list(
    hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release4_plus_hg38/df4a53cd-6d07-41be-a7b7-c381a658e7ae/jhu-usc.edu_COAD.HumanMethylation27.4.lvl-3.TCGA-AA-A004-01A-01D-A00B-05.gdc_hg38.txt')),
    hm450=read_tsv(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release4_plus_hg38/ef164350-d911-4a3d-a53a-b2efed5e682c/jhu-usc.edu_KIRC.HumanMethylation450.7.lvl-3.TCGA-DV-A4W0-05A-11D-A264-05.gdc_hg38.txt.gz'))))

dfdata_R4$hm27 %>% head

## # A tibble: 6 x 11
##   `Composite Elem… Beta_value Chromosome  Start    End Gene_Symbol
##   <chr>                 <dbl> <chr>       <int>  <int> <chr>      
## 1 cg00000292           0.738  chr16      2.89e7 2.89e7 ATP2A1;ATP…
## 2 cg00002426           0.274  chr3       5.78e7 5.78e7 SLMAP;SLMA…
## 3 cg00003994           0.248  chr7       1.57e7 1.57e7 MEOX2      
## 4 cg00005847           0.605  chr2       1.76e8 1.76e8 AC009336.1…
## 5 cg00006414          NA      chr7       1.49e8 1.49e8 RN7SL521P;…
## 6 cg00007981           0.0338 chr11      9.41e7 9.41e7 PANX1;PANX1
## # ... with 5 more variables: Gene_Type <chr>, Transcript_ID <chr>,
## #   Position_to_TSS <chr>, CGI_Coordinate <chr>, Feature_Type <chr>

dfdata_R4$hm450 %>% head

## # A tibble: 6 x 11
##   `Composite Elem… Beta_value Chromosome  Start    End Gene_Symbol
##   <chr>                 <dbl> <chr>       <int>  <int> <chr>      
## 1 cg00000029            0.659 chr16      5.34e7 5.34e7 RBL2;RBL2;…
## 2 cg00000108           NA     chr3       3.74e7 3.74e7 C3orf35;C3…
## 3 cg00000109           NA     chr3       1.72e8 1.72e8 FNDC3B;FND…
## 4 cg00000165            0.125 chr1       9.07e7 9.07e7 .          
## 5 cg00000236            0.900 chr8       4.24e7 4.24e7 VDAC3      
## 6 cg00000289            0.713 chr14      6.89e7 6.89e7 ACTN1;ACTN…
## # ... with 5 more variables: Gene_Type <chr>, Transcript_ID <chr>,
## #   Position_to_TSS <chr>, CGI_Coordinate <chr>, Feature_Type <chr>

Compare hg19 and hg38

Measurements

Measurements are the same, only the coordinates and gene annotation have changed.

sapply(dfdata_R4, nrow)

##   hm27  hm450 
##  27578 485577

sapply(dfdata_legacy, nrow)

##   hm27  hm450 
##  27578 485577

all(dfdata_legacy$hm450$Beta_value == dfdata_R4$hm450$Beta_value, na.rm = TRUE)

## [1] TRUE

Coordinates

Unmapped probes on hg38 are annotated with “*“.

Fraction of probes unmapped in hg38

HM27

tb <- table(dfdata_R4$hm27$Chromosome)
kable(tb)

Var1	Freq
*	544
chr1	2849
chr10	1035
chr11	1716
chr12	1517
chr13	485
chr14	822
chr15	803
chr16	1163
chr17	1538
chr18	390
chr19	1880
chr2	1677
chr20	879
chr21	296
chr22	626
chr3	1514
chr4	1013
chr5	1140
chr6	1465
chr7	1227
chr8	921
chr9	1050
chrX	1024
chrY	4

(1-unname(tb['*'])/sum(tb)) * 100 # percentage of mapped probes

## [1] 98.02741

HM450

tb <- table(dfdata_R4$hm450$Chromosome)
kable(tb)

Var1	Freq
*	5120
chr1	46242
chr10	24109
chr11	28558
chr12	24363
chr13	12131
chr14	14972
chr15	15093
chr16	21767
chr17	27654
chr18	5862
chr19	25344
chr2	34496
chr20	10341
chr21	3868
chr22	8435
chr3	24967
chr4	20203
chr5	24117
chr6	36299
chr7	29651
chr8	20670
chr9	9791
chrX	11127
chrY	397

(1-unname(tb['*'])/sum(tb)) * 100 # percentage of mapped probes

## [1] 98.94558

Fraction of unmapped

kable(sapply(dfdata_R4, function(x) table(x$Chromosome=='*')))

	hm27	hm450
FALSE	27034	480457
TRUE	544	5120

# legacy doesn't have any unmapped probes
sapply(dfdata_legacy, function(x) table(x$Chromosome=='*'))

##  hm27.FALSE hm450.FALSE 
##       25978      485512

Probe Mapping by Chromosome

Most probes are mapped to the same chromosome.

kable(table(dfdata_legacy$hm450$Chromosome, dfdata_R4$hm450$Chromosome))

	*	chr1	chr10	chr11	chr12	chr13	chr14	chr15	chr16	chr17	chr18	chr19	chr2	chr20	chr21	chr22	chr3	chr4	chr5	chr6	chr7	chr8	chr9	chrX	chrY
1	608	46233	0	0	0	0	0	0	0	0	0	0	0	0	0	8	0	0	0	0	0	0	8	0	0
10	281	0	24107	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
11	238	0	0	28556	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
12	180	0	0	0	24359	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
13	154	0	0	0	0	12131	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
14	106	0	0	0	0	0	14972	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
15	168	0	0	0	0	0	0	15091	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
16	203	0	0	0	0	0	0	0	21766	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
17	225	0	0	0	0	0	0	0	0	27654	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
18	61	0	0	0	0	0	0	0	0	0	5861	0	0	0	0	0	0	0	0	0	0	0	0	0	0
19	177	0	0	0	0	0	0	0	0	0	0	25344	0	0	0	0	0	0	0	0	0	0	0	0	0
2	319	0	0	0	0	0	0	0	0	0	0	0	34491	0	0	0	0	0	0	0	0	0	0	0	0
20	40	0	0	0	0	0	0	0	0	0	0	0	0	10339	0	0	0	0	0	0	0	0	0	0	0
21	377	0	0	0	0	0	0	0	0	0	0	0	0	0	3866	0	0	0	0	0	0	0	0	0	0
22	127	0	0	0	0	0	0	0	0	0	0	0	0	0	0	8422	0	1	0	0	0	2	0	0	0
3	196	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	24963	0	0	0	0	0	0	0	0
4	266	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	20198	0	0	0	0	0	0	0
5	214	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	24113	0	0	0	0	0	0
6	315	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	36296	0	0	0	0	0
7	365	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	29651	0	0	0	0
8	288	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	20662	0	0	0
9	80	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	9780	0	0
X	113	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	11119	0
Y	19	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	397

kable(table(dfdata_legacy$hm27$Chromosome, dfdata_R4$hm27$Chromosome))

	*	chr1	chr10	chr11	chr12	chr13	chr14	chr15	chr16	chr17	chr18	chr19	chr2	chr20	chr21	chr22	chr3	chr4	chr5	chr6	chr7	chr8	chr9	chrX	chrY
1	5	2745	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
10	2	0	1013	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
11	2	0	0	1652	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
12	0	0	0	0	1453	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
13	1	0	0	0	0	462	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
14	0	0	0	0	0	0	797	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
15	0	0	0	0	0	0	0	772	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
16	0	0	0	0	0	0	0	0	1128	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
17	5	0	0	0	0	0	0	0	0	1477	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
18	0	0	0	0	0	0	0	0	0	0	373	0	0	0	0	0	0	0	0	0	0	0	0	0	0
19	3	0	0	0	0	0	0	0	0	0	0	1821	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	0	0	1594	0	0	0	0	0	0	0	0	0	0	0	0
20	1	0	0	0	0	0	0	0	0	0	0	0	0	845	0	0	0	0	0	0	0	0	0	0	0
21	15	0	0	0	0	0	0	0	0	0	0	0	0	0	278	0	0	0	0	0	0	0	0	0	0
22	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	611	0	0	0	0	0	0	0	0	0
3	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1450	0	0	0	0	0	0	0	0
4	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	947	0	0	0	0	0	0	0
5	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1104	0	0	0	0	0	0
6	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1417	0	0	0	0	0
7	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1169	0	0	0	0
8	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	884	0	0	0
9	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	978	0	0
X	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	959	0
Y	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	3

CpGs Targeted by Multiple Probes

HM450

dfdata_R4$hm450 %>% filter(Chromosome!='*') %>% arrange(Chromosome, Start) -> dfdata_R4_bycoord
dup_index <- which(dfdata_R4_bycoord$Start[-1] == dfdata_R4_bycoord$Start[-nrow(dfdata_R4_bycoord)])
dup_probe_index <- sort(unique(c(dup_index, dup_index+1)))
dfdata_R4_bycoord[dup_probe_index,] %>% dplyr::select(Chromosome, Start, End, 'Composite Element REF')

## # A tibble: 42 x 4
##    Chromosome     Start       End `Composite Element REF`
##    <chr>          <int>     <int> <chr>                  
##  1 chr1       120851001 120851002 cg09137068             
##  2 chr1       120851001 120851002 cg26329816             
##  3 chr1       148152231 148152232 cg10127479             
##  4 chr1       148152231 148152232 cg20294457             
##  5 chr10       17844509  17844510 cg07386868             
##  6 chr10       17844509  17844510 cg19201533             
##  7 chr10       45972791  45972792 cg10432423             
##  8 chr10       45972791  45972792 cg20477318             
##  9 chr10       46397210  46397211 cg15247093             
## 10 chr10       46397210  46397211 cg17243957             
## # ... with 32 more rows

Under GRCh38, 21 CpGs become interrogated with multiple probes (42 probes). Note this is after excluding MASK_mapping. The full list is available at here.

HM27

No such CpGs exist for HM27

dfdata_R4$hm27 %>% filter(Chromosome!='*') %>% arrange(Chromosome, Start) -> dfdata_R4_bycoord
which(dfdata_R4_bycoord$Start[-1] == dfdata_R4_bycoord$Start[-nrow(dfdata_R4_bycoord)])

## integer(0)

Probe Mapping by Quality

HM450.hg38.manifest <- readRDS('~/Downloads/tmp/20180808/hm450.hg38.manifest.rds')
HM450.hg19.manifest <- readRDS('~/Downloads/tmp/20180808/hm450.hg19.manifest.rds')

All probes

da <- HM450.hg19.manifest
db <- HM450.hg38.manifest[names(da)]
a <- cut(da[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
tble['(59,60]','(59,60]'] / sum(tble) # Fraction of unique mapping

## [1] 0.8614918

Type-I

a <- cut(da[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg Type-I A')

Mapping quality cross comparison type cg Type-I A
	[0,10]	(10,40]	(40,59]	(59,60]
[0,10]	7216	15	4	4
(10,40]	114	6949	5	4
(40,59]	15	28	4114	9
(59,60]	194	45	52	116733

a <- cut(da[da$designType=='I']$mapQ_B, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_B, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg Type-I B')

Mapping quality cross comparison type cg Type-I B
	[0,10]	(10,40]	(40,59]	(59,60]
[0,10]	5693	20	2	9
(10,40]	115	6389	4	7
(40,59]	14	25	2709	7
(59,60]	204	55	36	120212

Type-II

a <- cut(da[da$designType=='II']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='II']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg type II')

Mapping quality cross comparison type cg type II
	[0,10]	(10,40]	(40,59]	(59,60]
[0,10]	11295	33	4	32
(10,40]	249	14548	9	8
(40,59]	61	34	10003	15
(59,60]	520	84	68	313113

Probe Mapping by Mismatch

A-allele

tble <- table(da[da$probeType!='rs']$NM_A, db[db$probeType!='rs']$NM_A)
kable(tble)

	0	1	2	3	4	5	6	8	12
0	484743	629	70	14	17	2	5	1	1

tble['0','0'] / sum(tble) # Fraction of perfect mapping

## [1] 0.9984778

tble_decoy <- table(da[da$probeType!='rs']$wDecoy_NM_A, db[db$probeType!='rs']$wDecoy_NM_A)
kable(tble_decoy)

	0	1	2	3	4	5	6	8	12
0	484766	629	68	16	15	2	5	2	1

tble_decoy['0','0'] / sum(tble_decoy) # Fraction of perfect mapping with decoy

## [1] 0.9984799

tble_decoy['0','0'] - tble['0','0'] # Number of probes moved to decoy

## [1] 23

B-allele

tble <- table(da[da$probeType!='rs']$NM_B, db[db$probeType!='rs']$NM_B)
kable(tble)

	0	1	2	3	4	5	6	7
0	135235	191	22	6	5	1	2	1

tble['0','0'] / sum(tble) # Fraction of perfect mapping

## [1] 0.9983169

tble_decoy <- table(da[da$probeType!='rs']$wDecoy_NM_B, db[db$probeType!='rs']$wDecoy_NM_B)
kable(tble_decoy)

	0	1	2	3	4	5	6	8
0	135246	191	21	4	5	1	3	1

tble_decoy['0','0'] / sum(tble_decoy) # Fraction of perfect mapping with decoy

## [1] 0.9983318

tble_decoy['0','0'] - tble['0','0'] # Number of probes moved to decoy

## [1] 11

SNP probes

One probe switched the REF and ALT.

kable(table(da[da$designType=='I' & da$probeType=='rs']$NM_A, db[db$designType=='I' & db$probeType=='rs']$NM_A))

	0	1
0	9	0
1	1	15

kable(table(da[da$designType=='I' & da$probeType=='rs']$NM_B, db[db$designType=='I' & db$probeType=='rs']$NM_B))

	0	1
0	15	1
1	0	9

kable(table(da[da$designType=='II' & da$probeType=='rs']$NM_A, db[db$designType=='II' & db$probeType=='rs']$NM_A))

	0
0	40

Gene Association

HM450

all(dfdata_R4$hm450$`Composite Element REF` == dfdata_legacy$hm450$`Composite Element REF`)

## [1] TRUE

dfdata_R4$hm450$geneUniq <- sapply(strsplit(dfdata_R4$hm450$Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))
dfdata_R4$hm450$geneType <- sapply(strsplit(dfdata_R4$hm450$Gene_Type, ';'), function(x) paste0(unique(x), collapse = ';'))
df <- dfdata_R4$hm450 %>% 
    dplyr::rename(probe='Composite Element REF') %>% 
    dplyr::select(probe, geneUniq, geneType) %>% 
    cbind(dfdata_legacy$hm450) %>% 
    mutate(geneUniq=replace(geneUniq, geneUniq=='.',NA)) %>% 
    dplyr::rename(gene_hg38=geneUniq) %>% 
    mutate(gene_hg19 = sapply(strsplit(Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))) %>% 
    mutate(gene_hg19 = replace(gene_hg19, gene_hg19=='',NA))

More probes are associated with gene in GRCh38.

tb <- table(!is.na(df$gene_hg38), !is.na(df$gene_hg19))
rownames(tb) <- c('Not Annotated in hg38','Annotated in hg38')
colnames(tb) <- c('Not Annotated in hg19','Annotated in hg19')
kable(tb)

	Not Annotated in hg19	Annotated in hg19
Not Annotated in hg38	72026	3291
Annotated in hg38	47691	362569

kable(tb/sum(tb))

	Not Annotated in hg19	Annotated in hg19
Not Annotated in hg38	0.1483307	0.0067775
Annotated in hg38	0.0982151	0.7466766

365860 in 485577 probes are associated with gene in hg19. 410260 in 485577 probes are associated with gene in hg38. 255082 probes are annotated with exactly the same genes.

df$hg19_in_hg38 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg19'],';')[[1]], function(xx) grepl(xx,x['gene_hg38']))))
df$hg19_in_hg38 <- ifelse(is.na(df$gene_hg19), TRUE, df$hg19_in_hg38)
df$hg38_in_hg19 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg38'],';')[[1]], function(xx) grepl(xx,x['gene_hg19']))))
df$hg38_in_hg19 <- ifelse(is.na(df$gene_hg38), TRUE, df$hg38_in_hg19)

df_identical <- (df %>% filter((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19)))
df_different <- (df %>% filter(!((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19))))

327145 (67.372425%) probes probes have identical gene annotations. 118767 (24.4589427%) probes annotated in hg19 is not included in genes annotated hg38. Only 39665 (8.1686324%) probes lost gene annotation from hg19 in hg38. 17952 probes are either with ‘orf’ or ‘LOC’ names, indicating an update of gene annotation from resolving these identifiers. 153082 (31.525793%) probes gain new annotation in hg38 compared to hg19. A substantial parts are due to the addition of non-coding genes.

df_different %>% filter(!hg38_in_hg19) %>% summarise(
    antisense=sum(grepl('antisense', geneType))/n()*100,
    lincRNA=sum(grepl('lincRNA', geneType))/n()*100,
    miRNA=sum(grepl('miRNA', geneType))/n()*100,
    processed_transcript=sum(grepl('processed_transcript', geneType))/n()*100,
    pseudogene=sum(grepl('pseudogene', geneType))/n()*100)

##   antisense  lincRNA    miRNA processed_transcript pseudogene
## 1  27.47874 18.02629 2.837042             5.000588   9.666715

df_different %>% head

##        probe      gene_hg38             geneType Composite Element REF
## 1 cg00000363          PGBD5       protein_coding            cg00000363
## 2 cg00000884           <NA>                    .            cg00000884
## 3 cg00001099 ATP6V0D2;PSKH2       protein_coding            cg00001099
## 4 cg00001249          LRRC9       protein_coding            cg00001249
## 5 cg00001261  LA16c-306E5.2       protein_coding            cg00001261
## 6 cg00001446 ELOVL1;MIR6734 protein_coding;miRNA            cg00001446
##   Beta_value Gene_Symbol Chromosome Genomic_Coordinate gene_hg19
## 1  0.1204661        <NA>          1          230560793      <NA>
## 2         NA        TLR2          4          154609857      TLR2
## 3         NA       PSKH2          8           87081553     PSKH2
## 4  0.8729648        <NA>         14           60389786      <NA>
## 5  0.2489936        <NA>         16            3463964      <NA>
## 6  0.8604751      ELOVL1          1           43831041    ELOVL1
##   hg19_in_hg38 hg38_in_hg19
## 1         TRUE        FALSE
## 2        FALSE         TRUE
## 3         TRUE        FALSE
## 4         TRUE        FALSE
## 5         TRUE        FALSE
## 6         TRUE        FALSE

The complete gene discordance table can be downloaded at https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_probes_with_different_gene_annotation_column.tsv

df %>% mutate(same=ifelse(hg19_in_hg38&hg38_in_hg19, 'CONCORDANT', ifelse(hg19_in_hg38, 'AUGMENTED', 'DISCORDANT')), geneType=sapply(strsplit(geneType,';'), function(x) paste0(sort(x), collapse=';'))) %>% mutate(geneType=replace(geneType, geneType=='.', 'NA')) -> df1
df1 %>% count(same) %>% mutate(prop=prop.table(n))

## # A tibble: 3 x 3
##   same            n   prop
##   <chr>       <int>  <dbl>
## 1 AUGMENTED  118767 0.245 
## 2 CONCORDANT 327145 0.674 
## 3 DISCORDANT  39665 0.0817

df1 %>% group_by(same, geneType) %>% summarise(freq=n()) %>% ungroup() %>% mutate(geneType = sprintf('%s (%1.2f%%)', sub(';',' ',geneType), freq/sum(freq)*100)) -> df2

# vocabulary_hg19 <- do.call(c,strsplit(df$gene_hg19,';'))
# pdf('~/gallery/20180819_gdc_qc_treemap_hm450.pdf')
treemap(df2, index=c('same','geneType'),vSize='freq',palette='Blues')

HM27

all(dfdata_R4$hm27$`Composite Element REF` == dfdata_legacy$hm27$`Composite Element REF`)

## [1] TRUE

dfdata_R4$hm27$geneUniq <- sapply(strsplit(dfdata_R4$hm27$Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))
dfdata_R4$hm27$geneType <- sapply(strsplit(dfdata_R4$hm27$Gene_Type, ';'), function(x) paste0(unique(x), collapse = ';'))
df <- dfdata_R4$hm27 %>% 
    dplyr::rename(probe='Composite Element REF') %>% 
    dplyr::select(probe, geneUniq, geneType) %>% 
    cbind(dfdata_legacy$hm27) %>% 
    mutate(geneUniq=replace(geneUniq, geneUniq=='.',NA)) %>% 
    dplyr::rename(gene_hg38=geneUniq) %>% 
    mutate(gene_hg19 = sapply(strsplit(Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))) %>% 
    mutate(gene_hg19 = replace(gene_hg19, gene_hg19=='',NA))

More probes are associated with gene in GRCh38.

tb <- table(!is.na(df$gene_hg38), !is.na(df$gene_hg19))
rownames(tb) <- c('Not Annotated in hg38','Annotated in hg38')
colnames(tb) <- c('Not Annotated in hg19','Annotated in hg19')
kable(tb)

	Not Annotated in hg19	Annotated in hg19
Not Annotated in hg38	562	61
Annotated in hg38	1254	25701

kable(tb/sum(tb))

	Not Annotated in hg19	Annotated in hg19
Not Annotated in hg38	0.0203786	0.0022119
Annotated in hg38	0.0454710	0.9319385

25762 in 27578 probes are associated with gene in hg19. 26955 in 27578 probes are associated with gene in hg38. 17174 probes are annotated with exactly the same genes.

df$hg19_in_hg38 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg19'],';')[[1]], function(xx) grepl(xx,x['gene_hg38']))))
df$hg19_in_hg38 <- ifelse(is.na(df$gene_hg19), TRUE, df$hg19_in_hg38)
df$hg38_in_hg19 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg38'],';')[[1]], function(xx) grepl(xx,x['gene_hg19']))))
df$hg38_in_hg19 <- ifelse(is.na(df$gene_hg38), TRUE, df$hg38_in_hg19)

df_identical <- (df %>% filter((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19)))
df_different <- (df %>% filter(!((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19))))

17740 (64.3266372%) probes have identical gene annotations. 7586 (27.5074335%) probes annotated in hg19 is not included in genes annotated hg38. Only 2252 (8.1659294%) probes lost gene annotation from hg19 in hg38. 1099 probes are either with ‘orf’ or ‘LOC’ names, indicating an update of gene annotation from resolving these identifiers. 9662 (35.035173%) probes gain new annotation in hg38 compared to hg19. A substantial parts are due to the addition of non-coding genes.

df_different %>% filter(!hg38_in_hg19) %>% summarise(
    antisense=sum(grepl('antisense', geneType))/n()*100,
    lincRNA=sum(grepl('lincRNA', geneType))/n()*100,
    miRNA=sum(grepl('miRNA', geneType))/n()*100,
    processed_transcript=sum(grepl('processed_transcript', geneType))/n()*100,
    pseudogene=sum(grepl('pseudogene', geneType))/n()*100)

##   antisense  lincRNA    miRNA processed_transcript pseudogene
## 1  31.63941 10.29807 3.405092              4.72987   3.415442

df_different %>% head

##        probe                             gene_hg38
## 1 cg00005847        AC009336.19;HOXD3;RP11-387A1.5
## 2 cg00006414               RN7SL521P;ZNF398;ZNF425
## 3 cg00008493                           COX8C;UNC79
## 4 cg00010193                    AC092535.3;TMED11P
## 5 cg00012792 BLOC1S5;BLOC1S5-TXNDC5;EEF1E1-BLOC1S5
## 6 cg00013618                      IGLVI-70;PRAMENP
##                                             geneType Composite Element REF
## 1                           protein_coding;antisense            cg00005847
## 2                            misc_RNA;protein_coding            cg00006414
## 3                                     protein_coding            cg00008493
## 4                       antisense;unitary_pseudogene            cg00010193
## 5                                     protein_coding            cg00012792
## 6 IG_V_pseudogene;transcribed_unprocessed_pseudogene            cg00013618
##   Beta_value    Gene_Symbol Chromosome Genomic_Coordinate      gene_hg19
## 1 0.60548056          HOXD3          2          177029073          HOXD3
## 2         NA  ZNF425;ZNF398          7          148822837  ZNF398;ZNF425
## 3 0.98931512 COX8C;KIAA1409         14           93813777 COX8C;KIAA1409
## 4 0.60833069           <NA>       <NA>                  0           <NA>
## 5 0.01845706          MUTED          6            8064493          MUTED
## 6 0.53753164           <NA>         22           22380839           <NA>
##   hg19_in_hg38 hg38_in_hg19
## 1         TRUE        FALSE
## 2         TRUE        FALSE
## 3        FALSE        FALSE
## 4         TRUE        FALSE
## 5        FALSE        FALSE
## 6         TRUE        FALSE

The complete gene discordance table can be downloaded at https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM27_probes_with_different_gene_annotation_column.tsv

df %>% mutate(same=ifelse(hg19_in_hg38&hg38_in_hg19, 'CONCORDANT', ifelse(hg19_in_hg38, 'AUGMENTED', 'DISCORDANT')), geneType=sapply(strsplit(geneType,';'), function(x) paste0(sort(x), collapse=';'))) %>% mutate(geneType=replace(geneType, geneType=='.', 'NA')) -> df1
df1 %>% count(same) %>% mutate(prop=prop.table(n))

## # A tibble: 3 x 3
##   same           n   prop
##   <chr>      <int>  <dbl>
## 1 AUGMENTED   7586 0.275 
## 2 CONCORDANT 17740 0.643 
## 3 DISCORDANT  2252 0.0817

df1 %>% group_by(same, geneType) %>% summarise(freq=n()) %>% ungroup() %>% mutate(geneType = sprintf('%s (%1.2f%%)', sub(';',' ',geneType), freq/sum(freq)*100)) -> df2

# vocabulary_hg19 <- do.call(c,strsplit(df$gene_hg19,';'))
# pdf('~/gallery/20180819_gdc_qc_treemap_hm450.pdf')
treemap(df2, index=c('same','geneType'),vSize='freq',palette='Blues')

GDC QC DNA Methylation

hg19 Legacy (GDC r1.0-3.0)

File Counts

Data

hg38 LiftOver (GDC r4.0-12.0)

File Counts

Data

Compare hg19 and hg38

Measurements

Coordinates

Fraction of probes unmapped in hg38

HM27

HM450

Fraction of unmapped

Probe Mapping by Chromosome

CpGs Targeted by Multiple Probes

HM450

HM27

Probe Mapping by Quality

All probes

Type-I

Type-II

Probe Mapping by Mismatch

A-allele

B-allele

SNP probes

Gene Association

HM450

HM27