df_legacy <- list(
hm27=read_tsv(url('https://raw.githubusercontent.com/zwdzwd/GDC_DNA_methylation_QC/master/file_lists/20180410_GDC_manifest_legacy.tsv_HM27_lvl3.tsv')),
hm450=read_tsv(url('https://raw.githubusercontent.com/zwdzwd/GDC_DNA_methylation_QC/master/file_lists/20180410_GDC_manifest_legacy.tsv_HM450_lvl3.tsv'))
)
sapply(df_legacy, nrow)
## hm27 hm450
## 2662 9813
Cases/patients (first 12 letters of TCGA barcode)
sapply(df_legacy, function(x) length(unique(substr(x$TCGA_barcode,1,12))))
## hm27 hm450
## 2280 8949
dfdata_legacy <- list(
hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release1-3_legacy/07e1bc0f-b2d9-41cb-bf4d-1a9ac769654a/jhu-usc.edu_COAD.HumanMethylation27.4.lvl-3.TCGA-AA-A004-01A-01D-A00B-05.txt'),skip=1),
hm450=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release1-3_legacy/3d4a102e-4a4c-48c3-ad51-8cc9f9b879e7/jhu-usc.edu_KIRC.HumanMethylation450.7.lvl-3.TCGA-DV-A4W0-05A-11D-A264-05.txt'),skip=1))
dfdata_legacy$hm27 %>% head
## # A tibble: 6 x 5
## `Composite Element … Beta_value Gene_Symbol Chromosome Genomic_Coordin…
## <chr> <dbl> <chr> <chr> <int>
## 1 cg00000292 0.738 ATP2A1 16 28890100
## 2 cg00002426 0.274 SLMAP 3 57743543
## 3 cg00003994 0.248 MEOX2 7 15725862
## 4 cg00005847 0.605 HOXD3 2 177029073
## 5 cg00006414 NA ZNF425;ZNF3… 7 148822837
## 6 cg00007981 0.0338 PANX1 11 93862594
dfdata_legacy$hm450 %>% head
## # A tibble: 6 x 5
## `Composite Element … Beta_value Gene_Symbol Chromosome Genomic_Coordina…
## <chr> <dbl> <chr> <chr> <int>
## 1 cg00000029 0.659 RBL2 16 53468112
## 2 cg00000108 NA C3orf35 3 37459206
## 3 cg00000109 NA FNDC3B 3 171916037
## 4 cg00000165 0.125 <NA> 1 91194674
## 5 cg00000236 0.900 VDAC3 8 42263294
## 6 cg00000289 0.713 ACTN1 14 69341139
df_R4 <- list(
hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/file_lists/20180410_GDC_manifest_liftOver_workflow.tsv_HM27_lvl3.tsv')),
hm450=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/file_lists/20180410_GDC_manifest_liftOver_workflow.tsv_HM450_lvl3.tsv'))
)
sapply(df_R4, nrow)
## hm27 hm450
## 2603 9756
Cases/patients (first 12 letters of TCGA barcode)
sapply(df_R4, function(x) length(unique(substr(x$TCGA_barcode,1,12))))
## hm27 hm450
## 2223 8893
dfdata_R4 <- list(
hm27=read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release4_plus_hg38/df4a53cd-6d07-41be-a7b7-c381a658e7ae/jhu-usc.edu_COAD.HumanMethylation27.4.lvl-3.TCGA-AA-A004-01A-01D-A00B-05.gdc_hg38.txt')),
hm450=read_tsv(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/sample_data/release4_plus_hg38/ef164350-d911-4a3d-a53a-b2efed5e682c/jhu-usc.edu_KIRC.HumanMethylation450.7.lvl-3.TCGA-DV-A4W0-05A-11D-A264-05.gdc_hg38.txt.gz'))))
dfdata_R4$hm27 %>% head
## # A tibble: 6 x 11
## `Composite Elem… Beta_value Chromosome Start End Gene_Symbol
## <chr> <dbl> <chr> <int> <int> <chr>
## 1 cg00000292 0.738 chr16 2.89e7 2.89e7 ATP2A1;ATP…
## 2 cg00002426 0.274 chr3 5.78e7 5.78e7 SLMAP;SLMA…
## 3 cg00003994 0.248 chr7 1.57e7 1.57e7 MEOX2
## 4 cg00005847 0.605 chr2 1.76e8 1.76e8 AC009336.1…
## 5 cg00006414 NA chr7 1.49e8 1.49e8 RN7SL521P;…
## 6 cg00007981 0.0338 chr11 9.41e7 9.41e7 PANX1;PANX1
## # ... with 5 more variables: Gene_Type <chr>, Transcript_ID <chr>,
## # Position_to_TSS <chr>, CGI_Coordinate <chr>, Feature_Type <chr>
dfdata_R4$hm450 %>% head
## # A tibble: 6 x 11
## `Composite Elem… Beta_value Chromosome Start End Gene_Symbol
## <chr> <dbl> <chr> <int> <int> <chr>
## 1 cg00000029 0.659 chr16 5.34e7 5.34e7 RBL2;RBL2;…
## 2 cg00000108 NA chr3 3.74e7 3.74e7 C3orf35;C3…
## 3 cg00000109 NA chr3 1.72e8 1.72e8 FNDC3B;FND…
## 4 cg00000165 0.125 chr1 9.07e7 9.07e7 .
## 5 cg00000236 0.900 chr8 4.24e7 4.24e7 VDAC3
## 6 cg00000289 0.713 chr14 6.89e7 6.89e7 ACTN1;ACTN…
## # ... with 5 more variables: Gene_Type <chr>, Transcript_ID <chr>,
## # Position_to_TSS <chr>, CGI_Coordinate <chr>, Feature_Type <chr>
Measurements are the same, only the coordinates and gene annotation have changed.
sapply(dfdata_R4, nrow)
## hm27 hm450
## 27578 485577
sapply(dfdata_legacy, nrow)
## hm27 hm450
## 27578 485577
all(dfdata_legacy$hm450$Beta_value == dfdata_R4$hm450$Beta_value, na.rm = TRUE)
## [1] TRUE
Unmapped probes on hg38 are annotated with “*“.
tb <- table(dfdata_R4$hm27$Chromosome)
kable(tb)
| Var1 | Freq |
|---|---|
| * | 544 |
| chr1 | 2849 |
| chr10 | 1035 |
| chr11 | 1716 |
| chr12 | 1517 |
| chr13 | 485 |
| chr14 | 822 |
| chr15 | 803 |
| chr16 | 1163 |
| chr17 | 1538 |
| chr18 | 390 |
| chr19 | 1880 |
| chr2 | 1677 |
| chr20 | 879 |
| chr21 | 296 |
| chr22 | 626 |
| chr3 | 1514 |
| chr4 | 1013 |
| chr5 | 1140 |
| chr6 | 1465 |
| chr7 | 1227 |
| chr8 | 921 |
| chr9 | 1050 |
| chrX | 1024 |
| chrY | 4 |
(1-unname(tb['*'])/sum(tb)) * 100 # percentage of mapped probes
## [1] 98.02741
tb <- table(dfdata_R4$hm450$Chromosome)
kable(tb)
| Var1 | Freq |
|---|---|
| * | 5120 |
| chr1 | 46242 |
| chr10 | 24109 |
| chr11 | 28558 |
| chr12 | 24363 |
| chr13 | 12131 |
| chr14 | 14972 |
| chr15 | 15093 |
| chr16 | 21767 |
| chr17 | 27654 |
| chr18 | 5862 |
| chr19 | 25344 |
| chr2 | 34496 |
| chr20 | 10341 |
| chr21 | 3868 |
| chr22 | 8435 |
| chr3 | 24967 |
| chr4 | 20203 |
| chr5 | 24117 |
| chr6 | 36299 |
| chr7 | 29651 |
| chr8 | 20670 |
| chr9 | 9791 |
| chrX | 11127 |
| chrY | 397 |
(1-unname(tb['*'])/sum(tb)) * 100 # percentage of mapped probes
## [1] 98.94558
kable(sapply(dfdata_R4, function(x) table(x$Chromosome=='*')))
| hm27 | hm450 | |
|---|---|---|
| FALSE | 27034 | 480457 |
| TRUE | 544 | 5120 |
# legacy doesn't have any unmapped probes
sapply(dfdata_legacy, function(x) table(x$Chromosome=='*'))
## hm27.FALSE hm450.FALSE
## 25978 485512
Most probes are mapped to the same chromosome.
kable(table(dfdata_legacy$hm450$Chromosome, dfdata_R4$hm450$Chromosome))
| * | chr1 | chr10 | chr11 | chr12 | chr13 | chr14 | chr15 | chr16 | chr17 | chr18 | chr19 | chr2 | chr20 | chr21 | chr22 | chr3 | chr4 | chr5 | chr6 | chr7 | chr8 | chr9 | chrX | chrY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 608 | 46233 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 0 | 0 |
| 10 | 281 | 0 | 24107 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 11 | 238 | 0 | 0 | 28556 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 12 | 180 | 0 | 0 | 0 | 24359 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 13 | 154 | 0 | 0 | 0 | 0 | 12131 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14 | 106 | 0 | 0 | 0 | 0 | 0 | 14972 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | 168 | 0 | 0 | 0 | 0 | 0 | 0 | 15091 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 16 | 203 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21766 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 17 | 225 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27654 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18 | 61 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5861 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 19 | 177 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25344 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 319 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 34491 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 20 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10339 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 21 | 377 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3866 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 22 | 127 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8422 | 0 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 0 |
| 3 | 196 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24963 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 266 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20198 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 214 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24113 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | 315 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36296 | 0 | 0 | 0 | 0 | 0 |
| 7 | 365 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 29651 | 0 | 0 | 0 | 0 |
| 8 | 288 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20662 | 0 | 0 | 0 |
| 9 | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9780 | 0 | 0 |
| X | 113 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11119 | 0 |
| Y | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 397 |
kable(table(dfdata_legacy$hm27$Chromosome, dfdata_R4$hm27$Chromosome))
| * | chr1 | chr10 | chr11 | chr12 | chr13 | chr14 | chr15 | chr16 | chr17 | chr18 | chr19 | chr2 | chr20 | chr21 | chr22 | chr3 | chr4 | chr5 | chr6 | chr7 | chr8 | chr9 | chrX | chrY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 5 | 2745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10 | 2 | 0 | 1013 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 11 | 2 | 0 | 0 | 1652 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 12 | 0 | 0 | 0 | 0 | 1453 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 13 | 1 | 0 | 0 | 0 | 0 | 462 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14 | 0 | 0 | 0 | 0 | 0 | 0 | 797 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 772 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1128 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 17 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1477 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 373 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 19 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1821 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1594 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 845 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 21 | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 278 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 22 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 611 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1450 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 947 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1104 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1417 | 0 | 0 | 0 | 0 | 0 |
| 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1169 | 0 | 0 | 0 | 0 |
| 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 884 | 0 | 0 | 0 |
| 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 978 | 0 | 0 |
| X | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 959 | 0 |
| Y | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
dfdata_R4$hm450 %>% filter(Chromosome!='*') %>% arrange(Chromosome, Start) -> dfdata_R4_bycoord
dup_index <- which(dfdata_R4_bycoord$Start[-1] == dfdata_R4_bycoord$Start[-nrow(dfdata_R4_bycoord)])
dup_probe_index <- sort(unique(c(dup_index, dup_index+1)))
dfdata_R4_bycoord[dup_probe_index,] %>% dplyr::select(Chromosome, Start, End, 'Composite Element REF')
## # A tibble: 42 x 4
## Chromosome Start End `Composite Element REF`
## <chr> <int> <int> <chr>
## 1 chr1 120851001 120851002 cg09137068
## 2 chr1 120851001 120851002 cg26329816
## 3 chr1 148152231 148152232 cg10127479
## 4 chr1 148152231 148152232 cg20294457
## 5 chr10 17844509 17844510 cg07386868
## 6 chr10 17844509 17844510 cg19201533
## 7 chr10 45972791 45972792 cg10432423
## 8 chr10 45972791 45972792 cg20477318
## 9 chr10 46397210 46397211 cg15247093
## 10 chr10 46397210 46397211 cg17243957
## # ... with 32 more rows
Under GRCh38, 21 CpGs become interrogated with multiple probes (42 probes). Note this is after excluding MASK_mapping. The full list is available at here.
No such CpGs exist for HM27
dfdata_R4$hm27 %>% filter(Chromosome!='*') %>% arrange(Chromosome, Start) -> dfdata_R4_bycoord
which(dfdata_R4_bycoord$Start[-1] == dfdata_R4_bycoord$Start[-nrow(dfdata_R4_bycoord)])
## integer(0)
HM450.hg38.manifest <- readRDS('~/Downloads/tmp/20180808/hm450.hg38.manifest.rds')
HM450.hg19.manifest <- readRDS('~/Downloads/tmp/20180808/hm450.hg19.manifest.rds')
da <- HM450.hg19.manifest
db <- HM450.hg38.manifest[names(da)]
a <- cut(da[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
tble['(59,60]','(59,60]'] / sum(tble) # Fraction of unique mapping
## [1] 0.8614918
a <- cut(da[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg Type-I A')
| [0,10] | (10,40] | (40,59] | (59,60] | |
|---|---|---|---|---|
| [0,10] | 7216 | 15 | 4 | 4 |
| (10,40] | 114 | 6949 | 5 | 4 |
| (40,59] | 15 | 28 | 4114 | 9 |
| (59,60] | 194 | 45 | 52 | 116733 |
a <- cut(da[da$designType=='I']$mapQ_B, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='I']$mapQ_B, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg Type-I B')
| [0,10] | (10,40] | (40,59] | (59,60] | |
|---|---|---|---|---|
| [0,10] | 5693 | 20 | 2 | 9 |
| (10,40] | 115 | 6389 | 4 | 7 |
| (40,59] | 14 | 25 | 2709 | 7 |
| (59,60] | 204 | 55 | 36 | 120212 |
a <- cut(da[da$designType=='II']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
b <- cut(db[da$designType=='II']$mapQ_A, c(0,10,40,59,60), include.lowest = T)
tble <- table(a, b)
kable(tble, caption='Mapping quality cross comparison type cg type II')
| [0,10] | (10,40] | (40,59] | (59,60] | |
|---|---|---|---|---|
| [0,10] | 11295 | 33 | 4 | 32 |
| (10,40] | 249 | 14548 | 9 | 8 |
| (40,59] | 61 | 34 | 10003 | 15 |
| (59,60] | 520 | 84 | 68 | 313113 |
tble <- table(da[da$probeType!='rs']$NM_A, db[db$probeType!='rs']$NM_A)
kable(tble)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 8 | 12 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 484743 | 629 | 70 | 14 | 17 | 2 | 5 | 1 | 1 |
tble['0','0'] / sum(tble) # Fraction of perfect mapping
## [1] 0.9984778
tble_decoy <- table(da[da$probeType!='rs']$wDecoy_NM_A, db[db$probeType!='rs']$wDecoy_NM_A)
kable(tble_decoy)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 8 | 12 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 484766 | 629 | 68 | 16 | 15 | 2 | 5 | 2 | 1 |
tble_decoy['0','0'] / sum(tble_decoy) # Fraction of perfect mapping with decoy
## [1] 0.9984799
tble_decoy['0','0'] - tble['0','0'] # Number of probes moved to decoy
## [1] 23
tble <- table(da[da$probeType!='rs']$NM_B, db[db$probeType!='rs']$NM_B)
kable(tble)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 135235 | 191 | 22 | 6 | 5 | 1 | 2 | 1 |
tble['0','0'] / sum(tble) # Fraction of perfect mapping
## [1] 0.9983169
tble_decoy <- table(da[da$probeType!='rs']$wDecoy_NM_B, db[db$probeType!='rs']$wDecoy_NM_B)
kable(tble_decoy)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 8 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 135246 | 191 | 21 | 4 | 5 | 1 | 3 | 1 |
tble_decoy['0','0'] / sum(tble_decoy) # Fraction of perfect mapping with decoy
## [1] 0.9983318
tble_decoy['0','0'] - tble['0','0'] # Number of probes moved to decoy
## [1] 11
One probe switched the REF and ALT.
kable(table(da[da$designType=='I' & da$probeType=='rs']$NM_A, db[db$designType=='I' & db$probeType=='rs']$NM_A))
| 0 | 1 | |
|---|---|---|
| 0 | 9 | 0 |
| 1 | 1 | 15 |
kable(table(da[da$designType=='I' & da$probeType=='rs']$NM_B, db[db$designType=='I' & db$probeType=='rs']$NM_B))
| 0 | 1 | |
|---|---|---|
| 0 | 15 | 1 |
| 1 | 0 | 9 |
kable(table(da[da$designType=='II' & da$probeType=='rs']$NM_A, db[db$designType=='II' & db$probeType=='rs']$NM_A))
| 0 | |
|---|---|
| 0 | 40 |
all(dfdata_R4$hm450$`Composite Element REF` == dfdata_legacy$hm450$`Composite Element REF`)
## [1] TRUE
dfdata_R4$hm450$geneUniq <- sapply(strsplit(dfdata_R4$hm450$Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))
dfdata_R4$hm450$geneType <- sapply(strsplit(dfdata_R4$hm450$Gene_Type, ';'), function(x) paste0(unique(x), collapse = ';'))
df <- dfdata_R4$hm450 %>%
dplyr::rename(probe='Composite Element REF') %>%
dplyr::select(probe, geneUniq, geneType) %>%
cbind(dfdata_legacy$hm450) %>%
mutate(geneUniq=replace(geneUniq, geneUniq=='.',NA)) %>%
dplyr::rename(gene_hg38=geneUniq) %>%
mutate(gene_hg19 = sapply(strsplit(Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))) %>%
mutate(gene_hg19 = replace(gene_hg19, gene_hg19=='',NA))
More probes are associated with gene in GRCh38.
tb <- table(!is.na(df$gene_hg38), !is.na(df$gene_hg19))
rownames(tb) <- c('Not Annotated in hg38','Annotated in hg38')
colnames(tb) <- c('Not Annotated in hg19','Annotated in hg19')
kable(tb)
| Not Annotated in hg19 | Annotated in hg19 | |
|---|---|---|
| Not Annotated in hg38 | 72026 | 3291 |
| Annotated in hg38 | 47691 | 362569 |
kable(tb/sum(tb))
| Not Annotated in hg19 | Annotated in hg19 | |
|---|---|---|
| Not Annotated in hg38 | 0.1483307 | 0.0067775 |
| Annotated in hg38 | 0.0982151 | 0.7466766 |
365860 in 485577 probes are associated with gene in hg19. 410260 in 485577 probes are associated with gene in hg38. 255082 probes are annotated with exactly the same genes.
df$hg19_in_hg38 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg19'],';')[[1]], function(xx) grepl(xx,x['gene_hg38']))))
df$hg19_in_hg38 <- ifelse(is.na(df$gene_hg19), TRUE, df$hg19_in_hg38)
df$hg38_in_hg19 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg38'],';')[[1]], function(xx) grepl(xx,x['gene_hg19']))))
df$hg38_in_hg19 <- ifelse(is.na(df$gene_hg38), TRUE, df$hg38_in_hg19)
df_identical <- (df %>% filter((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19)))
df_different <- (df %>% filter(!((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19))))
327145 (67.372425%) probes probes have identical gene annotations. 118767 (24.4589427%) probes annotated in hg19 is not included in genes annotated hg38. Only 39665 (8.1686324%) probes lost gene annotation from hg19 in hg38. 17952 probes are either with ‘orf’ or ‘LOC’ names, indicating an update of gene annotation from resolving these identifiers. 153082 (31.525793%) probes gain new annotation in hg38 compared to hg19. A substantial parts are due to the addition of non-coding genes.
df_different %>% filter(!hg38_in_hg19) %>% summarise(
antisense=sum(grepl('antisense', geneType))/n()*100,
lincRNA=sum(grepl('lincRNA', geneType))/n()*100,
miRNA=sum(grepl('miRNA', geneType))/n()*100,
processed_transcript=sum(grepl('processed_transcript', geneType))/n()*100,
pseudogene=sum(grepl('pseudogene', geneType))/n()*100)
## antisense lincRNA miRNA processed_transcript pseudogene
## 1 27.47874 18.02629 2.837042 5.000588 9.666715
df_different %>% head
## probe gene_hg38 geneType Composite Element REF
## 1 cg00000363 PGBD5 protein_coding cg00000363
## 2 cg00000884 <NA> . cg00000884
## 3 cg00001099 ATP6V0D2;PSKH2 protein_coding cg00001099
## 4 cg00001249 LRRC9 protein_coding cg00001249
## 5 cg00001261 LA16c-306E5.2 protein_coding cg00001261
## 6 cg00001446 ELOVL1;MIR6734 protein_coding;miRNA cg00001446
## Beta_value Gene_Symbol Chromosome Genomic_Coordinate gene_hg19
## 1 0.1204661 <NA> 1 230560793 <NA>
## 2 NA TLR2 4 154609857 TLR2
## 3 NA PSKH2 8 87081553 PSKH2
## 4 0.8729648 <NA> 14 60389786 <NA>
## 5 0.2489936 <NA> 16 3463964 <NA>
## 6 0.8604751 ELOVL1 1 43831041 ELOVL1
## hg19_in_hg38 hg38_in_hg19
## 1 TRUE FALSE
## 2 FALSE TRUE
## 3 TRUE FALSE
## 4 TRUE FALSE
## 5 TRUE FALSE
## 6 TRUE FALSE
The complete gene discordance table can be downloaded at https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_probes_with_different_gene_annotation_column.tsv
df %>% mutate(same=ifelse(hg19_in_hg38&hg38_in_hg19, 'CONCORDANT', ifelse(hg19_in_hg38, 'AUGMENTED', 'DISCORDANT')), geneType=sapply(strsplit(geneType,';'), function(x) paste0(sort(x), collapse=';'))) %>% mutate(geneType=replace(geneType, geneType=='.', 'NA')) -> df1
df1 %>% count(same) %>% mutate(prop=prop.table(n))
## # A tibble: 3 x 3
## same n prop
## <chr> <int> <dbl>
## 1 AUGMENTED 118767 0.245
## 2 CONCORDANT 327145 0.674
## 3 DISCORDANT 39665 0.0817
df1 %>% group_by(same, geneType) %>% summarise(freq=n()) %>% ungroup() %>% mutate(geneType = sprintf('%s (%1.2f%%)', sub(';',' ',geneType), freq/sum(freq)*100)) -> df2
# vocabulary_hg19 <- do.call(c,strsplit(df$gene_hg19,';'))
# pdf('~/gallery/20180819_gdc_qc_treemap_hm450.pdf')
treemap(df2, index=c('same','geneType'),vSize='freq',palette='Blues')
all(dfdata_R4$hm27$`Composite Element REF` == dfdata_legacy$hm27$`Composite Element REF`)
## [1] TRUE
dfdata_R4$hm27$geneUniq <- sapply(strsplit(dfdata_R4$hm27$Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))
dfdata_R4$hm27$geneType <- sapply(strsplit(dfdata_R4$hm27$Gene_Type, ';'), function(x) paste0(unique(x), collapse = ';'))
df <- dfdata_R4$hm27 %>%
dplyr::rename(probe='Composite Element REF') %>%
dplyr::select(probe, geneUniq, geneType) %>%
cbind(dfdata_legacy$hm27) %>%
mutate(geneUniq=replace(geneUniq, geneUniq=='.',NA)) %>%
dplyr::rename(gene_hg38=geneUniq) %>%
mutate(gene_hg19 = sapply(strsplit(Gene_Symbol, ';'), function(x) paste0(sort(unique(x)), collapse = ';'))) %>%
mutate(gene_hg19 = replace(gene_hg19, gene_hg19=='',NA))
More probes are associated with gene in GRCh38.
tb <- table(!is.na(df$gene_hg38), !is.na(df$gene_hg19))
rownames(tb) <- c('Not Annotated in hg38','Annotated in hg38')
colnames(tb) <- c('Not Annotated in hg19','Annotated in hg19')
kable(tb)
| Not Annotated in hg19 | Annotated in hg19 | |
|---|---|---|
| Not Annotated in hg38 | 562 | 61 |
| Annotated in hg38 | 1254 | 25701 |
kable(tb/sum(tb))
| Not Annotated in hg19 | Annotated in hg19 | |
|---|---|---|
| Not Annotated in hg38 | 0.0203786 | 0.0022119 |
| Annotated in hg38 | 0.0454710 | 0.9319385 |
25762 in 27578 probes are associated with gene in hg19. 26955 in 27578 probes are associated with gene in hg38. 17174 probes are annotated with exactly the same genes.
df$hg19_in_hg38 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg19'],';')[[1]], function(xx) grepl(xx,x['gene_hg38']))))
df$hg19_in_hg38 <- ifelse(is.na(df$gene_hg19), TRUE, df$hg19_in_hg38)
df$hg38_in_hg19 <- apply(df, 1, function(x) all(sapply(strsplit(x['gene_hg38'],';')[[1]], function(xx) grepl(xx,x['gene_hg19']))))
df$hg38_in_hg19 <- ifelse(is.na(df$gene_hg38), TRUE, df$hg38_in_hg19)
df_identical <- (df %>% filter((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19)))
df_different <- (df %>% filter(!((is.na(gene_hg19) & is.na(gene_hg38)) | (hg19_in_hg38 & hg38_in_hg19))))
17740 (64.3266372%) probes have identical gene annotations. 7586 (27.5074335%) probes annotated in hg19 is not included in genes annotated hg38. Only 2252 (8.1659294%) probes lost gene annotation from hg19 in hg38. 1099 probes are either with ‘orf’ or ‘LOC’ names, indicating an update of gene annotation from resolving these identifiers. 9662 (35.035173%) probes gain new annotation in hg38 compared to hg19. A substantial parts are due to the addition of non-coding genes.
df_different %>% filter(!hg38_in_hg19) %>% summarise(
antisense=sum(grepl('antisense', geneType))/n()*100,
lincRNA=sum(grepl('lincRNA', geneType))/n()*100,
miRNA=sum(grepl('miRNA', geneType))/n()*100,
processed_transcript=sum(grepl('processed_transcript', geneType))/n()*100,
pseudogene=sum(grepl('pseudogene', geneType))/n()*100)
## antisense lincRNA miRNA processed_transcript pseudogene
## 1 31.63941 10.29807 3.405092 4.72987 3.415442
df_different %>% head
## probe gene_hg38
## 1 cg00005847 AC009336.19;HOXD3;RP11-387A1.5
## 2 cg00006414 RN7SL521P;ZNF398;ZNF425
## 3 cg00008493 COX8C;UNC79
## 4 cg00010193 AC092535.3;TMED11P
## 5 cg00012792 BLOC1S5;BLOC1S5-TXNDC5;EEF1E1-BLOC1S5
## 6 cg00013618 IGLVI-70;PRAMENP
## geneType Composite Element REF
## 1 protein_coding;antisense cg00005847
## 2 misc_RNA;protein_coding cg00006414
## 3 protein_coding cg00008493
## 4 antisense;unitary_pseudogene cg00010193
## 5 protein_coding cg00012792
## 6 IG_V_pseudogene;transcribed_unprocessed_pseudogene cg00013618
## Beta_value Gene_Symbol Chromosome Genomic_Coordinate gene_hg19
## 1 0.60548056 HOXD3 2 177029073 HOXD3
## 2 NA ZNF425;ZNF398 7 148822837 ZNF398;ZNF425
## 3 0.98931512 COX8C;KIAA1409 14 93813777 COX8C;KIAA1409
## 4 0.60833069 <NA> <NA> 0 <NA>
## 5 0.01845706 MUTED 6 8064493 MUTED
## 6 0.53753164 <NA> 22 22380839 <NA>
## hg19_in_hg38 hg38_in_hg19
## 1 TRUE FALSE
## 2 TRUE FALSE
## 3 FALSE FALSE
## 4 TRUE FALSE
## 5 FALSE FALSE
## 6 TRUE FALSE
The complete gene discordance table can be downloaded at https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM27_probes_with_different_gene_annotation_column.tsv
df %>% mutate(same=ifelse(hg19_in_hg38&hg38_in_hg19, 'CONCORDANT', ifelse(hg19_in_hg38, 'AUGMENTED', 'DISCORDANT')), geneType=sapply(strsplit(geneType,';'), function(x) paste0(sort(x), collapse=';'))) %>% mutate(geneType=replace(geneType, geneType=='.', 'NA')) -> df1
df1 %>% count(same) %>% mutate(prop=prop.table(n))
## # A tibble: 3 x 3
## same n prop
## <chr> <int> <dbl>
## 1 AUGMENTED 7586 0.275
## 2 CONCORDANT 17740 0.643
## 3 DISCORDANT 2252 0.0817
df1 %>% group_by(same, geneType) %>% summarise(freq=n()) %>% ungroup() %>% mutate(geneType = sprintf('%s (%1.2f%%)', sub(';',' ',geneType), freq/sum(freq)*100)) -> df2
# vocabulary_hg19 <- do.call(c,strsplit(df$gene_hg19,';'))
# pdf('~/gallery/20180819_gdc_qc_treemap_hm450.pdf')
treemap(df2, index=c('same','geneType'),vSize='freq',palette='Blues')