release_date='20180909'
last_release_date='20180808'
dir.create(sprintf('~/Downloads/tmp/%s/',release_date), showWarnings = FALSE)
for (refversion in c('hg19','hg38')) {
for (platform in c('EPIC','hm450','hm27')) {
download.file(sprintf('http://zwdzwd.io/InfiniumAnnotation/%s/%s/%s.%s.manifest.rds', release_date, platform, platform, refversion),
sprintf('~/Downloads/tmp/%s/%s.%s.manifest.rds', release_date, platform, refversion))
}
}
dir.create(sprintf('~/Downloads/tmp/%s/',last_release_date), showWarnings = FALSE)
for (refversion in c('hg19','hg38')) {
for (platform in c('EPIC','hm450','hm27')) {
download.file(sprintf('http://zwdzwd.io/InfiniumAnnotation/%s/%s/%s.%s.manifest.rds', last_release_date, platform, platform, refversion),
sprintf('~/Downloads/tmp/%s/%s.%s.manifest.rds', last_release_date, platform, refversion))
}
}
manifest.last <- list()
for (platform in c('EPIC','hm450','hm27')) {
for (refversion in c('hg19','hg38')) {
id <- paste0(platform,'.',refversion)
manifest.last[[id]] <- readRDS(sprintf('~/Downloads/tmp/%s/%s.%s.manifest.rds', last_release_date, platform, refversion))
}
}
manifest.this <- list()
for (platform in c('EPIC','hm450','hm27')) {
for (refversion in c('hg19','hg38')) {
id <- paste0(platform,'.',refversion)
manifest.this[[id]] <- readRDS(sprintf('~/Downloads/tmp/%s/%s.%s.manifest.rds', release_date, platform, refversion))[names(manifest.last[[id]])]
}
}
An extra column gene_HGNC is added.
cbind(
ncol.last=sapply(manifest.last, function(x) ncol(mcols(x))),
ncol.this=sapply(manifest.this, function(x) ncol(mcols(x))),
nrow.this=sapply(manifest.this, length),
nrow.last=sapply(manifest.this, length))
## ncol.last ncol.this nrow.this nrow.last
## EPIC.hg19 51 52 865918 865918
## EPIC.hg38 51 52 865918 865918
## hm450.hg19 51 52 485577 485577
## hm450.hg38 51 52 485577 485577
## hm27.hg19 51 52 27578 27578
## hm27.hg38 51 52 27578 27578
All the mapping info remains the same as the last version. gene column becomes sorted alphabetically.
mapping_columns <- colnames(mcols(manifest.this$hm450.hg19))[1:40]
mapping_columns <- mapping_columns[!(mapping_columns %in% c('gene','gene_HGNC'))]
classes <- apply(expand.grid(c('EPIC','hm450','hm27'),c('hg19','hg38')),1,function(x) paste(x,collapse = '.'))
sapply(classes, function(x) {
identical(mcols(manifest.this[[x]])[mapping_columns], mcols(manifest.last[[x]])[mapping_columns])
})
## EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
## TRUE TRUE TRUE TRUE TRUE TRUE
gene_HGNC has been added to reflect discrepancy of the gene model (GENCODE v22) and HGNC recommendation. Each platform has this number of probes that are associated with discrepant gene naming from HGNC.
sapply(classes, function(x) {
sum(mcols(manifest.this[[x]])[['gene']] != mcols(manifest.this[[x]])[['gene_HGNC']],na.rm = TRUE)
})
## EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
## 24796 14887 1021 24796 14887 1021
The biggest change in MASKing is on hg19 all the categories have been recomputed using annotations hg19 instead of borrowing from hg38 in previous versions.
mask_columns <- grep('MASK',colnames(mcols(manifest.this$hm450.hg19)), value=TRUE)
kable(sapply(classes, function(x){
sapply(mask_columns, function(maskcol){
sum(mcols(manifest.this[[x]])[[maskcol]])
})
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| MASK_mapping | 63483 | 40490 | 1648 | 65537 | 41712 | 1674 |
| MASK_typeINextBaseSwitch | 306 | 305 | 29 | 365 | 365 | 28 |
| MASK_rmsk15 | 196448 | 82822 | 3095 | 200370 | 84332 | 3207 |
| MASK_sub40_copy | 6921 | 5343 | 608 | 8747 | 6419 | 619 |
| MASK_sub35_copy | 12701 | 9498 | 712 | 14502 | 10545 | 722 |
| MASK_sub30_copy | 23428 | 16546 | 882 | 25194 | 17568 | 893 |
| MASK_sub25_copy | 86533 | 59706 | 2699 | 88173 | 60654 | 2709 |
| MASK_snp5_common | 65844 | 32742 | 878 | 148496 | 74764 | 2643 |
| MASK_snp5_GMAF1p | 34554 | 19618 | 414 | 39446 | 22745 | 536 |
| MASK_extBase | 100 | 54 | 0 | 184 | 102 | 0 |
| MASK_general | 99360 | 60466 | 2136 | 105454 | 64144 | 2263 |
To contrast, this is what was in the last version. For some columns, hg19 and hg38 were merged. In the current release, they are separated.
kable(sapply(classes, function(x){
sapply(mask_columns, function(maskcol){
sum(mcols(manifest.last[[x]])[[maskcol]])
})
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| MASK_mapping | 63483 | 40490 | 1648 | 65537 | 41712 | 1674 |
| MASK_typeINextBaseSwitch | 1051 | 1030 | 195 | 1051 | 1030 | 195 |
| MASK_rmsk15 | 201251 | 84789 | 3207 | 201251 | 84789 | 3207 |
| MASK_sub40_copy | 8747 | 6419 | 619 | 8747 | 6419 | 619 |
| MASK_sub35_copy | 14502 | 10545 | 722 | 14502 | 10545 | 722 |
| MASK_sub30_copy | 25194 | 17568 | 893 | 25194 | 17568 | 893 |
| MASK_sub25_copy | 88173 | 60654 | 2709 | 88173 | 60654 | 2709 |
| MASK_snp5_common | 156200 | 78601 | 2640 | 156200 | 78601 | 2640 |
| MASK_snp5_GMAF1p | 42159 | 24008 | 546 | 42159 | 24008 | 546 |
| MASK_extBase | 171 | 105 | 9 | 171 | 105 | 9 |
| MASK_general | 107994 | 65574 | 2442 | 108716 | 66041 | 2449 |
The number has decreased a bit for both hg19 and hg38.
column <- 'MASK_general'
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[[column]]),
this=sum(mcols(manifest.this[[x]])[[column]]),
overlap=sum(mcols(manifest.last[[x]])[[column]] &
mcols(manifest.this[[x]])[[column]]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 107994 | 65574 | 2442 | 108716 | 66041 | 2449 |
| this | 99360 | 60466 | 2136 | 105454 | 64144 | 2263 |
| overlap | 97140 | 59404 | 2098 | 104914 | 63790 | 2251 |
by design, last release
column <- 'MASK_general'
kable(sapply(classes, function(x){
sapply(c('cg','ch','rs'), function(probeType) {
sum(mcols(manifest.last[[x]])[[column]] & mcols(manifest.last[[x]])[['probeType']]==probeType) /
sum(mcols(manifest.last[[x]])[['probeType']]==probeType)
})
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| cg | 0.1245042 | 0.1347101 | 0.0885488 | 0.1253408 | 0.1356782 | 0.0888027 |
| ch | 0.1896317 | 0.1899062 | NaN | 0.1896317 | 0.1899062 | NaN |
| rs | 0.0000000 | 0.0000000 | NaN | 0.0000000 | 0.0000000 | NaN |
| by de | sign, this r | elease |
column <- 'MASK_general'
kable(sapply(classes, function(x){
sapply(c('cg','ch','rs'), function(probeType) {
sum(mcols(manifest.this[[x]])[[column]] & mcols(manifest.this[[x]])[['probeType']]==probeType) /
sum(mcols(manifest.this[[x]])[['probeType']]==probeType)
})
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| cg | 0.1145242 | 0.1241675 | 0.077453 | 0.1215676 | 0.1317584 | 0.0820582 |
| ch | 0.1821282 | 0.1827887 | NaN | 0.1875853 | 0.1879651 | NaN |
| rs | 0.0000000 | 0.0000000 | NaN | 0.0000000 | 0.0000000 | NaN |
SNP information has been updated using the dbSNP build 151. As a consequence, a smaller number of probes become masked in the current release, possibly due to correction of allele frequencies. hg19 sees a more dramatic reduction since in the previous releases, the masking were borrowed from hg38 (dbSNP b147). Now they were recomputed using the native dbSNP b135 (a downgrade).
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[['MASK_snp5_common']]),
this=sum(mcols(manifest.this[[x]])[['MASK_snp5_common']]),
overlap=sum(mcols(manifest.last[[x]])[['MASK_snp5_common']] &
mcols(manifest.this[[x]])[['MASK_snp5_common']]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 156200 | 78601 | 2640 | 156200 | 78601 | 2640 |
| this | 65844 | 32742 | 878 | 148496 | 74764 | 2643 |
| overlap | 59920 | 29783 | 728 | 145465 | 72550 | 2401 |
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[['MASK_snp5_GMAF1p']]),
this=sum(mcols(manifest.this[[x]])[['MASK_snp5_GMAF1p']]),
overlap=sum(mcols(manifest.last[[x]])[['MASK_snp5_GMAF1p']] &
mcols(manifest.this[[x]])[['MASK_snp5_GMAF1p']]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 42159 | 24008 | 546 | 42159 | 24008 | 546 |
| this | 34554 | 19618 | 414 | 39446 | 22745 | 536 |
| overlap | 31344 | 17781 | 341 | 38448 | 21964 | 493 |
Mapping remains the same
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[['MASK_mapping']]),
this=sum(mcols(manifest.this[[x]])[['MASK_mapping']]),
overlap=sum(mcols(manifest.last[[x]])[['MASK_mapping']] &
mcols(manifest.this[[x]])[['MASK_mapping']]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 63483 | 40490 | 1648 | 65537 | 41712 | 1674 |
| this | 63483 | 40490 | 1648 | 65537 | 41712 | 1674 |
| overlap | 63483 | 40490 | 1648 | 65537 | 41712 | 1674 |
Type-I Color Channel Switch probes have been reduced due to the use of new dbSNP (b151) and allele frequency (1%).
column <- 'MASK_typeINextBaseSwitch'
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[[column]]),
this=sum(mcols(manifest.this[[x]])[[column]]),
overlap=sum(mcols(manifest.last[[x]])[[column]] &
mcols(manifest.this[[x]])[[column]]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 1051 | 1030 | 195 | 1051 | 1030 | 195 |
| this | 306 | 305 | 29 | 365 | 365 | 28 |
| overlap | 288 | 288 | 28 | 359 | 361 | 28 |
Although not incorporated into ‘MASK_general’, this has been updated for hg19. There is some minor updates in hg38 too but the source is not entirely clear due to lacking record that describes the original rmsk masking.
column <- 'MASK_rmsk15'
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[[column]]),
this=sum(mcols(manifest.this[[x]])[[column]]),
overlap=sum(mcols(manifest.last[[x]])[[column]] &
mcols(manifest.this[[x]])[[column]]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 201251 | 84789 | 3207 | 201251 | 84789 | 3207 |
| this | 196448 | 82822 | 3095 | 200370 | 84332 | 3207 |
| overlap | 186599 | 76752 | 2822 | 199682 | 84069 | 3207 |
This is remade for hg19. In previous version, hg19 was copying hg38. This remains the same for hg38.
column <- 'MASK_sub40_copy'
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[[column]]),
this=sum(mcols(manifest.this[[x]])[[column]]),
overlap=sum(mcols(manifest.last[[x]])[[column]] &
mcols(manifest.this[[x]])[[column]]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 8747 | 6419 | 619 | 8747 | 6419 | 619 |
| this | 6921 | 5343 | 608 | 8747 | 6419 | 619 |
| overlap | 6855 | 5289 | 595 | 8747 | 6419 | 619 |
column <- 'MASK_sub35_copy'
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[[column]]),
this=sum(mcols(manifest.this[[x]])[[column]]),
overlap=sum(mcols(manifest.last[[x]])[[column]] &
mcols(manifest.this[[x]])[[column]]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 14502 | 10545 | 722 | 14502 | 10545 | 722 |
| this | 12701 | 9498 | 712 | 14502 | 10545 | 722 |
| overlap | 12626 | 9437 | 699 | 14502 | 10545 | 722 |
column <- 'MASK_sub30_copy'
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[[column]]),
this=sum(mcols(manifest.this[[x]])[[column]]),
overlap=sum(mcols(manifest.last[[x]])[[column]] &
mcols(manifest.this[[x]])[[column]]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 25194 | 17568 | 893 | 25194 | 17568 | 893 |
| this | 23428 | 16546 | 882 | 25194 | 17568 | 893 |
| overlap | 23342 | 16472 | 870 | 25194 | 17568 | 893 |
column <- 'MASK_sub25_copy'
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[[column]]),
this=sum(mcols(manifest.this[[x]])[[column]]),
overlap=sum(mcols(manifest.last[[x]])[[column]] &
mcols(manifest.this[[x]])[[column]]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 88173 | 60654 | 2709 | 88173 | 60654 | 2709 |
| this | 86533 | 59706 | 2699 | 88173 | 60654 | 2709 |
| overlap | 86404 | 59611 | 2687 | 88173 | 60654 | 2709 |
Extension base masking has been reduced a bit as well. As above, hg19 no longer “borrow” masking from hg38. This causes the most reduction.
column <- 'MASK_extBase'
kable(sapply(classes, function(x){
c(last=sum(mcols(manifest.last[[x]])[[column]]),
this=sum(mcols(manifest.this[[x]])[[column]]),
overlap=sum(mcols(manifest.last[[x]])[[column]] &
mcols(manifest.this[[x]])[[column]]))
}))
| EPIC.hg19 | hm450.hg19 | hm27.hg19 | EPIC.hg38 | hm450.hg38 | hm27.hg38 | |
|---|---|---|---|---|---|---|
| last | 171 | 105 | 9 | 171 | 105 | 9 |
| this | 100 | 54 | 0 | 184 | 102 | 0 |
| overlap | 100 | 54 | 0 | 158 | 85 | 0 |
sapply(c('EPIC','hm450','hm27'), function(x){
sum(mcols(manifest.this[[paste0(x,'.hg19')]])[[column]] |
mcols(manifest.this[[paste0(x,'.hg38')]])[[column]])
})
## EPIC hm450 hm27
## 280 156 0
26 cases where MASK_extBase is turned on in this release was already masked by MASK_mapping in the previous release (redundant masking in some sense).
x <- 'EPIC.hg38'
column <- 'MASK_extBase'
xx <- !mcols(manifest.last[[x]])[[column]] & mcols(manifest.this[[x]])[[column]]
sum(xx)
## [1] 26
sum(manifest.last[[x]][xx]$MASK_mapping)
## [1] 26
13 cases where MASK_extBase is turned off in this release is also taken care of by MASK_mapping. So they remain masked in MASK_general.
xx <- mcols(manifest.last[[x]])[[column]] & !mcols(manifest.this[[x]])[[column]]
sum(xx)
## [1] 13
sum(manifest.this[[x]][xx]$MASK_mapping)
## [1] 13