release_date='20180909'
last_release_date='20180808'

Download manifest files

dir.create(sprintf('~/Downloads/tmp/%s/',release_date), showWarnings = FALSE)
for (refversion in c('hg19','hg38')) {
    for (platform in c('EPIC','hm450','hm27')) {
        download.file(sprintf('http://zwdzwd.io/InfiniumAnnotation/%s/%s/%s.%s.manifest.rds', release_date, platform, platform, refversion),
                      sprintf('~/Downloads/tmp/%s/%s.%s.manifest.rds', release_date, platform, refversion))
    }
}
dir.create(sprintf('~/Downloads/tmp/%s/',last_release_date), showWarnings = FALSE)
for (refversion in c('hg19','hg38')) {
    for (platform in c('EPIC','hm450','hm27')) {
        download.file(sprintf('http://zwdzwd.io/InfiniumAnnotation/%s/%s/%s.%s.manifest.rds', last_release_date, platform, platform, refversion),
                      sprintf('~/Downloads/tmp/%s/%s.%s.manifest.rds', last_release_date, platform, refversion))
    }
}
manifest.last <- list()
for (platform in c('EPIC','hm450','hm27')) {
    for (refversion in c('hg19','hg38')) {
        id <- paste0(platform,'.',refversion)
        manifest.last[[id]] <- readRDS(sprintf('~/Downloads/tmp/%s/%s.%s.manifest.rds', last_release_date, platform, refversion))
    }
}
manifest.this <- list()
for (platform in c('EPIC','hm450','hm27')) {
    for (refversion in c('hg19','hg38')) {
        id <- paste0(platform,'.',refversion)
        manifest.this[[id]] <- readRDS(sprintf('~/Downloads/tmp/%s/%s.%s.manifest.rds', release_date, platform, refversion))[names(manifest.last[[id]])]
    }
}

Summary

An extra column gene_HGNC is added.

cbind(
    ncol.last=sapply(manifest.last, function(x) ncol(mcols(x))),
    ncol.this=sapply(manifest.this, function(x) ncol(mcols(x))),
    nrow.this=sapply(manifest.this, length),
    nrow.last=sapply(manifest.this, length))
##            ncol.last ncol.this nrow.this nrow.last
## EPIC.hg19         51        52    865918    865918
## EPIC.hg38         51        52    865918    865918
## hm450.hg19        51        52    485577    485577
## hm450.hg38        51        52    485577    485577
## hm27.hg19         51        52     27578     27578
## hm27.hg38         51        52     27578     27578

Mapping

All the mapping info remains the same as the last version. gene column becomes sorted alphabetically.

mapping_columns <- colnames(mcols(manifest.this$hm450.hg19))[1:40]
mapping_columns <- mapping_columns[!(mapping_columns %in% c('gene','gene_HGNC'))]
classes <- apply(expand.grid(c('EPIC','hm450','hm27'),c('hg19','hg38')),1,function(x) paste(x,collapse = '.'))
sapply(classes, function(x) {
    identical(mcols(manifest.this[[x]])[mapping_columns], mcols(manifest.last[[x]])[mapping_columns])
})
##  EPIC.hg19 hm450.hg19  hm27.hg19  EPIC.hg38 hm450.hg38  hm27.hg38 
##       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE

gene_HGNC has been added to reflect discrepancy of the gene model (GENCODE v22) and HGNC recommendation. Each platform has this number of probes that are associated with discrepant gene naming from HGNC.

sapply(classes, function(x) {
    sum(mcols(manifest.this[[x]])[['gene']] != mcols(manifest.this[[x]])[['gene_HGNC']],na.rm = TRUE)
})
##  EPIC.hg19 hm450.hg19  hm27.hg19  EPIC.hg38 hm450.hg38  hm27.hg38 
##      24796      14887       1021      24796      14887       1021

Masking

The biggest change in MASKing is on hg19 all the categories have been recomputed using annotations hg19 instead of borrowing from hg38 in previous versions.

Current

mask_columns <- grep('MASK',colnames(mcols(manifest.this$hm450.hg19)), value=TRUE)
kable(sapply(classes, function(x){
    sapply(mask_columns, function(maskcol){
        sum(mcols(manifest.this[[x]])[[maskcol]])
    })
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
MASK_mapping 63483 40490 1648 65537 41712 1674
MASK_typeINextBaseSwitch 306 305 29 365 365 28
MASK_rmsk15 196448 82822 3095 200370 84332 3207
MASK_sub40_copy 6921 5343 608 8747 6419 619
MASK_sub35_copy 12701 9498 712 14502 10545 722
MASK_sub30_copy 23428 16546 882 25194 17568 893
MASK_sub25_copy 86533 59706 2699 88173 60654 2709
MASK_snp5_common 65844 32742 878 148496 74764 2643
MASK_snp5_GMAF1p 34554 19618 414 39446 22745 536
MASK_extBase 100 54 0 184 102 0
MASK_general 99360 60466 2136 105454 64144 2263

Last

To contrast, this is what was in the last version. For some columns, hg19 and hg38 were merged. In the current release, they are separated.

kable(sapply(classes, function(x){
    sapply(mask_columns, function(maskcol){
        sum(mcols(manifest.last[[x]])[[maskcol]])
    })
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
MASK_mapping 63483 40490 1648 65537 41712 1674
MASK_typeINextBaseSwitch 1051 1030 195 1051 1030 195
MASK_rmsk15 201251 84789 3207 201251 84789 3207
MASK_sub40_copy 8747 6419 619 8747 6419 619
MASK_sub35_copy 14502 10545 722 14502 10545 722
MASK_sub30_copy 25194 17568 893 25194 17568 893
MASK_sub25_copy 88173 60654 2709 88173 60654 2709
MASK_snp5_common 156200 78601 2640 156200 78601 2640
MASK_snp5_GMAF1p 42159 24008 546 42159 24008 546
MASK_extBase 171 105 9 171 105 9
MASK_general 107994 65574 2442 108716 66041 2449

MASK_general

The number has decreased a bit for both hg19 and hg38.

column <- 'MASK_general'
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[[column]]),
      this=sum(mcols(manifest.this[[x]])[[column]]),
      overlap=sum(mcols(manifest.last[[x]])[[column]] &
                  mcols(manifest.this[[x]])[[column]]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 107994 65574 2442 108716 66041 2449
this 99360 60466 2136 105454 64144 2263
overlap 97140 59404 2098 104914 63790 2251

by design, last release

column <- 'MASK_general'
kable(sapply(classes, function(x){
    sapply(c('cg','ch','rs'), function(probeType) {
        sum(mcols(manifest.last[[x]])[[column]] & mcols(manifest.last[[x]])[['probeType']]==probeType) /
            sum(mcols(manifest.last[[x]])[['probeType']]==probeType)
    })
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
cg 0.1245042 0.1347101 0.0885488 0.1253408 0.1356782 0.0888027
ch 0.1896317 0.1899062 NaN 0.1896317 0.1899062 NaN
rs 0.0000000 0.0000000 NaN 0.0000000 0.0000000 NaN
by de sign, this r elease
column <- 'MASK_general'
kable(sapply(classes, function(x){
    sapply(c('cg','ch','rs'), function(probeType) {
        sum(mcols(manifest.this[[x]])[[column]] & mcols(manifest.this[[x]])[['probeType']]==probeType) /
            sum(mcols(manifest.this[[x]])[['probeType']]==probeType)
    })
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
cg 0.1145242 0.1241675 0.077453 0.1215676 0.1317584 0.0820582
ch 0.1821282 0.1827887 NaN 0.1875853 0.1879651 NaN
rs 0.0000000 0.0000000 NaN 0.0000000 0.0000000 NaN

SNP

SNP information has been updated using the dbSNP build 151. As a consequence, a smaller number of probes become masked in the current release, possibly due to correction of allele frequencies. hg19 sees a more dramatic reduction since in the previous releases, the masking were borrowed from hg38 (dbSNP b147). Now they were recomputed using the native dbSNP b135 (a downgrade).

kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[['MASK_snp5_common']]),
      this=sum(mcols(manifest.this[[x]])[['MASK_snp5_common']]),
      overlap=sum(mcols(manifest.last[[x]])[['MASK_snp5_common']] &
                  mcols(manifest.this[[x]])[['MASK_snp5_common']]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 156200 78601 2640 156200 78601 2640
this 65844 32742 878 148496 74764 2643
overlap 59920 29783 728 145465 72550 2401
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[['MASK_snp5_GMAF1p']]),
      this=sum(mcols(manifest.this[[x]])[['MASK_snp5_GMAF1p']]),
      overlap=sum(mcols(manifest.last[[x]])[['MASK_snp5_GMAF1p']] &
                  mcols(manifest.this[[x]])[['MASK_snp5_GMAF1p']]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 42159 24008 546 42159 24008 546
this 34554 19618 414 39446 22745 536
overlap 31344 17781 341 38448 21964 493

Mapping

Mapping remains the same

kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[['MASK_mapping']]),
      this=sum(mcols(manifest.this[[x]])[['MASK_mapping']]),
      overlap=sum(mcols(manifest.last[[x]])[['MASK_mapping']] &
                  mcols(manifest.this[[x]])[['MASK_mapping']]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 63483 40490 1648 65537 41712 1674
this 63483 40490 1648 65537 41712 1674
overlap 63483 40490 1648 65537 41712 1674

Type I Next Base Switch

Type-I Color Channel Switch probes have been reduced due to the use of new dbSNP (b151) and allele frequency (1%).

column <- 'MASK_typeINextBaseSwitch'
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[[column]]),
      this=sum(mcols(manifest.this[[x]])[[column]]),
      overlap=sum(mcols(manifest.last[[x]])[[column]] &
                  mcols(manifest.this[[x]])[[column]]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 1051 1030 195 1051 1030 195
this 306 305 29 365 365 28
overlap 288 288 28 359 361 28

Repeat Masker

Although not incorporated into ‘MASK_general’, this has been updated for hg19. There is some minor updates in hg38 too but the source is not entirely clear due to lacking record that describes the original rmsk masking.

column <- 'MASK_rmsk15'
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[[column]]),
      this=sum(mcols(manifest.this[[x]])[[column]]),
      overlap=sum(mcols(manifest.last[[x]])[[column]] &
                  mcols(manifest.this[[x]])[[column]]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 201251 84789 3207 201251 84789 3207
this 196448 82822 3095 200370 84332 3207
overlap 186599 76752 2822 199682 84069 3207

Cross-hybridization from probe subsequence copy number

This is remade for hg19. In previous version, hg19 was copying hg38. This remains the same for hg38.

column <- 'MASK_sub40_copy'
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[[column]]),
      this=sum(mcols(manifest.this[[x]])[[column]]),
      overlap=sum(mcols(manifest.last[[x]])[[column]] &
                  mcols(manifest.this[[x]])[[column]]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 8747 6419 619 8747 6419 619
this 6921 5343 608 8747 6419 619
overlap 6855 5289 595 8747 6419 619
column <- 'MASK_sub35_copy'
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[[column]]),
      this=sum(mcols(manifest.this[[x]])[[column]]),
      overlap=sum(mcols(manifest.last[[x]])[[column]] &
                  mcols(manifest.this[[x]])[[column]]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 14502 10545 722 14502 10545 722
this 12701 9498 712 14502 10545 722
overlap 12626 9437 699 14502 10545 722
column <- 'MASK_sub30_copy'
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[[column]]),
      this=sum(mcols(manifest.this[[x]])[[column]]),
      overlap=sum(mcols(manifest.last[[x]])[[column]] &
                  mcols(manifest.this[[x]])[[column]]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 25194 17568 893 25194 17568 893
this 23428 16546 882 25194 17568 893
overlap 23342 16472 870 25194 17568 893
column <- 'MASK_sub25_copy'
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[[column]]),
      this=sum(mcols(manifest.this[[x]])[[column]]),
      overlap=sum(mcols(manifest.last[[x]])[[column]] &
                  mcols(manifest.this[[x]])[[column]]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 88173 60654 2709 88173 60654 2709
this 86533 59706 2699 88173 60654 2709
overlap 86404 59611 2687 88173 60654 2709

Extension Base

Extension base masking has been reduced a bit as well. As above, hg19 no longer “borrow” masking from hg38. This causes the most reduction.

column <- 'MASK_extBase'
kable(sapply(classes, function(x){
    c(last=sum(mcols(manifest.last[[x]])[[column]]),
      this=sum(mcols(manifest.this[[x]])[[column]]),
      overlap=sum(mcols(manifest.last[[x]])[[column]] &
                  mcols(manifest.this[[x]])[[column]]))
}))
EPIC.hg19 hm450.hg19 hm27.hg19 EPIC.hg38 hm450.hg38 hm27.hg38
last 171 105 9 171 105 9
this 100 54 0 184 102 0
overlap 100 54 0 158 85 0
sapply(c('EPIC','hm450','hm27'), function(x){
    sum(mcols(manifest.this[[paste0(x,'.hg19')]])[[column]] |
        mcols(manifest.this[[paste0(x,'.hg38')]])[[column]])
})
##  EPIC hm450  hm27 
##   280   156     0

26 cases where MASK_extBase is turned on in this release was already masked by MASK_mapping in the previous release (redundant masking in some sense).

x <- 'EPIC.hg38'
column <- 'MASK_extBase'
xx <- !mcols(manifest.last[[x]])[[column]] & mcols(manifest.this[[x]])[[column]]
sum(xx)
## [1] 26
sum(manifest.last[[x]][xx]$MASK_mapping)
## [1] 26

13 cases where MASK_extBase is turned off in this release is also taken care of by MASK_mapping. So they remain masked in MASK_general.

xx <- mcols(manifest.last[[x]])[[column]] & !mcols(manifest.this[[x]])[[column]]
sum(xx)
## [1] 13
sum(manifest.this[[x]][xx]$MASK_mapping)
## [1] 13