Legacy vs new masking

This is evaluated using HM450-WGBS correlation. Note that the new masking has not been incorporated into GDC as of release 12.0 yet.

HM450

#library(sesameData)
#mask.tcga <- sesameDataGet('HM450.probeInfo')$mask.tcga
#data.frame(LegacyMask=mask.tcga) %>% write_tsv(file.path(outputdir, 'HM450_TCGA_Legacy_Experiment_Independent_Mask.tsv'))
#mask.tcga %>% length # 89512

The list of experiment-independent masking is available at https://raw.githubusercontent.com/zwdzwd/GDC_DNA_methylation_QC/master/output/HM450_TCGA_Legacy_Experiment_Independent_Mask.tsv

Load data and function to calculate HM450-WGBS correlation

betas.hm450 <- readRDS(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/WGBS_vs_HM450/betas.hm450.rds')))
betas.WGBS10 <- readRDS(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/WGBS_vs_HM450/betas.WGBS10.rds')))

hm450.manifest <- list(
    hg19 = readRDS(gzcon(url('http://zwdzwd.io/InfiniumAnnotation/current/hm450/hm450.hg19.manifest.rds'))),
    hg38 = readRDS(gzcon(url('http://zwdzwd.io/InfiniumAnnotation/current/hm450/hm450.hg38.manifest.rds'))))

msk_cor <- function(msk) {
    msk <- intersect(msk, rownames(betas.hm450))
    aa <- (!is.na(betas.hm450[msk,])) & (!is.na(betas.WGBS10[msk,]))
    cor(betas.hm450[msk,][aa], betas.WGBS10[msk,][aa])
}

By Masking Criterion

msk_snp_10_hg19 <- read_tsv('/secondary/projects/laird/projects/2016_05_13_hm450_annotation/SNPmasking_hg19_10bp.bed', col_names = 'probe')$probe
msk_snp_10_hg38 <- read_tsv('/secondary/projects/laird/projects/2016_05_13_hm450_annotation/SNPmasking_hg38_10bp.bed', col_names = 'probe')$probe

refversions <- c('hg19','hg38')
msk_names <- c('Not_masked','MASK_general','MASK_rmsk15','MASK_mapping','MASK_sub30_copy','MASK_snp5_GMAF1p','MASK_typeINextBaseSwitch','MASK_extBase', 'MASK_snp10_GMAF1p')
msks <- lapply(refversions, function(refversion) {
    mft <- hm450.manifest[[refversion]]
    msks1 <- lapply(msk_names, function(msk_name) {
        if (msk_name == 'Not_masked') {
            msk <- names(mft[mcols(mft)[['MASK_general']]])
            msk <- rownames(betas.hm450) %>% setdiff(msk)
        } else if (msk_name == 'MASK_snp10_GMAF1p') {
            if (refversion=='hg19') {
                msk <- msk_snp_10_hg19
            } else {
                msk <- msk_snp_10_hg38
            }
        } else {
            msk <- names(mft[mcols(mft)[[msk_name]]])
        }
        grep('cg',msk,value=TRUE) # CpG-only
    })
    names(msks1) <- msk_names
    msks1
})
names(msks) <- refversions

msks[['hg19&hg38']] <- list()
for (msk_name in msk_names) {
    msks[['hg19&hg38']][[msk_name]] <- intersect(msks[['hg19']][[msk_name]], msks[['hg38']][[msk_name]])
}

refversions <- c(refversions, 'hg19&hg38')
msk_correlations <- t(do.call(rbind, lapply(refversions, function(refversion) {
    sapply(msk_names, function(msk_name) {
        msk <- msks[[refversion]][[msk_name]]
        c(length(msk), msk_cor(msk))
    })
})))

colnames(msk_correlations) <- c('hg19 (count)', 'hg19 (correlation)', 'hg38 (count)', 'hg38 (correlation)', 'overlap (count)', 'overlap (correlation)')

# msk_legacy_snp <- read_tsv(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_TCGA_Legacy_Experiment_Independent_Mask_SNP.tsv.gz')), col_names = 'probe')$probe
# msk_legacy_snp_hg38 <- read_tsv(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_TCGA_Legacy_Experiment_Independent_Mask_SNP_hg38.tsv.gz')), col_names = 'probe')$probe

sum(mcols(hm450.manifest[['hg19']])[['MASK_general']] & mcols(hm450.manifest[['hg38']][names(hm450.manifest[['hg19']])])[['MASK_general']])
## [1] 59239
msk_correlations %>% as.data.frame %>% mutate(category=rownames(msk_correlations)) %>% arrange(`hg19 (correlation)`) %>% kable
hg19 (count) hg19 (correlation) hg38 (count) hg38 (correlation) overlap (count) overlap (correlation) category
16225 0.8029413 17246 0.8118368 16151 0.8027408 MASK_sub30_copy
53 0.8046336 101 0.8068102 46 0.8173936 MASK_extBase
40082 0.8576264 41302 0.8599037 40037 0.8576127 MASK_mapping
59901 0.8799790 63563 0.8837210 58676 0.8789631 MASK_general
81211 0.9117868 82705 0.9127543 75331 0.9096676 MASK_rmsk15
305 0.9200702 365 0.9243109 279 0.9188774 MASK_typeINextBaseSwitch
19563 0.9215415 22670 0.9220932 17840 0.9221812 MASK_snp5_GMAF1p
26257 0.9274129 30603 0.9278802 24001 0.9279800 MASK_snp10_GMAF1p
422520 0.9520003 418858 0.9520903 417633 0.9521496 Not_masked
27773 + 52268
## [1] 80041

Legacy Masking vs New Masking

msk_legacy_all <- read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_TCGA_Legacy_Experiment_Independent_Mask.tsv'))$LegacyMask
msk_new_all <- names(hm450.manifest$hg38[hm450.manifest$hg38$MASK_general])

length(msk_legacy_all)
## [1] 89512
length(msk_new_all)
## [1] 64144
## CpG only
msk_legacy <- grep('cg',msk_legacy_all,value=TRUE)
msk_new <- grep('cg',msk_new_all,value=TRUE)

length(msk_legacy)
## [1] 88058
length(msk_new)
## [1] 63563
msk_groups <- list(
    msk_neither = (rownames(betas.hm450) %>% setdiff(msk_legacy) %>% setdiff(msk_new)),
    msk_not_legacy = (rownames(betas.hm450) %>% setdiff(msk_legacy)),
    msk_not_new = (rownames(betas.hm450) %>% setdiff(msk_new)),
    msk_legacy = msk_legacy,
    msk_legacy_only = msk_legacy %>% setdiff(msk_new),
    msk_new = msk_new,
    msk_new_only = msk_new %>% setdiff(msk_legacy),
    msk_both = intersect(msk_legacy, msk_new))

df <- data.frame(
    group=factor(names(msk_groups), levels = names(msk_groups),
                 labels = c(
                     'Neither', 
                     'Not Legacy',
                     'Not New',
                     'Legacy',
                     'Legacy Only',
                     'New',
                     'New Only',
                     'Both')),
    count=sapply(msk_groups, length),
    corr=sapply(msk_groups, msk_cor),
    source <- c('not masked','not masked','not masked','legacy','legacy','new','new','both'))

colors <- c('not masked'='grey', legacy='#FFC000', new='#92D050', both='black')

# number of cg probes
hm450.manifest$hg38$probeType %>% table
## .
##     cg     ch     rs 
## 482421   3091     65
# fraction of changed probes
(52268+27773) / 482421
## [1] 0.1659152
# percentage of usable probe increase
((482421-63563) - (482421-88058)) / (482421-88058)
## [1] 0.06211282
ggplot(df) + geom_bar(aes(group, count, fill=source), color='black', stat='identity') + geom_text(aes(group, count, label=count), vjust=-0.5) + xlab('Masking Status') + ylab('Number of CpGs') + scale_fill_manual(values=colors) + theme_bw()

ggplot(df) + geom_bar(aes(group, corr, fill=source), color='black', stat='identity') + geom_text(aes(group, corr, label=sprintf('%1.3f',corr)), vjust=-0.5) + scale_y_continuous(limits=c(0.7,1.0), oob=rescale_none) + xlab('Masking Status') + ylab('Correlation - HM450 vs WGBS') + scale_fill_manual(values=colors) + theme_bw()