This is evaluated using HM450-WGBS correlation. Note that the new masking has not been incorporated into GDC as of release 12.0 yet.
#library(sesameData)
#mask.tcga <- sesameDataGet('HM450.probeInfo')$mask.tcga
#data.frame(LegacyMask=mask.tcga) %>% write_tsv(file.path(outputdir, 'HM450_TCGA_Legacy_Experiment_Independent_Mask.tsv'))
#mask.tcga %>% length # 89512
The list of experiment-independent masking is available at https://raw.githubusercontent.com/zwdzwd/GDC_DNA_methylation_QC/master/output/HM450_TCGA_Legacy_Experiment_Independent_Mask.tsv
betas.hm450 <- readRDS(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/WGBS_vs_HM450/betas.hm450.rds')))
betas.WGBS10 <- readRDS(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/WGBS_vs_HM450/betas.WGBS10.rds')))
hm450.manifest <- list(
hg19 = readRDS(gzcon(url('http://zwdzwd.io/InfiniumAnnotation/current/hm450/hm450.hg19.manifest.rds'))),
hg38 = readRDS(gzcon(url('http://zwdzwd.io/InfiniumAnnotation/current/hm450/hm450.hg38.manifest.rds'))))
msk_cor <- function(msk) {
msk <- intersect(msk, rownames(betas.hm450))
aa <- (!is.na(betas.hm450[msk,])) & (!is.na(betas.WGBS10[msk,]))
cor(betas.hm450[msk,][aa], betas.WGBS10[msk,][aa])
}
msk_snp_10_hg19 <- read_tsv('/secondary/projects/laird/projects/2016_05_13_hm450_annotation/SNPmasking_hg19_10bp.bed', col_names = 'probe')$probe
msk_snp_10_hg38 <- read_tsv('/secondary/projects/laird/projects/2016_05_13_hm450_annotation/SNPmasking_hg38_10bp.bed', col_names = 'probe')$probe
refversions <- c('hg19','hg38')
msk_names <- c('Not_masked','MASK_general','MASK_rmsk15','MASK_mapping','MASK_sub30_copy','MASK_snp5_GMAF1p','MASK_typeINextBaseSwitch','MASK_extBase', 'MASK_snp10_GMAF1p')
msks <- lapply(refversions, function(refversion) {
mft <- hm450.manifest[[refversion]]
msks1 <- lapply(msk_names, function(msk_name) {
if (msk_name == 'Not_masked') {
msk <- names(mft[mcols(mft)[['MASK_general']]])
msk <- rownames(betas.hm450) %>% setdiff(msk)
} else if (msk_name == 'MASK_snp10_GMAF1p') {
if (refversion=='hg19') {
msk <- msk_snp_10_hg19
} else {
msk <- msk_snp_10_hg38
}
} else {
msk <- names(mft[mcols(mft)[[msk_name]]])
}
grep('cg',msk,value=TRUE) # CpG-only
})
names(msks1) <- msk_names
msks1
})
names(msks) <- refversions
msks[['hg19&hg38']] <- list()
for (msk_name in msk_names) {
msks[['hg19&hg38']][[msk_name]] <- intersect(msks[['hg19']][[msk_name]], msks[['hg38']][[msk_name]])
}
refversions <- c(refversions, 'hg19&hg38')
msk_correlations <- t(do.call(rbind, lapply(refversions, function(refversion) {
sapply(msk_names, function(msk_name) {
msk <- msks[[refversion]][[msk_name]]
c(length(msk), msk_cor(msk))
})
})))
colnames(msk_correlations) <- c('hg19 (count)', 'hg19 (correlation)', 'hg38 (count)', 'hg38 (correlation)', 'overlap (count)', 'overlap (correlation)')
# msk_legacy_snp <- read_tsv(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_TCGA_Legacy_Experiment_Independent_Mask_SNP.tsv.gz')), col_names = 'probe')$probe
# msk_legacy_snp_hg38 <- read_tsv(gzcon(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_TCGA_Legacy_Experiment_Independent_Mask_SNP_hg38.tsv.gz')), col_names = 'probe')$probe
sum(mcols(hm450.manifest[['hg19']])[['MASK_general']] & mcols(hm450.manifest[['hg38']][names(hm450.manifest[['hg19']])])[['MASK_general']])
## [1] 59239
msk_correlations %>% as.data.frame %>% mutate(category=rownames(msk_correlations)) %>% arrange(`hg19 (correlation)`) %>% kable
| hg19 (count) | hg19 (correlation) | hg38 (count) | hg38 (correlation) | overlap (count) | overlap (correlation) | category |
|---|---|---|---|---|---|---|
| 16225 | 0.8029413 | 17246 | 0.8118368 | 16151 | 0.8027408 | MASK_sub30_copy |
| 53 | 0.8046336 | 101 | 0.8068102 | 46 | 0.8173936 | MASK_extBase |
| 40082 | 0.8576264 | 41302 | 0.8599037 | 40037 | 0.8576127 | MASK_mapping |
| 59901 | 0.8799790 | 63563 | 0.8837210 | 58676 | 0.8789631 | MASK_general |
| 81211 | 0.9117868 | 82705 | 0.9127543 | 75331 | 0.9096676 | MASK_rmsk15 |
| 305 | 0.9200702 | 365 | 0.9243109 | 279 | 0.9188774 | MASK_typeINextBaseSwitch |
| 19563 | 0.9215415 | 22670 | 0.9220932 | 17840 | 0.9221812 | MASK_snp5_GMAF1p |
| 26257 | 0.9274129 | 30603 | 0.9278802 | 24001 | 0.9279800 | MASK_snp10_GMAF1p |
| 422520 | 0.9520003 | 418858 | 0.9520903 | 417633 | 0.9521496 | Not_masked |
27773 + 52268
## [1] 80041
msk_legacy_all <- read_tsv(url('https://github.com/zwdzwd/GDC_DNA_methylation_QC/raw/master/output/HM450_TCGA_Legacy_Experiment_Independent_Mask.tsv'))$LegacyMask
msk_new_all <- names(hm450.manifest$hg38[hm450.manifest$hg38$MASK_general])
length(msk_legacy_all)
## [1] 89512
length(msk_new_all)
## [1] 64144
## CpG only
msk_legacy <- grep('cg',msk_legacy_all,value=TRUE)
msk_new <- grep('cg',msk_new_all,value=TRUE)
length(msk_legacy)
## [1] 88058
length(msk_new)
## [1] 63563
msk_groups <- list(
msk_neither = (rownames(betas.hm450) %>% setdiff(msk_legacy) %>% setdiff(msk_new)),
msk_not_legacy = (rownames(betas.hm450) %>% setdiff(msk_legacy)),
msk_not_new = (rownames(betas.hm450) %>% setdiff(msk_new)),
msk_legacy = msk_legacy,
msk_legacy_only = msk_legacy %>% setdiff(msk_new),
msk_new = msk_new,
msk_new_only = msk_new %>% setdiff(msk_legacy),
msk_both = intersect(msk_legacy, msk_new))
df <- data.frame(
group=factor(names(msk_groups), levels = names(msk_groups),
labels = c(
'Neither',
'Not Legacy',
'Not New',
'Legacy',
'Legacy Only',
'New',
'New Only',
'Both')),
count=sapply(msk_groups, length),
corr=sapply(msk_groups, msk_cor),
source <- c('not masked','not masked','not masked','legacy','legacy','new','new','both'))
colors <- c('not masked'='grey', legacy='#FFC000', new='#92D050', both='black')
# number of cg probes
hm450.manifest$hg38$probeType %>% table
## .
## cg ch rs
## 482421 3091 65
# fraction of changed probes
(52268+27773) / 482421
## [1] 0.1659152
# percentage of usable probe increase
((482421-63563) - (482421-88058)) / (482421-88058)
## [1] 0.06211282
ggplot(df) + geom_bar(aes(group, count, fill=source), color='black', stat='identity') + geom_text(aes(group, count, label=count), vjust=-0.5) + xlab('Masking Status') + ylab('Number of CpGs') + scale_fill_manual(values=colors) + theme_bw()
ggplot(df) + geom_bar(aes(group, corr, fill=source), color='black', stat='identity') + geom_text(aes(group, corr, label=sprintf('%1.3f',corr)), vjust=-0.5) + scale_y_continuous(limits=c(0.7,1.0), oob=rescale_none) + xlab('Masking Status') + ylab('Correlation - HM450 vs WGBS') + scale_fill_manual(values=colors) + theme_bw()