dir.create('~/Downloads/tmp/20180808/', showWarnings = FALSE)
download.file('http://zwdzwd.io/InfiniumAnnotation/20180808/hm450/hm450.hg38.manifest.rds', '~/Downloads/tmp/20180808/hm450.hg38.manifest.rds')
download.file('http://zwdzwd.io/InfiniumAnnotation/20180808/hm450/hm450.hg19.manifest.rds', '~/Downloads/tmp/20180808/hm450.hg19.manifest.rds')
dir.create('~/Downloads/tmp/20180304/', showWarnings = FALSE)
download.file('http://zwdzwd.io/InfiniumAnnotation/20180304/hm450/hm450.hg38.manifest.rds', '~/Downloads/tmp/20180304/hm450.hg38.manifest.rds')
download.file('http://zwdzwd.io/InfiniumAnnotation/20180304/hm450/hm450.hg19.manifest.rds', '~/Downloads/tmp/20180304/hm450.hg19.manifest.rds')
HM450.hg38.manifest.0304 <- readRDS('~/Downloads/tmp/20180304/hm450.hg38.manifest.rds')
HM450.hg19.manifest.0304 <- readRDS('~/Downloads/tmp/20180304/hm450.hg19.manifest.rds')
HM450.hg38.manifest.0808 <- readRDS('~/Downloads/tmp/20180808/hm450.hg38.manifest.rds')[names(HM450.hg38.manifest.0304)]
HM450.hg19.manifest.0808 <- readRDS('~/Downloads/tmp/20180808/hm450.hg19.manifest.rds')[names(HM450.hg19.manifest.0304)]
There are 46 columns (annotations) and 485577 rows (probes) in the old manifest. There are 51 columns (annotations) and 485577 rows (probes) in the new manifest.
The new manifest
strands <- table(HM450.hg19.manifest.0808[HM450.hg19.manifest.0808$designType=='II']$flag_A, HM450.hg38.manifest.0808[HM450.hg38.manifest.0808$designType=='II']$flag_A)
strands <- strands[,c(1,3,2)]
rownames(strands) <- c('Watson GRCh37','Crick GRCh37')
colnames(strands) <- c('Watson GRCh38','Crick GRCh38','unmapped GRCh38')
kable(strands)
| Watson GRCh38 | Crick GRCh38 | unmapped GRCh38 | |
|---|---|---|---|
| Watson GRCh37 | 88115 | 87001 | 11 |
| Crick GRCh37 | 86987 | 87956 | 6 |
The old manifest
strands <- table(HM450.hg19.manifest.0304[HM450.hg19.manifest.0304$designType=='II']$flag_A, HM450.hg38.manifest.0304[HM450.hg38.manifest.0304$designType=='II']$flag_A)
strands <- strands[,c(1,3,2)]
rownames(strands) <- c('Watson GRCh37','Crick GRCh37')
colnames(strands) <- c('Watson GRCh38','Crick GRCh38','unmapped GRCh38')
kable(strands)
| Watson GRCh38 | Crick GRCh38 | unmapped GRCh38 | |
|---|---|---|---|
| Watson GRCh37 | 88115 | 87001 | 11 |
| Crick GRCh37 | 86987 | 87956 | 6 |
The biggest change in this release is that the MASK_general in hg19 becomes different from hg38. In all prior releases, MASK_general from hg19 and hg38 were merged and the merged set is recorded in both hg19 version and hg38 version. In the current version, this merging is abandoned.
masking_cols <- grep('MASK_',colnames(mcols(HM450.hg19.manifest.0304)), value = TRUE)
df <- rbind(
mcols(HM450.hg19.manifest.0304) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(HM450.hg38.manifest.0304) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(HM450.hg19.manifest.0808) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(HM450.hg38.manifest.0808) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum)
)
rownames(df) <- c('hg19_0304', 'hg38_0304', 'hg19_0808', 'hg38_0808')
kable(t(df))
| hg19_0304 | hg38_0304 | hg19_0808 | hg38_0808 | |
|---|---|---|---|---|
| MASK_mapping | 40517 | 41446 | 40490 | 41712 |
| MASK_typeINextBaseSwitch | 1030 | 1030 | 1030 | 1030 |
| MASK_rmsk15 | 84789 | 84789 | 84789 | 84789 |
| MASK_sub35_copy | 10545 | 10545 | 10545 | 10545 |
| MASK_sub30_copy | 17568 | 17568 | 17568 | 17568 |
| MASK_sub25_copy | 60654 | 60654 | 60654 | 60654 |
| MASK_sub40_copy | 6419 | 6419 | 6419 | 6419 |
| MASK_snp5_common | 78601 | 78601 | 78601 | 78601 |
| MASK_snp5_GMAF1p | 24008 | 24008 | 24008 | 24008 |
| MASK_extBase | 105 | 105 | 105 | 105 |
| MASK_general | 65894 | 65894 | 65574 | 66041 |
Another difference is we now masked slightly more probes in MASK_mapping based on NM_A and NM_B tag.
df <- mcols(HM450.hg38.manifest.0808[HM450.hg38.manifest.0808$MASK_mapping & !HM450.hg38.manifest.0304$MASK_mapping])
df$probeID <- rownames(df)
df %>% as.data.frame %>% dplyr::select(probeID, contains('NM')) %>% arrange(-NM_A) %>% head(10)
## probeID NM_A NM_B wDecoy_NM_A wDecoy_NM_B
## 1 cg04926385 4 NA 4 NA
## 2 cg03982074 3 NA 3 NA
## 3 cg10615091 3 NA 3 NA
## 4 cg10747483 3 3 3 3
## 5 cg22619763 2 2 2 2
## 6 cg10811045 2 2 2 2
## 7 cg05124441 2 2 2 2
## 8 cg20688289 2 NA 2 NA
## 9 cg05302531 2 2 2 2
## 10 cg10259889 2 2 2 2
table(pmax(df$NM_A, ifelse(is.na(df$NM_B),0,df$NM_B)))
##
## 1 2 3 4
## 241 21 3 1
On hg19, all the above probes were mapped without issue,
mcols(HM450.hg19.manifest.0808)[df$probeID,] %>% as.data.frame %>% dplyr::select(contains('NM')) %>% head(10)
## NM_A NM_B wDecoy_NM_A wDecoy_NM_B
## cg24602020 0 NA 0 NA
## cg23887839 0 NA 0 NA
## cg16047567 0 NA 0 NA
## cg01097325 0 NA 0 NA
## cg12639866 0 NA 0 NA
## cg11526198 0 NA 0 NA
## cg19266779 0 0 0 0
## cg21873275 0 0 0 0
## cg15067806 0 0 0 0
## cg21404935 0 0 0 0
As a matter of fact, under hg38 many probes have been mapped to interrogate the same CpG. Usually all but one of the probe are mapped with mismatches (NM>0).
cgBeg <- start(HM450.hg38.manifest.0808)
idx <- which(cgBeg[-1] == cgBeg[-length(cgBeg)])
length(idx)
## [1] 63
HM450.hg38.manifest.0808[sort(c(idx,idx+1))] %>% mcols %>% as.data.frame %>%
mutate(probeID=rownames(.)) %>% filter(!grepl('\\.1', probeID)) %>%
dplyr::select(probeID, chrm_A, probeBeg, probeEnd, NM_A, NM_B, designType) -> df
nrow(df)
## [1] 97
df
## probeID chrm_A probeBeg probeEnd NM_A NM_B designType
## 1 cg16673286 chr1 1713347 1713396 2 NA II
## 2 cg23611477 chr1 1713347 1713396 0 NA II
## 3 cg20017108 chr1 13410326 13410375 1 NA II
## 4 cg16566605 chr1 13410326 13410375 0 NA II
## 5 cg10240587 chr1 13410761 13410810 2 NA II
## 6 cg21410132 chr1 13410761 13410810 0 NA II
## 7 cg26329816 chr1 120850952 120851001 1 NA II
## 8 cg09137068 chr1 120850952 120851001 0 NA II
## 9 cg06441514 chr1 144420861 144420910 1 2 I
## 10 cg13552272 chr1 144420862 144420911 0 NA II
## 11 cg20294457 chr1 148152232 148152281 1 NA II
## 12 cg10127479 chr1 148152231 148152280 0 0 I
## 13 cg03170665 chr2 113575870 113575919 0 NA II
## 14 cg26523196 chr2 113575870 113575919 1 NA II
## 15 cg13912721 chr3 75785760 75785809 1 NA II
## 16 cg04092883 chr3 75785760 75785809 1 NA II
## 17 cg11938051 chr3 75785760 75785809 0 NA II
## 18 cg22509113 chr4 4226917 4226966 1 1 I
## 19 cg05936219 chr4 4226917 4226966 0 0 I
## 20 cg02334660 chr4 74446767 74446816 0 NA II
## 21 cg26611070 chr4 74446767 74446816 1 NA II
## 22 cg25593776 chr8 144147800 144147849 0 0 I
## 23 cg13177830 chr8 144147800 144147849 1 1 I
## 24 cg26712673 chr8 144277781 144277830 0 0 I
## 25 cg04746089 chr8 144277830 144277879 0 NA II
## 26 cg10796141 chr9 40992232 40992281 1 NA II
## 27 cg25976845 chr9 40992232 40992281 0 NA II
## 28 cg06230664 chr9 41074402 41074451 1 NA II
## [ reached getOption("max.print") -- omitted 69 rows ]
dir.create('~/Downloads/tmp/20180808/', showWarnings = FALSE)
download.file('http://zwdzwd.io/InfiniumAnnotation/20180808/EPIC/EPIC.hg38.manifest.rds', '~/Downloads/tmp/20180808/EPIC.hg38.manifest.rds')
download.file('http://zwdzwd.io/InfiniumAnnotation/20180808/EPIC/EPIC.hg19.manifest.rds', '~/Downloads/tmp/20180808/EPIC.hg19.manifest.rds')
dir.create('~/Downloads/tmp/20180304/', showWarnings = FALSE)
download.file('http://zwdzwd.io/InfiniumAnnotation/20180304/EPIC/EPIC.hg38.manifest.rds', '~/Downloads/tmp/20180304/EPIC.hg38.manifest.rds')
download.file('http://zwdzwd.io/InfiniumAnnotation/20180304/EPIC/EPIC.hg19.manifest.rds', '~/Downloads/tmp/20180304/EPIC.hg19.manifest.rds')
EPIC.hg38.manifest.0304 <- readRDS('~/Downloads/tmp/20180304/EPIC.hg38.manifest.rds')
EPIC.hg19.manifest.0304 <- readRDS('~/Downloads/tmp/20180304/EPIC.hg19.manifest.rds')
EPIC.hg38.manifest.0808 <- readRDS('~/Downloads/tmp/20180808/EPIC.hg38.manifest.rds')[names(EPIC.hg38.manifest.0304)]
EPIC.hg19.manifest.0808 <- readRDS('~/Downloads/tmp/20180808/EPIC.hg19.manifest.rds')[names(EPIC.hg19.manifest.0304)]
There are 46 columns (annotations) and 865918 rows (probes) in the old manifest. There are 51 columns (annotations) and 865918 rows (probes) in the new manifest.
The new manifest
strands <- table(EPIC.hg19.manifest.0808[EPIC.hg19.manifest.0808$designType=='II']$flag_A, EPIC.hg38.manifest.0808[EPIC.hg38.manifest.0808$designType=='II']$flag_A)
strands <- strands[,c(1,3,2)]
rownames(strands) <- c('Watson GRCh37','Crick GRCh37')
colnames(strands) <- c('Watson GRCh38','Crick GRCh38','unmapped GRCh38')
kable(strands)
| Watson GRCh38 | Crick GRCh38 | unmapped GRCh38 | |
|---|---|---|---|
| Watson GRCh37 | 181681 | 180554 | 16 |
| Crick GRCh37 | 180519 | 180980 | 10 |
The old manifest
strands <- table(EPIC.hg19.manifest.0304[EPIC.hg19.manifest.0304$designType=='II']$flag_A, EPIC.hg38.manifest.0304[EPIC.hg38.manifest.0304$designType=='II']$flag_A)
strands <- strands[,c(1,3,2)]
rownames(strands) <- c('Watson GRCh37','Crick GRCh37')
colnames(strands) <- c('Watson GRCh38','Crick GRCh38','unmapped GRCh38')
kable(strands)
| Watson GRCh38 | Crick GRCh38 | unmapped GRCh38 | |
|---|---|---|---|
| Watson GRCh37 | 181681 | 180554 | 16 |
| Crick GRCh37 | 180519 | 180980 | 10 |
The biggest change in this release is that the MASK_general in hg19 becomes different from hg38. In all prior releases, MASK_general from hg19 and hg38 were merged and the merged set is recorded in both hg19 version and hg38 version. In the current version, this merging is abandoned.
masking_cols <- grep('MASK_',colnames(mcols(EPIC.hg19.manifest.0304)), value = TRUE)
df <- rbind(
mcols(EPIC.hg19.manifest.0304) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(EPIC.hg38.manifest.0304) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(EPIC.hg19.manifest.0808) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(EPIC.hg38.manifest.0808) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum)
)
rownames(df) <- c('hg19_0304', 'hg38_0304', 'hg19_0808', 'hg38_0808')
kable(t(df))
| hg19_0304 | hg38_0304 | hg19_0808 | hg38_0808 | |
|---|---|---|---|---|
| MASK_mapping | 63508 | 65098 | 63483 | 65537 |
| MASK_typeINextBaseSwitch | 1051 | 1051 | 1051 | 1051 |
| MASK_rmsk15 | 201251 | 201251 | 201251 | 201251 |
| MASK_sub35_copy | 14502 | 14502 | 14502 | 14502 |
| MASK_sub30_copy | 25194 | 25194 | 25194 | 25194 |
| MASK_sub25_copy | 88173 | 88173 | 88173 | 88173 |
| MASK_sub40_copy | 8747 | 8747 | 8747 | 8747 |
| MASK_snp5_common | 156200 | 156200 | 156200 | 156200 |
| MASK_snp5_GMAF1p | 42159 | 42159 | 42159 | 42159 |
| MASK_extBase | 171 | 171 | 171 | 171 |
| MASK_general | 108452 | 108452 | 107994 | 108716 |
Another difference is we now masked slightly more probes in MASK_mapping based on NM_A and NM_B tag.
df <- mcols(EPIC.hg38.manifest.0808[EPIC.hg38.manifest.0808$MASK_mapping & !EPIC.hg38.manifest.0304$MASK_mapping])
df$probeID <- rownames(df)
df %>% as.data.frame %>% dplyr::select(probeID, contains('NM')) %>% arrange(-NM_A) %>% head(10)
## probeID NM_A NM_B wDecoy_NM_A wDecoy_NM_B
## 1 cg06449993 4 NA 4 NA
## 2 cg03982074 3 NA 3 NA
## 3 cg10615091 3 NA 3 NA
## 4 cg12381370 3 NA 3 NA
## 5 cg09761058 3 NA 3 NA
## 6 cg10747483 3 3 3 3
## 7 cg22619763 2 2 2 2
## 8 cg24771345 2 NA 2 NA
## 9 cg18041845 2 NA 2 NA
## 10 cg10811045 2 2 2 2
table(pmax(df$NM_A, ifelse(is.na(df$NM_B),0,df$NM_B)))
##
## 1 2 3 4
## 411 22 5 1
On hg19, all the above probes were mapped without issue,
mcols(EPIC.hg19.manifest.0808)[df$probeID,] %>% as.data.frame %>% dplyr::select(contains('NM')) %>% head(10)
## NM_A NM_B wDecoy_NM_A wDecoy_NM_B
## cg24602020 0 NA 0 NA
## cg18667738 0 NA 0 NA
## cg16047567 0 NA 0 NA
## cg01097325 0 NA 0 NA
## cg18303314 0 NA 0 NA
## cg20967147 0 NA 0 NA
## cg07942324 0 NA 0 NA
## cg11526198 0 NA 0 NA
## cg19266779 0 0 0 0
## cg12746041 0 NA 0 NA
As a matter of fact, under hg38 many probes have been mapped to interrogate the same CpG. Usually all but one of the probe are mapped with mismatches (NM>0).
cgBeg <- start(EPIC.hg38.manifest.0808)
idx <- which(cgBeg[-1] == cgBeg[-length(cgBeg)])
length(idx)
## [1] 77
EPIC.hg38.manifest.0808[sort(c(idx,idx+1))] %>% mcols %>% as.data.frame %>%
mutate(probeID=rownames(.)) %>% filter(!grepl('\\.1', probeID)) %>%
dplyr::select(probeID, chrm_A, probeBeg, probeEnd, NM_A, NM_B, designType) -> df
nrow(df)
## [1] 118
df
## probeID chrm_A probeBeg probeEnd NM_A NM_B designType
## 1 cg20017108 chr1 13410326 13410375 1 NA II
## 2 cg16566605 chr1 13410326 13410375 0 NA II
## 3 cg23664334 chr1 121010280 121010329 0 0 I
## 4 cg06741726 chr1 121010279 121010328 1 NA II
## 5 cg06441514 chr1 144420861 144420910 1 2 I
## 6 cg13552272 chr1 144420862 144420911 0 NA II
## 7 cg01893993 chr1 146307505 146307554 1 NA II
## 8 cg07100648 chr1 146307505 146307554 0 NA II
## 9 cg12512196 chr1 146472583 146472632 1 1 I
## 10 cg06616806 chr1 146472583 146472632 0 0 I
## 11 cg20294457 chr1 148152232 148152281 1 NA II
## 12 cg10127479 chr1 148152231 148152280 0 0 I
## 13 cg03170665 chr2 113575870 113575919 0 NA II
## 14 cg26523196 chr2 113575870 113575919 1 NA II
## 15 cg13912721 chr3 75785760 75785809 1 NA II
## 16 cg11938051 chr3 75785760 75785809 0 NA II
## 17 cg01972631 chr4 74441928 74441977 0 NA II
## 18 cg01231417 chr4 74441928 74441977 1 NA II
## 19 cg02334660 chr4 74446767 74446816 0 NA II
## 20 cg26611070 chr4 74446767 74446816 1 NA II
## 21 cg21205260 chr6 909186 909235 2 NA II
## 22 cg14475265 chr6 909186 909235 0 NA II
## 23 cg03481658 chr7 6011675 6011724 0 NA II
## 24 cg14344864 chr7 6011675 6011724 1 NA II
## 25 cg18779556 chr7 101681406 101681455 0 NA II
## 26 cg26670578 chr7 101681406 101681455 1 NA II
## 27 cg25593776 chr8 144147800 144147849 0 0 I
## 28 cg13177830 chr8 144147800 144147849 1 1 I
## [ reached getOption("max.print") -- omitted 90 rows ]
dir.create('~/Downloads/tmp/20180808/', showWarnings = FALSE)
download.file('http://zwdzwd.io/InfiniumAnnotation/20180808/hm27/hm27.hg38.manifest.rds', '~/Downloads/tmp/20180808/hm27.hg38.manifest.rds')
download.file('http://zwdzwd.io/InfiniumAnnotation/20180808/hm27/hm27.hg19.manifest.rds', '~/Downloads/tmp/20180808/hm27.hg19.manifest.rds')
dir.create('~/Downloads/tmp/20180304/', showWarnings = FALSE)
download.file('http://zwdzwd.io/InfiniumAnnotation/20180304/hm27/hm27.hg38.manifest.rds', '~/Downloads/tmp/20180304/hm27.hg38.manifest.rds')
download.file('http://zwdzwd.io/InfiniumAnnotation/20180304/hm27/hm27.hg19.manifest.rds', '~/Downloads/tmp/20180304/hm27.hg19.manifest.rds')
HM27.hg38.manifest.0304 <- readRDS('~/Downloads/tmp/20180304/hm27.hg38.manifest.rds')
HM27.hg19.manifest.0304 <- readRDS('~/Downloads/tmp/20180304/hm27.hg19.manifest.rds')
HM27.hg38.manifest.0808 <- readRDS('~/Downloads/tmp/20180808/hm27.hg38.manifest.rds')[names(HM27.hg38.manifest.0304)]
HM27.hg19.manifest.0808 <- readRDS('~/Downloads/tmp/20180808/hm27.hg19.manifest.rds')[names(HM27.hg19.manifest.0304)]
There are 46 columns (annotations) and 27578 rows (probes) in the old manifest. There are 51 columns (annotations) and 27578 rows (probes) in the new manifest.
The new manifest
strands <- table(HM27.hg19.manifest.0808$flag_A, HM27.hg38.manifest.0808$flag_A)
strands <- strands[c(1,3,2),]
rownames(strands) <- c('Watson GRCh38','Crick GRCh38','unmapped GRCh38')
colnames(strands) <- c('Watson GRCh37','Crick GRCh37')
kable(strands)
| Watson GRCh37 | Crick GRCh37 | |
|---|---|---|
| Watson GRCh38 | 8056 | 5777 |
| Crick GRCh38 | 5778 | 7964 |
| unmapped GRCh38 | 1 | 2 |
The old manifest
strands <- table(HM27.hg19.manifest.0304$flag_A, HM27.hg38.manifest.0304$flag_A)
strands <- strands[c(1,3,2),]
rownames(strands) <- c('Watson GRCh38','Crick GRCh38','unmapped GRCh38')
colnames(strands) <- c('Watson GRCh37','Crick GRCh37')
kable(strands)
| Watson GRCh37 | Crick GRCh37 | |
|---|---|---|
| Watson GRCh38 | 8056 | 5777 |
| Crick GRCh38 | 5778 | 7964 |
| unmapped GRCh38 | 1 | 2 |
The biggest change in this release is that the MASK_general in hg19 becomes different from hg38. In all prior releases, MASK_general from hg19 and hg38 were merged and the merged set is recorded in both hg19 version and hg38 version. In the current version, this merging is abandoned.
masking_cols <- grep('MASK_',colnames(mcols(HM27.hg19.manifest.0304)), value = TRUE)
df <- rbind(
mcols(HM27.hg19.manifest.0304) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(HM27.hg38.manifest.0304) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(HM27.hg19.manifest.0808) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum),
mcols(HM27.hg38.manifest.0808) %>% as.data.frame %>%
dplyr::select(starts_with('MASK_')) %>% summarise_all(sum)
)
rownames(df) <- c('hg19_0304', 'hg38_0304', 'hg19_0808', 'hg38_0808')
kable(t(df))
| hg19_0304 | hg38_0304 | hg19_0808 | hg38_0808 | |
|---|---|---|---|---|
| MASK_mapping | 1645 | 1659 | 1648 | 1674 |
| MASK_typeINextBaseSwitch | 195 | 195 | 195 | 195 |
| MASK_rmsk15 | 3207 | 3207 | 3207 | 3207 |
| MASK_sub35_copy | 722 | 722 | 722 | 722 |
| MASK_sub30_copy | 893 | 893 | 893 | 893 |
| MASK_sub25_copy | 2709 | 2709 | 2709 | 2709 |
| MASK_sub40_copy | 619 | 619 | 619 | 619 |
| MASK_snp5_common | 2640 | 2640 | 2640 | 2640 |
| MASK_snp5_GMAF1p | 546 | 546 | 546 | 546 |
| MASK_extBase | 9 | 9 | 9 | 9 |
| MASK_general | 2443 | 2443 | 2442 | 2449 |
Another difference is we now masked slightly more probes in MASK_mapping based on NM_A and NM_B tag.
df <- mcols(HM27.hg38.manifest.0808[HM27.hg38.manifest.0808$MASK_mapping & !HM27.hg38.manifest.0304$MASK_mapping])
df$probeID <- rownames(df)
df %>% as.data.frame %>% dplyr::select(probeID, contains('NM')) %>% arrange(-NM_A) %>% head(10)
## probeID NM_A NM_B wDecoy_NM_A wDecoy_NM_B
## 1 cg10615091 3 3 3 3
## 2 cg23424962 2 2 2 2
## 3 cg13019092 1 1 1 1
## 4 cg08307963 1 1 1 1
## 5 cg04344347 1 1 1 1
## 6 cg00684178 1 1 1 1
## 7 cg04172348 1 1 1 1
## 8 cg06518271 1 1 1 1
## 9 cg19921492 1 1 1 1
## 10 cg17630833 1 1 1 1
table(pmax(df$NM_A, ifelse(is.na(df$NM_B),0,df$NM_B)))
##
## 1 2 3
## 13 1 1
On hg19, all the above probes were mapped without issue,
mcols(HM27.hg19.manifest.0808)[df$probeID,] %>% as.data.frame %>% dplyr::select(contains('NM')) %>% head(10)
## NM_A NM_B wDecoy_NM_A wDecoy_NM_B
## cg13019092 0 0 0 0
## cg08307963 0 0 0 0
## cg04344347 1 1 1 1
## cg00684178 0 0 0 0
## cg04172348 0 0 0 0
## cg06518271 0 0 0 0
## cg10615091 0 0 0 0
## cg19921492 1 1 1 1
## cg17630833 1 1 1 1
## cg23727583 0 0 0 0
As a matter of fact, under hg38 many probes have been mapped to interrogate the same CpG. Usually all but one of the probe are mapped with mismatches (NM>0).
cgBeg <- start(HM27.hg38.manifest.0808)
idx <- which(cgBeg[-1] == cgBeg[-length(cgBeg)])
length(idx)
## [1] 12
HM27.hg38.manifest.0808[sort(c(idx,idx+1))] %>% mcols %>% as.data.frame %>%
mutate(probeID=rownames(.)) %>% filter(!grepl('\\.1', probeID)) %>%
dplyr::select(probeID, chrm_A, probeBeg, probeEnd, NM_A, NM_B, designType) -> df
nrow(df)
## [1] 24
df
## probeID chrm_A probeBeg probeEnd NM_A NM_B designType
## 1 cg11037148 chr6 27831434 27831483 0 0 I
## 2 cg00634577 chr6 27831434 27831483 0 0 I
## 3 cg03790787 chr6 170612200 170612249 0 0 I
## 4 cg00578575 chr6 170612200 170612249 0 0 I
## 5 cg08539093 chr9 135546158 135546207 0 0 I
## 6 cg15092802 chr9 135546158 135546207 0 0 I
## 7 cg11673803 chr10 87094414 87094463 0 0 I
## 8 cg11891583 chr10 87094414 87094463 0 0 I
## 9 cg15105987 chr13 112067857 112067906 0 0 I
## 10 cg11208483 chr13 112067857 112067906 0 0 I
## 11 cg15910079 chr14 20891256 20891305 0 0 I
## 12 cg26191951 chr14 20891256 20891305 0 0 I
## 13 cg23663653 chr14 105742822 105742871 0 0 I
## 14 cg23988567 chr14 105742822 105742871 0 0 I
## 15 cg20657421 chr17 36103077 36103126 0 0 I
## 16 cg00896220 chr17 36103077 36103126 1 1 I
## 17 cg02043477 chr19 40851247 40851296 0 0 I
## 18 cg20075229 chr19 40851247 40851296 0 0 I
## 19 cg15408454 chrX 152698701 152698750 0 0 I
## 20 cg07545232 chrX 152698701 152698750 0 0 I
## 21 cg06899808 chrX 152698855 152698904 0 0 I
## 22 cg16390856 chrX 152698855 152698904 0 0 I
## 23 cg23509027 chrX 152733885 152733934 0 0 I
## 24 cg08977028 chrX 152733885 152733934 0 0 I