suppressPackageStartupMessages({
library(foreach)
library(doParallel)
library(dplyr)
library(ggplot2)
library(FlowSorted.Blood.EPIC)
library(limma)
library(EpiDISH)
library(reshape2)
})
# library(IlluminaHumanMethylationEPICmanifest)
# library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
# library(data.table)
data_cpg <- read.csv("NTP-39.GENE.cg_genename_avg_annotation.csv")
data_chg <- read.csv("NTP-39.GENE.chg_genename_avg_annotation.csv")
data_chh <- read.csv("NTP-39.GENE.chh_genename_avg_annotation.csv")
Annotate the probe info to the RRBS data
load("C:/Users/suanni/Box/CODE/Reference/EPIC_fdat.RData")
anno <- fData_EPIC[, c("TargetID", "CHR", "MAPINFO", "INFINIUM_DESIGN_TYPE")]
anno$CHR <- as.character(anno$CHR)
anno$TargetID <- as.character(anno$TargetID)
data <- data_chh
data$Probe <- NA
cores = detectCores()
cl <- makeCluster(cores[1]-2) #not to overload your computer
registerDoParallel(cl)
# Mapping the start and end on RRBS files to microarray annotation - considered to be in the same region if the array probe is located somewhere between the start and end
anno.f <- foreach(i = 1:nrow(anno), .combine = rbind) %dopar% {
probe <- anno[i, ]
test <- sapply(1:nrow(data), function(j) {
if(data$Chromosome[j] == probe$CHR & data$Start[j] < probe$MAPINFO & data$End[j] > probe$MAPINFO){
data$Probe[j] <- probe$TargetID
out <- c(data$GeneID[j], probe$TargetID)
} else{
out <- c(data$GeneID[j], NA)
}
return(out)
})
test <- t(test)
test <- as.data.frame(test)
test <- test[!is.na(test[, 2]), ]
test
}
#stop cluster
stopCluster(cl)
out <- sapply(unique(anno.f$V2), function(x){
dat <- anno.f[anno.f$V2 == x, ]
out <- anno[x, ]
out$Read <- paste0(dat$V1, collapse = ", ")
return(out)
})
out <- t(out) %>% as.data.frame()
head(out)
anno.f_cpg <- anno.f
out_cpg <- out
#save(anno.f_cpg, out_cpg, file = "01-Mapped_to_Array.RData")
#save(anno.f_chg, out_chg, file = "01-Mapped_to_Array_chg.RData")
#save(anno.f_chh, out_chh, file = "01-Mapped_to_Array_chh.RData")
load("Islam-RRBS/01-Mapped_to_Array.RData")
head(out_cpg)
## TargetID CHR MAPINFO INFINIUM_DESIGN_TYPE
## cg00000029 cg00000029 16 53468112 2
## cg00000109 cg00000109 3 171916037 2
## cg00000155 cg00000155 7 2590565 2
## cg00000158 cg00000158 9 95010555 2
## cg00000221 cg00000221 17 54534248 2
## cg00000236 cg00000236 8 42263294 2
## Read
## cg00000029 ID=gene:ENSG00000103479
## cg00000109 ID=gene:ENSG00000075420
## cg00000155 ID=gene:ENSG00000106009
## cg00000158 ID=gene:ENSG00000196305
## cg00000221 ID=gene:ENSG00000153930
## cg00000236 ID=gene:ENSG00000078668
data <- data_cpg
out <- out_cpg
# Take average of average methylation level for each probe
bt <- sapply(1:nrow(out), function(x){
probes <- strsplit2(out$Read[x], ", ") %>% as.character()
samp <- data[data$GeneID %in% probes, ]
out1 <- mean(samp$AverageMethylation)
return(c(out$TargetID[x], out1))
})
bt <- t(bt) %>% as.data.frame()
rownames(bt) <- bt[, 1]
colnames(bt)[2] <- "Chh"
bt_cpg <- bt
bt <- cbind(as.numeric(bt_cpg[, 2]), as.numeric(bt_chg[, 2]), as.numeric(bt_chh[, 2])) %>% as.data.frame()
rownames(bt) <- rownames(bt_cpg)
colnames(bt) <- c("cpg", "chg", "chh")
save(bt, file = "02-All_betas_mapped.RData")
load("Islam-RRBS/02-All_betas_mapped.RData")
head(bt)
## cpg chg chh
## cg00000029 0.3926431 0.012475750 0.00481200
## cg00000109 0.8166536 0.138361176 0.07561470
## cg00000155 0.6150922 0.023909111 0.01226480
## cg00000158 0.4443894 0.059757333 0.06590147
## cg00000221 0.9052018 0.013217250 0.01623000
## cg00000236 0.3427634 0.007301167 0.01018150
# Predict cell type with in-house script
ct <- ECC9(bt, tissueType = "blood", arrayType = "EPIC", probelist = "auto")
## snapshotDate(): 2020-10-27
## see ?FlowSorted.Blood.EPIC and browseVignettes('FlowSorted.Blood.EPIC') for documentation
## loading from cache
## [estimateCellCounts] Starting Cell Type Estimation Based on the FlowSorted.Blood.EPIC Dataset.
## Loading required package: IlluminaHumanMethylationEPICmanifest
## [estimateCellCounts] Combining Data with Flow Sorted Data.
## [estimateCellCounts] Quantile Normalizing Data Together.
## [estimateCellCounts] Picking Probes for Composition Estimation Based on Probes That Best Differentiate Cell Types in the Reference Dataset.
## [estimateCellCounts] Estimating Composition.
ct$counts # no granulocytes - makes sense if sample is PBMC
## CD8T CD4T NK Bcell Mono Neu
## cpg 0.2243495 0.3162806 0.1800005 0.3229444 0 0
## chg 0.2089902 0.2512813 0.2223962 0.2847073 0 0
## chh 0.2742319 0.1822183 0.2066942 0.2944816 0 0
plt <- melt(ct$counts)
ggplot(plt, aes(Var2, value, fill = Var1, color = Var1)) +
geom_point(shape = 21, size = 4) +
theme_bw() +
xlab("Blood Cell Type") +
ylab("Cell Type Proportion")
# Predict cell type with an alternative method - EPIDISH
#data("centEpiFibIC.m")
#data("centBloodSub.m")
data("centDHSbloodDMC.m")
ct.epidish <- epidish(bt, centDHSbloodDMC.m, method = 'RPC')
ct.epidish$estF # Eosinophil percentage too high - probably not accurate
## B NK CD4T CD8T Mono Neutro Eosino
## cpg 0.00000000 0 0.0000000 0.6119805 0.09684498 0 0.2911745
## chg 0.03455842 0 0.2910818 0.3151630 0.22911480 0 0.1300820
## chh 0.06790552 0 0.5544949 0.0000000 0.16319521 0 0.2144043
plt <- melt(ct.epidish$estF)
ggplot(plt, aes(Var2, value, fill = Var1, color = Var1)) +
geom_point(shape = 21, size = 4) +
theme_bw() +
xlab("Blood Cell Type") +
ylab("Cell Type Proportion")
# Perform CellDMC
count <- ct$counts[, 1:4]
pheno <- c(1, 1, 0) # I don't have phenotype information, so I'll make something up here. Replace this variable with your actual group variable (e.g. disease vs control)
celldmc <- CellDMC(bt, pheno, count) # It gave me an error because there are not enough samples (fewer samples than the number of condition:cell type), but you can use this code once you have all the samples run to get an output as shown below
# CellDMC examples - not based on your data!
out.l <- epidish(DummyBeta.m, centEpiFibIC.m, method = 'RPC')
frac.m <- out.l$estF
pheno.v <- rep(c(0, 1), each = 5)
celldmc.o <- CellDMC(DummyBeta.m, pheno.v, frac.m)
## Binary phenotype detected. Predicted change will be 1 - 0.
head(celldmc.o$coe$Epi) # Which CpG's differential methylation is driven by epithelial cells
## Estimate SE t p adjP
## cg17506061 1.0000000 0.5263342 2.0836128 0.10559050 0.7152720
## cg09300980 0.1718010 0.3978319 0.4318432 0.68811432 0.9788255
## cg18886245 1.0000000 1.4106163 0.7101953 0.51679505 0.9620602
## cg17470327 1.0000000 0.3937913 2.6582283 0.05649463 0.5918364
## cg26082174 1.0000000 0.2780896 5.6517997 0.00482831 0.4334332
## cg14737131 0.9250249 0.5983209 1.5460348 0.19699413 0.8116733
write.csv(bt, file = "Betas_mapped_to_array.csv")
write.csv(ct$counts, file ="CT_predicted.csv")