suppressPackageStartupMessages({
   library(foreach)
   library(doParallel)
   library(dplyr)
   library(ggplot2)
   library(FlowSorted.Blood.EPIC)
   library(limma)
   library(EpiDISH)
   library(reshape2)
})
# library(IlluminaHumanMethylationEPICmanifest)
# library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
# library(data.table)

data_cpg <- read.csv("NTP-39.GENE.cg_genename_avg_annotation.csv")
data_chg <- read.csv("NTP-39.GENE.chg_genename_avg_annotation.csv")
data_chh <- read.csv("NTP-39.GENE.chh_genename_avg_annotation.csv")

Annotation

Annotate the probe info to the RRBS data

load("C:/Users/suanni/Box/CODE/Reference/EPIC_fdat.RData")
anno <- fData_EPIC[, c("TargetID", "CHR", "MAPINFO", "INFINIUM_DESIGN_TYPE")]
anno$CHR <- as.character(anno$CHR)
anno$TargetID <- as.character(anno$TargetID)

data <- data_chh
data$Probe <- NA
cores = detectCores()
cl <- makeCluster(cores[1]-2) #not to overload your computer
registerDoParallel(cl)

# Mapping the start and end on RRBS files to microarray annotation - considered to be in the same region if the array probe is located somewhere between the start and end
anno.f <- foreach(i = 1:nrow(anno), .combine = rbind) %dopar% {
   probe <- anno[i, ]
   test <- sapply(1:nrow(data), function(j) {
     if(data$Chromosome[j] == probe$CHR & data$Start[j] < probe$MAPINFO & data$End[j] > probe$MAPINFO){ 
       data$Probe[j] <- probe$TargetID
       out <- c(data$GeneID[j], probe$TargetID)
     } else{
       out <- c(data$GeneID[j], NA)
     }
     return(out)
   })
   test <- t(test) 
   test <- as.data.frame(test)
   test <- test[!is.na(test[, 2]), ]
   test
}

#stop cluster
stopCluster(cl)

out <- sapply(unique(anno.f$V2), function(x){
  dat <- anno.f[anno.f$V2 == x, ]
  out <- anno[x, ]
  out$Read <- paste0(dat$V1, collapse = ", ")
  return(out)
})
out <- t(out) %>% as.data.frame()
head(out)

anno.f_cpg <- anno.f
out_cpg <- out

#save(anno.f_cpg, out_cpg, file = "01-Mapped_to_Array.RData")
#save(anno.f_chg, out_chg, file = "01-Mapped_to_Array_chg.RData")
#save(anno.f_chh, out_chh, file = "01-Mapped_to_Array_chh.RData")
load("Islam-RRBS/01-Mapped_to_Array.RData")
head(out_cpg)
##              TargetID CHR   MAPINFO INFINIUM_DESIGN_TYPE
## cg00000029 cg00000029  16  53468112                    2
## cg00000109 cg00000109   3 171916037                    2
## cg00000155 cg00000155   7   2590565                    2
## cg00000158 cg00000158   9  95010555                    2
## cg00000221 cg00000221  17  54534248                    2
## cg00000236 cg00000236   8  42263294                    2
##                               Read
## cg00000029 ID=gene:ENSG00000103479
## cg00000109 ID=gene:ENSG00000075420
## cg00000155 ID=gene:ENSG00000106009
## cg00000158 ID=gene:ENSG00000196305
## cg00000221 ID=gene:ENSG00000153930
## cg00000236 ID=gene:ENSG00000078668

Calculate Beta values for each CpG

data <- data_cpg
out <- out_cpg

# Take average of average methylation level for each probe
bt <- sapply(1:nrow(out), function(x){
   probes <- strsplit2(out$Read[x], ", ") %>% as.character()
   samp <- data[data$GeneID %in% probes, ]
   out1 <- mean(samp$AverageMethylation)
   return(c(out$TargetID[x], out1))
})

bt <- t(bt) %>% as.data.frame()
rownames(bt) <- bt[, 1]
colnames(bt)[2] <- "Chh"
bt_cpg <- bt

bt <- cbind(as.numeric(bt_cpg[, 2]), as.numeric(bt_chg[, 2]), as.numeric(bt_chh[, 2])) %>% as.data.frame()
rownames(bt) <- rownames(bt_cpg)
colnames(bt) <- c("cpg", "chg", "chh")

save(bt, file = "02-All_betas_mapped.RData")
load("Islam-RRBS/02-All_betas_mapped.RData")
head(bt)
##                  cpg         chg        chh
## cg00000029 0.3926431 0.012475750 0.00481200
## cg00000109 0.8166536 0.138361176 0.07561470
## cg00000155 0.6150922 0.023909111 0.01226480
## cg00000158 0.4443894 0.059757333 0.06590147
## cg00000221 0.9052018 0.013217250 0.01623000
## cg00000236 0.3427634 0.007301167 0.01018150

Predict cell type proportion

# Predict cell type with in-house script
ct <- ECC9(bt, tissueType = "blood", arrayType = "EPIC", probelist = "auto")
## snapshotDate(): 2020-10-27
## see ?FlowSorted.Blood.EPIC and browseVignettes('FlowSorted.Blood.EPIC') for documentation
## loading from cache
## [estimateCellCounts] Starting Cell Type Estimation Based on the FlowSorted.Blood.EPIC Dataset.
## Loading required package: IlluminaHumanMethylationEPICmanifest
## [estimateCellCounts] Combining Data with Flow Sorted Data.
## [estimateCellCounts] Quantile Normalizing Data Together.
## [estimateCellCounts] Picking Probes for Composition Estimation Based on Probes That Best Differentiate Cell Types in the Reference Dataset.
## [estimateCellCounts] Estimating Composition.
ct$counts # no granulocytes - makes sense if sample is PBMC
##          CD8T      CD4T        NK     Bcell Mono Neu
## cpg 0.2243495 0.3162806 0.1800005 0.3229444    0   0
## chg 0.2089902 0.2512813 0.2223962 0.2847073    0   0
## chh 0.2742319 0.1822183 0.2066942 0.2944816    0   0
plt <- melt(ct$counts)
ggplot(plt, aes(Var2, value, fill = Var1, color = Var1)) +
   geom_point(shape = 21, size = 4) + 
   theme_bw() + 
   xlab("Blood Cell Type") + 
   ylab("Cell Type Proportion") 

# Predict cell type with an alternative method - EPIDISH
#data("centEpiFibIC.m")
#data("centBloodSub.m")
data("centDHSbloodDMC.m")
ct.epidish <- epidish(bt, centDHSbloodDMC.m, method = 'RPC') 
ct.epidish$estF # Eosinophil percentage too high - probably not accurate
##              B NK      CD4T      CD8T       Mono Neutro    Eosino
## cpg 0.00000000  0 0.0000000 0.6119805 0.09684498      0 0.2911745
## chg 0.03455842  0 0.2910818 0.3151630 0.22911480      0 0.1300820
## chh 0.06790552  0 0.5544949 0.0000000 0.16319521      0 0.2144043
plt <- melt(ct.epidish$estF)
ggplot(plt, aes(Var2, value, fill = Var1, color = Var1)) +
   geom_point(shape = 21, size = 4) + 
   theme_bw() + 
   xlab("Blood Cell Type") + 
   ylab("Cell Type Proportion") 

Use CellDMC to determine the cell type that drives a differential methyaltion signal

# Perform CellDMC
count <- ct$counts[, 1:4]
pheno <- c(1, 1, 0) # I don't have phenotype information, so I'll make something up here. Replace this variable with your actual group variable (e.g. disease vs control)
celldmc <- CellDMC(bt, pheno, count)  # It gave me an error because there are not enough samples (fewer samples than the number of condition:cell type), but you can use this code once you have all the samples run to get an output as shown below 
# CellDMC examples - not based on your data!
out.l <- epidish(DummyBeta.m, centEpiFibIC.m, method = 'RPC')
frac.m <- out.l$estF
pheno.v <- rep(c(0, 1), each = 5)
celldmc.o <- CellDMC(DummyBeta.m, pheno.v, frac.m) 
## Binary phenotype detected. Predicted change will be 1 - 0.
head(celldmc.o$coe$Epi) # Which CpG's differential methylation is driven by epithelial cells
##             Estimate        SE         t          p      adjP
## cg17506061 1.0000000 0.5263342 2.0836128 0.10559050 0.7152720
## cg09300980 0.1718010 0.3978319 0.4318432 0.68811432 0.9788255
## cg18886245 1.0000000 1.4106163 0.7101953 0.51679505 0.9620602
## cg17470327 1.0000000 0.3937913 2.6582283 0.05649463 0.5918364
## cg26082174 1.0000000 0.2780896 5.6517997 0.00482831 0.4334332
## cg14737131 0.9250249 0.5983209 1.5460348 0.19699413 0.8116733

Export output

write.csv(bt, file = "Betas_mapped_to_array.csv")
write.csv(ct$counts, file ="CT_predicted.csv")