# Install CRAN packages
install.packages("dplyr")
# Install Bioconductor packages
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("minfi")Accounting for ISCH - example code
Set up the analysis environment
In the code chunks throughout the document, the required R packages will be listed. They can be installed into your R environment with the following code, depending on whether they are CRAN or Bioconductor packages.
Example preprocessing pipeline
Start from raw IDAT files
library(minfi)
library(dplyr)
# Download the raw data at https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE112618&format=file
# untar with tar -xvf GSE112618_RAW.tar
# unzip the idat files within with gzip -d *.gz
yourRGChannelSet <- read.metharray.exp("GSE112618_RAW") ##################### For EPIC v2 only #####################
# Download EPIC v2 manifest
# BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2manifest")
# BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2anno.20a1.hg38")
# reassign the annotation so that the minfi functions getBeta(), preprocessRaw() and others like preprocessNoob() will work
library(IlluminaHumanMethylationEPICv2manifest)
library(IlluminaHumanMethylationEPICv2anno.20a1.hg38)
annotation(yourRGChannelSet)["array"] <- "IlluminaHumanMethylationEPICv2"
annotation(yourRGChannelSet)["annotation"] <- "20a1.hg38"
(manifest <- getManifest(yourRGChannelSet))Obtain beta matrix from RGChannelSet
# Minimal preprocessing - only background color correction and inter-sample normalization
yourBetaMatrix <- preprocessFunnorm(yourRGChannelSet) %>%
getBeta()
# See Table 1 for what input is required for each cell type prediction methodIntersample cellular heterogeneity (ISCH) estimation
Reference-based ISCH estimation
Example: Estimating cell type proportions in blood
We will continue to use the yourRGChannelSet dataset created in the previous section (Start from raw IDAT files)
library(FlowSorted.Blood.EPIC) # Bioconductor package
library(ExperimentHub) # Bioconductor package
hub <- ExperimentHub()
FlowSorted.Blood.EPIC <- hub[["EH1136"]] # Loading reference data set
ecc_output <- estimateCellCounts2(yourRGChannelSet,
processMethod = "preprocessNoob",
probeSelect = "auto",
cellTypes = c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Neu"),
referencePlatform = "IlluminaHumanMethylationEPIC",
referenceset = "FlowSorted.Blood.EPIC", # Specify the reference data set.
IDOLOptimizedCpGs = NULL,
returnAll = T)
# This function is actually not compatible with EPICv2 dataset for the moment (Aug 2024)
blood_ct <- ecc_output$prop
head(blood_ct) CD8T CD4T NK Bcell Mono Neu
GSM3074480_202163530006_R02C01 0.0739 0.1900 0.0472 0.0370 0.0940 0.6099
GSM3074481_202163530006_R03C01 0.1729 0.1648 0.0911 0.0765 0.0909 0.4637
GSM3074482_202163530006_R04C01 0.0926 0.0929 0.0473 0.0581 0.0917 0.6753
GSM3074483_202163530006_R06C01 0.1655 0.2511 0.0725 0.0973 0.0918 0.3889
GSM3074484_202163530006_R07C01 0.0994 0.0786 0.0512 0.0579 0.0848 0.6882
GSM3074485_202163530006_R08C01 0.2050 0.1102 0.1058 0.0870 0.1266 0.4297
# visualization of blood cell type distribution
library(ggplot2)
blood_plt <- reshape2::melt(blood_ct)
ggplot(blood_plt, aes(Var2, value, color = Var2)) +
geom_boxplot() +
geom_point(position = position_jitter()) +
theme_bw() +
labs(x = "cell type",
y = "cell type proportion (%)",
title = "Blood cell type proportion") Example: Estimating cell type proportion in buccal swabs or saliva
# load example buccal and saliva datasets
library(GEOquery) # Bioconductor package
# Example buccal dataset GSE137495
GSE137495 <- getGEO("GSE137495")
GSE137495_beta <- exprs(GSE137495$GSE137495_series_matrix.txt.gz)
# Example saliva dataset GSE111631
GSE111631 <- getGEO("GSE111631")
GSE111631_beta <- exprs(GSE111631$GSE111631_series_matrix.txt.gz)library(EpiDISH) # Bioconductor package
data(centEpiFibIC.m) # Reference for fibroblasts, epithelial cells, and immune cells
data(centBloodSub.m) # Reference for immune cell subtypes
buccal_ct <- hepidish(beta.m = GSE137495_beta, # replace this with your beta matrix
ref1.m = centEpiFibIC.m,
ref2.m = centBloodSub.m,
h.CT.idx = 3, # Specify the column in ref1.m to which ref2.m applies
method = "RPC")
head(buccal_ct) Epi Fib B NK CD4T CD8T Mono
GSM4080206 0.9792600 0 0.005029510 0.004739591 0.002862252 0 0.003456092
GSM4080207 0.9786308 0 0.005229510 0.004365611 0.002950365 0 0.003858183
GSM4080208 0.9679868 0 0.007965030 0.007060661 0.004377309 0 0.004852714
GSM4080209 0.9676823 0 0.007781069 0.006783387 0.004897736 0 0.005166767
GSM4080210 0.9777751 0 0.005507455 0.005045903 0.003400369 0 0.003151569
GSM4080211 0.9717333 0 0.006719373 0.006479177 0.003711909 0 0.004261341
Neutro Eosino
GSM4080206 0.003187218 0.001465325
GSM4080207 0.003562419 0.001403089
GSM4080208 0.006183435 0.001574031
GSM4080209 0.005752227 0.001936477
GSM4080210 0.003168665 0.001950987
GSM4080211 0.005424045 0.001670821
saliva_ct <- hepidish(beta.m = GSE111631_beta, # replace this with your beta matrix
ref1.m = centEpiFibIC.m,
ref2.m = centBloodSub.m,
h.CT.idx = 3, # Specify the column in ref1.m to which ref2.m applies
method = "RPC")
head(saliva_ct) Epi Fib B NK CD4T CD8T
GSM3035989 0.09911421 0.01009859 0.0007254532 0.002567000 0.000000000 0
GSM3035990 0.11942556 0.01093509 0.0186068286 0.035274308 0.014675686 0
GSM3035991 0.73068579 0.00000000 0.0347779815 0.039622319 0.024404079 0
GSM3035992 0.12086098 0.01955833 0.0072313044 0.018155891 0.005109280 0
GSM3035993 0.13408641 0.02022015 0.0106236185 0.014948171 0.003894098 0
GSM3035994 0.11632987 0.01411160 0.0145979708 0.008708027 0.000000000 0
Mono Neutro Eosino
GSM3035989 0.005690285 0.8818045 0
GSM3035990 0.028107167 0.7729754 0
GSM3035991 0.048866402 0.1216434 0
GSM3035992 0.036033861 0.7930504 0
GSM3035993 0.024966963 0.7912606 0
GSM3035994 0.032935647 0.8133169 0
# visualization of buccal and saliva cell type distribution
library(ggplot2)
buccal_plt <- reshape2::melt(buccal_ct)
ggplot(buccal_plt, aes(Var2, value, color = Var2)) +
geom_boxplot() +
geom_point(position = position_jitter()) +
theme_bw() +
labs(x = "cell type",
y = "cell type proportion (%)",
title = "Buccal cell type proportion") # Buccal swab is mostly composed of epithelial cellssaliva_plt <- reshape2::melt(saliva_ct)
ggplot(saliva_plt, aes(Var2, value, color = Var2)) +
geom_boxplot() +
geom_point(position = position_jitter()) +
theme_bw() +
labs(x = "cell type",
y = "cell type proportion (%)",
title = "Saliva cell type proportion") # Saliva samples typically have a more heterogeneous distribution of epithelial cells and neutrophils compared to buccal swabsExample: Estimating cell type proportion in the brain
# load example brain datasets
library(GEOquery) # Bioconductor package
# Example brain dataset GSE168916
# Download the raw data at https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE168916&format=file
# untar compressed file and unzip the idat files within
GSE168916 <- read.metharray.exp("GSE168916_RAW/")library(HiBED) # install with devtools::install_github("SalasLab/HiBED")
library(minfi) # Bioconductor package
library(wateRmelon) # Bioconductor package
# Normalization
brain_Mset <- preprocessNoob(GSE168916) # noob normalization for color correction
brain_betas <- BMIQ(brain_Mset) # alternatively, do getBeta(brain_Mset) to extract beta matrix without BMIQ
# Brain ISCH prediction
brain_ct <- HiBED_deconvolution(brain_betas, h = 2) # 2 layers of deconvolution yields 7 brain cell types# visualization of brain cell type distribution
library(ggplot2)
brain_plt <- reshape2::melt(brain_ct)
ggplot(brain_plt, aes(variable, value, color = variable)) +
geom_boxplot() +
geom_point(position = position_jitter()) +
theme_bw() +
labs(x = "cell type",
y = "cell type proportion (%)",
title = "Brain cell type proportion") Reference-free ISCH estimation
Example 1: Buccal dataset GSE137495
library(GEOquery) # Bioconductor package
library(sva) # Bioconductor package
library(impute) # Bioconductor package
# Self-defined function to suppress unnecessary verbose output
quiet <- function(x) {
sink(tempfile())
on.exit(sink())
invisible(force(x))
}
# Using buccal dataset GSE137495 as example
GSE137495 <- getGEO("GSE137495")
yourBetaMatrix <- exprs(GSE137495$GSE137495_series_matrix.txt.gz)
yourPhenoData <- pData(GSE137495$GSE137495_series_matrix.txt.gz)
yourPhenoData <- yourPhenoData[, c(34, 36)]
colnames(yourPhenoData) <- c("Age", "Sex")
yourPhenoData$Age <- as.numeric(yourPhenoData$Age)
identical(colnames(yourBetaMatrix), rownames(yourPhenoData)) # TRUE[1] TRUE
# Running SVA
yourBetaMatrix <- quiet(impute.knn(yourBetaMatrix)$data)
mod <- model.matrix(~as.factor(Sex) + Age, data = yourPhenoData) # For mainVar, enter the column name in yourPhenoData.
mod0 <- model.matrix(~Age, data = yourPhenoData)
yourBetaMatrix <- yourBetaMatrix[, rownames(mod)] # The model.matrix function removes samples with missing values, so ensure that the same samples are included in both data frames
(n.sv <- num.sv(yourBetaMatrix, mod, method = "leek")) # No SV identified[1] 0
No relevant SV was identified! As we saw before, the buccal cell type proportion is actually quite homogenous in this dataset (i.e. mostly epithelial cells), and SVA result suggest that ISCH and other contributors to DNAme variability does not yield sufficient heterogeneity to drive a SV.
Example 2: Buccal dataset GSE80261
library(GEOquery) # Bioconductor package
# Using buccal dataset GSE80261 as example
GSE80261 <- getGEO("GSE80261")
yourBetaMatrix <- exprs(GSE80261$GSE80261_series_matrix.txt.gz)
yourPhenoData <- pData(GSE80261$GSE80261_series_matrix.txt.gz)
yourPhenoData <- yourPhenoData[, c(41, 44:45, 50)]
colnames(yourPhenoData) <- c("Age", "Disease_Status", "Disease_Subgroup", "Sex") # Just a few examples of potentially relevant variables
identical(colnames(yourBetaMatrix), rownames(yourPhenoData)) # TRUE[1] TRUE
# Recode some variables
yourPhenoData$Age <- as.numeric(yourPhenoData$Age)
yourPhenoData$Disease_Status <- as.factor(yourPhenoData$Disease_Status)
yourPhenoData$Disease_Subgroup <- as.factor(yourPhenoData$Disease_Subgroup)
yourPhenoData$Sex <- as.factor(yourPhenoData$Sex)library(sva) # Bioconductor package
library(impute) # Bioconductor package
library(dplyr) # CRAN package
library(EpiDISH)
# impute missing values
yourBetaMatrix <- quiet(impute.knn(yourBetaMatrix)$data)
# Running SVA
mod <- model.matrix(~Disease_Status + Sex + Age, data = yourPhenoData)
# Say if we are interested in the main effect of Disease Status
mod0 <- model.matrix(~Sex + Age, data = yourPhenoData) # Account for covariates that might contribute to DNAme variability but we are not interested in examining directly
yourBetaMatrix <- yourBetaMatrix[, rownames(mod)] # The model.matrix function removes samples with missing values, so ensure that the same samples are included in both data frames
(n.sv <- num.sv(yourBetaMatrix, mod, method = "leek")) # 1 SV estimated[1] 1
svaobj <- sva(yourBetaMatrix, mod, mod0, n.sv = n.sv)Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
sv <- as.data.frame(svaobj$sv)
head(sv) V1
1 -0.01684039
2 -0.01895156
3 -0.02337255
4 -0.01471983
5 -0.02301552
6 0.01769994
# See if the SVs reflects estimated cell type proportions / other variables
yourCT <- hepidish(beta.m = yourBetaMatrix,
ref1.m = centEpiFibIC.m,
ref2.m = centBloodSub.m,
h.CT.idx = 3,
method = "RPC")
yourPhenoData <- cbind(yourPhenoData, yourCT)
# Fit a linear model of relevant meta data + estimated cell type proportion to see if any SV significantly
colnames(sv) <- paste0("SV", 1)
lm1 <- lm(sv$SV1 ~ ., data = yourPhenoData)
summary(lm1) # Significantly associated with FASD subgroup, row effect, and some cell types
Call:
lm(formula = sv$SV1 ~ ., data = yourPhenoData)
Residuals:
Min 1Q Median 3Q Max
-0.07227 -0.01543 -0.00383 0.01007 0.31947
Coefficients: (2 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -8.591e-01 1.037e+00 -0.829 0.408195
Age 5.647e-04 7.066e-04 0.799 0.425164
Disease_StatusFASD 4.349e-03 5.945e-03 0.731 0.465342
Disease_SubgroupControl NA NA NA NA
Disease_SubgroupFAS -1.100e-02 1.189e-02 -0.925 0.356171
Disease_SubgroupPAE 4.544e-03 8.303e-03 0.547 0.584835
Disease_SubgrouppFAS 1.553e-02 9.151e-03 1.697 0.091238 .
Disease_SubgrouppFASD 1.803e-01 3.483e-02 5.176 5.49e-07 ***
SexMale -6.040e-03 4.832e-03 -1.250 0.212776
Epi 8.453e-01 1.039e+00 0.813 0.417049
Fib 5.076e+00 1.334e+00 3.805 0.000188 ***
B -4.449e+00 1.552e+00 -2.867 0.004583 **
NK 4.865e-01 1.458e+00 0.334 0.738987
CD4T 6.703e+00 1.450e+00 4.622 6.80e-06 ***
CD8T 3.853e+02 7.288e+02 0.529 0.597650
Mono 3.721e+00 1.430e+00 2.603 0.009933 **
Neutro 6.723e-01 1.012e+00 0.664 0.507362
Eosino NA NA NA NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.03386 on 200 degrees of freedom
Multiple R-squared: 0.7707, Adjusted R-squared: 0.7535
F-statistic: 44.8 on 15 and 200 DF, p-value: < 2.2e-16
As one see in example 2, SV1 is significantly associated with multiple cell types. Certainly, for tissues where reference-free ISCH estimation algorithms are used, there is typically no applicable reference-based method. This example intents to show that SVs can be associated with a variety of factors, including FASD subgroups and ISCH in this example. As buccal swabs are mostly composed of epithelial and neutrophils, 1 SVs can imaginably capture the variability in ISCH.
Comparing DNAme variance explained by ISCH estimation methods
Example: Comparing SVA and MeDeCom output with dataset GSE80261
For this section, we will continue to use buccal dataset GSE80261 as an example, with the imputed yourBetaMatrix, recoded yourPhenoData, and SVA results from the code chuncks from the previous section ([Example 2: Buccal dataset GSE80261])
library(MeDeCom) # Install with devtools::install_github("lutsik/MeDeCom")
library(dplyr) # CRAN package
# Running MeDeCom - Takes very long
mdc_output <- runMeDeCom(yourBetaMatrix,
Ks = 2:4, # Potential number of unique cell types - just an example, buccal swabs can have more than 4 cell types
lambdas = c(0, 10e-4, 10e-2), # Regularization parameter, MeDeCom vignette recommended starting with a logarithmic grid of c(0, 10^(-5:-1))
NINIT = 5, # Reduced from default to save time
NFOLDS = 5, # Reduced from default to save time
ITERMAX = 20) # Reduced from default to save time[Main:] checking inputs
[Main:] preparing data
[Main:] preparing jobs
[Main:] 54 factorization runs in total
[Main:] finished all jobs. Creating the object
plotParameters(mdc_output)plotParameters(mdc_output, K = 4, lambdaScale = "log")mdc <- getProportions(mdc_output, K = 4, lambda = 0) %>%
t() %>%
as.data.frame() # Output
head(mdc) LMC1 LMC2 LMC3 LMC4
1 0.00000000 0.61039016 0.003036707 0.3865731
2 0.00000000 0.49633020 0.246828384 0.2568414
3 0.05157431 0.00000000 0.197102024 0.7513237
4 0.00000000 0.53611011 0.167376124 0.2965138
5 0.00000000 0.43653418 0.348261365 0.2152045
6 0.07745216 0.09254265 0.458591803 0.3714134
# Comparing SVA and MeDeCom output in variance explained
summaryR2 <- apply(yourBetaMatrix, 1, function(cpg){
mod1 <- lm(formula = cpg ~ SV1, data = sv) # predicted ISCH from method 1
mod2 <- lm(formula = cpg ~ LMC1 + LMC2 + LMC3 + LMC4, data = mdc) # predicted ISCH from method 2
return(c(sva_R2 = summary(mod1)$adj.r.squared, mdc_R2 = summary(mod2)$adj.r.squared))
}) %>% t() %>% as.data.frame()
(sva_R2_avg <- mean(summaryR2$sva_R2))[1] 0.2902967
(mdc_R2_avg <- mean(summaryR2$mdc_R2))[1] 0.3971034
Overall, The average R2 of the sva and mdc-estimated ISCH are 0.29 and 0.40, respectively, suggesting the MeDeCom estimation explained more variance in the DNAme data. This is unsurprising, as we have accounted for sex and age in the model when conducting SVA, thereby removing variability associated with those variables. This is simply one way to guide the readers in determining which reference-free method to employ.
Accounting for intersample differences in cellular composition
Calculating ISCH principal components (PCs)
We will use the blood_ct object created from the dataset GSE112618 in the previous section to use as an example ([Example: Estimating cell type proportions in blood]).
pca <- princomp(blood_ct) # replace with your data
pca_var <- cumsum(pca$sdev^2 / sum(pca$sdev^2)) # cumulative variance explained by PCs
names(pca_var)[pca_var > 0.9][1] # the PC at which cumulative variance exceeds 90%[1] "Comp.2"
yourISCHpcs <- as.data.frame(pca$scores) # the ISCH PCs
colnames(yourISCHpcs) <- paste0("PC", 1:ncol(yourISCHpcs))
head(yourISCHpcs) PC1 PC2 PC3
GSM3074480_202163530006_R02C01 -0.07312462 -0.07594768 -0.020262875
GSM3074481_202163530006_R03C01 0.09014307 0.01292521 0.003598693
GSM3074482_202163530006_R04C01 -0.15111674 0.01190744 0.005200124
GSM3074483_202163530006_R06C01 0.17945334 -0.06015121 0.013455390
GSM3074484_202163530006_R07C01 -0.16443379 0.02601105 0.011279134
GSM3074485_202163530006_R08C01 0.11907874 0.08525519 -0.013270467
PC4 PC5 PC6
GSM3074480_202163530006_R02C01 0.001351717 0.0001301589 -1.214306e-17
GSM3074481_202163530006_R03C01 0.014888436 -0.0003233259 7.632783e-17
GSM3074482_202163530006_R04C01 -0.006082752 -0.0007880339 -3.122502e-17
GSM3074483_202163530006_R06C01 -0.006630392 0.0001201341 -1.318390e-16
GSM3074484_202163530006_R07C01 0.001949168 0.0007213611 2.081668e-17
GSM3074485_202163530006_R08C01 -0.005476176 0.0001397057 -3.469447e-18
Use robust linear regression and include ISCH information as covariates
For this and the following sections, we will use the yourBetaMatrix object created from the dataset GSE112618 (Obtain beta matrix from RGChannelSet) and the yourISCHpcs calculated in the last section (Calculating ISCH principal components (PCs)). The goal is to account for ISCH in statistical models, such as in the context of EWAS.
library(MASS) # CRAN package
library(sfsmisc) # CRAN package
library(dplyr) # CRAN package
# Obtain metadata file for GSE112618
GSE112618 <- getGEO("GSE112618")
yourPhenoData <- pData(GSE112618$GSE112618_series_matrix.txt.gz)
yourPhenoData <- yourPhenoData[, c(46, 57)]
colnames(yourPhenoData) <- c("Age", "Sex")
# Recode rownames to make sure the two dataset have the same sample ID
yourISCHpcs$ID <- gsub("_.*", "", rownames(yourISCHpcs))
pd <- merge(yourPhenoData, yourISCHpcs, by.x = 0, by.y = "ID") # or yourestimatedISCH here
ewas_result <- apply(yourBetaMatrix, 1, function(y) {
mod <- rlm(formula = y ~ Sex + PC1 + PC2, data = pd)
out <- f.robftest(mod, var = 2) # The var = 2 argument refers to the variable you want to examine. In this case, we are looking at the effect of Sex (second variable in the model after the intercept, PC1 would be 3, and PC2 would be 4)
return(c(mod$coefficients[2], out$p.value))
}) %>% t()
colnames(ewas_result) <- c("coefficient", "pvalue")
head(ewas_result) coefficient pvalue
cg14817997 -0.022940507 0.6351950
cg26928153 0.003617166 0.1451366
cg16269199 -0.037336833 0.4149547
cg13869341 -0.035000389 0.7699616
cg14008030 -0.068738492 0.1366654
cg12045430 0.001736642 0.9681647
Adjusting for ISCH by residualization
library(dplyr) # CRAN package
residuals <- apply(yourBetaMatrix, 1, function(x){
x <- as.numeric(x)
mod <- lm(x ~ PC1 + PC2, data = yourISCHpcs) # or replace with yourestimatedISCH
return(residuals(summary(mod))) # The residual represents the variance not associated with ISCH
}) %>% t()
adj.residuals <- residuals + matrix(apply(yourBetaMatrix, 1, mean), nrow = nrow(residuals), ncol = ncol(residuals))
# Add the residuals back to the mean DNAmDNAme level of each methylation site to obtain an adjusted beta matrix.
adj.residuals[adj.residuals <= 0] <- 0.0001 # Adjust the “beta values” such that they are bounded by 0 and 1
adj.residuals[adj.residuals > 1] <- 0.9999
adj.residuals[1:5, 1:5] GSM3074480_202163530006_R02C01 GSM3074481_202163530006_R03C01
cg14817997 0.8071082 0.8507052
cg26928153 0.9198204 0.9237327
cg16269199 0.7915424 0.8274217
cg13869341 0.8120545 0.8310354
cg14008030 0.5931829 0.5996638
GSM3074482_202163530006_R04C01 GSM3074483_202163530006_R06C01
cg14817997 0.8483628 0.8542510
cg26928153 0.9215760 0.9183272
cg16269199 0.8262990 0.8427078
cg13869341 0.9201039 0.9389554
cg14008030 0.5459372 0.5873481
GSM3074484_202163530006_R07C01
cg14817997 0.8501595
cg26928153 0.9177988
cg16269199 0.8425308
cg13869341 0.8951252
cg14008030 0.6273289
Cell-type specific DNAme associations
We will use the hannum.chr22.RData dataset that is a part of the TCA vignette as an example.
# download.file("https://github.com/cozygene/TCA/raw/master/vignettes/hannum.chr22.RData",
# "hannum.chr22.RData")
load("hannum.chr22.RData")
yourBetaMatrix <- hannum$X
yourEstimatedISCH <- hannum$W
yourPhenoData <- as.data.frame(hannum$cov)
covariatesassociatedwithISCH <- hannum$cov[,c("gender","age")]
covariatesnotassociatedwithISCH <- hannum$cov[,3:ncol(hannum$cov)]CellDMC
library(EpiDISH) # Bioconductor package
mod <- model.matrix(~., data = yourPhenoData[, -2]) # Create a model matrix of relevant covariates that are not the estimated ISCH or the main variable of interest.
system.time(dmc_output <- CellDMC(yourBetaMatrix,
yourPhenoData$gender, # mainVar
yourEstimatedISCH,
cov.mod = mod)) user system elapsed
34.09 0.22 38.45
# user system elapsed #5584 CpGs, 656 samples, 6 cell types, 20 covarites
# 32.36 0.14 34.73
# user system elapsed #2750 CpGs, 656 samples, 6 cell types, 20 covarites
# 15.76 0.19 17.04
# user system elapsed #5584 CpGs, 656 samples, 3 cell types, 20 covarites
# 25.72 0.17 27.29
dmc_hits <- dmc_output$dmct[rowSums(dmc_output$dmct) != 0, ] # Number of differentially methylation sites in each cell type
head(dmc_output$coe$Gran) # Differential methylation in each cell type Estimate SE t p adjP
cg00004775 -0.0149047410 0.007093100 -2.10130151 0.03601430 0.5027596
cg00012194 -0.0116518568 0.006230175 -1.87022938 0.06191917 0.5585730
cg00013618 -0.0133000781 0.006895574 -1.92878474 0.05421010 0.5449048
cg00014104 -0.0007350321 0.011508978 -0.06386597 0.94909737 0.9886329
cg00014733 -0.0050587269 0.004133598 -1.22380706 0.22148617 0.7466424
cg00021762 -0.0028660806 0.005670997 -0.50539273 0.61346120 0.9171972
CEDAR
library(TOAST) # Bioconductor package
system.time(cedar_output <- cedar(Y_raw = yourBetaMatrix,
prop = yourEstimatedISCH,
design.1 = yourPhenoData,
factor.to.test = 'gender',
cutoff.tree = c('pval', 0.01),
cutoff.prior.prob = c('pval', 0.01))) user system elapsed
6.30 0.35 249.14
# user system elapsed #5584 CpGs, 656 samples, 6 cell types, 20 covarites
# 6.39 0.53 244.70
# user system elapsed #2750 CpGs, 656 samples, 6 cell types, 20 covarites
# 5.23 0.17 116.27
# user system elapsed #5584 CpGs, 656 samples, 3 cell types, 20 covarites
# 1.39 0.12 42.92
cedar_csDM <- cedar_output$toast_res
head(cedar_csDM$Gran) # average csDNAme pattern (e.g. Granulocytes) and significance of their association with the main variable of interest beta beta_var mu effect_size f_statistics
cg00004775 -0.023328311 5.898761e-05 2.379476e-04 2.0416494 9.2258373
cg00012194 -0.011865475 4.869023e-05 2.730965e-03 3.7059087 2.8915351
cg00013618 -0.014048728 5.430235e-05 -2.509204e-04 1.9310212 3.6345897
cg00014104 0.004848781 1.752127e-04 1.391375e-03 1.2707230 0.1341836
cg00014733 -0.005585691 2.173714e-05 -5.556374e-05 1.9609861 1.4353288
cg00021762 -0.003272513 3.875428e-05 -3.721372e-03 0.6108138 0.2763396
p_value fdr
cg00004775 0.002503756 0.2367091
cg00012194 0.089632681 0.5700557
cg00013618 0.057131086 0.5184443
cg00014104 0.714279370 0.9425295
cg00014733 0.231432218 0.7321912
cg00021762 0.599330348 0.9079761
TCA
library(TCA) # CRAN package
system.time(tca_output <- tca(X = yourBetaMatrix,
W = yourEstimatedISCH,
C1 = covariatesassociatedwithISCH,
C2 = covariatesnotassociatedwithISCH))INFO [2024-09-09 03:36:16] Starting tca...
INFO [2024-09-09 03:36:16] Validating input...
INFO [2024-09-09 03:36:16] Fitting the TCA model...
INFO [2024-09-09 03:36:16] Fitting means and variances...
INFO [2024-09-09 03:36:16] Iteration 1 out of 10 internal iterations...
INFO [2024-09-09 03:36:29] Iteration 2 out of 10 internal iterations...
INFO [2024-09-09 03:37:25] Internal loop converged.
INFO [2024-09-09 03:37:25] Calculate p-values for deltas and gammas.
INFO [2024-09-09 03:39:33] Finished tca.
user system elapsed
146.60 13.95 197.09
# user system elapsed #5584 CpGs, 656 samples, 6 cell types, 20 covarites
# 125.93 10.11 141.41
# user system elapsed #2750 CpGs, 656 samples, 6 cell types, 20 covarites
# 60.25 5.04 70.15
# user system elapsed #5584 CpGs, 656 samples, 3 cell types, 20 covarites
# 105.45 8.83 119.55
tca_csDM_pvals <- tca_output$gammas_hat_pvals
head(tca_csDM_pvals) # Cell type specific association for variables in C1 Gran.gender Gran.age CD4T.gender CD4T.age CD8T.gender
cg00004775 0.01552160 0.10275372 0.0195148291 0.02920576 0.5569767
cg00012194 0.07422961 0.65399233 0.1434474604 0.69579298 0.2122048
cg00013618 0.01660231 0.63997707 0.0608209658 0.92497303 0.1859385
cg00014104 0.99785537 0.03982474 0.0001170947 0.06019540 0.1023940
cg00014733 0.25631255 0.48753551 0.3724006480 0.72632986 0.3381321
cg00021762 0.53972884 0.18113787 0.9476537316 0.88073282 0.3553992
CD8T.age Mono.gender Mono.age B.gender B.age NK.gender
cg00004775 0.99997695 0.1010739 0.41377483 0.12673818 0.00164447 0.59677977
cg00012194 0.58781425 0.1792314 0.95562662 0.10699493 0.68870392 0.69561928
cg00013618 0.04436382 0.9338547 0.05160316 0.07080298 0.14862531 0.03458348
cg00014104 0.48249905 0.2226695 0.01339042 0.45911802 0.66604635 0.61364472
cg00014733 0.16615101 0.4085354 0.12571285 0.89589376 0.32641612 0.26822161
cg00021762 0.89933647 0.4043427 0.18057989 0.41242861 0.60200087 0.75996115
NK.age
cg00004775 0.6333288
cg00012194 0.1016116
cg00013618 0.7210515
cg00014104 0.2708199
cg00014733 0.2456464
cg00021762 0.1764015