Accounting for ISCH - example code

Set up the analysis environment

In the code chunks throughout the document, the required R packages will be listed. They can be installed into your R environment with the following code, depending on whether they are CRAN or Bioconductor packages.

# Install CRAN packages
install.packages("dplyr")

# Install Bioconductor packages
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("minfi")

Example preprocessing pipeline

Start from raw IDAT files

library(minfi)
library(dplyr)

# Download the raw data at https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE112618&format=file
# untar with tar -xvf GSE112618_RAW.tar
# unzip the idat files within with gzip -d *.gz
yourRGChannelSet <- read.metharray.exp("GSE112618_RAW") 
##################### For EPIC v2 only #####################
# Download EPIC v2 manifest
# BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2manifest")
# BiocManager::install("jokergoo/IlluminaHumanMethylationEPICv2anno.20a1.hg38")
# reassign the annotation so that the minfi functions getBeta(), preprocessRaw() and others like preprocessNoob() will work
library(IlluminaHumanMethylationEPICv2manifest)
library(IlluminaHumanMethylationEPICv2anno.20a1.hg38)
annotation(yourRGChannelSet)["array"] <- "IlluminaHumanMethylationEPICv2"
annotation(yourRGChannelSet)["annotation"] <- "20a1.hg38"
(manifest <- getManifest(yourRGChannelSet))

Obtain beta matrix from RGChannelSet

# Minimal preprocessing - only background color correction and inter-sample normalization
yourBetaMatrix <- preprocessFunnorm(yourRGChannelSet) %>% 
    getBeta()
# See Table 1 for what input is required for each cell type prediction method

Intersample cellular heterogeneity (ISCH) estimation

Reference-based ISCH estimation

Example: Estimating cell type proportions in blood

We will continue to use the yourRGChannelSet dataset created in the previous section (Start from raw IDAT files)

library(FlowSorted.Blood.EPIC)              # Bioconductor package
library(ExperimentHub)                      # Bioconductor package
hub <- ExperimentHub()
FlowSorted.Blood.EPIC <- hub[["EH1136"]]    # Loading reference data set

ecc_output <- estimateCellCounts2(yourRGChannelSet,
                                  processMethod = "preprocessNoob",
                                  probeSelect = "auto",
                                  cellTypes = c("CD8T", "CD4T", "NK", "Bcell", "Mono", "Neu"),
                                  referencePlatform = "IlluminaHumanMethylationEPIC",
                                  referenceset = "FlowSorted.Blood.EPIC",  # Specify the reference data set.
                                  IDOLOptimizedCpGs = NULL,
                                  returnAll = T)
# This function is actually not compatible with EPICv2 dataset for the moment (Aug 2024)

blood_ct <- ecc_output$prop
head(blood_ct)
                                 CD8T   CD4T     NK  Bcell   Mono    Neu
GSM3074480_202163530006_R02C01 0.0739 0.1900 0.0472 0.0370 0.0940 0.6099
GSM3074481_202163530006_R03C01 0.1729 0.1648 0.0911 0.0765 0.0909 0.4637
GSM3074482_202163530006_R04C01 0.0926 0.0929 0.0473 0.0581 0.0917 0.6753
GSM3074483_202163530006_R06C01 0.1655 0.2511 0.0725 0.0973 0.0918 0.3889
GSM3074484_202163530006_R07C01 0.0994 0.0786 0.0512 0.0579 0.0848 0.6882
GSM3074485_202163530006_R08C01 0.2050 0.1102 0.1058 0.0870 0.1266 0.4297
# visualization of blood cell type distribution
library(ggplot2)
blood_plt <- reshape2::melt(blood_ct)
ggplot(blood_plt, aes(Var2, value, color = Var2)) + 
    geom_boxplot() + 
    geom_point(position = position_jitter()) + 
    theme_bw() + 
    labs(x = "cell type", 
         y = "cell type proportion (%)", 
         title = "Blood cell type proportion") 

Example: Estimating cell type proportion in buccal swabs or saliva

# load example buccal and saliva datasets
library(GEOquery)       # Bioconductor package

# Example buccal dataset GSE137495
GSE137495 <- getGEO("GSE137495")
GSE137495_beta <- exprs(GSE137495$GSE137495_series_matrix.txt.gz)

# Example saliva dataset GSE111631
GSE111631 <- getGEO("GSE111631")
GSE111631_beta <- exprs(GSE111631$GSE111631_series_matrix.txt.gz)
library(EpiDISH)        # Bioconductor package
data(centEpiFibIC.m)    # Reference for fibroblasts, epithelial cells, and immune cells
data(centBloodSub.m)    # Reference for immune cell subtypes
buccal_ct <- hepidish(beta.m = GSE137495_beta, # replace this with your beta matrix
                      ref1.m = centEpiFibIC.m,
                      ref2.m = centBloodSub.m,
                      h.CT.idx = 3, # Specify the column in ref1.m to which ref2.m applies
                      method = "RPC")
head(buccal_ct)
                 Epi Fib           B          NK        CD4T CD8T        Mono
GSM4080206 0.9792600   0 0.005029510 0.004739591 0.002862252    0 0.003456092
GSM4080207 0.9786308   0 0.005229510 0.004365611 0.002950365    0 0.003858183
GSM4080208 0.9679868   0 0.007965030 0.007060661 0.004377309    0 0.004852714
GSM4080209 0.9676823   0 0.007781069 0.006783387 0.004897736    0 0.005166767
GSM4080210 0.9777751   0 0.005507455 0.005045903 0.003400369    0 0.003151569
GSM4080211 0.9717333   0 0.006719373 0.006479177 0.003711909    0 0.004261341
                Neutro      Eosino
GSM4080206 0.003187218 0.001465325
GSM4080207 0.003562419 0.001403089
GSM4080208 0.006183435 0.001574031
GSM4080209 0.005752227 0.001936477
GSM4080210 0.003168665 0.001950987
GSM4080211 0.005424045 0.001670821
saliva_ct <- hepidish(beta.m = GSE111631_beta, # replace this with your beta matrix
                      ref1.m = centEpiFibIC.m,
                      ref2.m = centBloodSub.m,
                      h.CT.idx = 3, # Specify the column in ref1.m to which ref2.m applies
                      method = "RPC")
head(saliva_ct)
                  Epi        Fib            B          NK        CD4T CD8T
GSM3035989 0.09911421 0.01009859 0.0007254532 0.002567000 0.000000000    0
GSM3035990 0.11942556 0.01093509 0.0186068286 0.035274308 0.014675686    0
GSM3035991 0.73068579 0.00000000 0.0347779815 0.039622319 0.024404079    0
GSM3035992 0.12086098 0.01955833 0.0072313044 0.018155891 0.005109280    0
GSM3035993 0.13408641 0.02022015 0.0106236185 0.014948171 0.003894098    0
GSM3035994 0.11632987 0.01411160 0.0145979708 0.008708027 0.000000000    0
                  Mono    Neutro Eosino
GSM3035989 0.005690285 0.8818045      0
GSM3035990 0.028107167 0.7729754      0
GSM3035991 0.048866402 0.1216434      0
GSM3035992 0.036033861 0.7930504      0
GSM3035993 0.024966963 0.7912606      0
GSM3035994 0.032935647 0.8133169      0
# visualization of buccal and saliva cell type distribution
library(ggplot2)
buccal_plt <- reshape2::melt(buccal_ct)
ggplot(buccal_plt, aes(Var2, value, color = Var2)) + 
    geom_boxplot() + 
    geom_point(position = position_jitter()) + 
    theme_bw() + 
    labs(x = "cell type", 
         y = "cell type proportion (%)", 
         title = "Buccal cell type proportion") # Buccal swab is mostly composed of epithelial cells

saliva_plt <- reshape2::melt(saliva_ct)
ggplot(saliva_plt, aes(Var2, value, color = Var2)) + 
    geom_boxplot() + 
    geom_point(position = position_jitter()) + 
    theme_bw() + 
    labs(x = "cell type", 
         y = "cell type proportion (%)", 
         title = "Saliva cell type proportion") # Saliva samples typically have a more heterogeneous distribution of epithelial cells and neutrophils compared to buccal swabs

Example: Estimating cell type proportion in the brain

# load example brain datasets
library(GEOquery)       # Bioconductor package

# Example brain dataset GSE168916
# Download the raw data at https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE168916&format=file
# untar compressed file and unzip the idat files within
GSE168916 <- read.metharray.exp("GSE168916_RAW/")
library(HiBED)                  # install with devtools::install_github("SalasLab/HiBED")
library(minfi)                  # Bioconductor package
library(wateRmelon)             # Bioconductor package

# Normalization
brain_Mset <- preprocessNoob(GSE168916)   # noob normalization for color correction
brain_betas <- BMIQ(brain_Mset)           # alternatively, do getBeta(brain_Mset) to extract beta matrix without BMIQ
# Brain ISCH prediction
brain_ct <- HiBED_deconvolution(brain_betas, h = 2) # 2 layers of deconvolution yields 7 brain cell types
# visualization of brain cell type distribution
library(ggplot2)
brain_plt <- reshape2::melt(brain_ct)
ggplot(brain_plt, aes(variable, value, color = variable)) + 
    geom_boxplot() + 
    geom_point(position = position_jitter()) + 
    theme_bw() + 
    labs(x = "cell type", 
         y = "cell type proportion (%)", 
         title = "Brain cell type proportion") 

Reference-free ISCH estimation

Example 1: Buccal dataset GSE137495

library(GEOquery)               # Bioconductor package
library(sva)                    # Bioconductor package
library(impute)                 # Bioconductor package

# Self-defined function to suppress unnecessary verbose output
quiet <- function(x) { 
  sink(tempfile()) 
  on.exit(sink()) 
  invisible(force(x)) 
} 

# Using buccal dataset GSE137495 as example
GSE137495 <- getGEO("GSE137495")
yourBetaMatrix <- exprs(GSE137495$GSE137495_series_matrix.txt.gz)
yourPhenoData <- pData(GSE137495$GSE137495_series_matrix.txt.gz)
yourPhenoData <- yourPhenoData[, c(34, 36)]
colnames(yourPhenoData) <- c("Age", "Sex")
yourPhenoData$Age <- as.numeric(yourPhenoData$Age)
identical(colnames(yourBetaMatrix), rownames(yourPhenoData)) # TRUE
[1] TRUE
# Running SVA
yourBetaMatrix <- quiet(impute.knn(yourBetaMatrix)$data)
mod <- model.matrix(~as.factor(Sex) + Age, data = yourPhenoData)    # For mainVar,  enter the column name in yourPhenoData. 
mod0 <- model.matrix(~Age, data = yourPhenoData)
yourBetaMatrix <- yourBetaMatrix[, rownames(mod)]   # The model.matrix function removes samples with missing values, so ensure that the same samples are included in both data frames
(n.sv <- num.sv(yourBetaMatrix, mod, method = "leek"))  # No SV identified
[1] 0

No relevant SV was identified! As we saw before, the buccal cell type proportion is actually quite homogenous in this dataset (i.e. mostly epithelial cells), and SVA result suggest that ISCH and other contributors to DNAme variability does not yield sufficient heterogeneity to drive a SV.

Example 2: Buccal dataset GSE80261

library(GEOquery)               # Bioconductor package

# Using buccal dataset GSE80261 as example
GSE80261 <- getGEO("GSE80261")
yourBetaMatrix <- exprs(GSE80261$GSE80261_series_matrix.txt.gz)
yourPhenoData <- pData(GSE80261$GSE80261_series_matrix.txt.gz)
yourPhenoData <- yourPhenoData[, c(41, 44:45, 50)]
colnames(yourPhenoData) <- c("Age", "Disease_Status", "Disease_Subgroup", "Sex") # Just a few examples of potentially relevant variables
identical(colnames(yourBetaMatrix), rownames(yourPhenoData)) # TRUE
[1] TRUE
# Recode some variables
yourPhenoData$Age <- as.numeric(yourPhenoData$Age)
yourPhenoData$Disease_Status <- as.factor(yourPhenoData$Disease_Status)
yourPhenoData$Disease_Subgroup <- as.factor(yourPhenoData$Disease_Subgroup)
yourPhenoData$Sex <- as.factor(yourPhenoData$Sex)
library(sva)                    # Bioconductor package
library(impute)                 # Bioconductor package
library(dplyr)                  # CRAN package    
library(EpiDISH)

# impute missing values
yourBetaMatrix <- quiet(impute.knn(yourBetaMatrix)$data)

# Running SVA
mod <- model.matrix(~Disease_Status + Sex + Age, data = yourPhenoData)  
# Say if we are interested in the main effect of Disease Status
mod0 <- model.matrix(~Sex + Age, data = yourPhenoData) # Account for covariates that might contribute to DNAme variability but we are not interested in examining directly
yourBetaMatrix <- yourBetaMatrix[, rownames(mod)]   # The model.matrix function removes samples with missing values, so ensure that the same samples are included in both data frames
(n.sv <- num.sv(yourBetaMatrix, mod, method = "leek"))  # 1 SV estimated
[1] 1
svaobj <- sva(yourBetaMatrix, mod, mod0, n.sv = n.sv)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
sv <- as.data.frame(svaobj$sv)

head(sv)
           V1
1 -0.01684039
2 -0.01895156
3 -0.02337255
4 -0.01471983
5 -0.02301552
6  0.01769994
# See if the SVs reflects estimated cell type proportions / other variables
yourCT <- hepidish(beta.m = yourBetaMatrix,
                   ref1.m = centEpiFibIC.m,
                   ref2.m = centBloodSub.m,
                   h.CT.idx = 3,
                   method = "RPC")
yourPhenoData <- cbind(yourPhenoData, yourCT)

# Fit a linear model of relevant meta data + estimated cell type proportion to see if any SV significantly 
colnames(sv) <- paste0("SV", 1)
lm1 <- lm(sv$SV1 ~ ., data = yourPhenoData) 
summary(lm1) # Significantly associated with FASD subgroup, row effect, and some cell types

Call:
lm(formula = sv$SV1 ~ ., data = yourPhenoData)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.07227 -0.01543 -0.00383  0.01007  0.31947 

Coefficients: (2 not defined because of singularities)
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -8.591e-01  1.037e+00  -0.829 0.408195    
Age                      5.647e-04  7.066e-04   0.799 0.425164    
Disease_StatusFASD       4.349e-03  5.945e-03   0.731 0.465342    
Disease_SubgroupControl         NA         NA      NA       NA    
Disease_SubgroupFAS     -1.100e-02  1.189e-02  -0.925 0.356171    
Disease_SubgroupPAE      4.544e-03  8.303e-03   0.547 0.584835    
Disease_SubgrouppFAS     1.553e-02  9.151e-03   1.697 0.091238 .  
Disease_SubgrouppFASD    1.803e-01  3.483e-02   5.176 5.49e-07 ***
SexMale                 -6.040e-03  4.832e-03  -1.250 0.212776    
Epi                      8.453e-01  1.039e+00   0.813 0.417049    
Fib                      5.076e+00  1.334e+00   3.805 0.000188 ***
B                       -4.449e+00  1.552e+00  -2.867 0.004583 ** 
NK                       4.865e-01  1.458e+00   0.334 0.738987    
CD4T                     6.703e+00  1.450e+00   4.622 6.80e-06 ***
CD8T                     3.853e+02  7.288e+02   0.529 0.597650    
Mono                     3.721e+00  1.430e+00   2.603 0.009933 ** 
Neutro                   6.723e-01  1.012e+00   0.664 0.507362    
Eosino                          NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.03386 on 200 degrees of freedom
Multiple R-squared:  0.7707,    Adjusted R-squared:  0.7535 
F-statistic:  44.8 on 15 and 200 DF,  p-value: < 2.2e-16

As one see in example 2, SV1 is significantly associated with multiple cell types. Certainly, for tissues where reference-free ISCH estimation algorithms are used, there is typically no applicable reference-based method. This example intents to show that SVs can be associated with a variety of factors, including FASD subgroups and ISCH in this example. As buccal swabs are mostly composed of epithelial and neutrophils, 1 SVs can imaginably capture the variability in ISCH.

Comparing DNAme variance explained by ISCH estimation methods

Example: Comparing SVA and MeDeCom output with dataset GSE80261

For this section, we will continue to use buccal dataset GSE80261 as an example, with the imputed yourBetaMatrix, recoded yourPhenoData, and SVA results from the code chuncks from the previous section ([Example 2: Buccal dataset GSE80261])

library(MeDeCom)    # Install with devtools::install_github("lutsik/MeDeCom")
library(dplyr)      # CRAN package

# Running MeDeCom - Takes very long
mdc_output <- runMeDeCom(yourBetaMatrix,
                         Ks = 2:4,                     # Potential number of unique cell types - just an example, buccal swabs can have more than 4 cell types
                         lambdas = c(0, 10e-4, 10e-2), # Regularization parameter, MeDeCom vignette recommended starting with a logarithmic grid of c(0, 10^(-5:-1))
                         NINIT = 5,                    # Reduced from default to save time
                         NFOLDS = 5,                   # Reduced from default to save time
                         ITERMAX = 20)                 # Reduced from default to save time
[Main:] checking inputs
[Main:] preparing data
[Main:] preparing jobs
[Main:] 54 factorization runs in total
[Main:] finished all jobs. Creating the object
plotParameters(mdc_output)

plotParameters(mdc_output, K = 4, lambdaScale = "log")

mdc <- getProportions(mdc_output, K = 4, lambda = 0) %>%
    t() %>% 
    as.data.frame()                                    # Output 

head(mdc)
        LMC1       LMC2        LMC3      LMC4
1 0.00000000 0.61039016 0.003036707 0.3865731
2 0.00000000 0.49633020 0.246828384 0.2568414
3 0.05157431 0.00000000 0.197102024 0.7513237
4 0.00000000 0.53611011 0.167376124 0.2965138
5 0.00000000 0.43653418 0.348261365 0.2152045
6 0.07745216 0.09254265 0.458591803 0.3714134
# Comparing SVA and MeDeCom output in variance explained
summaryR2 <- apply(yourBetaMatrix, 1, function(cpg){
  mod1 <- lm(formula = cpg ~ SV1, data = sv) # predicted ISCH from method 1 
  mod2 <- lm(formula = cpg ~ LMC1 + LMC2 + LMC3 + LMC4, data = mdc) # predicted ISCH from method 2
  return(c(sva_R2 = summary(mod1)$adj.r.squared, mdc_R2 = summary(mod2)$adj.r.squared))
}) %>% t() %>% as.data.frame()
(sva_R2_avg <- mean(summaryR2$sva_R2))
[1] 0.2902967
(mdc_R2_avg <- mean(summaryR2$mdc_R2))
[1] 0.3971034

Overall, The average R2 of the sva and mdc-estimated ISCH are 0.29 and 0.40, respectively, suggesting the MeDeCom estimation explained more variance in the DNAme data. This is unsurprising, as we have accounted for sex and age in the model when conducting SVA, thereby removing variability associated with those variables. This is simply one way to guide the readers in determining which reference-free method to employ.

Accounting for intersample differences in cellular composition

Calculating ISCH principal components (PCs)

We will use the blood_ct object created from the dataset GSE112618 in the previous section to use as an example ([Example: Estimating cell type proportions in blood]).

pca <- princomp(blood_ct)                       # replace with your data
pca_var <- cumsum(pca$sdev^2 / sum(pca$sdev^2)) # cumulative variance explained by PCs
names(pca_var)[pca_var > 0.9][1]                # the PC at which cumulative variance exceeds 90%
[1] "Comp.2"
yourISCHpcs <- as.data.frame(pca$scores)        # the ISCH PCs
colnames(yourISCHpcs) <- paste0("PC", 1:ncol(yourISCHpcs))

head(yourISCHpcs)
                                       PC1         PC2          PC3
GSM3074480_202163530006_R02C01 -0.07312462 -0.07594768 -0.020262875
GSM3074481_202163530006_R03C01  0.09014307  0.01292521  0.003598693
GSM3074482_202163530006_R04C01 -0.15111674  0.01190744  0.005200124
GSM3074483_202163530006_R06C01  0.17945334 -0.06015121  0.013455390
GSM3074484_202163530006_R07C01 -0.16443379  0.02601105  0.011279134
GSM3074485_202163530006_R08C01  0.11907874  0.08525519 -0.013270467
                                        PC4           PC5           PC6
GSM3074480_202163530006_R02C01  0.001351717  0.0001301589 -1.214306e-17
GSM3074481_202163530006_R03C01  0.014888436 -0.0003233259  7.632783e-17
GSM3074482_202163530006_R04C01 -0.006082752 -0.0007880339 -3.122502e-17
GSM3074483_202163530006_R06C01 -0.006630392  0.0001201341 -1.318390e-16
GSM3074484_202163530006_R07C01  0.001949168  0.0007213611  2.081668e-17
GSM3074485_202163530006_R08C01 -0.005476176  0.0001397057 -3.469447e-18

Use robust linear regression and include ISCH information as covariates

For this and the following sections, we will use the yourBetaMatrix object created from the dataset GSE112618 (Obtain beta matrix from RGChannelSet) and the yourISCHpcs calculated in the last section (Calculating ISCH principal components (PCs)). The goal is to account for ISCH in statistical models, such as in the context of EWAS.

library(MASS)                                   # CRAN package
library(sfsmisc)                                # CRAN package
library(dplyr)                                  # CRAN package

# Obtain metadata file for GSE112618
GSE112618 <- getGEO("GSE112618")
yourPhenoData <- pData(GSE112618$GSE112618_series_matrix.txt.gz)
yourPhenoData <- yourPhenoData[, c(46, 57)]
colnames(yourPhenoData) <- c("Age", "Sex")

# Recode rownames to make sure the two dataset have the same sample ID
yourISCHpcs$ID <- gsub("_.*", "", rownames(yourISCHpcs))

pd <- merge(yourPhenoData, yourISCHpcs, by.x = 0, by.y = "ID") # or yourestimatedISCH here
ewas_result <- apply(yourBetaMatrix, 1, function(y) {
    mod <- rlm(formula = y ~ Sex + PC1 + PC2, data = pd) 
    out <- f.robftest(mod, var = 2) # The var = 2 argument refers to the variable you want to examine. In this case, we are looking at the effect of Sex (second variable in the model after the intercept, PC1 would be 3, and PC2 would be 4)
    return(c(mod$coefficients[2], out$p.value))
}) %>% t()
colnames(ewas_result) <- c("coefficient", "pvalue")

head(ewas_result)
            coefficient    pvalue
cg14817997 -0.022940507 0.6351950
cg26928153  0.003617166 0.1451366
cg16269199 -0.037336833 0.4149547
cg13869341 -0.035000389 0.7699616
cg14008030 -0.068738492 0.1366654
cg12045430  0.001736642 0.9681647

Adjusting for ISCH by residualization

library(dplyr)                          # CRAN package
residuals <- apply(yourBetaMatrix, 1, function(x){
    x <- as.numeric(x)
    mod <- lm(x ~ PC1 + PC2, data = yourISCHpcs)    # or replace with yourestimatedISCH
    return(residuals(summary(mod))) # The residual represents the variance not associated with ISCH
}) %>% t()
adj.residuals <- residuals + matrix(apply(yourBetaMatrix, 1, mean), nrow = nrow(residuals), ncol = ncol(residuals))
# Add the residuals back to the mean DNAmDNAme level of each methylation site to obtain an adjusted beta matrix.
adj.residuals[adj.residuals <= 0] <- 0.0001     # Adjust the “beta values” such  that they are bounded by 0 and 1
adj.residuals[adj.residuals > 1] <- 0.9999

adj.residuals[1:5, 1:5]
           GSM3074480_202163530006_R02C01 GSM3074481_202163530006_R03C01
cg14817997                      0.8071082                      0.8507052
cg26928153                      0.9198204                      0.9237327
cg16269199                      0.7915424                      0.8274217
cg13869341                      0.8120545                      0.8310354
cg14008030                      0.5931829                      0.5996638
           GSM3074482_202163530006_R04C01 GSM3074483_202163530006_R06C01
cg14817997                      0.8483628                      0.8542510
cg26928153                      0.9215760                      0.9183272
cg16269199                      0.8262990                      0.8427078
cg13869341                      0.9201039                      0.9389554
cg14008030                      0.5459372                      0.5873481
           GSM3074484_202163530006_R07C01
cg14817997                      0.8501595
cg26928153                      0.9177988
cg16269199                      0.8425308
cg13869341                      0.8951252
cg14008030                      0.6273289

Cell-type specific DNAme associations

We will use the hannum.chr22.RData dataset that is a part of the TCA vignette as an example.

# download.file("https://github.com/cozygene/TCA/raw/master/vignettes/hannum.chr22.RData",
#               "hannum.chr22.RData")
load("hannum.chr22.RData")
yourBetaMatrix <- hannum$X
yourEstimatedISCH <- hannum$W
yourPhenoData <- as.data.frame(hannum$cov)
covariatesassociatedwithISCH <- hannum$cov[,c("gender","age")]
covariatesnotassociatedwithISCH <- hannum$cov[,3:ncol(hannum$cov)]

CellDMC

library(EpiDISH)        # Bioconductor package
mod <- model.matrix(~., data = yourPhenoData[, -2]) # Create a model matrix of relevant covariates that are not the estimated ISCH or the main variable of interest.
system.time(dmc_output <- CellDMC(yourBetaMatrix, 
                                  yourPhenoData$gender, # mainVar
                                  yourEstimatedISCH, 
                                  cov.mod = mod))
   user  system elapsed 
  34.09    0.22   38.45 
  #  user  system elapsed  #5584 CpGs, 656 samples, 6 cell types, 20 covarites
  # 32.36    0.14   34.73 
  #  user  system elapsed  #2750 CpGs, 656 samples, 6 cell types, 20 covarites
  # 15.76    0.19   17.04 
  #  user  system elapsed  #5584 CpGs, 656 samples, 3 cell types, 20 covarites
  # 25.72    0.17   27.29 
dmc_hits <- dmc_output$dmct[rowSums(dmc_output$dmct) != 0, ] # Number of differentially methylation sites in each cell type
head(dmc_output$coe$Gran)                                    # Differential methylation in each cell type
                Estimate          SE           t          p      adjP
cg00004775 -0.0149047410 0.007093100 -2.10130151 0.03601430 0.5027596
cg00012194 -0.0116518568 0.006230175 -1.87022938 0.06191917 0.5585730
cg00013618 -0.0133000781 0.006895574 -1.92878474 0.05421010 0.5449048
cg00014104 -0.0007350321 0.011508978 -0.06386597 0.94909737 0.9886329
cg00014733 -0.0050587269 0.004133598 -1.22380706 0.22148617 0.7466424
cg00021762 -0.0028660806 0.005670997 -0.50539273 0.61346120 0.9171972

CEDAR

library(TOAST)                          # Bioconductor package
system.time(cedar_output <- cedar(Y_raw = yourBetaMatrix,
                                  prop = yourEstimatedISCH,
                                  design.1 = yourPhenoData,
                                  factor.to.test = 'gender',
                                  cutoff.tree = c('pval', 0.01),
                                  cutoff.prior.prob = c('pval', 0.01)))
   user  system elapsed 
   6.30    0.35  249.14 
   # user  system elapsed #5584 CpGs, 656 samples, 6 cell types, 20 covarites
   # 6.39    0.53  244.70 
   # user  system elapsed #2750 CpGs, 656 samples, 6 cell types, 20 covarites
   # 5.23    0.17  116.27 
   # user  system elapsed #5584 CpGs, 656 samples, 3 cell types, 20 covarites
   # 1.39    0.12   42.92 
cedar_csDM <- cedar_output$toast_res
head(cedar_csDM$Gran)           # average csDNAme pattern (e.g. Granulocytes) and significance of their association with the main variable of interest 
                   beta     beta_var            mu effect_size f_statistics
cg00004775 -0.023328311 5.898761e-05  2.379476e-04   2.0416494    9.2258373
cg00012194 -0.011865475 4.869023e-05  2.730965e-03   3.7059087    2.8915351
cg00013618 -0.014048728 5.430235e-05 -2.509204e-04   1.9310212    3.6345897
cg00014104  0.004848781 1.752127e-04  1.391375e-03   1.2707230    0.1341836
cg00014733 -0.005585691 2.173714e-05 -5.556374e-05   1.9609861    1.4353288
cg00021762 -0.003272513 3.875428e-05 -3.721372e-03   0.6108138    0.2763396
               p_value       fdr
cg00004775 0.002503756 0.2367091
cg00012194 0.089632681 0.5700557
cg00013618 0.057131086 0.5184443
cg00014104 0.714279370 0.9425295
cg00014733 0.231432218 0.7321912
cg00021762 0.599330348 0.9079761

TCA

library(TCA)                            # CRAN package

system.time(tca_output <- tca(X = yourBetaMatrix,
                              W = yourEstimatedISCH,
                              C1 = covariatesassociatedwithISCH,
                              C2 = covariatesnotassociatedwithISCH))
INFO [2024-09-09 03:36:16] Starting tca...
INFO [2024-09-09 03:36:16] Validating input...
INFO [2024-09-09 03:36:16] Fitting the TCA model...
INFO [2024-09-09 03:36:16] Fitting means and variances...
INFO [2024-09-09 03:36:16] Iteration 1 out of 10 internal iterations...
INFO [2024-09-09 03:36:29] Iteration 2 out of 10 internal iterations...
INFO [2024-09-09 03:37:25] Internal loop converged.
INFO [2024-09-09 03:37:25] Calculate p-values for deltas and gammas.
INFO [2024-09-09 03:39:33] Finished tca.
   user  system elapsed 
 146.60   13.95  197.09 
   #   user  system elapsed  #5584 CpGs, 656 samples, 6 cell types, 20 covarites
   # 125.93   10.11  141.41 
   #   user  system elapsed  #2750 CpGs, 656 samples, 6 cell types, 20 covarites
   #  60.25    5.04   70.15 
   #   user  system elapsed  #5584 CpGs, 656 samples, 3 cell types, 20 covarites
   # 105.45    8.83  119.55 
tca_csDM_pvals <- tca_output$gammas_hat_pvals
head(tca_csDM_pvals)            # Cell type specific association for variables in C1
           Gran.gender   Gran.age  CD4T.gender   CD4T.age CD8T.gender
cg00004775  0.01552160 0.10275372 0.0195148291 0.02920576   0.5569767
cg00012194  0.07422961 0.65399233 0.1434474604 0.69579298   0.2122048
cg00013618  0.01660231 0.63997707 0.0608209658 0.92497303   0.1859385
cg00014104  0.99785537 0.03982474 0.0001170947 0.06019540   0.1023940
cg00014733  0.25631255 0.48753551 0.3724006480 0.72632986   0.3381321
cg00021762  0.53972884 0.18113787 0.9476537316 0.88073282   0.3553992
             CD8T.age Mono.gender   Mono.age   B.gender      B.age  NK.gender
cg00004775 0.99997695   0.1010739 0.41377483 0.12673818 0.00164447 0.59677977
cg00012194 0.58781425   0.1792314 0.95562662 0.10699493 0.68870392 0.69561928
cg00013618 0.04436382   0.9338547 0.05160316 0.07080298 0.14862531 0.03458348
cg00014104 0.48249905   0.2226695 0.01339042 0.45911802 0.66604635 0.61364472
cg00014733 0.16615101   0.4085354 0.12571285 0.89589376 0.32641612 0.26822161
cg00021762 0.89933647   0.4043427 0.18057989 0.41242861 0.60200087 0.75996115
              NK.age
cg00004775 0.6333288
cg00012194 0.1016116
cg00013618 0.7210515
cg00014104 0.2708199
cg00014733 0.2456464
cg00021762 0.1764015