Objective: Build clusters from highly correlated adjacent epigenetic marks from sequencing data. This applies to studies examining DNA methylation, ChIP-seq histone marks, and ATAC-seq open chromatin regions. The strength of these clusters can then be compared between control and treatment groups. This is an example using ATAC-seq data.

Before beginning in R: If you are working from an HPC cluster at a university, other modules may be necessary to load into your bash environment prior to opening a session in R.

module load udunits proj gdal geos R

Start R session and prepare epigenetic mark counts and metadata

# Load in the necessary packages

library(decorate)
suppressPackagesStartupMessages(library(EnsDb.Hsapiens.v86))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(data.table)) 
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(qvalue))
# Load in epignetic mark counts and metadata. Here I am loading in open chromatin peaks (note the rownames)

glu.counts <- read.table("path/to/file",header=T,row.names=1)
glu.meta <- read.table("path/to/file",header=T,row.names=1)

head(glu.counts)
#                    GLU.255 GLU.257 
#chr1-761945-763298      76     187     
#chr1-784555-786034      55     118    

head(glu.meta)
#celltype batch   GROUP SEX AGE
#GLU.255      GLU     1  HEROIN   F  32
#GLU.257      GLU     1 CONTROL   M  46
#GLU.260      GLU     1 CONTROL   M  65

# Make sure subjects match between counts and metadata files
glu.counts <- glu.counts[,rownames(glu.meta)]


summary(rownames(glu.meta)==colnames(glu.counts))
# Mode    TRUE   # They match!
#logical      60 


# Filter peaks with low counts -- here using <10 avg. count

glu.counts <- glu.counts[rowMeans(glu.counts)>10,]

str(glu.counts)  # We have 43,781 peaks for 60 subjects
#data.frame':   43781 obs. of  60 variables:
# $ GLU.255     : int  76 55 43 895 368 226 473 137 577 928


# Normalize counts with vst transformation
norm.glu.counts <- vst(as.matrix(glu.counts))

# Get peak regions from rownames of your counts file

head(norm.glu.counts)
#                    GLU.255 GLU.257 
#chr1-761945-763298      76.7     187     
#chr1-784555-786034      55     118  

peaks <- data.frame(rownames(glu.counts))
peaks <- peaks %>% separate(glu.counts,c("chr","start","end"))
Data.Range <- as(peaks[,c(1:3)],"GRanges")
names(Data.Range) <- rownames(glu.counts)

head(Data.Range)
#GRanges object with 6 ranges and 0 metadata columns:
#  seqnames        ranges strand
#<Rle>     <IRanges>  <Rle>
#  chr1-761945-763298     chr1 761945-763298      *
#  chr1-784555-786034     chr1 784555-786034      *


# Convert normalized peak counts to covariate-adjusted residuals

design = model.matrix(~batch+SEX+GROUP, glu.meta) # batch, SEX, and GROUP are my covariate
fit = lmFit(norm.glu.counts, design)
residValues = residuals(fit,norm.glu.counts) # Compute residuals that will have regressed out covariates

Perform correlations, generate and filter for robust clusters

treeList = runOrderedClusteringGenome( residValues, Data.Range,
           adjacentCount=500,method.corr="spearman")  
           # Broad correlations computed, Spearman limts outlier effect
           # adjacentCount sets the max number of features to correlate                  

treeListClusters = createClusters(treeList, method='meanClusterSize', 
                   meanClusterSize=c( 10, 20, 30, 40, 50, 100, 200)) 
                   # Clusters made at different cut-offs for size of clusters 

clstScore = scoreClusters(treeList, treeListClusters, BPPARAM=SerialParam() )
            # LEF scores ascribed to each cluster


# Need to apply filter to eliminate 
# LEF is the fraction of variance explained by the first eigen-value
# If lead eigen value fraction (LEF) > 40% then keep clusters

clustInclude = retainClusters( clstScore, "LEF", 0.1 )
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )

# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, Data.Range )  

#Print amount of total clusters 
n_clusters = countClusters( treeListClusters_collapse )
n_clusters
#6091

Carry out differential correlation analysis on clusters, recover results, and write to file

Here we were contrasting by GROUP – CONTROL vs. HEROIN

diffCor = evalDiffCorr(residValues, glu.meta$GROUP, Data.Range, 
          treeListClusters_collapse, method="Box.permute",
          method.corr = "spearman") 
          # check decorate documentation for alternative statistical methods
                         
df_results = combineResults(diffCor, clstScore, treeListClusters,Data.Range)

write.csv(df_results,file="GLU.ATAC.diffCor.csv")

Visualization of a differentially correlated region

# Get coordinates of top differentially correlated peak from results -- 
#df_results should be sorted by significance (lowest FDR adj. p)

peakIDs = getFeaturesInCluster( treeListClusters_collapse, 
          df_results$chrom[1],df_results$cluster[1], df_results$id[1])
query = range(Data.Range[names(Data.Range) %in% peakIDs])

# Can be helpful to zoom out of region by setting a window extension

window = 100
start(query) = start(query) - window
end(query) = end(query) + window

# For contrast, you make plotDecorate objects for both treatment groups

fig1 = plotDecorate(EnsDb.Hsapiens.v86, treeList, treeListClusters_collapse, Data.Range, query, data=residValues[,glu.meta$GROUP=="CONTROL"])

fig2 = plotDecorate( EnsDb.Hsapiens.v86, treeList, treeListClusters_collapse, Data.Range, query, data=residValues[,glu.meta$GROUP=="HEROIN"])

# Check it out!

plot_grid( fig1, fig2, ncol=2, labels=c('CONTROL', 'HEROIN') )