Epigenetic Correlation Analysis with decorate in R

Objective: Build clusters from highly correlated adjacent epigenetic marks from sequencing data. This applies to studies examining DNA methylation, ChIP-seq histone marks, and ATAC-seq open chromatin regions. The strength of these clusters can then be compared between control and treatment groups. This is an example using ATAC-seq data.

Before beginning in R: If you are working from an HPC cluster at a university, other modules may be necessary to load into your bash environment prior to opening a session in R.

module load udunits proj gdal geos R

Start R session and prepare epigenetic mark counts and metadata

# Load in the necessary packages

library(decorate)
suppressPackagesStartupMessages(library(EnsDb.Hsapiens.v86))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(data.table)) 
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(qvalue))

# Load in epignetic mark counts and metadata. Here I am loading in open chromatin peaks (note the rownames)

glu.counts <- read.table("path/to/file",header=T,row.names=1)
glu.meta <- read.table("path/to/file",header=T,row.names=1)

head(glu.counts)
#                    GLU.255 GLU.257 
#chr1-761945-763298      76     187     
#chr1-784555-786034      55     118    

head(glu.meta)
#celltype batch   GROUP SEX AGE
#GLU.255      GLU     1  HEROIN   F  32
#GLU.257      GLU     1 CONTROL   M  46
#GLU.260      GLU     1 CONTROL   M  65

# Make sure subjects match between counts and metadata files
glu.counts <- glu.counts[,rownames(glu.meta)]


summary(rownames(glu.meta)==colnames(glu.counts))
# Mode    TRUE   # They match!
#logical      60 


# Filter peaks with low counts -- here using <10 avg. count

glu.counts <- glu.counts[rowMeans(glu.counts)>10,]

str(glu.counts)  # We have 43,781 peaks for 60 subjects
#data.frame':   43781 obs. of  60 variables:
# $ GLU.255     : int  76 55 43 895 368 226 473 137 577 928


# Normalize counts with vst transformation
norm.glu.counts <- vst(as.matrix(glu.counts))

# Get peak regions from rownames of your counts file

head(norm.glu.counts)
#                    GLU.255 GLU.257 
#chr1-761945-763298      76.7     187     
#chr1-784555-786034      55     118  

peaks <- data.frame(rownames(glu.counts))
peaks <- peaks %>% separate(glu.counts,c("chr","start","end"))
Data.Range <- as(peaks[,c(1:3)],"GRanges")
names(Data.Range) <- rownames(glu.counts)

head(Data.Range)
#GRanges object with 6 ranges and 0 metadata columns:
#  seqnames        ranges strand
#<Rle>     <IRanges>  <Rle>
#  chr1-761945-763298     chr1 761945-763298      *
#  chr1-784555-786034     chr1 784555-786034      *


# Convert normalized peak counts to covariate-adjusted residuals

design = model.matrix(~batch+SEX+GROUP, glu.meta) # batch, SEX, and GROUP are my covariate
fit = lmFit(norm.glu.counts, design)
residValues = residuals(fit,norm.glu.counts) # Compute residuals that will have regressed out covariates

Perform correlations, generate and filter for robust clusters

treeList = runOrderedClusteringGenome( residValues, Data.Range,
           adjacentCount=500,method.corr="spearman")  
           # Broad correlations computed, Spearman limts outlier effect
           # adjacentCount sets the max number of features to correlate                  

treeListClusters = createClusters(treeList, method='meanClusterSize', 
                   meanClusterSize=c( 10, 20, 30, 40, 50, 100, 200)) 
                   # Clusters made at different cut-offs for size of clusters 

clstScore = scoreClusters(treeList, treeListClusters, BPPARAM=SerialParam() )
            # LEF scores ascribed to each cluster


# Need to apply filter to eliminate 
# LEF is the fraction of variance explained by the first eigen-value
# If lead eigen value fraction (LEF) > 40% then keep clusters

clustInclude = retainClusters( clstScore, "LEF", 0.1 )
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )

# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, Data.Range )  

#Print amount of total clusters 
n_clusters = countClusters( treeListClusters_collapse )
n_clusters
#6091

Carry out differential correlation analysis on clusters, recover results, and write to file

Here we were contrasting by GROUP – CONTROL vs. HEROIN

diffCor = evalDiffCorr(residValues, glu.meta$GROUP, Data.Range, 
          treeListClusters_collapse, method="Box.permute",
          method.corr = "spearman") 
          # check decorate documentation for alternative statistical methods
                         
df_results = combineResults(diffCor, clstScore, treeListClusters,Data.Range)

write.csv(df_results,file="GLU.ATAC.diffCor.csv")

Visualization of a differentially correlated region

# Get coordinates of top differentially correlated peak from results -- 
#df_results should be sorted by significance (lowest FDR adj. p)

peakIDs = getFeaturesInCluster( treeListClusters_collapse, 
          df_results$chrom[1],df_results$cluster[1], df_results$id[1])
query = range(Data.Range[names(Data.Range) %in% peakIDs])

# Can be helpful to zoom out of region by setting a window extension

window = 100
start(query) = start(query) - window
end(query) = end(query) + window

# For contrast, you make plotDecorate objects for both treatment groups

fig1 = plotDecorate(EnsDb.Hsapiens.v86, treeList, treeListClusters_collapse, Data.Range, query, data=residValues[,glu.meta$GROUP=="CONTROL"])

fig2 = plotDecorate( EnsDb.Hsapiens.v86, treeList, treeListClusters_collapse, Data.Range, query, data=residValues[,glu.meta$GROUP=="HEROIN"])

# Check it out!

plot_grid( fig1, fig2, ncol=2, labels=c('CONTROL', 'HEROIN') )

Epigenetic Correlation Analysis with decorate in R

Gregory Rompala

9/5/2020

Start R session and prepare epigenetic mark counts and metadata

Perform correlations, generate and filter for robust clusters

Carry out differential correlation analysis on clusters, recover results, and write to file

Visualization of a differentially correlated region