Objective: Build clusters from highly correlated adjacent epigenetic marks from sequencing data. This applies to studies examining DNA methylation, ChIP-seq histone marks, and ATAC-seq open chromatin regions. The strength of these clusters can then be compared between control and treatment groups. This is an example using ATAC-seq data.
Before beginning in R: If you are working from an HPC cluster at a university, other modules may be necessary to load into your bash environment prior to opening a session in R.
module load udunits proj gdal geos R
# Load in the necessary packages
library(decorate)
suppressPackagesStartupMessages(library(EnsDb.Hsapiens.v86))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(qvalue))
# Load in epignetic mark counts and metadata. Here I am loading in open chromatin peaks (note the rownames)
glu.counts <- read.table("path/to/file",header=T,row.names=1)
glu.meta <- read.table("path/to/file",header=T,row.names=1)
head(glu.counts)
# GLU.255 GLU.257
#chr1-761945-763298 76 187
#chr1-784555-786034 55 118
head(glu.meta)
#celltype batch GROUP SEX AGE
#GLU.255 GLU 1 HEROIN F 32
#GLU.257 GLU 1 CONTROL M 46
#GLU.260 GLU 1 CONTROL M 65
# Make sure subjects match between counts and metadata files
glu.counts <- glu.counts[,rownames(glu.meta)]
summary(rownames(glu.meta)==colnames(glu.counts))
# Mode TRUE # They match!
#logical 60
# Filter peaks with low counts -- here using <10 avg. count
glu.counts <- glu.counts[rowMeans(glu.counts)>10,]
str(glu.counts) # We have 43,781 peaks for 60 subjects
#data.frame': 43781 obs. of 60 variables:
# $ GLU.255 : int 76 55 43 895 368 226 473 137 577 928
# Normalize counts with vst transformation
norm.glu.counts <- vst(as.matrix(glu.counts))
# Get peak regions from rownames of your counts file
head(norm.glu.counts)
# GLU.255 GLU.257
#chr1-761945-763298 76.7 187
#chr1-784555-786034 55 118
peaks <- data.frame(rownames(glu.counts))
peaks <- peaks %>% separate(glu.counts,c("chr","start","end"))
Data.Range <- as(peaks[,c(1:3)],"GRanges")
names(Data.Range) <- rownames(glu.counts)
head(Data.Range)
#GRanges object with 6 ranges and 0 metadata columns:
# seqnames ranges strand
#<Rle> <IRanges> <Rle>
# chr1-761945-763298 chr1 761945-763298 *
# chr1-784555-786034 chr1 784555-786034 *
# Convert normalized peak counts to covariate-adjusted residuals
design = model.matrix(~batch+SEX+GROUP, glu.meta) # batch, SEX, and GROUP are my covariate
fit = lmFit(norm.glu.counts, design)
residValues = residuals(fit,norm.glu.counts) # Compute residuals that will have regressed out covariates
treeList = runOrderedClusteringGenome( residValues, Data.Range,
adjacentCount=500,method.corr="spearman")
# Broad correlations computed, Spearman limts outlier effect
# adjacentCount sets the max number of features to correlate
treeListClusters = createClusters(treeList, method='meanClusterSize',
meanClusterSize=c( 10, 20, 30, 40, 50, 100, 200))
# Clusters made at different cut-offs for size of clusters
clstScore = scoreClusters(treeList, treeListClusters, BPPARAM=SerialParam() )
# LEF scores ascribed to each cluster
# Need to apply filter to eliminate
# LEF is the fraction of variance explained by the first eigen-value
# If lead eigen value fraction (LEF) > 40% then keep clusters
clustInclude = retainClusters( clstScore, "LEF", 0.1 )
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )
# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, Data.Range )
#Print amount of total clusters
n_clusters = countClusters( treeListClusters_collapse )
n_clusters
#6091
Here we were contrasting by GROUP – CONTROL vs. HEROIN
diffCor = evalDiffCorr(residValues, glu.meta$GROUP, Data.Range,
treeListClusters_collapse, method="Box.permute",
method.corr = "spearman")
# check decorate documentation for alternative statistical methods
df_results = combineResults(diffCor, clstScore, treeListClusters,Data.Range)
write.csv(df_results,file="GLU.ATAC.diffCor.csv")