DMR cluster

library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(gbdmr)

Function

Load the functiotions in the R package gbdmr

source("src_samebeta_ver2.R")
## Loading required package: miscTools
## 
## Attaching package: 'miscTools'
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
## The following objects are masked from 'package:MatrixGenerics':
## 
##     colMedians, rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     colMedians, rowMedians
## 
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
## 
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/
# load your data and the data is a matrix where columns are samples and rows correspond to the sites
load('beta.Rdata')
beta <- as.matrix(beta)  
cor.threshold=0.5 # the threshold to define the adjacent correlation 

get the cluster information

#construct an annotation data frame
data(list="IlluminaHumanMethylation450kanno.ilmn12.hg19")
data(Locations)
data(Other)

annotation <- cbind(as.data.frame(Locations), as.data.frame(Other))

# add the annotation to the stats object 
common <- intersect(rownames(beta), rownames(annotation))
length(common)
## [1] 1000
dim(beta)
## [1] 1000  296
annotation <- annotation[match(common, rownames(annotation)),]
beta <- beta[match(common, rownames(beta)),]
t.beta <- t(beta)

chr=annotation$chr
pos=annotation$pos
Indexes <- split(1:length(chr),chr)
clusterIDs=rep(NA,length(chr))
LAST=0
for (i in seq(along=Indexes)){
  Index <- Indexes[[i]]
  temp <- pos[Index]
  Index <- Index[order(temp)]
  x=colCors(t.beta[,Index[-length(Index)]],t.beta[,Index[-1]])
  y <- as.numeric(x <= cor.threshold)
  z <- cumsum(c(1, y))
  clusterIDs[Index] <- z + LAST
  LAST <- max(z) + LAST
}
#clusterIDs # which cluster the cpg belongs
sum(table(clusterIDs))
## [1] 1000
table(table(clusterIDs))
## 
##   1   2   3   4   5   6   7  12 
## 743  54  26   9   2   1   1   1
# Data
blocksize <- c(1, 2, 3, 4, 5, 6, 7, 12)
counts <- c(743, 54, 26, 9, 2, 1, 1, 1)

# Create a data frame
df <- data.frame(value = rep(blocksize, counts))

# Plot histogram
hist(df$value, breaks = seq(0.5, max(df$value) + 0.5, by = 1), col = "skyblue", main = "Histogram Plot", xlab = "Size", ylab = "Frequency")