library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
d.14sGFPpos = select(read.table("14sGFPpos_Rep1_mCRF.LMRs",header=FALSE),4:6)
d.14sGFPneg = select(read.table("14sGFPneg_Rep1_mCRF.LMRs",header=FALSE),4:6)
d.24hGFPpos = select(read.table("24hGFPpos_Rep2_mCRF.LMRs",header=FALSE),4:6)
d.24hGFPneg = select(read.table("24hGFPneg_Rep2_mCRF.LMRs",header=FALSE),4:6)
# Check missing values
sum(is.na(d.14sGFPpos))
## [1] 0
sum(is.na(d.14sGFPneg))
## [1] 0
sum(is.na(d.24hGFPpos))
## [1] 0
sum(is.na(d.24hGFPneg))
## [1] 0
colnames(d.14sGFPpos) <- c("avg_methylation","CpG_count","LMR_length")
colnames(d.14sGFPneg) <- c("avg_methylation","CpG_count","LMR_length")
colnames(d.24hGFPpos) <- c("avg_methylation","CpG_count","LMR_length")
colnames(d.24hGFPneg) <- c("avg_methylation","CpG_count","LMR_length")
# Summarize
lapply(d.14sGFPpos,summary)
## $avg_methylation
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02000 0.09583 0.10000 0.11280 0.10000 0.30000
##
## $CpG_count
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 5.00 11.00 17.46 21.00 2676.00
##
## $LMR_length
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 81.0 215.0 410.6 483.0 64820.0
lapply(d.14sGFPneg,summary)
## $avg_methylation
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02000 0.09608 0.10000 0.11270 0.10000 0.30000
##
## $CpG_count
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 5.00 10.00 17.01 20.00 2676.00
##
## $LMR_length
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 76 207 404 471 64170
lapply(d.24hGFPpos,summary)
## $avg_methylation
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02000 0.09471 0.10000 0.11290 0.10000 0.30000
##
## $CpG_count
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 5.00 11.00 17.47 21.00 2676.00
##
## $LMR_length
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 82.0 220.0 415.8 494.0 55560.0
lapply(d.24hGFPneg,summary)
## $avg_methylation
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0200 0.0952 0.1000 0.1121 0.1000 0.3000
##
## $CpG_count
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 5.00 11.00 17.39 21.00 2676.00
##
## $LMR_length
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 82.0 217.0 414.9 490.0 64060.0
lmr_num <- c(
length(d.14sGFPpos[,1]),
length(d.14sGFPneg[,1]),
length(d.24hGFPpos[,1]),
length(d.24hGFPneg[,1])
)
barplot(lmr_num, names.arg = c("14sGFPpos","14sGFPneg","24hGFPpos","24hGFPneg"),
ylim = c(0,150000), main="Number of LMRs for each sample")
# Combine
d.14sGFPpos$name = "14sGFPpos"
d.14sGFPneg$name = "14sGFPneg"
d.24hGFPpos$name = "24hGFPpos"
d.24hGFPneg$name = "24hGFPneg"
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.1.2
d <- rbind(d.14sGFPpos,d.14sGFPneg,d.24hGFPpos,d.24hGFPneg)
dm <- melt(d)
## Using name as id variables
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
ggplot(filter(dm,dm$variable=="avg_methylation"), aes(value, color=name)) +
geom_density() +
ggtitle(label = "Average methylation of LMRs") + xlab("samples")
Why are there so many average values ~0.10??
ggplot(filter(dm,dm$variable=="CpG_count"), aes(name, value, color=name)) +
geom_boxplot() + scale_y_log10() +
ggtitle(label = "CpG counts per LMR") + xlab("samples") + ylab("CpG counts / LMR (log10 scale)")
ggplot(filter(dm,dm$variable=="LMR_length"), aes(name, value, color=name)) +
geom_boxplot() + scale_y_log10() +
ggtitle(label = "LMR lengths") + xlab("samples") + ylab("lengths (bp, log10 scale)")
#ggplot(filter(dm,dm$variable=="LMR_length" & dm$name=="14sGFPpos"), aes(value)) + geom_histogram(binwidth=0.05) + scale_x_log10()
# Histogram grid
ggplot(filter(dm,dm$variable=="LMR_length"), aes(value, color=name)) + geom_histogram(binwidth=0.05) +
scale_x_log10() + facet_grid(.~name) +
xlab("LMR length (log10 scale)")
In general features of LMRs look very similar across all samples, and there are approximately the same number of LMRs across samples.
What is the peak of LMR lengths?
statmod <- function(x) {
z <- table(as.vector(x))
names(z)[z == max(z)]
}
statmod(filter(dm,dm$name=="14sGFPpos" & dm$variable=="LMR_length")[,3])
## [1] "2"
statmod(filter(dm,dm$name=="14sGFPneg" & dm$variable=="LMR_length")[,3])
## [1] "2"
statmod(filter(dm,dm$name=="24hGFPpos" & dm$variable=="LMR_length")[,3])
## [1] "2"
statmod(filter(dm,dm$name=="24hGFPneg" & dm$variable=="LMR_length")[,3])
## [1] "2"
All have mode of 2. So there must be many single CpGs that are <= 0.30 methylation by mCRF.
a <- (filter(d,d$name=="24hGFPneg" & d$LMR_length==2))
statmod(a$CpG_count)
## [1] "1"
The mode of CpGs for the subset of LMRs with a length of 2 is 1.