Read in data

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
d.14sGFPpos = select(read.table("14sGFPpos_Rep1_mCRF.LMRs",header=FALSE),4:6)
d.14sGFPneg = select(read.table("14sGFPneg_Rep1_mCRF.LMRs",header=FALSE),4:6)
d.24hGFPpos = select(read.table("24hGFPpos_Rep2_mCRF.LMRs",header=FALSE),4:6)
d.24hGFPneg = select(read.table("24hGFPneg_Rep2_mCRF.LMRs",header=FALSE),4:6)

Clean, summarize, and combine data.

# Check missing values
sum(is.na(d.14sGFPpos))
## [1] 0
sum(is.na(d.14sGFPneg))
## [1] 0
sum(is.na(d.24hGFPpos))
## [1] 0
sum(is.na(d.24hGFPneg))
## [1] 0
colnames(d.14sGFPpos) <- c("avg_methylation","CpG_count","LMR_length")
colnames(d.14sGFPneg) <- c("avg_methylation","CpG_count","LMR_length")
colnames(d.24hGFPpos) <- c("avg_methylation","CpG_count","LMR_length")
colnames(d.24hGFPneg) <- c("avg_methylation","CpG_count","LMR_length")

# Summarize
lapply(d.14sGFPpos,summary)
## $avg_methylation
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02000 0.09583 0.10000 0.11280 0.10000 0.30000 
## 
## $CpG_count
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00   11.00   17.46   21.00 2676.00 
## 
## $LMR_length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0    81.0   215.0   410.6   483.0 64820.0
lapply(d.14sGFPneg,summary)
## $avg_methylation
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02000 0.09608 0.10000 0.11270 0.10000 0.30000 
## 
## $CpG_count
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00   10.00   17.01   20.00 2676.00 
## 
## $LMR_length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2      76     207     404     471   64170
lapply(d.24hGFPpos,summary)
## $avg_methylation
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02000 0.09471 0.10000 0.11290 0.10000 0.30000 
## 
## $CpG_count
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00   11.00   17.47   21.00 2676.00 
## 
## $LMR_length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0    82.0   220.0   415.8   494.0 55560.0
lapply(d.24hGFPneg,summary)
## $avg_methylation
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0200  0.0952  0.1000  0.1121  0.1000  0.3000 
## 
## $CpG_count
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    5.00   11.00   17.39   21.00 2676.00 
## 
## $LMR_length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0    82.0   217.0   414.9   490.0 64060.0
lmr_num <- c(
    length(d.14sGFPpos[,1]),
    length(d.14sGFPneg[,1]),
    length(d.24hGFPpos[,1]),
    length(d.24hGFPneg[,1])
)

barplot(lmr_num, names.arg = c("14sGFPpos","14sGFPneg","24hGFPpos","24hGFPneg"),
        ylim = c(0,150000), main="Number of LMRs for each sample") 

# Combine
d.14sGFPpos$name = "14sGFPpos"
d.14sGFPneg$name = "14sGFPneg"
d.24hGFPpos$name = "24hGFPpos"
d.24hGFPneg$name = "24hGFPneg"

library(reshape2)
## Warning: package 'reshape2' was built under R version 3.1.2
d <- rbind(d.14sGFPpos,d.14sGFPneg,d.24hGFPpos,d.24hGFPneg)
dm <- melt(d)
## Using name as id variables

Plot data

Density plot of average methylation values for each sample.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
ggplot(filter(dm,dm$variable=="avg_methylation"), aes(value, color=name)) + 
    geom_density() + 
    ggtitle(label = "Average methylation of LMRs") + xlab("samples") 

Why are there so many average values ~0.10??

Distributions of CpG counts in LMRs

ggplot(filter(dm,dm$variable=="CpG_count"), aes(name, value, color=name)) + 
    geom_boxplot() + scale_y_log10() +
    ggtitle(label = "CpG counts per LMR") + xlab("samples") + ylab("CpG counts / LMR (log10 scale)")

Distribution of LMR lengths

ggplot(filter(dm,dm$variable=="LMR_length"), aes(name, value, color=name)) + 
    geom_boxplot() + scale_y_log10() +
    ggtitle(label = "LMR lengths") + xlab("samples") + ylab("lengths (bp, log10 scale)")

#ggplot(filter(dm,dm$variable=="LMR_length" & dm$name=="14sGFPpos"), aes(value)) + geom_histogram(binwidth=0.05) + scale_x_log10()

# Histogram grid
ggplot(filter(dm,dm$variable=="LMR_length"), aes(value, color=name)) + geom_histogram(binwidth=0.05) +
    scale_x_log10() + facet_grid(.~name) +
    xlab("LMR length (log10 scale)")

In general features of LMRs look very similar across all samples, and there are approximately the same number of LMRs across samples.

What is the peak of LMR lengths?

statmod <- function(x) {
   z <- table(as.vector(x))
   names(z)[z == max(z)]
}

statmod(filter(dm,dm$name=="14sGFPpos" & dm$variable=="LMR_length")[,3])
## [1] "2"
statmod(filter(dm,dm$name=="14sGFPneg" & dm$variable=="LMR_length")[,3])
## [1] "2"
statmod(filter(dm,dm$name=="24hGFPpos" & dm$variable=="LMR_length")[,3])
## [1] "2"
statmod(filter(dm,dm$name=="24hGFPneg" & dm$variable=="LMR_length")[,3])
## [1] "2"

All have mode of 2. So there must be many single CpGs that are <= 0.30 methylation by mCRF.

a <- (filter(d,d$name=="24hGFPneg" & d$LMR_length==2))
statmod(a$CpG_count)
## [1] "1"

The mode of CpGs for the subset of LMRs with a length of 2 is 1.