Exploration of Differential Methylation Analysis Results from Linear Mixed-Effects

library(plyr)
library(reshape2)
library(latticeExtra)
library(VennDiagram)
library(xtable)

Load top-tables

From linear-mixed effect fitting of islands with random effect on probe. Two tables of group effects (~27k):

ML estimation from package nlme
REML estimation from package lme4

### top-tables from MLE
topML.all <- read.table("lme.t_ml_all.tab")
topML.plp <- read.table("lme.t_ml_plp.tab")
topML.apl <- read.table("lme.t_ml_apl.tab")

### top-tables from REML (lmer, no p-values)
topRE.all <- read.table("lme.t_reml_all.tab")
topRE.apl <- read.table("lme.t_reml_apl.tab")

Compare MLE and REML

APL vs. control (smallest q-values) is used for comparison

Merge top-tables

topBoth.apl <- merge(topML.apl, topRE.apl, by = "cgi")
head(topBoth.apl)

##                        cgi  Value t.value   p.value     q.val  Estim
## 1   chr1:10003165-10003585 1.5926  14.475 8.359e-36 8.120e-35 1.5926
## 2     chr1:1002663-1005318 0.4576   4.748 3.043e-06 4.281e-06 0.4576
## 3 chr1:100315420-100316009 1.3369  12.677 1.092e-30 7.203e-30 1.3369
## 4 chr1:100435297-100436070 0.9277  11.920 5.014e-28 2.718e-27 0.9277
## 5 chr1:100503482-100504404 1.9240  14.664 4.967e-37 5.294e-36 1.9240
## 6   chr1:10057121-10058108 0.5405   5.906 9.535e-09 1.524e-08 0.5405
##        t
## 1 14.499
## 2  4.754
## 3 12.693
## 4 11.934
## 5 14.687
## 6  5.915

Estim and t are from REML only

Difference between estimates (density plot)

with(topBoth.apl, plot(density(Value - Estim), xlab = "MLE - REML estimates", 
    main = "Density of diff (Value-Estim)"))

plot of chunk EstimDensity

Difference in t-statistics (density plot)

with(topBoth.apl, plot(density(t.value - t), main = "Density of diff (t.value = t)", 
    xlab = "MLE-REML t-statistics"))

plot of chunk TdiffDensity

Just as warned, the t-statistics are slightly biased from the MLE side, but not by much in general.

Relationship between t-difference and q-value:

par(mfrow = c(2, 1))
with(topBoth.apl, plot(x = t.value - t, y = q.val, main = "t diff vs. q", col = rgb(0, 
    0, 1, 0.1), pch = 16))
with(topBoth.apl, plot(x = t.value - t, y = q.val, main = "t diff vs. q", col = rgb(0, 
    0, 1, 0.1), pch = 16, log = "y"))

plot of chunk TdiffVq

The bias of t-statistics is only big with “large” with enormous values of \( t \)

Correlation of t-statistics:

with(topBoth.apl, plot(t.value, t, xlab = "t (MLE)", ylab = "t(REML)"))
t.cor <- with(topBoth.apl, cor(t.value, t, method = "spearman"))
t.maxdiff <- with(topBoth.apl, max(abs(t.value - t)))  ## 0.223889
text(c(-20, 20), c(20, -20), paste(c("Spearman cor:", "Max diff:"), c(format(c(t.cor, 
    t.maxdiff), digits = 5))))
abline(0, 1)

plot of chunk tcor

Compare PLP and APL

Merge into on data frame

topPL <- cbind(topML.plp, topML.apl[-1])
names(topPL)[2:5] <- jPaste(names(topPL)[2:5], "PLP")
names(topPL)[6:9] <- jPaste(names(topPL)[6:9], "APL")

Overlap of “significant” hits

…with approximately equal-sized sets:

venn.list <- with(topPL, list(APL = which(q.valAPL < 1e-25), PLP = which(q.valPLP < 
    1e-18)))
plot.new()
grid.draw(venn.diagram(venn.list, filename = NULL))

plot of chunk venn1

… With the same q-cutoff:

plot.new()
grid.draw(venn.diagram(lapply(topPL[c("q.valPLP", "q.valAPL")], function(x) which(x < 
    1e-25)), filename = NULL))

plot of chunk venn2

Let's go with just PLP then!

Strip plot of top hits

Get number of probes per island

load('CPGI2Probe_MList.Rdata')
cpgi.lth <- cpgi.probes.M[[1]]

## Number Of Probes in Island (NOPI)
nopi <- with(cpgi.lth, tapply(ALL_956761, cpgi, length))

### What's the distribution of NOPI?
tab <- table(nopi)
barplot(tab, xlab = '# CpG sites in Island', ylab = 'Frequency (islands)')

plot of chunk stripPrep

Pick the best islands with 5 CpG probes

### Merge results from PLP and ALL analyses
topBoth <- cbind(topML.plp, topML.all[-1])
names(topBoth)[2:5] <- jPaste(names(topBoth)[2:5], "PLP")
names(topBoth)[6:9] <- jPaste(names(topBoth)[6:9], "ALL")
topBoth$nopi <- nopi[as.character(topBoth$cgi)]

### Now choose those islands with 5 probes
chooseExample <- subset(topBoth, nopi == 5)

### And choose interesting islands WARNING: not reproducible -- magic
### numbers live here: (these cutoffs were finagled to get the most
### exemplary gene per group)
whichIx <- with(chooseExample, c(which.min(q.valPLP * q.valALL), which(q.valPLP > 
    0.05 & q.valALL < 1e-15), which(q.valPLP < 1e-24 & q.valALL > 0.05), which.max(q.valPLP * 
    q.valALL)))

cgi.examples <- as.character(chooseExample[whichIx, "cgi"])

## good.. now take subset of M values for these islands and convert to
## tall:
cpgi.probes.M.dat <- with(cpgi.probes.M, cbind(ALL[-ncol(ALL)], APL[-ncol(APL)], 
    CTRL))
data <- subset(cpgi.probes.M.dat, cpgi %in% cgi.examples, select = c(grepl("HBC|ALL|PLP|cpgi", 
    names(cpgi.probes.M.dat))))
data$probe <- rownames(data)
tall <- melt(data, id.vars = c("cpgi", "probe"), value.name = "M", variable.name = "Sample")
tall$Group <- substr(tall$Sample, 1, 3)
tall <- droplevels(tall)

tall$probe <- factor(tall$probe)
tall$Group <- factor(tall$Group)

And the plots:

par(mfrow = c(2, 2))
for (i in 1:length(levels(tall$cpgi))) {
    CPGI <- levels(tall$cpgi)[i]

    tall.sub <- subset(tall, cpgi == CPGI)
    tall.sub <- droplevels(tall.sub)
    title <- paste(CPGI, paste(c("q.PLP", "q.ALL"), "=", format(subset(chooseExample, 
        cgi == CPGI)[1, c("q.valPLP", "q.valALL")], digits = 4), collapse = ", "))
    print(stripplot(M ~ Group | probe, tall.sub, col = rgb(0, 0, 1, 0.6), pch = 16, 
        type = c("p", "a"), jitter = TRUE, na.rm = T, layout = c(5, 1), main = title))
}

plot of chunk stripPlot_4genes