In this seminar, we are going to perform differential methylation analysis (run on the Illumina Infinium HumanMethylation 450K array) for Acute lymphoblastic leukemia (ALL) cases and healthy B cells as control group. The datasets we are going to use are: GSE39141 which contains 29 ALL bone marrow samples and 4 healthy B cell samples, and to make up for the small sample size for healthy cells, we will also use GSE42865 which contains 9 healthy B cell samples and 7 samples with other conditions.
First, let's retrieve our datasets from GEO with getGEO from GEOquery package.
if (file.exists("methyl_ALL.Rdata")) {
# if previously downloaded
load("methyl_ALL.Rdata")
} else {
# if downloading for the first time
GSE39141 <- getGEO("GSE39141")
show(GSE39141) ## 33 samples (29 ALL and 4 healthy B cells)
GSE42865 <- getGEO("GSE42865") # took ~2 mins for JB
show(GSE42865) ## 16 samples (9 healthy cells B cells and 7 other cells)
# Extract expression matrices (turn into data frames at once)
ALL.dat <- as.data.frame(exprs(GSE39141[[1]]))
CTRL.dat <- as.data.frame(exprs(GSE42865[[1]]))
# Obtain the meta-data for the samples and rename them perhaps?
ALL.meta <- pData(phenoData(GSE39141[[1]]))
CTRL.meta <- pData(phenoData(GSE42865[[1]]))
# create some labels
ALL.meta$Group <- c(rep("ALL", 29), rep("HBC", 4))
## ALL: Case; HBC: Healthy B Cells
# Subset both meta-data and data for control (healthy) donors
CTRL.meta <- droplevels(subset(CTRL.meta, grepl("Healthy donor", characteristics_ch1.1)))
CTRL.dat <- subset(CTRL.dat, select = as.character(CTRL.meta$geo_accession))
# Rename variables
names(ALL.dat) <- paste(ALL.meta$Group, gsub("GSM", "", names(ALL.dat)),
sep = "_")
names(CTRL.dat) <- paste("HBC", gsub("GSM", "", names(CTRL.dat)), sep = "_")
# save the data to avoid future re-downloading
save(ALL.dat, CTRL.dat, ALL.meta, CTRL.meta, file = "methyl_ALL.Rdata")
}
Let's plot the average beta distribution of the ALL samples and the CTRL samples separately.
library(ggplot2)
avg.betas <- c(rowMeans(ALL.dat, na.rm = T), rowMeans(CTRL.dat, na.rm = T))
plotDat <- data.frame(Beta = avg.betas, Dataset = rep(c("ALL", "CTRL"), each = nrow(ALL.dat)))
(probeAvg <- ggplot(data = plotDat, aes(x = Beta, col = Dataset)) + geom_density() +
ggtitle("Average Beta distribution of ALL vs CTRL samples") + xlab("Beta values") +
ylab("Density") + theme_bw())
Since the distribution of average beta values between the two experiments are not the same, we will need to normalize the data.
# combine data from two experiments into one matrix, each column represents
# beta values of one sample
beta.matrix <- as.matrix(cbind(ALL.dat, CTRL.dat))
str(beta.matrix, max.level = 0)
## num [1:485577, 1:42] 0.512 0.911 0.857 0.149 0.729 ...
## - attr(*, "dimnames")=List of 2
system.time(beta.norm <- betaqn(beta.matrix))
## user system elapsed
## 59.01 13.47 109.89
Let's compare the beta value distributions of the two experiments before and after normalization.
dat.probeMeans <- c(rowMeans(beta.norm[, 1:ncol(ALL.dat)], na.rm = TRUE), rowMeans(beta.norm[,
ncol(ALL.dat):ncol(beta.norm)], na.rm = TRUE))
plotNorm <- rbind(data.frame(plotDat, Norm = "Before"), data.frame(Beta = dat.probeMeans,
Dataset = rep(c("ALL", "CTRL"), each = nrow(ALL.dat)), Norm = "After"))
plotNorm$Norm <- factor(plotNorm$Norm, levels = c("Before", "After"))
(probeAvgNorm <- ggplot(data = plotNorm, aes(x = Beta, col = Dataset)) + geom_density() +
facet_grid(Norm ~ .) + ggtitle("Density of Beta values before and after normalization") +
xlab("Beta") + ylab("Density") + theme_bw())
We can see above that the beta value distribution is predominantly bimodal on the interval [0,1]. This type of distribution is not very compatible with the typical assumptions of linear models. To circumvent this problem, we will compute M-values by applying a logit transformation on the data to convert it to a continuous variable that spans (−∞,∞).
M.norm <- beta2m(beta.norm)
We will aggregate our data to focus on probes that reside in CpG islands (CGIs) and then detect differentially methylated CGIs.
# Extracting probe ID to CpG islands association
cginame <- as.data.frame(IlluminaHumanMethylation450kCPGINAME)
names(cginame) <- c("Probe_ID", "cginame")
rownames(cginame) <- cginame$Probe_ID
length(levels(factor(cginame$cginame))) # No. of CGIs
## [1] 27176
# restrict probes to those within CGIs
beta.inCGI <- beta.norm[cginame$Probe_ID, ]
M.inCGI <- M.norm[cginame$Probe_ID, ]
nrow(M.inCGI) # No. of probes within CGIs
## [1] 309465
# aggregate probes to CGIs
beta.CGI <- aggregate(beta.inCGI, by = list(cginame$cginame), mean, na.rm = T)
rownames(beta.CGI) <- beta.CGI[, "Group.1"]
beta.CGI <- subset(beta.CGI, select = -Group.1)
str(beta.CGI, max.level = 0)
## 'data.frame': 27176 obs. of 42 variables:
M.CGI <- aggregate(M.inCGI, by = list(cginame$cginame), mean, na.rm = T)
rownames(M.CGI) <- M.CGI[, "Group.1"]
M.CGI <- subset(M.CGI, select = -Group.1)
str(M.CGI, max.level = 0)
## 'data.frame': 27176 obs. of 42 variables:
Checking the relative distribution of CGI M-values:
# check the distribution of CGI M values with boxplot
library(reshape2)
M.CGI.tall <- melt(t(M.CGI), value.name = "M", varnames = c("Sample", "CGI"))
M.CGI.tall$Group <- substring(M.CGI.tall$Sample, 1, 3)
(M.boxplot <- ggplot(data = M.CGI.tall, aes(Sample, M, colour = Group)) + geom_boxplot() +
ggtitle("Distribution of CGI M values") + xlab("Samples") + ylab("M values") +
theme_bw() + scale_x_discrete(labels = NULL))
We willuse a linear model to identify differentially methylated CGIs with limma.
library(limma)
design <- data.frame(Group = relevel(factor(gsub("_[0-9]+", "", colnames(M.CGI))),
ref = "HBC"), row.names = colnames(M.CGI))
str(design)
## 'data.frame': 42 obs. of 1 variable:
## $ Group: Factor w/ 2 levels "HBC","ALL": 2 2 2 2 2 2 2 2 2 2 ...
(DesMat <- model.matrix(~Group, design))
## (Intercept) GroupALL
## ALL_956761 1 1
## ALL_956762 1 1
## ALL_956763 1 1
## ALL_956764 1 1
## ALL_956765 1 1
## ALL_956766 1 1
## ALL_956767 1 1
## ALL_956768 1 1
## ALL_956769 1 1
## ALL_956770 1 1
## ALL_956771 1 1
## ALL_956772 1 1
## ALL_956773 1 1
## ALL_956774 1 1
## ALL_956775 1 1
## ALL_956776 1 1
## ALL_956777 1 1
## ALL_956778 1 1
## ALL_956779 1 1
## ALL_956780 1 1
## ALL_956781 1 1
## ALL_956782 1 1
## ALL_956783 1 1
## ALL_956784 1 1
## ALL_956785 1 1
## ALL_956786 1 1
## ALL_956787 1 1
## ALL_956788 1 1
## ALL_956789 1 1
## HBC_956790 1 0
## HBC_956791 1 0
## HBC_956792 1 0
## HBC_956793 1 0
## HBC_1052420 1 0
## HBC_1052421 1 0
## HBC_1052422 1 0
## HBC_1052423 1 0
## HBC_1052424 1 0
## HBC_1052425 1 0
## HBC_1052426 1 0
## HBC_1052427 1 0
## HBC_1052428 1 0
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$Group
## [1] "contr.treatment"
DMRfit <- lmFit(M.CGI, DesMat)
DMRfitEb <- eBayes(DMRfit)
cutoff <- 0.01
DMR <- topTable(DMRfitEb, coef = "GroupALL", number = Inf, p.value = cutoff)
head(DMR) # top hits
## logFC AveExpr t P.Value adj.P.Val B
## chr19:49828412-49828668 1.3084 2.283 12.05 2.798e-15 7.604e-11 24.31
## chr4:156680095-156681386 -1.1115 -2.521 -10.95 6.245e-14 8.485e-10 21.39
## chr20:11871374-11872207 -1.2639 -2.284 -10.49 2.370e-13 1.803e-09 20.13
## chr19:33726654-33726946 0.9429 1.887 10.39 3.172e-13 1.803e-09 19.85
## chr18:77166704-77167043 0.8104 3.199 10.36 3.429e-13 1.803e-09 19.78
## chr18:46447718-46448083 -0.8990 2.034 -10.31 3.980e-13 1.803e-09 19.64
Using a FDR cutoff of 0.01, we identified 4115 CGIs that are differentially methylated between ALL and control samples.
Create a heatmap of beta values of top 100 hits:
library(gplots)
DMR100 <- topTable(DMRfitEb, coef = "GroupALL", number = 100)
DMR.CGI <- t(as.matrix(subset(beta.CGI, rownames(beta.CGI) %in% rownames(DMR100))))
str(DMR.CGI, max.level = 0)
## num [1:42, 1:100] 0.707 0.688 0.69 0.728 0.692 ...
## - attr(*, "dimnames")=List of 2
col <- c(rep("darkgoldenrod1", times = nrow(DMR.CGI)))
col[grepl("HBC", rownames(DMR.CGI))] <- "forestgreen"
op <- par(mai = rep(0.5, 4))
heatmap.2(DMR.CGI, col = redblue(256), RowSideColors = col, density.info = "none",
trace = "none", Rowv = TRUE, Colv = TRUE, labCol = FALSE, labRow = FALSE,
dendrogram = "row", margins = c(1, 5))
legend("topright", c("ALL", "HBC"), col = c("darkgoldenrod1", "forestgreen"),
pch = 15)
Create a stripplot of the beta values of the probes within the top CGI hits:
DMR5 <- topTable(DMRfitEb, coef = "GroupALL", number = 5)
beta.DMR5probe <- beta.inCGI[cginame[rownames(beta.inCGI), ]$cginame %in% rownames(DMR5),
]
beta.DMR5probe.tall <- melt(beta.DMR5probe, value.name = "M", varnames = c("Probe_ID",
"Sample"))
beta.DMR5probe.tall$Group <- factor(gsub("_[0-9]+", "", beta.DMR5probe.tall$Sample))
beta.DMR5probe.tall$CGI <- factor(cginame[as.character(beta.DMR5probe.tall$Probe_ID),
]$cginame)
(beta.DMR5.stripplot <- ggplot(data = beta.DMR5probe.tall, aes(x = Group, y = M,
color = Group)) + geom_point(position = position_jitter(width = 0.05), na.rm = T) +
stat_summary(fun.y = mean, aes(group = 1), geom = "line", color = "black") +
facet_grid(. ~ CGI) + ggtitle("Probe beta values within top 5 DM CGIs") +
xlab("Group") + ylab("beta") + theme_bw())
## Warning: Removed 3 rows containing missing values (stat_summary).
## Warning: Removed 1 rows containing missing values (stat_summary).
## Warning: Removed 2 rows containing missing values (stat_summary).
## Warning: Removed 3 rows containing missing values (stat_summary).
Plot location of differential methylated probes along each chromosome:
# get the length of chromosome 1-22 and X
chrlen <- unlist(as.list(IlluminaHumanMethylation450kCHRLENGTHS)[c(as.character(1:22),
"X")])
chrlen <- data.frame(chr = factor(names(chrlen)), length = chrlen)
chr <- IlluminaHumanMethylation450kCHR # get the chromosome of each probe
# get the probe identifiers that are mapped to chromosome
chr <- unlist(as.list(chr[mappedkeys(chr)]))
# get chromosome coordinate of each probe
coord <- IlluminaHumanMethylation450kCPGCOORDINATE
# get the probe identifiers that are mapped to coordinate
coord <- unlist(as.list(coord[mappedkeys(coord)]))
coord <- data.frame(chr = chr[intersect(names(chr), names(coord))], coord = coord[intersect(names(chr),
names(coord))])
# coordinates of probes in DM CGIs
coordDMRprobe <- droplevels(na.omit(coord[cginame[cginame$cginame %in% rownames(DMR),
]$Probe_ID, ]))
(coord.plot <- ggplot(data = coordDMRprobe) + geom_linerange(aes(factor(chr,
levels = c("X", as.character(22:1))), ymin = 0, ymax = length), data = chrlen,
alpha = 0.5) + geom_point(aes(x = factor(chr, levels = c("X", as.character(22:1))),
y = coord), position = position_jitter(width = 0.03), na.rm = T) + ggtitle("DMR positions on chromosomes") +
ylab("Position of DMRs") + xlab("chr") + coord_flip() + theme_bw())