by Erica Acton
Retrieve datasets.
if (file.exists("methyl_ALL.Rdata")) {
load("methyl_ALL.Rdata")
} else {
GSE39141 <- getGEO("GSE39141") #29 ALL and 4 healthy B cells
show(GSE39141)
GSE42865 <- getGEO("GSE42865") #9 heathy and 7 other conditions
show(GSE42865)
# Extract expression matrices and convert to data frames
ALL.dat <- as.data.frame(exprs(GSE39141[[1]]))
CTRL.dat <- as.data.frame(exprs(GSE42865[[1]]))
# Obtain meta-data
ALL.meta <- pData(phenoData(GSE39141[[1]]))
CTRL.meta <- pData(phenoData(GSE42865[[1]]))
# Create labels
ALL.meta$Group <- c(rep("ALL", 29), rep("HBC", 4))
# subset meta-data and data for healthy donors
CTRL.meta <- droplevels(subset(CTRL.meta, grepl("Healthy donor", characteristics_ch1.1)))
CTRL.dat <- subset(CTRL.dat, select = as.character(CTRL.meta$geo_accession))
# Rename variables
names(ALL.dat) <- paste(ALL.meta$Group, gsub("GSM", "", names(ALL.dat)),
sep = "_")
names(CTRL.dat) <- paste("HBC", gsub("GSM", "", names(CTRL.dat)), sep = "_")
save(ALL.dat, CTRL.dat, ALL.meta, CTRL.meta, file = "methyl_ALL.Rdata")
}
Exploratory analysis looking at the density of Beta values for probes in the two datasets.
ALL.mean <- rowMeans(ALL.dat, na.rm = T)
CTRL.mean <- rowMeans(CTRL.dat, na.rm = T)
both.mean <- data.frame(Beta = c(ALL.mean, CTRL.mean), type = c(rep("ALL", length(ALL.mean)),
rep("Control", length(CTRL.mean))))
densityplot(~Beta, both.mean, groups = type, grid = TRUE, plot.points = FALSE,
auto.key = TRUE, main = "Density of Beta values before normalization")
Combine data from both experiments into one matrix.
beta.matrix <- as.matrix(cbind(ALL.dat, CTRL.dat))
str(beta.matrix, max.level = 0)
## num [1:485577, 1:42] 0.512 0.911 0.857 0.149 0.729 ...
## - attr(*, "dimnames")=List of 2
Perform quantile normalization of beta values.
system.time(beta.norm <- betaqn(beta.matrix))
## user system elapsed
## 45.987 0.376 46.467
# takes longer than Gloria...poor computer...
Plot density of Beta values after normalization.
beta.qn <- data.frame(Beta = c(rowMeans(beta.norm[, colnames(ALL.dat)], na.rm = T),
rowMeans(beta.norm[, colnames(CTRL.dat)], na.rm = T)), group = c(rep("ALL",
nrow(ALL.dat)), rep("Control", nrow(CTRL.dat))))
densityplot(~Beta, beta.qn, groups = group, grid = TRUE, plot.points = FALSE,
auto.key = TRUE, main = "Density of Beta values after normalization")
Apply transformation ie. compute M value.
M.norm <- beta2m(beta.norm)
Extract probe ID to CpG islands association.
cginame <- as.data.frame(IlluminaHumanMethylation450kCPGINAME)
names(cginame) <- c("Probe_ID", "cginame")
rownames(cginame) <- cginame$Probe_ID
length(levels(factor(cginame$cginame)))
## [1] 27176
Restrict probes to within CGIs.
beta.inCGI <- beta.norm[cginame$Probe_ID, ]
M.inCGI <- M.norm[cginame$Probe_ID, ]
nrow(M.inCGI)
## [1] 309465
Aggregate probes to CGIs (B-values).
beta.CGI <- aggregate(beta.inCGI, by = list(cginame$cginame), mean, na.rm = T)
rownames(beta.CGI) <- beta.CGI[, "Group.1"]
beta.CGI <- subset(beta.CGI, select = -Group.1)
str(beta.CGI, max.level = 0)
## 'data.frame': 27176 obs. of 42 variables:
Aggregate probes to CGIs (M-values).
M.CGI <- aggregate(M.inCGI, by = list(cginame$cginame), mean, na.rm = T)
rownames(M.CGI) <- M.CGI[, "Group.1"]
M.CGI <- subset(M.CGI, select = -Group.1)
str(M.CGI, max.level = 0)
## 'data.frame': 27176 obs. of 42 variables:
Boxplot of distribution of CGI M-values.
boxplot(M.CGI, xlab = "Samples", ylab = "M values", main = "Distribution of CGI M values")
## Warning: Outlier (-Inf) in boxplot 2 is not drawn
## Warning: Outlier (-Inf) in boxplot 3 is not drawn
## Warning: Outlier (-Inf) in boxplot 6 is not drawn
## Warning: Outlier (-Inf) in boxplot 7 is not drawn
## Warning: Outlier (-Inf) in boxplot 8 is not drawn
## Warning: Outlier (-Inf) in boxplot 11 is not drawn
## Warning: Outlier (-Inf) in boxplot 12 is not drawn
## Warning: Outlier (-Inf) in boxplot 14 is not drawn
## Warning: Outlier (-Inf) in boxplot 16 is not drawn
## Warning: Outlier (-Inf) in boxplot 18 is not drawn
## Warning: Outlier (-Inf) in boxplot 19 is not drawn
## Warning: Outlier (-Inf) in boxplot 20 is not drawn
## Warning: Outlier (-Inf) in boxplot 21 is not drawn
## Warning: Outlier (-Inf) in boxplot 22 is not drawn
## Warning: Outlier (-Inf) in boxplot 25 is not drawn
## Warning: Outlier (-Inf) in boxplot 26 is not drawn
## Warning: Outlier (-Inf) in boxplot 27 is not drawn
## Warning: Outlier (-Inf) in boxplot 28 is not drawn
## Warning: Outlier (-Inf) in boxplot 29 is not drawn
Use a linear model to identify differentially methylated CGIs. Design matrix, lmFit, eBayes, topTable…
design <- data.frame(Group = relevel(factor(gsub("_[0-9]+", "", colnames(M.CGI))),
ref = "HBC"), row.names = colnames(M.CGI))
str(design)
## 'data.frame': 42 obs. of 1 variable:
## $ Group: Factor w/ 2 levels "HBC","ALL": 2 2 2 2 2 2 2 2 2 2 ...
(DesMat <- model.matrix(~Group, design))
## (Intercept) GroupALL
## ALL_956761 1 1
## ALL_956762 1 1
## ALL_956763 1 1
## ALL_956764 1 1
## ALL_956765 1 1
## ALL_956766 1 1
## ALL_956767 1 1
## ALL_956768 1 1
## ALL_956769 1 1
## ALL_956770 1 1
## ALL_956771 1 1
## ALL_956772 1 1
## ALL_956773 1 1
## ALL_956774 1 1
## ALL_956775 1 1
## ALL_956776 1 1
## ALL_956777 1 1
## ALL_956778 1 1
## ALL_956779 1 1
## ALL_956780 1 1
## ALL_956781 1 1
## ALL_956782 1 1
## ALL_956783 1 1
## ALL_956784 1 1
## ALL_956785 1 1
## ALL_956786 1 1
## ALL_956787 1 1
## ALL_956788 1 1
## ALL_956789 1 1
## HBC_956790 1 0
## HBC_956791 1 0
## HBC_956792 1 0
## HBC_956793 1 0
## HBC_1052420 1 0
## HBC_1052421 1 0
## HBC_1052422 1 0
## HBC_1052423 1 0
## HBC_1052424 1 0
## HBC_1052425 1 0
## HBC_1052426 1 0
## HBC_1052427 1 0
## HBC_1052428 1 0
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$Group
## [1] "contr.treatment"
DMRfit <- lmFit(M.CGI, DesMat)
DMRfitEb <- eBayes(DMRfit)
cutoff <- 0.01
DMR <- topTable(DMRfitEb, coef = "GroupALL", number = Inf, p.value = cutoff)
head(DMR)
## logFC AveExpr t P.Value adj.P.Val B
## chr19:49828412-49828668 1.3084 2.283 12.05 2.798e-15 7.604e-11 24.31
## chr4:156680095-156681386 -1.1115 -2.521 -10.95 6.245e-14 8.485e-10 21.39
## chr20:11871374-11872207 -1.2639 -2.284 -10.49 2.370e-13 1.803e-09 20.13
## chr19:33726654-33726946 0.9429 1.887 10.39 3.172e-13 1.803e-09 19.85
## chr18:77166704-77167043 0.8104 3.199 10.36 3.429e-13 1.803e-09 19.78
## chr18:46447718-46448083 -0.8990 2.034 -10.31 3.980e-13 1.803e-09 19.64
# Number of CGIs differentially methylated between ALL and Control group at
# a cutoff of FDR=0.01
nrow(DMR)
## [1] 4115
Make a heatmap of the top 100 hits.
DMR100 <- topTable(DMRfitEb, coef = "GroupALL", number = 100)
DMR.CGI <- t(as.matrix(subset(beta.CGI, rownames(beta.CGI) %in% rownames(DMR100))))
str(DMR.CGI, max.level = 0)
## num [1:42, 1:100] 0.751 0.735 0.72 0.702 0.737 ...
## - attr(*, "dimnames")=List of 2
col <- c(rep("darkgoldenrod1", times = nrow(DMR.CGI)))
col[grepl("HBC", rownames(DMR.CGI))] <- "forestgreen"
op <- par(mai = rep(0.5, 4))
heatmap.2(DMR.CGI, col = redblue(256), RowSideColors = col, density.info = "none",
trace = "none", Rowv = TRUE, Colv = TRUE, labCol = FALSE, labRow = FALSE,
dendrogram = "row", margins = c(1, 5))
legend("topright", c("ALL", "HBC"), col = c("darkgoldenrod1", "forestgreen"),
pch = 15)
par(op)
Make a stripplot of beta values of probes within top 5 CGI hits.
DMR5 <- topTable(DMRfitEb, coef = "GroupALL", number = 5)
beta.DMR5probe <- beta.inCGI[cginame[rownames(beta.inCGI), ]$cginame %in% rownames(DMR5),
]
beta.DMR5probe.tall <- melt(beta.DMR5probe, value.name = "M", varnames = c("Probe_ID",
"Sample"))
beta.DMR5probe.tall$Group <- factor(gsub("_[0-9]+", "", beta.DMR5probe.tall$Sample))
beta.DMR5probe.tall$CGI <- factor(cginame[as.character(beta.DMR5probe.tall$Probe_ID),
]$cginame)
(beta.DMR5.stripplot <- ggplot(data = beta.DMR5probe.tall, aes(x = Group, y = M,
color = Group)) + geom_point(position = position_jitter(width = 0.05), na.rm = T) +
stat_summary(fun.y = mean, aes(group = 1), geom = "line", color = "black") +
facet_grid(. ~ CGI) + ggtitle("Probe beta values within top 5 DM CGIs") +
xlab("Group") + ylab("Beta") + theme_bw())
## Warning: Removed 3 rows containing missing values (stat_summary).
## Warning: Removed 1 rows containing missing values (stat_summary).
## Warning: Removed 2 rows containing missing values (stat_summary).
## Warning: Removed 3 rows containing missing values (stat_summary).
Plot the location of DM probes along each chromosome.
# chromosome lengths 1-22 and X
chrlen <- unlist(as.list(IlluminaHumanMethylation450kCHRLENGTHS)[c(as.character(1:22),
"X")])
chrlen <- data.frame(chr = factor(names(chrlen)), length = chrlen)
chr <- IlluminaHumanMethylation450kCHR
# probe identifiers mapped to chromosome
chr <- unlist(as.list(chr[mappedkeys(chr)]))
# coordinate of probe
coord <- IlluminaHumanMethylation450kCPGCOORDINATE
# probe identifiers mapped to coordinate
coord <- unlist(as.list(coord[mappedkeys(coord)]))
coord <- data.frame(chr = chr[intersect(names(chr), names(coord))], coord = coord[intersect(names(chr),
names(coord))])
# coordinates of probes in DM CGIs
coordDMRprobe <- droplevels(na.omit(coord[cginame[cginame$cginame %in% rownames(DMR),
]$Probe_ID, ]))
# plot
(coord.plot <- ggplot(data = coordDMRprobe) + geom_linerange(aes(factor(chr,
levels = c("X", as.character(22:1))), ymin = 0, ymax = length), data = chrlen,
alpha = 0.5) + geom_point(aes(x = factor(chr, levels = c("X", as.character(22:1))),
y = coord), position = position_jitter(width = 0.03), na.rm = T) + ggtitle("DMR positions on chromosomes") +
ylab("Position of DMRs") + xlab("chr") + coord_flip() + theme_bw())
Associate DMRs with biological features/functions by associating them with genes. Get annotation information ie. probes associated with different genomic regions or gene IDs. Below is a list of objects available in the package to acheive this task.
ls("package:IlluminaHumanMethylation450k.db")
## [1] "IlluminaHumanMethylation450k"
## [2] "IlluminaHumanMethylation450kACCNUM"
## [3] "IlluminaHumanMethylation450kALIAS2PROBE"
## [4] "IlluminaHumanMethylation450kBLAME"
## [5] "IlluminaHumanMethylation450kBUILD"
## [6] "IlluminaHumanMethylation450kCHR"
## [7] "IlluminaHumanMethylation450kCHR36"
## [8] "IlluminaHumanMethylation450kCHR37"
## [9] "IlluminaHumanMethylation450kCHRLENGTHS"
## [10] "IlluminaHumanMethylation450kCHRLOC"
## [11] "IlluminaHumanMethylation450kCHRLOCEND"
## [12] "IlluminaHumanMethylation450kCOLORCHANNEL"
## [13] "IlluminaHumanMethylation450kCPG36"
## [14] "IlluminaHumanMethylation450kCPG37"
## [15] "IlluminaHumanMethylation450kCPGCOORDINATE"
## [16] "IlluminaHumanMethylation450kCPGILOCATION"
## [17] "IlluminaHumanMethylation450kCPGINAME"
## [18] "IlluminaHumanMethylation450kCPGIRELATION"
## [19] "IlluminaHumanMethylation450kCPGS"
## [20] "IlluminaHumanMethylation450k_dbconn"
## [21] "IlluminaHumanMethylation450k_dbfile"
## [22] "IlluminaHumanMethylation450k_dbInfo"
## [23] "IlluminaHumanMethylation450k_dbschema"
## [24] "IlluminaHumanMethylation450kDESIGN"
## [25] "IlluminaHumanMethylation450kDHS"
## [26] "IlluminaHumanMethylation450kDMR"
## [27] "IlluminaHumanMethylation450kENHANCER"
## [28] "IlluminaHumanMethylation450kENSEMBL"
## [29] "IlluminaHumanMethylation450kENSEMBL2PROBE"
## [30] "IlluminaHumanMethylation450kENTREZID"
## [31] "IlluminaHumanMethylation450kENZYME"
## [32] "IlluminaHumanMethylation450kENZYME2PROBE"
## [33] "IlluminaHumanMethylation450kFANTOM"
## [34] "IlluminaHumanMethylation450kGENEBODY"
## [35] "IlluminaHumanMethylation450kGENENAME"
## [36] "IlluminaHumanMethylation450k_get27k"
## [37] "IlluminaHumanMethylation450k_getControls"
## [38] "IlluminaHumanMethylation450k_getProbeOrdering"
## [39] "IlluminaHumanMethylation450k_getProbes"
## [40] "IlluminaHumanMethylation450kGO"
## [41] "IlluminaHumanMethylation450kGO2ALLPROBES"
## [42] "IlluminaHumanMethylation450kGO2PROBE"
## [43] "IlluminaHumanMethylation450kISCPGISLAND"
## [44] "IlluminaHumanMethylation450kMAP"
## [45] "IlluminaHumanMethylation450kMAPCOUNTS"
## [46] "IlluminaHumanMethylation450kMETHYL27"
## [47] "IlluminaHumanMethylation450kNUID"
## [48] "IlluminaHumanMethylation450kOMIM"
## [49] "IlluminaHumanMethylation450kORGANISM"
## [50] "IlluminaHumanMethylation450kORGPKG"
## [51] "IlluminaHumanMethylation450kPATH"
## [52] "IlluminaHumanMethylation450kPATH2PROBE"
## [53] "IlluminaHumanMethylation450kPFAM"
## [54] "IlluminaHumanMethylation450kPMID"
## [55] "IlluminaHumanMethylation450kPMID2PROBE"
## [56] "IlluminaHumanMethylation450kPROBELOCATION"
## [57] "IlluminaHumanMethylation450kPROSITE"
## [58] "IlluminaHumanMethylation450kRANDOM"
## [59] "IlluminaHumanMethylation450kREFSEQ"
## [60] "IlluminaHumanMethylation450kREGULATORYGROUP"
## [61] "IlluminaHumanMethylation450kREGULATORYLOCATION"
## [62] "IlluminaHumanMethylation450kREGULATORYNAME"
## [63] "IlluminaHumanMethylation450kSVNID"
## [64] "IlluminaHumanMethylation450kSYMBOL"
## [65] "IlluminaHumanMethylation450kUNIGENE"
## [66] "IlluminaHumanMethylation450kUNIPROT"