Seminar : DNA Methylation Analysis (Array)

by Erica Acton

Explore 450K methylation

Retrieve datasets.

if (file.exists("methyl_ALL.Rdata")) {
    load("methyl_ALL.Rdata")
} else {
    GSE39141 <- getGEO("GSE39141")  #29 ALL and 4 healthy B cells
    show(GSE39141)
    GSE42865 <- getGEO("GSE42865")  #9 heathy and 7 other conditions
    show(GSE42865)
    # Extract expression matrices and convert to data frames
    ALL.dat <- as.data.frame(exprs(GSE39141[[1]]))
    CTRL.dat <- as.data.frame(exprs(GSE42865[[1]]))
    # Obtain meta-data
    ALL.meta <- pData(phenoData(GSE39141[[1]]))
    CTRL.meta <- pData(phenoData(GSE42865[[1]]))
    # Create labels
    ALL.meta$Group <- c(rep("ALL", 29), rep("HBC", 4))
    # subset meta-data and data for healthy donors
    CTRL.meta <- droplevels(subset(CTRL.meta, grepl("Healthy donor", characteristics_ch1.1)))
    CTRL.dat <- subset(CTRL.dat, select = as.character(CTRL.meta$geo_accession))
    # Rename variables
    names(ALL.dat) <- paste(ALL.meta$Group, gsub("GSM", "", names(ALL.dat)), 
        sep = "_")
    names(CTRL.dat) <- paste("HBC", gsub("GSM", "", names(CTRL.dat)), sep = "_")

    save(ALL.dat, CTRL.dat, ALL.meta, CTRL.meta, file = "methyl_ALL.Rdata")
}

Exploratory analysis looking at the density of Beta values for probes in the two datasets.

ALL.mean <- rowMeans(ALL.dat, na.rm = T)
CTRL.mean <- rowMeans(CTRL.dat, na.rm = T)
both.mean <- data.frame(Beta = c(ALL.mean, CTRL.mean), type = c(rep("ALL", length(ALL.mean)), 
    rep("Control", length(CTRL.mean))))

densityplot(~Beta, both.mean, groups = type, grid = TRUE, plot.points = FALSE, 
    auto.key = TRUE, main = "Density of Beta values before normalization")

plot of chunk unnamed-chunk-3

Normalization

Combine data from both experiments into one matrix.

beta.matrix <- as.matrix(cbind(ALL.dat, CTRL.dat))
str(beta.matrix, max.level = 0)
##  num [1:485577, 1:42] 0.512 0.911 0.857 0.149 0.729 ...
##  - attr(*, "dimnames")=List of 2

Perform quantile normalization of beta values.

system.time(beta.norm <- betaqn(beta.matrix))
##    user  system elapsed 
##  45.987   0.376  46.467
# takes longer than Gloria...poor computer...

Plot density of Beta values after normalization.

beta.qn <- data.frame(Beta = c(rowMeans(beta.norm[, colnames(ALL.dat)], na.rm = T), 
    rowMeans(beta.norm[, colnames(CTRL.dat)], na.rm = T)), group = c(rep("ALL", 
    nrow(ALL.dat)), rep("Control", nrow(CTRL.dat))))

densityplot(~Beta, beta.qn, groups = group, grid = TRUE, plot.points = FALSE, 
    auto.key = TRUE, main = "Density of Beta values after normalization")

plot of chunk unnamed-chunk-6

M Values

Apply transformation ie. compute M value.

M.norm <- beta2m(beta.norm)

CpG Islands

Extract probe ID to CpG islands association.

cginame <- as.data.frame(IlluminaHumanMethylation450kCPGINAME)
names(cginame) <- c("Probe_ID", "cginame")
rownames(cginame) <- cginame$Probe_ID
length(levels(factor(cginame$cginame)))
## [1] 27176

Restrict probes to within CGIs.

beta.inCGI <- beta.norm[cginame$Probe_ID, ]
M.inCGI <- M.norm[cginame$Probe_ID, ]
nrow(M.inCGI)
## [1] 309465

Aggregate probes to CGIs (B-values).

beta.CGI <- aggregate(beta.inCGI, by = list(cginame$cginame), mean, na.rm = T)
rownames(beta.CGI) <- beta.CGI[, "Group.1"]
beta.CGI <- subset(beta.CGI, select = -Group.1)
str(beta.CGI, max.level = 0)
## 'data.frame':    27176 obs. of  42 variables:

Aggregate probes to CGIs (M-values).

M.CGI <- aggregate(M.inCGI, by = list(cginame$cginame), mean, na.rm = T)
rownames(M.CGI) <- M.CGI[, "Group.1"]
M.CGI <- subset(M.CGI, select = -Group.1)
str(M.CGI, max.level = 0)
## 'data.frame':    27176 obs. of  42 variables:

Boxplot of distribution of CGI M-values.

boxplot(M.CGI, xlab = "Samples", ylab = "M values", main = "Distribution of CGI M values")
## Warning: Outlier (-Inf) in boxplot 2 is not drawn
## Warning: Outlier (-Inf) in boxplot 3 is not drawn
## Warning: Outlier (-Inf) in boxplot 6 is not drawn
## Warning: Outlier (-Inf) in boxplot 7 is not drawn
## Warning: Outlier (-Inf) in boxplot 8 is not drawn
## Warning: Outlier (-Inf) in boxplot 11 is not drawn
## Warning: Outlier (-Inf) in boxplot 12 is not drawn
## Warning: Outlier (-Inf) in boxplot 14 is not drawn
## Warning: Outlier (-Inf) in boxplot 16 is not drawn
## Warning: Outlier (-Inf) in boxplot 18 is not drawn
## Warning: Outlier (-Inf) in boxplot 19 is not drawn
## Warning: Outlier (-Inf) in boxplot 20 is not drawn
## Warning: Outlier (-Inf) in boxplot 21 is not drawn
## Warning: Outlier (-Inf) in boxplot 22 is not drawn
## Warning: Outlier (-Inf) in boxplot 25 is not drawn
## Warning: Outlier (-Inf) in boxplot 26 is not drawn
## Warning: Outlier (-Inf) in boxplot 27 is not drawn
## Warning: Outlier (-Inf) in boxplot 28 is not drawn
## Warning: Outlier (-Inf) in boxplot 29 is not drawn

plot of chunk unnamed-chunk-12

Differential methylation analysis with limma

Use a linear model to identify differentially methylated CGIs. Design matrix, lmFit, eBayes, topTable…

design <- data.frame(Group = relevel(factor(gsub("_[0-9]+", "", colnames(M.CGI))), 
    ref = "HBC"), row.names = colnames(M.CGI))
str(design)
## 'data.frame':    42 obs. of  1 variable:
##  $ Group: Factor w/ 2 levels "HBC","ALL": 2 2 2 2 2 2 2 2 2 2 ...

(DesMat <- model.matrix(~Group, design))
##             (Intercept) GroupALL
## ALL_956761            1        1
## ALL_956762            1        1
## ALL_956763            1        1
## ALL_956764            1        1
## ALL_956765            1        1
## ALL_956766            1        1
## ALL_956767            1        1
## ALL_956768            1        1
## ALL_956769            1        1
## ALL_956770            1        1
## ALL_956771            1        1
## ALL_956772            1        1
## ALL_956773            1        1
## ALL_956774            1        1
## ALL_956775            1        1
## ALL_956776            1        1
## ALL_956777            1        1
## ALL_956778            1        1
## ALL_956779            1        1
## ALL_956780            1        1
## ALL_956781            1        1
## ALL_956782            1        1
## ALL_956783            1        1
## ALL_956784            1        1
## ALL_956785            1        1
## ALL_956786            1        1
## ALL_956787            1        1
## ALL_956788            1        1
## ALL_956789            1        1
## HBC_956790            1        0
## HBC_956791            1        0
## HBC_956792            1        0
## HBC_956793            1        0
## HBC_1052420           1        0
## HBC_1052421           1        0
## HBC_1052422           1        0
## HBC_1052423           1        0
## HBC_1052424           1        0
## HBC_1052425           1        0
## HBC_1052426           1        0
## HBC_1052427           1        0
## HBC_1052428           1        0
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$Group
## [1] "contr.treatment"

DMRfit <- lmFit(M.CGI, DesMat)
DMRfitEb <- eBayes(DMRfit)
cutoff <- 0.01
DMR <- topTable(DMRfitEb, coef = "GroupALL", number = Inf, p.value = cutoff)
head(DMR)
##                            logFC AveExpr      t   P.Value adj.P.Val     B
## chr19:49828412-49828668   1.3084   2.283  12.05 2.798e-15 7.604e-11 24.31
## chr4:156680095-156681386 -1.1115  -2.521 -10.95 6.245e-14 8.485e-10 21.39
## chr20:11871374-11872207  -1.2639  -2.284 -10.49 2.370e-13 1.803e-09 20.13
## chr19:33726654-33726946   0.9429   1.887  10.39 3.172e-13 1.803e-09 19.85
## chr18:77166704-77167043   0.8104   3.199  10.36 3.429e-13 1.803e-09 19.78
## chr18:46447718-46448083  -0.8990   2.034 -10.31 3.980e-13 1.803e-09 19.64
# Number of CGIs differentially methylated between ALL and Control group at
# a cutoff of FDR=0.01
nrow(DMR)
## [1] 4115

Make a heatmap of the top 100 hits.

DMR100 <- topTable(DMRfitEb, coef = "GroupALL", number = 100)
DMR.CGI <- t(as.matrix(subset(beta.CGI, rownames(beta.CGI) %in% rownames(DMR100))))
str(DMR.CGI, max.level = 0)
##  num [1:42, 1:100] 0.751 0.735 0.72 0.702 0.737 ...
##  - attr(*, "dimnames")=List of 2

col <- c(rep("darkgoldenrod1", times = nrow(DMR.CGI)))
col[grepl("HBC", rownames(DMR.CGI))] <- "forestgreen"
op <- par(mai = rep(0.5, 4))
heatmap.2(DMR.CGI, col = redblue(256), RowSideColors = col, density.info = "none", 
    trace = "none", Rowv = TRUE, Colv = TRUE, labCol = FALSE, labRow = FALSE, 
    dendrogram = "row", margins = c(1, 5))
legend("topright", c("ALL", "HBC"), col = c("darkgoldenrod1", "forestgreen"), 
    pch = 15)

plot of chunk unnamed-chunk-14

par(op)

Make a stripplot of beta values of probes within top 5 CGI hits.

DMR5 <- topTable(DMRfitEb, coef = "GroupALL", number = 5)
beta.DMR5probe <- beta.inCGI[cginame[rownames(beta.inCGI), ]$cginame %in% rownames(DMR5), 
    ]
beta.DMR5probe.tall <- melt(beta.DMR5probe, value.name = "M", varnames = c("Probe_ID", 
    "Sample"))
beta.DMR5probe.tall$Group <- factor(gsub("_[0-9]+", "", beta.DMR5probe.tall$Sample))
beta.DMR5probe.tall$CGI <- factor(cginame[as.character(beta.DMR5probe.tall$Probe_ID), 
    ]$cginame)
(beta.DMR5.stripplot <- ggplot(data = beta.DMR5probe.tall, aes(x = Group, y = M, 
    color = Group)) + geom_point(position = position_jitter(width = 0.05), na.rm = T) + 
    stat_summary(fun.y = mean, aes(group = 1), geom = "line", color = "black") + 
    facet_grid(. ~ CGI) + ggtitle("Probe beta values within top 5 DM CGIs") + 
    xlab("Group") + ylab("Beta") + theme_bw())
## Warning: Removed 3 rows containing missing values (stat_summary).
## Warning: Removed 1 rows containing missing values (stat_summary).
## Warning: Removed 2 rows containing missing values (stat_summary).
## Warning: Removed 3 rows containing missing values (stat_summary).

plot of chunk unnamed-chunk-15

Plot the location of DM probes along each chromosome.

# chromosome lengths 1-22 and X
chrlen <- unlist(as.list(IlluminaHumanMethylation450kCHRLENGTHS)[c(as.character(1:22), 
    "X")])
chrlen <- data.frame(chr = factor(names(chrlen)), length = chrlen)
chr <- IlluminaHumanMethylation450kCHR
# probe identifiers mapped to chromosome
chr <- unlist(as.list(chr[mappedkeys(chr)]))
# coordinate of probe
coord <- IlluminaHumanMethylation450kCPGCOORDINATE
# probe identifiers mapped to coordinate
coord <- unlist(as.list(coord[mappedkeys(coord)]))
coord <- data.frame(chr = chr[intersect(names(chr), names(coord))], coord = coord[intersect(names(chr), 
    names(coord))])
# coordinates of probes in DM CGIs
coordDMRprobe <- droplevels(na.omit(coord[cginame[cginame$cginame %in% rownames(DMR), 
    ]$Probe_ID, ]))
# plot
(coord.plot <- ggplot(data = coordDMRprobe) + geom_linerange(aes(factor(chr, 
    levels = c("X", as.character(22:1))), ymin = 0, ymax = length), data = chrlen, 
    alpha = 0.5) + geom_point(aes(x = factor(chr, levels = c("X", as.character(22:1))), 
    y = coord), position = position_jitter(width = 0.03), na.rm = T) + ggtitle("DMR positions on chromosomes") + 
    ylab("Position of DMRs") + xlab("chr") + coord_flip() + theme_bw())

plot of chunk unnamed-chunk-16

Interpretation and functional enrichment analysis

Associate DMRs with biological features/functions by associating them with genes. Get annotation information ie. probes associated with different genomic regions or gene IDs. Below is a list of objects available in the package to acheive this task.

ls("package:IlluminaHumanMethylation450k.db")
##  [1] "IlluminaHumanMethylation450k"                  
##  [2] "IlluminaHumanMethylation450kACCNUM"            
##  [3] "IlluminaHumanMethylation450kALIAS2PROBE"       
##  [4] "IlluminaHumanMethylation450kBLAME"             
##  [5] "IlluminaHumanMethylation450kBUILD"             
##  [6] "IlluminaHumanMethylation450kCHR"               
##  [7] "IlluminaHumanMethylation450kCHR36"             
##  [8] "IlluminaHumanMethylation450kCHR37"             
##  [9] "IlluminaHumanMethylation450kCHRLENGTHS"        
## [10] "IlluminaHumanMethylation450kCHRLOC"            
## [11] "IlluminaHumanMethylation450kCHRLOCEND"         
## [12] "IlluminaHumanMethylation450kCOLORCHANNEL"      
## [13] "IlluminaHumanMethylation450kCPG36"             
## [14] "IlluminaHumanMethylation450kCPG37"             
## [15] "IlluminaHumanMethylation450kCPGCOORDINATE"     
## [16] "IlluminaHumanMethylation450kCPGILOCATION"      
## [17] "IlluminaHumanMethylation450kCPGINAME"          
## [18] "IlluminaHumanMethylation450kCPGIRELATION"      
## [19] "IlluminaHumanMethylation450kCPGS"              
## [20] "IlluminaHumanMethylation450k_dbconn"           
## [21] "IlluminaHumanMethylation450k_dbfile"           
## [22] "IlluminaHumanMethylation450k_dbInfo"           
## [23] "IlluminaHumanMethylation450k_dbschema"         
## [24] "IlluminaHumanMethylation450kDESIGN"            
## [25] "IlluminaHumanMethylation450kDHS"               
## [26] "IlluminaHumanMethylation450kDMR"               
## [27] "IlluminaHumanMethylation450kENHANCER"          
## [28] "IlluminaHumanMethylation450kENSEMBL"           
## [29] "IlluminaHumanMethylation450kENSEMBL2PROBE"     
## [30] "IlluminaHumanMethylation450kENTREZID"          
## [31] "IlluminaHumanMethylation450kENZYME"            
## [32] "IlluminaHumanMethylation450kENZYME2PROBE"      
## [33] "IlluminaHumanMethylation450kFANTOM"            
## [34] "IlluminaHumanMethylation450kGENEBODY"          
## [35] "IlluminaHumanMethylation450kGENENAME"          
## [36] "IlluminaHumanMethylation450k_get27k"           
## [37] "IlluminaHumanMethylation450k_getControls"      
## [38] "IlluminaHumanMethylation450k_getProbeOrdering" 
## [39] "IlluminaHumanMethylation450k_getProbes"        
## [40] "IlluminaHumanMethylation450kGO"                
## [41] "IlluminaHumanMethylation450kGO2ALLPROBES"      
## [42] "IlluminaHumanMethylation450kGO2PROBE"          
## [43] "IlluminaHumanMethylation450kISCPGISLAND"       
## [44] "IlluminaHumanMethylation450kMAP"               
## [45] "IlluminaHumanMethylation450kMAPCOUNTS"         
## [46] "IlluminaHumanMethylation450kMETHYL27"          
## [47] "IlluminaHumanMethylation450kNUID"              
## [48] "IlluminaHumanMethylation450kOMIM"              
## [49] "IlluminaHumanMethylation450kORGANISM"          
## [50] "IlluminaHumanMethylation450kORGPKG"            
## [51] "IlluminaHumanMethylation450kPATH"              
## [52] "IlluminaHumanMethylation450kPATH2PROBE"        
## [53] "IlluminaHumanMethylation450kPFAM"              
## [54] "IlluminaHumanMethylation450kPMID"              
## [55] "IlluminaHumanMethylation450kPMID2PROBE"        
## [56] "IlluminaHumanMethylation450kPROBELOCATION"     
## [57] "IlluminaHumanMethylation450kPROSITE"           
## [58] "IlluminaHumanMethylation450kRANDOM"            
## [59] "IlluminaHumanMethylation450kREFSEQ"            
## [60] "IlluminaHumanMethylation450kREGULATORYGROUP"   
## [61] "IlluminaHumanMethylation450kREGULATORYLOCATION"
## [62] "IlluminaHumanMethylation450kREGULATORYNAME"    
## [63] "IlluminaHumanMethylation450kSVNID"             
## [64] "IlluminaHumanMethylation450kSYMBOL"            
## [65] "IlluminaHumanMethylation450kUNIGENE"           
## [66] "IlluminaHumanMethylation450kUNIPROT"