This file contains clustering procedure and key results for: (A)pre-normalized, i.e. raw (B)post-normalized ©post-normalized and filtered by pairwise ctrl group comparison
Clustering for pre-normalized raw data: 1.) Probe-based (belta values from 485577 probes for the CPG sites)
First, load previously compiled data
setwd("C://STAT540/project/")
library(RColorBrewer)
##### Load raw data (modify the path accordingly)
load("Data/All_3_metasets.Rdata")
load("Data/All_3_sets.Rdata")
# merge 3 data sets and design
merged.dat <- cbind(APL.dat, ALL.dat, CTRL.dat)
merged.des <- data.frame(Samples = colnames(merged.dat), Group = c(APL.meta$Group,
ALL.meta$Group, rep("HBC", 9)))
Try different methods for clustering, i.e. average, complete, ward, single
# compute pairwise distances
pr.dis <- dist(t(merged.dat), method = "euclidean")
# sanity check between the sample name and group assigned
pr.nms <- with(merged.des, paste(Samples, Group, sep = "_"))
all(unlist(lapply(pr.nms, function(x) substr(x, 1, 3) == substr(x, nchar(x) -
2, nchar(x))))) # TRUE
## [1] TRUE
# Explore which clustering method performs the best
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
(I) overview of the 4 clustering methods w/o sample labels:
plot the dendrograms for the 4 methods and compare them
par(mar = c(0, 4, 4, 2))
par(mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
(II) plot for each individual method with sample labels to judge which one groups the samples the best:
plot the dendrogram for ward
# ward --> FINAL CHOICE
par(mfrow = c(1, 1))
# decide how many clusters we should cut
unique(merged.des$Group) # 5 categories: ALL HBC HBM PLP PLR
## [1] PLR PLP HBM ALL HBC
## Levels: ALL HBC HBM PLP PLR
# label the leaf nodes by coloring them according to their groups
ward.dendro <- as.dendrogram(pr.hc.w)
# color code each node of the dendrogram according to their group
colors <- brewer.pal(5, "Set1")
names(colors) <- c("ALL", "HBC", "HBM", "PLP", "PLR")
colLab <- function(n) {
if (is.leaf(n)) {
a <- attributes(n)
# find group name
a.group <- substr(a$label, 1, 3)
# retrieve the corresponding color
attr(n, "nodePar") <- c(a$nodePar, list(lab.col = colors[a.group], lab.bg = "grey50",
pch = sample(19:25, 1)))
attr(n, "frame.plot") <- TRUE
}
n
}
clusDendro <- dendrapply(ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "Ward showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.w, k = 5, border = c("cyan"))
plot the dendrogram for single
# single
par(mfrow = c(1, 1))
plot(pr.hc.s, labels = merged.des$Samples, cex = 0.6, main = "single showing 5 clusters")
rect.hclust(pr.hc.s, k = 5)
plot the dendrogram for complete
# complete --> Not as good as Ward, tho better than single and average
par(mfrow = c(1, 1))
# color leaf node of the dendrogram
comp.dendro <- as.dendrogram(pr.hc.c)
clusDendro <- dendrapply(comp.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "complete showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.c, k = 5, border = c("cyan"))
plot the dendrogram for Average
# Average
par(mfrow = c(1, 1))
plot(pr.hc.a, labels = merged.des$Samples, cex = 0.6, main = "Average showing 5 clusters")
rect.hclust(pr.hc.a, k = 5)
(III) headmap with ward dendrogram Please note that the heatmap line is commented out due to this expansive computatin time The corresponding heatmap could be downloaded from our digital supplementary file
# scale
sMerged.Dat <- t(scale(t(merged.dat)))
# heatmap(as.matrix(sMerged.Dat), Rowv = NA, Colv = NULL, hclustfun =
# function(x) hclust(x, method = 'ward'), distfun = function(x) dist(x,
# method = 'euclidean'), scale = 'none', labCol = pr.nms, labRow = NA,
# margin = c(8, 1), ColSideColor = brewer.pal(11,
#'RdGy')[c(4, 7)][unclass(merged.des$Group)])
(IV) another overview plot for ward vs. complete using the scaled data
pdf("Figures/clusterOverviewByProbe_wad_com_scaledRaw.pdf")
par(mar = c(0, 4, 4, 2))
par(mfrow = c(2, 1))
# recalculate pairwise distance
pr.dis.s <- dist(t(sMerged.Dat), method = "euclidean")
pr.hc.c.s <- hclust(pr.dis.s, method = "complete")
pr.hc.w.s <- hclust(pr.dis.s, method = "ward")
plot(pr.hc.c.s, labels = merged.des$Samples, cex = 0.6, main = "complete showing 5 clusters (scaled)")
rect.hclust(pr.hc.c.s, k = 5)
plot(pr.hc.w.s, labels = merged.des$Samples, cex = 0.6, main = "Ward showing 5 clusters (scaled)")
rect.hclust(pr.hc.w.s, k = 5)
(v) another ward plot using the scaled data: the ALL samples have a better clustering here pdf(“Figures/clusterByProbe_ward_scaledRaw.pdf”)
par(mfrow = c(1, 1))
# label the leaf nodes by coloring them according to their groups
ward.dendro.s <- as.dendrogram(pr.hc.w.s)
clusDendro <- dendrapply(ward.dendro.s, colLab)
plot(clusDendro, cex = 0.6, main = "Scaled Ward showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.w.s, k = 5, border = c("cyan"))
2.) CPG-island-based (aggregated as median or mean of the belta values of the probes from one island)
Load the previous calculated data (the R script for this calculation could be retrieved from aggregate_raw_norm_filter.R)
##### Load the raw probe data load('Data/CPGI2Probe_betaList_raw.Rdata')
##### construct a data frame with rows = cpgi IDs, columns = mean belta
##### value for each sample
load("Data/CPGI_betaMeanList__raw.Rdata")
#### Transpose to place the cpgi features in rows
cpgi.Beta.mean <- lapply(cpgi.Beta.mean, t)
cpgi.Beta.mean.whole <- do.call(cbind, cpgi.Beta.mean)
dim(cpgi.Beta.mean.whole)
## [1] 27176 52
# [1] 27176 52
(I) ward plot for the mean cpgi belta values
# ward cluster based on the mean cpgi belta values compute pairwise
# distances
pr.dis.mean <- dist(t(cpgi.Beta.mean.whole), method = "euclidean")
pr.hc.w.mean <- hclust(pr.dis.mean, method = "ward")
par(mfrow = c(1, 1))
# label the leaf nodes by coloring them according to their groups
mean.ward.dendro <- as.dendrogram(pr.hc.w.mean)
clusDendro <- dendrapply(mean.ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "CPGI-based Ward(mean) Clustering showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.w.mean, k = 5, border = c("cyan"))
(II) ward plot for the median cpgi belta values
load("Data/CPGI_betaMedianList_raw.Rdata")
#### Transpose to place the cpgi features in rows
cpgi.Beta.median <- lapply(cpgi.Beta.median, t)
cpgi.Beta.median.whole <- do.call(cbind, cpgi.Beta.median)
dim(cpgi.Beta.median.whole)
## [1] 27176 52
# [1] 27176 52
# ward cluster based on the median cpgi belta values compute pairwise
# distances
pr.dis.median <- dist(t(cpgi.Beta.median.whole), method = "euclidean")
pr.hc.w.median <- hclust(pr.dis.median, method = "ward")
pdf("Figures/clusterByMedianCPGI_ward_Raw.pdf")
par(mfrow = c(1, 1))
# label the leaf nodes by coloring them according to their groups
median.ward.dendro <- as.dendrogram(pr.hc.w.median)
clusDendro <- dendrapply(median.ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "CPGI-based Ward(median) Clustering showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.w.median, k = 5, border = c("cyan"))
From all the comparison above, we conclude that clustering using ward works the best among all the methods, which is the method we will use to proceed for the normalized and norm&filtered data. Then we can compare whether the normalization and filteration procedures have an effect on the grouping result.
1.) Probe-based (485577 probes for the CPG sites)
load data and pre-process the data for plotting
# load the data
load("Data/All_3_sets_normalized.Rdata")
dim(ALL.norm)
## [1] 485577 33
# [1] 485577 33
##### (1) probe level: clustering by Belta values ##### compute pairwise
##### distances
merged.dat <- cbind(APL.norm, ALL.norm, CTRL.norm)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Error: cannot allocate vector of size 3.7 Mb
pr.dis <- dist(t(merged.dat), method = "euclidean")
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
# sanity check between the sample name and group assigned
merged.des <- data.frame(Samples = colnames(merged.dat), Group = c(APL.meta$Group,
ALL.meta$Group, rep("HBC", 9)))
pr.nms <- with(merged.des, paste(Samples, Group, sep = "_"))
all(unlist(lapply(pr.nms, function(x) substr(x, 1, 3) == substr(x, nchar(x) -
2, nchar(x))))) # TRUE
## [1] TRUE
# compute hierarchical clustering using ward linkage type
pr.hc.w <- hclust(pr.dis, method = "ward")
Plot the ward dendrogram
# ward
par(mfrow = c(1, 1))
# label the leaf nodes by coloring them according to their groups
ward.dendro <- as.dendrogram(pr.hc.w)
# color code each node of the dendrogram according to their group
clusDendro <- dendrapply(ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "Ward showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.w, k = 5, border = c("cyan"))
(2) cpgi level: clustering by Belta values
load previously calculated data (the R script for this calculation could be retrieved from aggregate_raw_norm_filter.R)
##### Load the norm probe data
### (I) ward plot for the mean cpgi belta values
# construct a data frame with rows = cpgi IDs, columns = mean belta value
# for each sample
load("Data/CPGI_betaMeanList__norm.Rdata")
dim(cpgi.Beta.mean$ALL)
## [1] 33 27176
# [1] 33 27176
#### Transpose to place the cpgi features in rows
cpgi.Beta.mean <- lapply(cpgi.Beta.mean, t)
cpgi.Beta.mean.whole <- do.call(cbind, cpgi.Beta.mean)
dim(cpgi.Beta.mean.whole)
## [1] 27176 52
# [1] 27176 52
Plot the ward cluster based on the mean cpgi belta values
# compute pairwise distances
pr.dis.mean <- dist(t(cpgi.Beta.mean.whole), method = "euclidean")
pr.hc.w.mean <- hclust(pr.dis.mean, method = "ward")
par(mfrow = c(1, 1))
# label the leaf nodes by coloring them according to their groups
mean.ward.dendro <- as.dendrogram(pr.hc.w.mean)
clusDendro <- dendrapply(mean.ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "CPGI-based Ward(mean) Clustering showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.w.mean, k = 5, border = c("cyan"))
(II) ward plot for the median cpgi belta values
Load data
load("Data/CPGI_betaMedianList_norm.Rdata")
#### Transpose to place the cpgi features in rows
cpgi.Beta.median <- lapply(cpgi.Beta.median, t)
cpgi.Beta.median.whole <- do.call(cbind, cpgi.Beta.median)
dim(cpgi.Beta.median.whole)
## [1] 27176 52
# [1] 27176 52
Plot ward cluster based on the median cpgi belta values
# compute pairwise distances
pr.dis.median <- dist(t(cpgi.Beta.median.whole), method = "euclidean")
pr.hc.w.median <- hclust(pr.dis.median, method = "ward")
par(mfrow = c(1, 1))
# label the leaf nodes by coloring them according to their groups
median.ward.dendro <- as.dendrogram(pr.hc.w.median)
clusDendro <- dendrapply(median.ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "CPGI-based Ward(median) Clustering showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.w.median, k = 5, border = c("cyan"))
Clustering for post-normalized and filtered data:
1.) Probe-based (435739 probes for the CPG sites)
Load data and calculate the distance, then cluster by ward
# load the data
load("Data/All_3_sets_normAndFilt.Rdata")
dim(ALL.dat)
## [1] 435739 33
# [1] 435739 33
##### (1) probe level: clustering by Belta values #####
# compute pairwise distances
merged.dat <- cbind(APL.dat, ALL.dat, CTRL.dat)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
pr.dis <- dist(t(merged.dat), method = "euclidean")
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
## Warning: Reached total allocation of 1535Mb: see help(memory.size)
# sanity check between the sample name and group assigned
merged.des <- data.frame(Samples = colnames(merged.dat), Group = c(APL.meta$Group,
ALL.meta$Group, rep("HBC", 9)))
pr.nms <- with(merged.des, paste(Samples, Group, sep = "_"))
all(unlist(lapply(pr.nms, function(x) substr(x, 1, 3) == substr(x, nchar(x) -
2, nchar(x))))) # TRUE
## [1] TRUE
# compute hierarchical clustering using ward linkage type
pr.hc.w <- hclust(pr.dis, method = "ward")
Plot the dendrogram with ward
par(mfrow = c(1, 1))
# label the leaf nodes by coloring them according to their groups
ward.dendro <- as.dendrogram(pr.hc.w)
# color code each node of the dendrogram according to their group
clusDendro <- dendrapply(ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "Ward showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors,
title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.w, k = 5, border = c("cyan"))