Exploration with hierarchical clustering

This file contains clustering procedure and key results for: (A)pre-normalized, i.e. raw (B)post-normalized ©post-normalized and filtered by pairwise ctrl group comparison

Analysis on Raw Data

Clustering for pre-normalized raw data: 1.) Probe-based (belta values from 485577 probes for the CPG sites)

First, load previously compiled data

setwd("C://STAT540/project/")
library(RColorBrewer)

##### Load raw data (modify the path accordingly)
load("Data/All_3_metasets.Rdata")
load("Data/All_3_sets.Rdata")
# merge 3 data sets and design
merged.dat <- cbind(APL.dat, ALL.dat, CTRL.dat)
merged.des <- data.frame(Samples = colnames(merged.dat), Group = c(APL.meta$Group, 
    ALL.meta$Group, rep("HBC", 9)))

Try different methods for clustering, i.e. average, complete, ward, single

# compute pairwise distances
pr.dis <- dist(t(merged.dat), method = "euclidean")

# sanity check between the sample name and group assigned
pr.nms <- with(merged.des, paste(Samples, Group, sep = "_"))
all(unlist(lapply(pr.nms, function(x) substr(x, 1, 3) == substr(x, nchar(x) - 
    2, nchar(x)))))  # TRUE

## [1] TRUE



# Explore which clustering method performs the best

# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")

(I) overview of the 4 clustering methods w/o sample labels:

plot the dendrograms for the 4 methods and compare them

par(mar = c(0, 4, 4, 2))
par(mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")

plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")

plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")

plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")

plot of chunk dendrogram_I

(II) plot for each individual method with sample labels to judge which one groups the samples the best:

plot the dendrogram for ward

# ward --> FINAL CHOICE
par(mfrow = c(1, 1))
# decide how many clusters we should cut
unique(merged.des$Group)  # 5 categories: ALL HBC HBM PLP PLR

## [1] PLR PLP HBM ALL HBC
## Levels: ALL HBC HBM PLP PLR


# label the leaf nodes by coloring them according to their groups
ward.dendro <- as.dendrogram(pr.hc.w)
# color code each node of the dendrogram according to their group
colors <- brewer.pal(5, "Set1")
names(colors) <- c("ALL", "HBC", "HBM", "PLP", "PLR")

colLab <- function(n) {
    if (is.leaf(n)) {
        a <- attributes(n)
        # find group name
        a.group <- substr(a$label, 1, 3)
        # retrieve the corresponding color
        attr(n, "nodePar") <- c(a$nodePar, list(lab.col = colors[a.group], lab.bg = "grey50", 
            pch = sample(19:25, 1)))
        attr(n, "frame.plot") <- TRUE
    }
    n
}
clusDendro <- dendrapply(ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "Ward showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")

rect.hclust(pr.hc.w, k = 5, border = c("cyan"))

plot of chunk dendrogram_II_ward

plot the dendrogram for single

# single
par(mfrow = c(1, 1))
plot(pr.hc.s, labels = merged.des$Samples, cex = 0.6, main = "single showing 5 clusters")
rect.hclust(pr.hc.s, k = 5)

plot of chunk dendrogram_II_single

plot the dendrogram for complete

# complete --> Not as good as Ward, tho better than single and average
par(mfrow = c(1, 1))

# color leaf node of the dendrogram
comp.dendro <- as.dendrogram(pr.hc.c)
clusDendro <- dendrapply(comp.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "complete showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")
rect.hclust(pr.hc.c, k = 5, border = c("cyan"))

plot of chunk dendrogram_II_complete

plot the dendrogram for Average

# Average
par(mfrow = c(1, 1))
plot(pr.hc.a, labels = merged.des$Samples, cex = 0.6, main = "Average showing 5 clusters")
rect.hclust(pr.hc.a, k = 5)

plot of chunk dendrogram_II_average

(III) headmap with ward dendrogram Please note that the heatmap line is commented out due to this expansive computatin time The corresponding heatmap could be downloaded from our digital supplementary file

# scale
sMerged.Dat <- t(scale(t(merged.dat)))
# heatmap(as.matrix(sMerged.Dat), Rowv = NA, Colv = NULL, hclustfun =
# function(x) hclust(x, method = 'ward'), distfun = function(x) dist(x,
# method = 'euclidean'), scale = 'none', labCol = pr.nms, labRow = NA,
# margin = c(8, 1), ColSideColor = brewer.pal(11,
#'RdGy')[c(4, 7)][unclass(merged.des$Group)])

(IV) another overview plot for ward vs. complete using the scaled data

pdf("Figures/clusterOverviewByProbe_wad_com_scaledRaw.pdf")
par(mar = c(0, 4, 4, 2))
par(mfrow = c(2, 1))

# recalculate pairwise distance
pr.dis.s <- dist(t(sMerged.Dat), method = "euclidean")
pr.hc.c.s <- hclust(pr.dis.s, method = "complete")
pr.hc.w.s <- hclust(pr.dis.s, method = "ward")

plot(pr.hc.c.s, labels = merged.des$Samples, cex = 0.6, main = "complete showing 5 clusters (scaled)")
rect.hclust(pr.hc.c.s, k = 5)

plot(pr.hc.w.s, labels = merged.des$Samples, cex = 0.6, main = "Ward showing 5 clusters (scaled)")
rect.hclust(pr.hc.w.s, k = 5)

(v) another ward plot using the scaled data: the ALL samples have a better clustering here pdf(“Figures/clusterByProbe_ward_scaledRaw.pdf”)

par(mfrow = c(1, 1))

# label the leaf nodes by coloring them according to their groups
ward.dendro.s <- as.dendrogram(pr.hc.w.s)
clusDendro <- dendrapply(ward.dendro.s, colLab)
plot(clusDendro, cex = 0.6, main = "Scaled Ward showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")

rect.hclust(pr.hc.w.s, k = 5, border = c("cyan"))

plot of chunk dendrogram_II_ward_scaledRaw

2.) CPG-island-based (aggregated as median or mean of the belta values of the probes from one island)

Load the previous calculated data (the R script for this calculation could be retrieved from aggregate_raw_norm_filter.R)

##### Load the raw probe data load('Data/CPGI2Probe_betaList_raw.Rdata')
##### construct a data frame with rows = cpgi IDs, columns = mean belta
##### value for each sample
load("Data/CPGI_betaMeanList__raw.Rdata")
#### Transpose to place the cpgi features in rows
cpgi.Beta.mean <- lapply(cpgi.Beta.mean, t)
cpgi.Beta.mean.whole <- do.call(cbind, cpgi.Beta.mean)
dim(cpgi.Beta.mean.whole)

## [1] 27176    52

# [1] 27176 52

(I) ward plot for the mean cpgi belta values

# ward cluster based on the mean cpgi belta values compute pairwise
# distances
pr.dis.mean <- dist(t(cpgi.Beta.mean.whole), method = "euclidean")
pr.hc.w.mean <- hclust(pr.dis.mean, method = "ward")

par(mfrow = c(1, 1))

# label the leaf nodes by coloring them according to their groups
mean.ward.dendro <- as.dendrogram(pr.hc.w.mean)
clusDendro <- dendrapply(mean.ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "CPGI-based Ward(mean) Clustering showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")

rect.hclust(pr.hc.w.mean, k = 5, border = c("cyan"))

plot of chunk dendrogram_clusterByMeanCPGI_ward_Raw

(II) ward plot for the median cpgi belta values

load("Data/CPGI_betaMedianList_raw.Rdata")
#### Transpose to place the cpgi features in rows
cpgi.Beta.median <- lapply(cpgi.Beta.median, t)
cpgi.Beta.median.whole <- do.call(cbind, cpgi.Beta.median)
dim(cpgi.Beta.median.whole)

## [1] 27176    52

# [1] 27176 52


# ward cluster based on the median cpgi belta values compute pairwise
# distances
pr.dis.median <- dist(t(cpgi.Beta.median.whole), method = "euclidean")
pr.hc.w.median <- hclust(pr.dis.median, method = "ward")

pdf("Figures/clusterByMedianCPGI_ward_Raw.pdf")
par(mfrow = c(1, 1))

# label the leaf nodes by coloring them according to their groups
median.ward.dendro <- as.dendrogram(pr.hc.w.median)
clusDendro <- dendrapply(median.ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "CPGI-based Ward(median) Clustering showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")

rect.hclust(pr.hc.w.median, k = 5, border = c("cyan"))

From all the comparison above, we conclude that clustering using ward works the best among all the methods, which is the method we will use to proceed for the normalized and norm&filtered data. Then we can compare whether the normalization and filteration procedures have an effect on the grouping result.

Analysis on normalized Data

1.) Probe-based (485577 probes for the CPG sites)

load data and pre-process the data for plotting

# load the data
load("Data/All_3_sets_normalized.Rdata")
dim(ALL.norm)

## [1] 485577     33

# [1] 485577 33


##### (1) probe level: clustering by Belta values ##### compute pairwise
##### distances
merged.dat <- cbind(APL.norm, ALL.norm, CTRL.norm)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Error: cannot allocate vector of size 3.7 Mb

pr.dis <- dist(t(merged.dat), method = "euclidean")

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)


# sanity check between the sample name and group assigned
merged.des <- data.frame(Samples = colnames(merged.dat), Group = c(APL.meta$Group, 
    ALL.meta$Group, rep("HBC", 9)))
pr.nms <- with(merged.des, paste(Samples, Group, sep = "_"))
all(unlist(lapply(pr.nms, function(x) substr(x, 1, 3) == substr(x, nchar(x) - 
    2, nchar(x)))))  # TRUE

## [1] TRUE


# compute hierarchical clustering using ward linkage type
pr.hc.w <- hclust(pr.dis, method = "ward")

Plot the ward dendrogram

# ward
par(mfrow = c(1, 1))

# label the leaf nodes by coloring them according to their groups
ward.dendro <- as.dendrogram(pr.hc.w)
# color code each node of the dendrogram according to their group
clusDendro <- dendrapply(ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "Ward showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")

rect.hclust(pr.hc.w, k = 5, border = c("cyan"))

plot of chunk dendrogram_clusterByProbe_ward_norm

(2) cpgi level: clustering by Belta values

load previously calculated data (the R script for this calculation could be retrieved from aggregate_raw_norm_filter.R)

##### Load the norm probe data

### (I) ward plot for the mean cpgi belta values

# construct a data frame with rows = cpgi IDs, columns = mean belta value
# for each sample
load("Data/CPGI_betaMeanList__norm.Rdata")
dim(cpgi.Beta.mean$ALL)

## [1]    33 27176

# [1] 33 27176



#### Transpose to place the cpgi features in rows
cpgi.Beta.mean <- lapply(cpgi.Beta.mean, t)
cpgi.Beta.mean.whole <- do.call(cbind, cpgi.Beta.mean)
dim(cpgi.Beta.mean.whole)

## [1] 27176    52

# [1] 27176 52

Plot the ward cluster based on the mean cpgi belta values


# compute pairwise distances
pr.dis.mean <- dist(t(cpgi.Beta.mean.whole), method = "euclidean")
pr.hc.w.mean <- hclust(pr.dis.mean, method = "ward")

par(mfrow = c(1, 1))

# label the leaf nodes by coloring them according to their groups
mean.ward.dendro <- as.dendrogram(pr.hc.w.mean)
clusDendro <- dendrapply(mean.ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "CPGI-based Ward(mean) Clustering showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")

rect.hclust(pr.hc.w.mean, k = 5, border = c("cyan"))

plot of chunk dendrogram_clusterByMeanCPGI_ward_Norm

(II) ward plot for the median cpgi belta values

Load data

load("Data/CPGI_betaMedianList_norm.Rdata")
#### Transpose to place the cpgi features in rows
cpgi.Beta.median <- lapply(cpgi.Beta.median, t)
cpgi.Beta.median.whole <- do.call(cbind, cpgi.Beta.median)
dim(cpgi.Beta.median.whole)

## [1] 27176    52

# [1] 27176 52

Plot ward cluster based on the median cpgi belta values

# compute pairwise distances
pr.dis.median <- dist(t(cpgi.Beta.median.whole), method = "euclidean")
pr.hc.w.median <- hclust(pr.dis.median, method = "ward")

par(mfrow = c(1, 1))

# label the leaf nodes by coloring them according to their groups
median.ward.dendro <- as.dendrogram(pr.hc.w.median)
clusDendro <- dendrapply(median.ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "CPGI-based Ward(median) Clustering showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")

rect.hclust(pr.hc.w.median, k = 5, border = c("cyan"))

Analysis on norm&filtered Data

Clustering for post-normalized and filtered data:

1.) Probe-based (435739 probes for the CPG sites)

Load data and calculate the distance, then cluster by ward

# load the data
load("Data/All_3_sets_normAndFilt.Rdata")
dim(ALL.dat)

## [1] 435739     33

# [1] 435739 33

##### (1) probe level: clustering by Belta values #####

# compute pairwise distances
merged.dat <- cbind(APL.dat, ALL.dat, CTRL.dat)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

pr.dis <- dist(t(merged.dat), method = "euclidean")

## Warning: Reached total allocation of 1535Mb: see help(memory.size)

## Warning: Reached total allocation of 1535Mb: see help(memory.size)


# sanity check between the sample name and group assigned
merged.des <- data.frame(Samples = colnames(merged.dat), Group = c(APL.meta$Group, 
    ALL.meta$Group, rep("HBC", 9)))
pr.nms <- with(merged.des, paste(Samples, Group, sep = "_"))
all(unlist(lapply(pr.nms, function(x) substr(x, 1, 3) == substr(x, nchar(x) - 
    2, nchar(x)))))  # TRUE

## [1] TRUE


# compute hierarchical clustering using ward linkage type
pr.hc.w <- hclust(pr.dis, method = "ward")

Plot the dendrogram with ward

par(mfrow = c(1, 1))

# label the leaf nodes by coloring them according to their groups
ward.dendro <- as.dendrogram(pr.hc.w)
# color code each node of the dendrogram according to their group
clusDendro <- dendrapply(ward.dendro, colLab)
plot(clusDendro, cex = 0.6, main = "Ward showing 5 clusters")
legend("topright", legend = c("ALL", "HBC", "HBM", "PLP", "PLR"), fill = colors, 
    title = "Sample Groups", box.col = "transparent")

rect.hclust(pr.hc.w, k = 5, border = c("cyan"))

plot of chunk dendrogram_clusterByProbe_ward_normFilter