Generate a sample dateset

y <- matrix(rnorm(50), 10, 5, dimnames=list(paste("g", 1:10, sep=""), paste("t", 1:5, sep="")))

Center or scales matrices column-wise. t() allows to scale by rows and then t() back.

yscaled <- t(scale(t(y)))

Obtain a distance matrix of euclidean using dist(). To obtain a correlation-based distances, cor() together with t() computes correlations in row. as.dist() makes it a matrix.

eu <- dist(y, method = "euclidean")
c <- cor(t(y), method = "spearman")
d <- as.dist(1-c)

Hierarchical clustering (hc) functions in R: hclust and agnes perform agglomerative hc, flashClust is a 50-100 faster version of hclust. diana perform divisive hc.

Package pvclust can assess uncertainty in hc analyses. It provides approximately unbiased (au) or bootstrap p-values. Seven cluster joining methods can be chosen: ward, single, complete, average, mcquitty, median and centroid. unclass() prints the full content of hclust object

hr <- hclust(d, method = "complete", members = NULL)
unclass(hr)
## $merge
##       [,1] [,2]
##  [1,]   -3   -8
##  [2,]   -4   -7
##  [3,]   -1   -2
##  [4,]   -5   -6
##  [5,]  -10    1
##  [6,]   -9    4
##  [7,]    3    5
##  [8,]    2    6
##  [9,]    7    8
## 
## $height
## [1] 0.1 0.1 0.4 0.4 0.5 0.7 1.1 1.5 2.0
## 
## $order
##  [1]  1  2 10  3  8  4  7  9  5  6
## 
## $labels
##  [1] "g1"  "g2"  "g3"  "g4"  "g5"  "g6"  "g7"  "g8"  "g9"  "g10"
## 
## $method
## [1] "complete"
## 
## $call
## hclust(d = d, method = "complete", members = NULL)
## 
## $dist.method
## NULL

str(as.dendrogram()) describes dendrogram structure.

par(mfrow = c(2,2))
plot(hr, hang = 0.1)
plot(hr, hang = -1)
plot(as.dendrogram(hr), edgePar = list(col=3, lwd=4, horiz=T))
str(as.dendrogram(hr))
## --[dendrogram w/ 2 branches and 10 members at h = 2]
##   |--[dendrogram w/ 2 branches and 5 members at h = 1.1]
##   |  |--[dendrogram w/ 2 branches and 2 members at h = 0.4]
##   |  |  |--leaf "g1" 
##   |  |  `--leaf "g2" 
##   |  `--[dendrogram w/ 2 branches and 3 members at h = 0.5]
##   |     |--leaf "g10" 
##   |     `--[dendrogram w/ 2 branches and 2 members at h = 0.1]
##   |        |--leaf "g3" 
##   |        `--leaf "g8" 
##   `--[dendrogram w/ 2 branches and 5 members at h = 1.5]
##      |--[dendrogram w/ 2 branches and 2 members at h = 0.1]
##      |  |--leaf "g4" 
##      |  `--leaf "g7" 
##      `--[dendrogram w/ 2 branches and 3 members at h = 0.7]
##         |--leaf "g9" 
##         `--[dendrogram w/ 2 branches and 2 members at h = 0.4]
##            |--leaf "g5" 
##            `--leaf "g6"

Print the row label orderly as they appear in the tree.

hr$labels[hr$order]
##  [1] "g1"  "g2"  "g10" "g3"  "g8"  "g4"  "g7"  "g9"  "g5"  "g6"

Reorder a dendrogram and print out its labels

par(mfrow=c(2,1))
hrd1 <- as.dendrogram(hr)
plot(hrd1)
hrd2 <- reorder(hrd1, sample(1:10))
plot(hrd2)

labels(hrd1)
##  [1] "g1"  "g2"  "g10" "g3"  "g8"  "g4"  "g7"  "g9"  "g5"  "g6"
labels(hrd2)
##  [1] "g7"  "g4"  "g6"  "g5"  "g9"  "g10" "g8"  "g3"  "g2"  "g1"

Cut tree intp bins (discrete cluster groups) at specified height in ‘h’ argument. If cut based on number of clusters, specify in ‘k’ argument.

mycl <- cutree(hr, h=max(hr$height)/2)
mycl[hr$labels[hr$order]]
##  g1  g2 g10  g3  g8  g4  g7  g9  g5  g6 
##   1   1   2   2   2   3   3   4   4   4

Cut dendrogram and draw red rectangles around the resulting clusters

plot(hr)
rect.hclust(hr, k=5, border = "red")

Subset a tree by cutting it at a given branch point. The identified row IDs are then used for distance matrix and re-cluster. Print out the indices of the cluster members of the two main branches and members of the next two descendent clusters of the first main branch.

subcl <- names(mycl[mycl==3])
subd <- as.dist(as.matrix(d)[subcl,subcl])
subhr <- hclust(subd, method = "complete")
par(mfrow=c(1,2))
plot(hr)
#plot(subhr)
lapply(as.dendrogram(hr), unlist)
## [[1]]
## [1]  1  2 10  3  8
## 
## [[2]]
## [1] 4 7 9 5 6
lapply(as.dendrogram(hr)[[1]], unlist)
## [[1]]
## [1] 1 2
## 
## [[2]]
## [1] 10  3  8

Color the tree. Zoom into the tree with “xlim” and “ylim”.

source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/dendroCol.R")
dend_colored <- dendrapply(as.dendrogram(hr), dendroCol, keys=subcl, xPar="edgePar", bgr = "red", fgr="blue", lwd=2, pch=20)
par(mfrow = c(1,3))
plot(dend_colored, horiz=T)
plot(dend_colored, horiz=T, type = "tr")
plot(dend_colored, horiz=T, edgePar=list(lwd=2),xlim = c(3,0),ylim = c(1,3))

To manually color tree elements

z <- as.dendrogram(hr)
attr(z[[2]][[2]],"edgePar") <- list(col="blue", lwd=4, pch=NA)
plot(z,horiz=T)

Heatmaps. Row colors are set to correspond to cluster bins from cutree().

source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/my.colorFct.R")
hr <- hclust(as.dist(1-cor(t(y), method = "pearson")), method = "complete")
hc <- hclust(as.dist(1-cor(y, method = "spearman")), method = "complete")
heatmap(y, Rowv = as.dendrogram(hr), Colv = as.dendrogram(hc), col=my.colorFct(), scale="row")

mycl <- cutree(hr, h=max(hr$height)/1.5)
mycol <- sample(rainbow(256))
mycol <- mycol[as.vector(mycl)]
heatmap(y, Rowv = as.dendrogram(hr), Colv = as.dendrogram(hc), col=my.colorFct(), scale = "row", ColSideColors = heat.colors(length(hc$labels)),RowSideColors = mycol)

Color labels by a colored dendrogram object like previous “dend_colored”. Custom labels using “labRow” and “labCol”.

dend_colored <- dendrapply(as.dendrogram(hr), dendroCol, keys=c("g1", "g2"), xPar="edgePar", bgr="black", fgr="red", pch=20); heatmap(y, Rowv=dend_colored, Colv=as.dendrogram(hc), col=my.colorFct(), scale="row", ColSideColors=heat.colors(length(hc$labels)), RowSideColors=mycol) 

heatmap(y, Rowv=as.dendrogram(hr), Colv = as.dendrogram(hc), col=my.colorFct(), scale="row", labRow = "", labCol = "")

If “Rowv” or “Colv” is set to NA, then row or column clustering is turned off.

heatmap(y, Rowv = as.dendrogram(hr), Colv = NA, col= my.colorFct())

Sort rows and columns of a heatmap by hclust trees.

ysort <- y[hr$labels[hr$order], hc$labels[hc$order]]
heatmap(ysort, Rowv = NA, Colv = NA, col=my.colorFct())

Color legend.

#x11(height=4, width=4)
heatmap(matrix(rep(seq(min(as.vector(y)), max(as.vector(y)), length.out = 10),2),2, byrow = T, dimnames = list(1:2, round(seq(min(as.vector(y)), max(as.vector(y)), length.out = 10), 1))), col=my.colorFct(), Colv=NA, Rowv = NA, labRow="", main="Color Legend")

Generate heetmap using image()

image(scale(t(ysort)), col = my.colorFct())

par(mfrow=c(1,2))
plot(as.dendrogram(hr), horiz = T, yaxt = "n", edgePar = list(col="grey", lwd=4))
image(scale(t(ysort)), col = my.colorFct(), xaxt="n", yaxt="n")

Generate heatmap using heatmap.2() in gplots. “key” allows to add a color key within the same plot. Numeric values can be added to each cell with “cellnote” argument.

library(gplots)
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(y, Rowv = as.dendrogram(hr), Colv = as.dendrogram(hc), col = redgreen(75), scale = "row", ColSideColors = heat.colors(length(hc$labels)), RowSideColors = mycol, trace = "none", key = T, cellnote = round(t(scale(t(y))),1))

Select a more appropriate representing color scheme using RColorBrewer library or other in demo.col(20). The row or column reordering in heatmap.2() is turned off using “dendrogram=‘row’, Colv=F” so that no column tree will be shown.

library(RColorBrewer)
mypalette <- rev(brewer.pal(9, "Blues")[-1])
heatmap.2(y, Rowv = as.dendrogram(hr), dendrogram = "row", Colv = F, col = mypalette, scale = "row", trace = "none", key = T, cellnote = round(t(scale(t(y))),1))

Turn off the re-ordering of both dimensions.

heatmap.2(y, Rowv = as.dendrogram(hr), dendrogram = "none", Rowv=F, Colv = F, col = mypalette, scale = "row", trace = "none", key = T, cellnote = round(t(scale(t(y))),1))

Zooming in heatmaps by sub-clustering selected tree nodes.

y <- matrix(rnorm(500), 100, 5, dimnames = list(paste("g", 1:100, sep = ""), paste("t", 1:5, sep = "")))
hr <- hclust(as.dist(1-cor(t(y), method = "pearson")), method = "complete")
hc <- hclust(as.dist(1-cor(y, method = "spearman")), method = "complete")
mycl <- cutree(hr, h=max(hr$height)/1.5)
mycolhc <- rainbow(length(unique(mycl)), start = 0.1, end = 0.9)
mycolhc <- mycolhc[as.vector(mycl)]
myheatcol <- redgreen(75)
heatmap.2(y, Rowv = as.dendrogram(hr), Colv = as.dendrogram(hc), col = myheatcol, scale = "row", density.info = "none", trace = "none", RowSideColors = mycolhc)

Create heatmaps for entire dateset where the obtained clusters are indicated in the color bar. Print out color key for cluster assignment.

names(mycolhc) <- names(mycl)
barplot(rep(10, max(mycl)), col = unique(mycolhc[hc$labels[hr$order]]), horiz = T, names=unique(mycl[hr$order]))

clid <- c(1,2) # sub-cluster number
ysub <- y[names(mycl[mycl%in%clid]),]
hrsub <- hclust(as.dist(1-cor(t(ysub), method = "pearson")), method = "complete")
heatmap.2(ysub, Rowv = as.dendrogram(hrsub), Colv = as.dendrogram(hc), col = myheatcol, scale = "row", density.info = "none", trace = "none", RowSideColors = mycolhc[mycl%in%clid])

data.frame(Labels= rev(hrsub$labels[hrsub$order]))
##    Labels
## 1     g96
## 2     g81
## 3      g2
## 4     g94
## 5     g88
## 6     g16
## 7     g99
## 8     g14
## 9     g58
## 10    g10
## 11    g63
## 12    g27
## 13    g48
## 14    g62
## 15    g45
## 16    g75
## 17    g37
## 18    g30
## 19    g12
## 20   g100
## 21    g95
## 22    g73
## 23    g20
## 24    g13
## 25    g84
## 26    g93
## 27    g56
## 28    g26
## 29    g22
## 30    g65
## 31     g3
## 32    g51
## 33    g34
## 34     g7
## 35     g5
## 36    g44
## 37     g8
## 38    g71
## 39    g92
## 40    g89
## 41    g98
## 42    g17
## 43    g11
## 44    g72
## 45     g1