Generate a sample dateset
y <- matrix(rnorm(50), 10, 5, dimnames=list(paste("g", 1:10, sep=""), paste("t", 1:5, sep="")))
Center or scales matrices column-wise. t() allows to scale by rows and then t() back.
yscaled <- t(scale(t(y)))
Obtain a distance matrix of euclidean using dist(). To obtain a correlation-based distances, cor() together with t() computes correlations in row. as.dist() makes it a matrix.
eu <- dist(y, method = "euclidean")
c <- cor(t(y), method = "spearman")
d <- as.dist(1-c)
Hierarchical clustering (hc) functions in R: hclust and agnes perform agglomerative hc, flashClust is a 50-100 faster version of hclust. diana perform divisive hc.
Package pvclust can assess uncertainty in hc analyses. It provides approximately unbiased (au) or bootstrap p-values. Seven cluster joining methods can be chosen: ward, single, complete, average, mcquitty, median and centroid. unclass() prints the full content of hclust object
hr <- hclust(d, method = "complete", members = NULL)
unclass(hr)
## $merge
## [,1] [,2]
## [1,] -3 -8
## [2,] -4 -7
## [3,] -1 -2
## [4,] -5 -6
## [5,] -10 1
## [6,] -9 4
## [7,] 3 5
## [8,] 2 6
## [9,] 7 8
##
## $height
## [1] 0.1 0.1 0.4 0.4 0.5 0.7 1.1 1.5 2.0
##
## $order
## [1] 1 2 10 3 8 4 7 9 5 6
##
## $labels
## [1] "g1" "g2" "g3" "g4" "g5" "g6" "g7" "g8" "g9" "g10"
##
## $method
## [1] "complete"
##
## $call
## hclust(d = d, method = "complete", members = NULL)
##
## $dist.method
## NULL
str(as.dendrogram()) describes dendrogram structure.
par(mfrow = c(2,2))
plot(hr, hang = 0.1)
plot(hr, hang = -1)
plot(as.dendrogram(hr), edgePar = list(col=3, lwd=4, horiz=T))
str(as.dendrogram(hr))
## --[dendrogram w/ 2 branches and 10 members at h = 2]
## |--[dendrogram w/ 2 branches and 5 members at h = 1.1]
## | |--[dendrogram w/ 2 branches and 2 members at h = 0.4]
## | | |--leaf "g1"
## | | `--leaf "g2"
## | `--[dendrogram w/ 2 branches and 3 members at h = 0.5]
## | |--leaf "g10"
## | `--[dendrogram w/ 2 branches and 2 members at h = 0.1]
## | |--leaf "g3"
## | `--leaf "g8"
## `--[dendrogram w/ 2 branches and 5 members at h = 1.5]
## |--[dendrogram w/ 2 branches and 2 members at h = 0.1]
## | |--leaf "g4"
## | `--leaf "g7"
## `--[dendrogram w/ 2 branches and 3 members at h = 0.7]
## |--leaf "g9"
## `--[dendrogram w/ 2 branches and 2 members at h = 0.4]
## |--leaf "g5"
## `--leaf "g6"
Print the row label orderly as they appear in the tree.
hr$labels[hr$order]
## [1] "g1" "g2" "g10" "g3" "g8" "g4" "g7" "g9" "g5" "g6"
Reorder a dendrogram and print out its labels
par(mfrow=c(2,1))
hrd1 <- as.dendrogram(hr)
plot(hrd1)
hrd2 <- reorder(hrd1, sample(1:10))
plot(hrd2)
labels(hrd1)
## [1] "g1" "g2" "g10" "g3" "g8" "g4" "g7" "g9" "g5" "g6"
labels(hrd2)
## [1] "g7" "g4" "g6" "g5" "g9" "g10" "g8" "g3" "g2" "g1"
Cut tree intp bins (discrete cluster groups) at specified height in ‘h’ argument. If cut based on number of clusters, specify in ‘k’ argument.
mycl <- cutree(hr, h=max(hr$height)/2)
mycl[hr$labels[hr$order]]
## g1 g2 g10 g3 g8 g4 g7 g9 g5 g6
## 1 1 2 2 2 3 3 4 4 4
Cut dendrogram and draw red rectangles around the resulting clusters
plot(hr)
rect.hclust(hr, k=5, border = "red")
Subset a tree by cutting it at a given branch point. The identified row IDs are then used for distance matrix and re-cluster. Print out the indices of the cluster members of the two main branches and members of the next two descendent clusters of the first main branch.
subcl <- names(mycl[mycl==3])
subd <- as.dist(as.matrix(d)[subcl,subcl])
subhr <- hclust(subd, method = "complete")
par(mfrow=c(1,2))
plot(hr)
#plot(subhr)
lapply(as.dendrogram(hr), unlist)
## [[1]]
## [1] 1 2 10 3 8
##
## [[2]]
## [1] 4 7 9 5 6
lapply(as.dendrogram(hr)[[1]], unlist)
## [[1]]
## [1] 1 2
##
## [[2]]
## [1] 10 3 8
Color the tree. Zoom into the tree with “xlim” and “ylim”.
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/dendroCol.R")
dend_colored <- dendrapply(as.dendrogram(hr), dendroCol, keys=subcl, xPar="edgePar", bgr = "red", fgr="blue", lwd=2, pch=20)
par(mfrow = c(1,3))
plot(dend_colored, horiz=T)
plot(dend_colored, horiz=T, type = "tr")
plot(dend_colored, horiz=T, edgePar=list(lwd=2),xlim = c(3,0),ylim = c(1,3))
To manually color tree elements
z <- as.dendrogram(hr)
attr(z[[2]][[2]],"edgePar") <- list(col="blue", lwd=4, pch=NA)
plot(z,horiz=T)
Heatmaps. Row colors are set to correspond to cluster bins from cutree().
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/my.colorFct.R")
hr <- hclust(as.dist(1-cor(t(y), method = "pearson")), method = "complete")
hc <- hclust(as.dist(1-cor(y, method = "spearman")), method = "complete")
heatmap(y, Rowv = as.dendrogram(hr), Colv = as.dendrogram(hc), col=my.colorFct(), scale="row")
mycl <- cutree(hr, h=max(hr$height)/1.5)
mycol <- sample(rainbow(256))
mycol <- mycol[as.vector(mycl)]
heatmap(y, Rowv = as.dendrogram(hr), Colv = as.dendrogram(hc), col=my.colorFct(), scale = "row", ColSideColors = heat.colors(length(hc$labels)),RowSideColors = mycol)
Color labels by a colored dendrogram object like previous “dend_colored”. Custom labels using “labRow” and “labCol”.
dend_colored <- dendrapply(as.dendrogram(hr), dendroCol, keys=c("g1", "g2"), xPar="edgePar", bgr="black", fgr="red", pch=20); heatmap(y, Rowv=dend_colored, Colv=as.dendrogram(hc), col=my.colorFct(), scale="row", ColSideColors=heat.colors(length(hc$labels)), RowSideColors=mycol)
heatmap(y, Rowv=as.dendrogram(hr), Colv = as.dendrogram(hc), col=my.colorFct(), scale="row", labRow = "", labCol = "")
If “Rowv” or “Colv” is set to NA, then row or column clustering is turned off.
heatmap(y, Rowv = as.dendrogram(hr), Colv = NA, col= my.colorFct())
Sort rows and columns of a heatmap by hclust trees.
ysort <- y[hr$labels[hr$order], hc$labels[hc$order]]
heatmap(ysort, Rowv = NA, Colv = NA, col=my.colorFct())
Color legend.
#x11(height=4, width=4)
heatmap(matrix(rep(seq(min(as.vector(y)), max(as.vector(y)), length.out = 10),2),2, byrow = T, dimnames = list(1:2, round(seq(min(as.vector(y)), max(as.vector(y)), length.out = 10), 1))), col=my.colorFct(), Colv=NA, Rowv = NA, labRow="", main="Color Legend")
Generate heetmap using image()
image(scale(t(ysort)), col = my.colorFct())
par(mfrow=c(1,2))
plot(as.dendrogram(hr), horiz = T, yaxt = "n", edgePar = list(col="grey", lwd=4))
image(scale(t(ysort)), col = my.colorFct(), xaxt="n", yaxt="n")
Generate heatmap using heatmap.2() in gplots. “key” allows to add a color key within the same plot. Numeric values can be added to each cell with “cellnote” argument.
library(gplots)
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(y, Rowv = as.dendrogram(hr), Colv = as.dendrogram(hc), col = redgreen(75), scale = "row", ColSideColors = heat.colors(length(hc$labels)), RowSideColors = mycol, trace = "none", key = T, cellnote = round(t(scale(t(y))),1))
Select a more appropriate representing color scheme using RColorBrewer library or other in demo.col(20). The row or column reordering in heatmap.2() is turned off using “dendrogram=‘row’, Colv=F” so that no column tree will be shown.
library(RColorBrewer)
mypalette <- rev(brewer.pal(9, "Blues")[-1])
heatmap.2(y, Rowv = as.dendrogram(hr), dendrogram = "row", Colv = F, col = mypalette, scale = "row", trace = "none", key = T, cellnote = round(t(scale(t(y))),1))
Turn off the re-ordering of both dimensions.
heatmap.2(y, Rowv = as.dendrogram(hr), dendrogram = "none", Rowv=F, Colv = F, col = mypalette, scale = "row", trace = "none", key = T, cellnote = round(t(scale(t(y))),1))
Zooming in heatmaps by sub-clustering selected tree nodes.
y <- matrix(rnorm(500), 100, 5, dimnames = list(paste("g", 1:100, sep = ""), paste("t", 1:5, sep = "")))
hr <- hclust(as.dist(1-cor(t(y), method = "pearson")), method = "complete")
hc <- hclust(as.dist(1-cor(y, method = "spearman")), method = "complete")
mycl <- cutree(hr, h=max(hr$height)/1.5)
mycolhc <- rainbow(length(unique(mycl)), start = 0.1, end = 0.9)
mycolhc <- mycolhc[as.vector(mycl)]
myheatcol <- redgreen(75)
heatmap.2(y, Rowv = as.dendrogram(hr), Colv = as.dendrogram(hc), col = myheatcol, scale = "row", density.info = "none", trace = "none", RowSideColors = mycolhc)
Create heatmaps for entire dateset where the obtained clusters are indicated in the color bar. Print out color key for cluster assignment.
names(mycolhc) <- names(mycl)
barplot(rep(10, max(mycl)), col = unique(mycolhc[hc$labels[hr$order]]), horiz = T, names=unique(mycl[hr$order]))
clid <- c(1,2) # sub-cluster number
ysub <- y[names(mycl[mycl%in%clid]),]
hrsub <- hclust(as.dist(1-cor(t(ysub), method = "pearson")), method = "complete")
heatmap.2(ysub, Rowv = as.dendrogram(hrsub), Colv = as.dendrogram(hc), col = myheatcol, scale = "row", density.info = "none", trace = "none", RowSideColors = mycolhc[mycl%in%clid])
data.frame(Labels= rev(hrsub$labels[hrsub$order]))
## Labels
## 1 g96
## 2 g81
## 3 g2
## 4 g94
## 5 g88
## 6 g16
## 7 g99
## 8 g14
## 9 g58
## 10 g10
## 11 g63
## 12 g27
## 13 g48
## 14 g62
## 15 g45
## 16 g75
## 17 g37
## 18 g30
## 19 g12
## 20 g100
## 21 g95
## 22 g73
## 23 g20
## 24 g13
## 25 g84
## 26 g93
## 27 g56
## 28 g26
## 29 g22
## 30 g65
## 31 g3
## 32 g51
## 33 g34
## 34 g7
## 35 g5
## 36 g44
## 37 g8
## 38 g71
## 39 g92
## 40 g89
## 41 g98
## 42 g17
## 43 g11
## 44 g72
## 45 g1