4.1 Dendograms
# Read data, compute and add features
gemIdGene <- read.csv(file = "gemIdGene.csv", row.names = 1)
genes <- gemIdGene[, 1:9]
augMat <- t(apply(genes[, 1:9], 1, diff))
geneMat <- cbind(genes[, 1:9], augMat)
# Sample the data to show smaller dendrograms
set.seed(137)
subs <- sample(1:nrow(geneMat), 60, replace = FALSE)
geneMat60 <- geneMat[subs, ]
# Distance matrix and cluster trees
eucDist <- dist(geneMat60)
library(cluster)
clus1 <- hclust(eucDist, method = "complete")
clus2 <- hclust(eucDist, method = "single")
clus3 <- hclust(eucDist, method = "ward")
clus4 <- hclust(eucDist, method = "average")
clus5 <- as.hclust(diana(eucDist))
source("panelFunctions.r")
# Dendrograms
par(cex = 0.5, las = 1)
pan <- panelLayout(nrow = 3, ncol = 1, topMar = 0.6, rowSep = c(0, 0.4, 0.4,
0))
panelSelect(pan, mar = "top")
panelScale()
## $rx
## [1] 0 1
##
## $ry
## [1] 0 1
text(0.5, 0.85, "Dendrograms for Different Clustering Choices Set2", adj = 0.5,
cex = 1.2)
panelSelect(pan, 1, 1)
panelScale(c(-2, 64), c(-3.5, 3.5))
## $rx
## [1] -2 64
##
## $ry
## [1] -3.5 3.5
par(new = TRUE)
plot(clus3, ann = FALSE)
mtext(side = 3, "Ward", cex = 1, font = 2)
panelSelect(pan, 2, 1)
panelScale(c(-2, 64), c(-3.5, 3.5))
## $rx
## [1] -2 64
##
## $ry
## [1] -3.5 3.5
par(new = TRUE)
plot(clus4, ann = FALSE)
mtext(side = 3, "Average", cex = 1, font = 2)
panelSelect(pan, 3, 1)
panelScale(c(-2, 64), c(-3.5, 3.5))
## $rx
## [1] -2 64
##
## $ry
## [1] -3.5 3.5
par(new = TRUE)
plot(clus5, ann = FALSE)
mtext(side = 3, "Devisive", font = 2, cex = 1)
4.2 Horizontal layout with parabolic lines
source("ParabolaTree.r")
parabolaTree(clus3, lab = clus3$labels, cex = 0.75, title = "Rat Gene Expression for 60 Genes",
labelF = 0.12, xGrid = seq(2, 12, by = 2))
parabolaTree(clus3, lab = clus3$labels, cex = 0.75, title = "Rat Gene Expression for 60 Genes",
labelF = 0.12, xGrid = seq(0, 12, by = 2), parabolaF = 0.06, nameGapF = 0.015)
5. Look for clusters, and thinking about the number of clusters
# Look at the distance matrix.
library(RColorBrewer)
hmcol <- colorRampPalette(brewer.pal(6, "RdBu"))(236)
heatmap(as.matrix(eucDist), sym = TRUE, col = hmcol, distfun = function(x) as.dist(x))
*5.2 Look for clusters in a 2-D scaled view *
eucDistAll <- dist(geneMat)
ans <- cmdscale(eucDistAll, 3)
plot(ans[, 1], ans[, 2], main = "Multidimensional Scaling Plot", xlab = "x",
ylab = "y", pty = "sq")
# identify(ans[, 1], ans[, 2], row.names(ans))
5.3 Look for a clusters in a 3-D scaled view.
library(rgl)
open3d()
plot3d(ans, type = "s", radius = 0.05, col = "green", zlab = "z", xlab = "y",
ylab = "z")

6. K-means discussion
km2 <- kmeans(geneMat, centers = 2, nstart = 10)
km3 <- kmeans(geneMat, centers = 3, nstart = 10)
km4 <- kmeans(geneMat, centers = 4, nstart = 10)
km5 <- kmeans(geneMat, centers = 5, nstart = 10)
km6 <- kmeans(geneMat, centers = 6, nstart = 10)
km7 <- kmeans(geneMat, centers = 7, nstart = 10)
km8 <- kmeans(geneMat, centers = 8, nstart = 10)
km9 <- kmeans(geneMat, centers = 9, nstart = 10)
plot(2:9, c(km2[6], km3[6], km4[6], km5[6], km6[6], km7[6], km8[6], km9[6]),
main = "K Means: How many clusters?", las = 1, xlab = "Number of Clusters",
ylab = "Between to Total Sums of Squares")
# Compute PAM clusters, compare with k-mean clusters
pam2 <- pam(eucDistAll, k = 2, diss = TRUE)
pam3 <- pam(eucDistAll, k = 3, diss = TRUE)
pam6 <- pam(eucDistAll, k = 6, diss = TRUE)
# all(names(km2$cluster) == names(pam2$clustering))
# Agglomerative trees and cluster number selection
hclResult <- hclust(eucDistAll, method = "ward")
agglom2 <- cutree(hclResult, k = 2) # creates two clusters
# all(names(agglom2) == names(pam2$clustering))
## Cluster separation and cluster number selection cbind(hclResult$merge,
## hclResult$height)
silpam6 <- silhouette(pam6)
plot(silpam6, main = "")