x = c(rnorm(20), rnorm(20, 3), seq(from = 1, to = 4, length.out = 20))
y = c(rnorm(20), rnorm(20, 3), seq(from = 0, to = 0.5, length.out = 20))
col = c(rep("red", 20), rep("blue", 20), rep("green", 20))


colLab = function(n, cols) {
    leafs = order.dendrogram(n)
    leaf.col = unique(cols[leafs])
    if (length(leaf.col) == 1) {
        attr(n, "edgePar") = list(col = leaf.col)
    }
    n
}

par(mfrow = c(2, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(2, 3, 1, 0), 
    oma = c(0, 0, 0, 1))
plot(x, y, col = col, pch = 19)

d = dist(cbind(x, y), method = "euclidean")
h = hclust(d)
plot(h)
dg = as.dendrogram(h)
dg = dendrapply(dg, colLab, col)
plot(dg, leaflab = "none", main = "Complete")

dg = dendrapply(as.dendrogram(hclust(d, method = "average")), colLab, col)
plot(dg, leaflab = "none", main = "average")
dg = dendrapply(as.dendrogram(hclust(d^2, method = "ward")), colLab, col)
plot(dg, leaflab = "none", main = "ward")
dg = dendrapply(as.dendrogram(hclust(d, method = "single")), colLab, col)
plot(dg, leaflab = "none", main = "Single")

plot of chunk unnamed-chunk-1


par(mfrow = c(1, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(2, 3, 1, 0), 
    oma = c(0, 0, 0, 1))
h = hclust(d)
dg = dendrapply(as.dendrogram(h), colLab, col)
plot(dg, leaflab = "none", main = "Complete")
t = 4
abline(h = t, col = "red")
cl = cutree(h, h = t)
plot(x, y, col = cl, pch = 19, main = "by level")
cl = cutree(h, k = 6)
plot(x, y, col = cl, pch = 19, main = "by cluster count")


renameCl = function(x) {
    ux = unique(x)
    cl = ux
    for (i in ux) cl[i] = mean(which(x == i))
    cl = rank(cl)
    r = x
    for (i in 1:length(ux)) r[x == i] = cl[i]
    r
}
# k-means
par(mfrow = c(2, 3), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(2, 3, 1, 0), 
    oma = c(0, 0, 0, 1))

plot of chunk unnamed-chunk-1

for (i in 1:6) {
    cl = kmeans(cbind(x, y), centers = 4, nstart = 20)
    plot(x, y, col = renameCl(cl$cluster), pch = 19)
    cl$size
}

plot of chunk unnamed-chunk-1


# k-medoids
library(cluster)
for (i in 1:6) {
    cl = pam(d, k = 4)
    plot(x, y, col = renameCl(cl$cluster), pch = 19)
}

plot of chunk unnamed-chunk-1


par(mfrow = c(1, 1), tck = -0.02, mgp = c(1.1, 0.2, 0), mar = c(2, 3, 1, 0), 
    oma = c(0, 0, 0, 1))

cl = pam(d, k = 4)
slh = silhouette(cl$clustering, d)
plot(slh)

plot of chunk unnamed-chunk-1


slh.m = matrix(ncol = 4, nrow = 15)
colnames(slh.m) = c("h.compl", "h.singl", "kmean", "kmed")
for (i in 1:15) {
    d = dist(cbind(x, y))
    slh.m[i, 1] = mean(silhouette(cutree(hclust(d, meth = "compl"), k = i + 
        1), d)[, 3])
    slh.m[i, 2] = mean(silhouette(cutree(hclust(d, meth = "singl"), k = i + 
        1), d)[, 3])
    slh.m[i, 3] = mean(silhouette(kmeans(cbind(x, y), i + 1, nstart = 50)$cluster, 
        d)[, 3])
    slh.m[i, 4] = mean(silhouette(pam(d, i + 1)$clustering, d)[, 3])
}
## Warning: did not converge in 10 iterations

plot(2:16, slh.m[, 1], t = "l", ylim = range(slh.m), xlab = "# of clusters", 
    ylab = "mean silhouette")
lines(2:16, slh.m[, 2], t = "l", col = 2)
lines(2:16, slh.m[, 3], t = "l", col = 3)
lines(2:16, slh.m[, 4], t = "l", col = 4)
legend(10, 0.38, col = 1:4, lwd = 1, legend = colnames(slh.m))

plot of chunk unnamed-chunk-1