In this analysis we will see how to create a hierarchical clustering model. The objectives are to explore if the similarities groups in our database and see how they are behaving.
As an example, we will use the Doubs database, based on the physical features of a sample of fish extracted from the Doubs river in France. The objective is to see the behaviour of the sample and how the data can be grouped.
river
We need to remove the rows with double zeros or NA values, otherwise they will become problematic when we try to create dendrograms. Then we need to normalize the values according to their distances. This time we will use the euclidean distance, but there are other distance methods that could be useful.
data(doubs)
spe <- doubs$fish[-8,]
env <- doubs$env[-8,]
spa <- doubs$xy[-8,]
spe.norm <- decostand(spe, "normalize")
spe.ch <- vegdist(spe.norm, "euc")
We will choose all the available methods and then we will choose the best one with a proper validation analysis. The methods are single, complete, average, centroid and Ward.
par(mfrow = c(2, 3))
spe.ch.single <- hclust(spe.ch, method = "single")
plot(spe.ch.single, sub = "Single method")
spe.ch.complete <- hclust(spe.ch, method = "complete")
plot(spe.ch.complete, sub = "Complete method")
spe.ch.UPGMA <- hclust(spe.ch, method = "average")
plot(spe.ch.UPGMA, sub = "Average method")
spe.ch.centroid <- hclust(spe.ch, method = "centroid")
plot(spe.ch.centroid, sub = "Centroid method")
spe.ch.ward <- hclust(spe.ch, method = "ward.D")
plot(spe.ch.ward, sub = "Ward method")
The structure of this dendrogram proved to be problematic. We can observe overlapping along the dendrogram, so this method is no longer valid.
We can see overfitting in the centroid method case. In the rest of cases we have to calculate: - Cophenetic correlation. - Gower distance value.
spe.ch.single.coph <- cophenetic(spe.ch.single) # Single
paste("Single cophenetic:", cor(spe.ch, spe.ch.single.coph))
## [1] "Single cophenetic: 0.599192957534406"
spe.ch.comp.coph <- cophenetic(spe.ch.complete) # Complete
paste("Complete cophenetic:", cor(spe.ch, spe.ch.comp.coph))
## [1] "Complete cophenetic: 0.765562801324477"
spe.ch.UPGMA.coph <- cophenetic(spe.ch.UPGMA) # Average
paste("UPGMA cophenetic:", cor(spe.ch, spe.ch.UPGMA.coph))
## [1] "UPGMA cophenetic: 0.860832629864453"
spe.ch.ward.coph <- cophenetic(spe.ch.ward) # Ward
paste("Ward cophenetic:", cor(spe.ch, spe.ch.ward.coph))
## [1] "Ward cophenetic: 0.759393401189188"
cor(spe.ch, spe.ch.ward.coph, method = "spearman")
## [1] 0.7661171
Graphical representation of the cophenetic correlation (Shepard diagram). The closer to the diagonal line the better.
par(mfrow = c(2, 2))
plot(spe.ch, spe.ch.single.coph, xlab = "Chord distance",
ylab = "Cophenetic distance", asp = 1, xlim = c(0, sqrt(2)), ylim = c(0, sqrt(2)),
main = c("Single linkage", paste("Cophenetic correlation =",
round(cor(spe.ch, spe.ch.single.coph), 3))))
abline(0, 1); lines(lowess(spe.ch, spe.ch.single.coph), col = "red")
plot(spe.ch, spe.ch.comp.coph, xlab = "Chord distance",
ylab = "Cophenetic distance", asp = 1, xlim = c(0,sqrt(2)), ylim = c(0,sqrt(2)),
main = c("Complete linkage", paste("Cophenetic correlation =",
round(cor(spe.ch, spe.ch.comp.coph), 3))))
abline(0, 1); lines(lowess(spe.ch, spe.ch.comp.coph), col = "red")
plot(spe.ch, spe.ch.UPGMA.coph, xlab = "Chord distance",
ylab = "Cophenetic distance", asp = 1, xlim = c(0,sqrt(2)), ylim = c(0,sqrt(2)),
main = c("UPGMA", paste("Cophenetic correlation =",
round(cor(spe.ch, spe.ch.UPGMA.coph), 3))))
abline(0, 1); lines(lowess(spe.ch, spe.ch.UPGMA.coph), col = "red")
plot(spe.ch, spe.ch.ward.coph, xlab = "Chord distance",
ylab = "Cophenetic distance", asp = 1, xlim = c(0,sqrt(2)),
ylim = c(0,max(spe.ch.ward$height)),
main = c("Ward clustering", paste("Cophenetic correlation =",
round(cor(spe.ch, spe.ch.ward.coph), 3))))
abline(0, 1); lines(lowess(spe.ch, spe.ch.ward.coph), col = "red")
The best fitted method is UPGMA, with a cophenetic correlation of .861, which is quite high.
(gow.dist.single <- sum((spe.ch - spe.ch.single.coph) ^ 2))
## [1] 95.41391
(gow.dist.comp <- sum((spe.ch - spe.ch.comp.coph) ^ 2))
## [1] 40.48897
(gow.dist.UPGMA <- sum((spe.ch - spe.ch.UPGMA.coph) ^ 2))
## [1] 11.6746
(gow.dist.ward <- sum((spe.ch - spe.ch.ward.coph) ^ 2))
## [1] 8001.85
The best method is still UPGMA or average method.
To this effect we need three different tests: - a- Fussion level plot - b- Silhouette plot - c- Mantel value
dev.off()
## null device
## 1
plot(spe.ch.ward$height, nrow(spe) : 2, type = "S",
main = "Fusion levels - Chord - Ward",
ylab = "k (number of clusters)", xlab = "h (node height)", col = "grey")
text(spe.ch.ward$height, nrow(spe) : 2, nrow(spe) : 2, col = "red", cex = 0.8)
By the length of the fussion line the best number of k groups must be 2 or 4.
asw <- numeric(nrow(spe))
for(k in 2:(nrow(spe) - 1)){
sil <- silhouette(cutree(spe.ch.ward, k = k), spe.ch)
asw[k] <- summary(sil)$avg.width}
k.best <- which.max(asw)
plot(1: nrow(spe), asw, type="h",
main = "Silhouette-optimal number of clusters",
xlab = "k (number of groups)", ylab = "Average silhouette width")
axis(1, k.best, paste("optimum", k.best, sep = "\n"), col = "red", font = 2,
col.axis = "red")
points(k.best, max(asw), pch = 16, col = "red", cex = 1.5)
cat("", "Silhouette-optimal number of clusters k =", k.best, "\n",
"with an average silhouette width of", max(asw), "\n")
## Silhouette-optimal number of clusters k = 2
## with an average silhouette width of 0.3658319
grpdist <- function(X){
require(cluster)
gr <- as.data.frame(as.factor(X))
distgr <- daisy(gr, "gower")
distgr}
kt <- data.frame(k = 1:nrow(spe), r = 0)
for(i in 2:(nrow(spe) - 1)){
gr <- cutree(spe.ch.ward, i)
distgr <- grpdist(gr)
mt <- cor(spe.ch, distgr, method = "pearson")
kt[i, 2] <- mt}
k.best <- which.max(kt$r)
plot(kt$k, kt$r, type = "h", main = "Mantel-optimal number of clusters",
xlab = "k (number of groups)", ylab = "Pearson's correlation")
axis(1, k.best, paste("optimum", k.best, sep = "\n"), col = "red", font = 2,
col.axis = "red")
points(k.best, max(kt$r), pch = 16, col = "red", cex = 1.5)
cat("", "Mantel-optimal number of clusters k =", k.best, "\n",
"with a matrix linear correlation of", max(kt$r), "\n")
## Mantel-optimal number of clusters k = 4
## with a matrix linear correlation of 0.7154912
The first and second method demonstrates that the best number of conglomerates is k = 2 while the Mantel value indicates that the number must be 4. Since the theory indicates that the number of species are 4, we will validate our model for k = 4.
k <- 4
cutg <- cutree(spe.ch.ward, k = k)
sil <- silhouette(cutg, spe.ch)
rownames(sil) <- row.names(spe)
plot(sil, main = "Silhouette plot",
cex.names = 0.8, col = 2:(k + 1), nmax = 100)
The groups are coherent, but the second group has two unclassified values.
So far, we have grouped our date through the UPGMA method in 4 clusters.Now we will use the hcoplot function created by Francois Gillet (2012) to how the dendrogram behaves.
hcoplot <- function(tree, diss, k,
title = paste("Reordered dendrogram from", deparse(tree$call), sep = "\n"))
{
require(gclus)
gr <- cutree(tree, k = k)
tor <- reorder.hclust(tree, diss)
plot(tor, hang = -1, xlab = paste(length(gr),"sites"), sub = paste(k, "clusters"),
main = title)
so <- gr[tor$order]
gro <- numeric(k)
for (i in 1:k)
{
gro[i] <- so[1]
if (i<k) so <- so[so != gro[i]]
}
rect.hclust(tor, k = k, border = gro + 1, cluster = gr)
legend("topright", paste("Cluster", 1:k), pch = 22, col = 2:(k + 1), bty = "n")
}
hcoplot(spe.ch.ward, spe.ch, k = 4)
This time we will make a k-mean clustering model.
This step is already done in the previous analysis.
set.seed(1)
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)
We created our model with 4 groups, the same as the previous HC model. Here we have the centroids of each cluster, the groups and the variance of each cluster. The model can explain a 66.7% of the total variance.
To do this, we will use the following criteria: * Calinski & Harabasz value * Simple structure index (SSI) * Sum of squared errors (SSE) * Silhouette plot
spe.KM.cascade <- cascadeKM(spe.norm, inf.gr = 2, sup.gr = 10, iter = 100, criterion = "ssi")
spe.KM.cascade$results
## 2 groups 3 groups 4 groups 5 groups 6 groups 7 groups 8 groups
## SSE 8.2149405 6.4768108 5.0719796 4.3015573 3.5856120 2.9523667 2.48405487
## ssi 0.1312111 0.1675852 0.1244603 0.1149501 0.1281785 0.1306085 0.07275788
## 9 groups 10 groups
## SSE 2.052189 1.75992916
## ssi 0.126707 0.07094594
plot(spe.KM.cascade, sortg = TRUE)
The statistics are not determining. By its SSE, the best number must be 2 and by its SSI must be 3.
We will try to make a silhouette plot for 3 groups.
spe.kmeans <- kmeans(spe.norm, centers = 3, nstart = 100)
dissE <- daisy(spe.norm)
sk <- silhouette(spe.kmeans$cl, dissE)
plot(sk)
Now we proceed to compare the classification made in the previous dendrogram with the earlier classification.
spebc.ward.g <- cutree(spe.ch.ward,k = 4)
table(spe.kmeans$cluster, spebc.ward.g)
## spebc.ward.g
## 1 2 3 4
## 1 0 6 0 0
## 2 11 1 0 0
## 3 0 0 8 3
They only differ in the classification of just one case.
clusplot(spe.norm, spe.kmeans$cluster, color = TRUE, shade = TRUE,
labels = 2, lines = 0)
There is some overlapping between groups, but we achieved to explain up to a 61.75% of the variability, which is a good percentage.