library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)
dist_mat <- as.dist(matrix(
c(0, 0.3, 0.4, 0.7,
0.3, 0, 0.5, 0.8,
0.4, 0.5, 0, 0.45,
0.7, 0.8, 0.45, 0),
nrow = 4,
byrow = TRUE))
hc_complete <- hclust(dist_mat, method = "complete")
plot(hc_complete, main = "Complete Linkage Dendrogram",
xlab = "Observations", sub = "", hang = -1)
Interpretation: - First, observations 1 and 2 fuse at
height 0.3. - Then, observations 3 and 4 fuse at height 0.45. - Finally,
both clusters fuse at height ~0.7.
hc_single <- hclust(dist_mat, method = "single")
plot(hc_single, main = "Single Linkage Dendrogram",
xlab = "Observations", sub = "", hang = -1)
Interpretation: - First, observations 1 and 2 fuse at
height 0.3. - Next, 3 joins them at height 0.4 (min of 0.4 and 0.5). -
Then 4 joins at height 0.45.
cutree(hc_complete, k = 2)
## [1] 1 1 2 2
Result: - Cluster 1: Obs 1, 2 - Cluster 2: Obs 3, 4
cutree(hc_single, k = 2)
## [1] 1 1 1 2
Result: - Cluster 1: Obs 1, 2, 3 - Cluster 2: Obs 4
plot(hc_complete, main = "Reordered Dendrogram (Complete Linkage)",
xlab = "Observations", sub = "", hang = -1)
Interpretation: - Leaf positions may shift, but the
branching structure remains unchanged.
k_df <- data.frame(
Obs = 1:6,
X1 = c(1, 1, 0, 5, 6, 4),
X2 = c(4, 3, 4, 1, 2, 0)
)
plot(k_df$X1, k_df$X2, pch = 19, col = "black",
xlab = "X1", ylab = "X2", main = "Initial Observations")
text(k_df$X1, k_df$X2, labels = k_df$Obs, pos = 3)
set.seed(123)
k_df$Cluster <- sample(1:2, 6, replace = TRUE)
k_df
## Obs X1 X2 Cluster
## 1 1 1 4 1
## 2 2 1 3 1
## 3 3 0 4 1
## 4 4 5 1 2
## 5 5 6 2 1
## 6 6 4 0 2
centroids <- k_df %>%
group_by(Cluster) %>%
summarise(across(c(X1, X2), mean))
centroids
## # A tibble: 2 Ă— 3
## Cluster X1 X2
## <int> <dbl> <dbl>
## 1 1 2 3.25
## 2 2 4.5 0.5
repeat {
old_clusters <- k_df$Cluster
# Compute centroids
centroids <- k_df %>%
group_by(Cluster) %>%
summarise(across(c(X1, X2), mean))
# Assign new clusters
k_df$Cluster <- apply(k_df[, c("X1", "X2")], 1, function(row) {
dists <- apply(centroids[, c("X1", "X2")], 1, function(centroid) {
sqrt(sum((row - centroid)^2))
})
which.min(dists)
})
if (all(old_clusters == k_df$Cluster)) break
}
k_df
## Obs X1 X2 Cluster
## 1 1 1 4 1
## 2 2 1 3 1
## 3 3 0 4 1
## 4 4 5 1 2
## 5 5 6 2 2
## 6 6 4 0 2
plot(k_df$X1, k_df$X2, col = k_df$Cluster,
pch = 19, xlab = "X1", ylab = "X2",
main = "Final Cluster Assignments")
text(k_df$X1, k_df$X2, labels = k_df$Obs, pos = 3)
Interpretation: - Final cluster 1: Obs 1, 2, 3
(upper-left) - Final cluster 2: Obs 4, 5, 6 (lower-right) - Clustering
converges after centroid and assignment iteration.
At a certain point on the single linkage dendrogram, the clusters {1, 2, 3} and {4, 5} fuse. On the complete linkage dendrogram, they also fuse. However, complete linkage considers the farthest distance between clusters. Therefore, there is not enough information to tell which will fuse higher since actual pairwise distances aren’t given.
For the fusion of clusters {5} and {6}, we again lack exact distances. But since both linkage methods agree on the fusion: Again, not enough information is provided to determine which fusion is higher on the tree.