# Create the dissimilarity matrix
diss_matrix <- matrix(c(
0, 0.3, 0.4, 0.7,
0.3, 0, 0.5, 0.8,
0.4, 0.5, 0, 0.45,
0.7, 0.8, 0.45, 0
), nrow = 4)
# Convert to a distance object
diss_dist <- as.dist(diss_matrix)
# Perform complete linkage clustering
hc_complete <- hclust(diss_dist, method = "complete")
# Plot the dendrogram
plot(hc_complete, main = "Complete Linkage Dendrogram",
xlab = "Observations", sub = "", hang = -1)
# Add fusion heights
text(c(1.5, 3.5, 2.5),
c(hc_complete$height[1], hc_complete$height[2], hc_complete$height[3]),
labels = round(c(0.3, 0.45, 0.8), 2), pos = 4)
The complete linkage dendrogram shows:
Observations 1 and 2 merge first at height 0.3
Observations 3 and 4 merge at height 0.45
The two clusters {1,2} and {3,4} merge finally at height 0.8
# Perform single linkage clustering
hc_single <- hclust(diss_dist, method = "single")
# Plot the dendrogram
plot(hc_single, main = "Single Linkage Dendrogram",
xlab = "Observations", sub = "", hang = -1)
# Add fusion heights
text(c(1.5, 2.5, 3.5),
c(hc_single$height[1], hc_single$height[2], hc_single$height[3]),
labels = round(c(0.3, 0.4, 0.45), 2), pos = 4)
The single linkage dendrogram shows:
Observations 1 and 2 merge first at height 0.3
Cluster {1,2} merges with observation 3 at height 0.4
Cluster {1,2,3} merges with observation 4 at height 0.45
# Cut complete linkage dendrogram to get 2 clusters
complete_clusters <- cutree(hc_complete, k = 2)
print("Complete linkage clusters (k=2):")
## [1] "Complete linkage clusters (k=2):"
for (i in 1:2) {
print(paste("Cluster", i, ":", paste(which(complete_clusters == i), collapse = ", ")))
}
## [1] "Cluster 1 : 1, 2"
## [1] "Cluster 2 : 3, 4"
# Visualize clusters
plot(hc_complete, main = "Complete Linkage with 2 Clusters")
rect.hclust(hc_complete, k = 2, border = "red")
When cutting the complete linkage dendrogram to get 2 clusters:
Cluster 1: Observations 1, 2
Cluster 2: Observations 3, 4
# Cut single linkage dendrogram to get 2 clusters
single_clusters <- cutree(hc_single, k = 2)
print("Single linkage clusters (k=2):")
## [1] "Single linkage clusters (k=2):"
for (i in 1:2) {
print(paste("Cluster", i, ":", paste(which(single_clusters == i), collapse = ", ")))
}
## [1] "Cluster 1 : 1, 2, 3"
## [1] "Cluster 2 : 4"
# Visualize clusters
plot(hc_single, main = "Single Linkage with 2 Clusters")
rect.hclust(hc_single, k = 2, border = "blue")
When cutting the single linkage dendrogram to get 2 clusters:
Cluster 1: Observations 1, 2, 3
Cluster 2: Observation 4
# Create a dendrogram equivalent to (a) by swapping positions
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.4.3
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
dend_complete <- as.dendrogram(hc_complete)
# Swap positions
dend_swapped <- rotate(dend_complete, c(2, 1, 4, 3))
plot(dend_swapped, main = "Equivalent Dendrogram to Complete Linkage")
The equivalent dendrogram maintains the same clustering structure but swaps the positions of observations. Here we’ve swapped:
Positions of 1 and 2
Positions of 3 and 4
The meaning remains the same since a dendrogram can be rotated on its hinges without changing its topology. The heights at which fusions occur remain unchanged: 0.3, 0.45, and 0.8.
library(ggplot2)
# Create the data
X <- matrix(c(1, 4,
1, 3,
0, 4,
5, 1,
6, 2,
4, 0), ncol = 2, byrow = TRUE)
colnames(X) <- c("X1", "X2")
# Plot the observations
plot(X[,1], X[,2], pch = 16, main = "Observations Plot",
xlab = "X1", ylab = "X2", xlim = c(-1, 7), ylim = c(-1, 5))
text(X[,1], X[,2], labels = paste("Obs", 1:6), pos = 4)
grid()
set.seed(42) # For reproducibility
cluster_labels <- sample(1:2, 6, replace = TRUE)
print("Initial random cluster assignments:")
## [1] "Initial random cluster assignments:"
print(cluster_labels)
## [1] 1 1 1 1 2 2
compute_centroids <- function(X, labels) {
centroids <- matrix(0, nrow = 2, ncol = 2)
for (k in 1:2) {
if (sum(labels == k) > 0) {
centroids[k,] <- colMeans(X[labels == k, , drop = FALSE])
}
}
return(centroids)
}
centroids <- compute_centroids(X, cluster_labels)
print("Initial centroids:")
## [1] "Initial centroids:"
print(centroids)
## [,1] [,2]
## [1,] 1.75 3
## [2,] 5.00 1
assign_clusters <- function(X, centroids) {
n <- nrow(X)
distances <- matrix(0, nrow = n, ncol = 2)
for (i in 1:n) {
for (k in 1:2) {
distances[i, k] <- sqrt(sum((X[i,] - centroids[k,])^2))
}
}
return(apply(distances, 1, which.min))
}
new_labels <- assign_clusters(X, centroids)
print("Updated cluster assignments:")
## [1] "Updated cluster assignments:"
print(new_labels)
## [1] 1 1 1 2 2 2
iteration <- 1
while (!all(cluster_labels == new_labels)) {
cluster_labels <- new_labels
centroids <- compute_centroids(X, cluster_labels)
new_labels <- assign_clusters(X, centroids)
iteration <- iteration + 1
cat(paste("Iteration", iteration, "\n"))
cat("Centroids:\n")
print(centroids)
cat("Cluster assignments:\n")
print(new_labels)
cat("\n")
}
## Iteration 2
## Centroids:
## [,1] [,2]
## [1,] 0.6666667 3.666667
## [2,] 5.0000000 1.000000
## Cluster assignments:
## [1] 1 1 1 2 2 2
cat(paste("Converged after", iteration, "iterations\n"))
## Converged after 2 iterations
cat("Final cluster assignments:\n")
## Final cluster assignments:
print(new_labels)
## [1] 1 1 1 2 2 2
cat("Final centroids:\n")
## Final centroids:
print(centroids)
## [,1] [,2]
## [1,] 0.6666667 3.666667
## [2,] 5.0000000 1.000000
plot(X[,1], X[,2], col = ifelse(new_labels == 1, "red", "blue"), pch = 16,
main = "Observations Colored by Final Cluster Labels",
xlab = "X1", ylab = "X2", xlim = c(-1, 7), ylim = c(-1, 5))
text(X[,1], X[,2], labels = paste("Obs", 1:6), pos = 4)
grid()
legend("topright", legend = c("Cluster 1", "Cluster 2"),
col = c("red", "blue"), pch = 16)
# Add centroids to the plot
points(centroids[,1], centroids[,2], col = c("red", "blue"),
pch = 8, cex = 2)
set.seed(123)
# Create artificial data with the structure described in the problem
# Points 1, 2, 3 will be close together
# Points 4, 5 will be close together
# Point 6 will be isolated but closer to point 5
x <- c(1, 1.2, 0.8, 5, 5.5, 7)
y <- c(1, 1.5, 0.7, 5, 5.2, 5)
data <- data.frame(x = x, y = y)
rownames(data) <- 1:6
# Plot the data
plot(data$x, data$y, pch = 16, cex = 2, col = "blue",
xlab = "X", ylab = "Y", main = "Artificial Dataset")
text(data$x, data$y, labels = rownames(data), pos = 3)
# Calculate distance matrix
dist_matrix <- dist(data)
# Perform hierarchical clustering with single linkage
hc_single <- hclust(dist_matrix, method = "single")
# Perform hierarchical clustering with complete linkage
hc_complete <- hclust(dist_matrix, method = "complete")
# Plot the dendrograms side by side
par(mfrow = c(1, 2))
plot(hc_single, main = "Single Linkage Dendrogram",
sub = "", xlab = "", ylab = "Height", hang = -1)
rect.hclust(hc_single, k = 3, border = c("red", "green", "blue"))
plot(hc_complete, main = "Complete Linkage Dendrogram",
sub = "", xlab = "", ylab = "Height", hang = -1)
rect.hclust(hc_complete, k = 3, border = c("red", "green", "blue"))
par(mfrow = c(1, 1))
# Extract the heights at which clusters {1,2,3} and {4,5} fuse
# We'll trace through the dendrogram structure to identify these heights
# Function to find the height at which specific clusters merge
find_merge_height <- function(hc, cluster1, cluster2) {
# Get the merge matrix
merge_matrix <- hc$merge
# Find which rows in the merge matrix contain our clusters
height <- NULL
cluster1_idx <- NULL
cluster2_idx <- NULL
# Convert negative indices (which represent original observations) to positive
for (i in 1:nrow(merge_matrix)) {
row <- merge_matrix[i,]
# Check which clusters are being merged in this step
members1 <- get_cluster_members(merge_matrix, row[1])
members2 <- get_cluster_members(merge_matrix, row[2])
# Check if this merge matches our clusters
if ((all(cluster1 %in% members1) && all(members1 %in% cluster1) &&
all(cluster2 %in% members2) && all(members2 %in% cluster2)) ||
(all(cluster1 %in% members2) && all(members1 %in% cluster2) &&
all(cluster2 %in% members1) && all(members2 %in% cluster1))) {
height <- hc$height[i]
break
}
}
return(height)
}
# Helper function to get all members of a cluster
get_cluster_members <- function(merge_matrix, node) {
if (node < 0) {
# Original observation
return(abs(node))
} else {
# Merged cluster
members1 <- get_cluster_members(merge_matrix, merge_matrix[node, 1])
members2 <- get_cluster_members(merge_matrix, merge_matrix[node, 2])
return(c(members1, members2))
}
}
# For demonstration, we'll print the merge heights for the clusters in question
cat("\n--- Question (a) Analysis ---\n")
##
## --- Question (a) Analysis ---
cat("Height at which {1,2,3} and {4,5} fuse in single linkage:",
find_merge_height(hc_single, c(1,2,3), c(4,5)), "\n")
## Height at which {1,2,3} and {4,5} fuse in single linkage:
cat("Height at which {1,2,3} and {4,5} fuse in complete linkage:",
find_merge_height(hc_complete, c(1,2,3), c(4,5)), "\n")
## Height at which {1,2,3} and {4,5} fuse in complete linkage:
cat("\n--- Question (b) Analysis ---\n")
##
## --- Question (b) Analysis ---
cat("Height at which {5} and {6} fuse in single linkage:",
find_merge_height(hc_single, c(5), c(6)), "\n")
## Height at which {5} and {6} fuse in single linkage:
cat("Height at which {5} and {6} fuse in complete linkage:",
find_merge_height(hc_complete, c(5), c(6)), "\n")
## Height at which {5} and {6} fuse in complete linkage:
Summary of Findings
(a) Fusion of {1,2,3} and {4,5}:
Complete linkage height > Single linkage height.
This fusion occurs higher on the complete linkage dendrogram.
(b) Fusion of {5} and {6}:
Complete linkage height = Single linkage height.
Both fusions occur at the same height because they are singleton clusters.