Exercise 8

2.

# Create the dissimilarity matrix
diss_matrix <- matrix(c(
  0, 0.3, 0.4, 0.7,
  0.3, 0, 0.5, 0.8,
  0.4, 0.5, 0, 0.45,
  0.7, 0.8, 0.45, 0
), nrow = 4)

# Convert to a distance object
diss_dist <- as.dist(diss_matrix)

a. Complete Linkage Clustering

# Perform complete linkage clustering
hc_complete <- hclust(diss_dist, method = "complete")

# Plot the dendrogram
plot(hc_complete, main = "Complete Linkage Dendrogram", 
     xlab = "Observations", sub = "", hang = -1)
# Add fusion heights
text(c(1.5, 3.5, 2.5), 
     c(hc_complete$height[1], hc_complete$height[2], hc_complete$height[3]),
     labels = round(c(0.3, 0.45, 0.8), 2), pos = 4)

The complete linkage dendrogram shows:

  • Observations 1 and 2 merge first at height 0.3

  • Observations 3 and 4 merge at height 0.45

  • The two clusters {1,2} and {3,4} merge finally at height 0.8

b. Single Linkage Clustering

# Perform single linkage clustering
hc_single <- hclust(diss_dist, method = "single")

# Plot the dendrogram
plot(hc_single, main = "Single Linkage Dendrogram", 
     xlab = "Observations", sub = "", hang = -1)
# Add fusion heights
text(c(1.5, 2.5, 3.5), 
     c(hc_single$height[1], hc_single$height[2], hc_single$height[3]),
     labels = round(c(0.3, 0.4, 0.45), 2), pos = 4)

The single linkage dendrogram shows:

  • Observations 1 and 2 merge first at height 0.3

  • Cluster {1,2} merges with observation 3 at height 0.4

  • Cluster {1,2,3} merges with observation 4 at height 0.45

c. Complete Linkage Clusters

# Cut complete linkage dendrogram to get 2 clusters
complete_clusters <- cutree(hc_complete, k = 2)
print("Complete linkage clusters (k=2):")
## [1] "Complete linkage clusters (k=2):"
for (i in 1:2) {
  print(paste("Cluster", i, ":", paste(which(complete_clusters == i), collapse = ", ")))
}
## [1] "Cluster 1 : 1, 2"
## [1] "Cluster 2 : 3, 4"
# Visualize clusters
plot(hc_complete, main = "Complete Linkage with 2 Clusters")
rect.hclust(hc_complete, k = 2, border = "red")

When cutting the complete linkage dendrogram to get 2 clusters:

  • Cluster 1: Observations 1, 2

  • Cluster 2: Observations 3, 4

d. Single Linkage Clusters

# Cut single linkage dendrogram to get 2 clusters
single_clusters <- cutree(hc_single, k = 2)
print("Single linkage clusters (k=2):")
## [1] "Single linkage clusters (k=2):"
for (i in 1:2) {
  print(paste("Cluster", i, ":", paste(which(single_clusters == i), collapse = ", ")))
}
## [1] "Cluster 1 : 1, 2, 3"
## [1] "Cluster 2 : 4"
# Visualize clusters
plot(hc_single, main = "Single Linkage with 2 Clusters")
rect.hclust(hc_single, k = 2, border = "blue")

When cutting the single linkage dendrogram to get 2 clusters:

  • Cluster 1: Observations 1, 2, 3

  • Cluster 2: Observation 4

e. Equivalent Dendrogram

# Create a dendrogram equivalent to (a) by swapping positions
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.4.3
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
dend_complete <- as.dendrogram(hc_complete)

# Swap positions
dend_swapped <- rotate(dend_complete, c(2, 1, 4, 3))
plot(dend_swapped, main = "Equivalent Dendrogram to Complete Linkage")

The equivalent dendrogram maintains the same clustering structure but swaps the positions of observations. Here we’ve swapped:

  • Positions of 1 and 2

  • Positions of 3 and 4

The meaning remains the same since a dendrogram can be rotated on its hinges without changing its topology. The heights at which fusions occur remain unchanged: 0.3, 0.45, and 0.8.


3.

library(ggplot2)

a. Plot the observations

# Create the data
X <- matrix(c(1, 4,
              1, 3,
              0, 4,
              5, 1,
              6, 2,
              4, 0), ncol = 2, byrow = TRUE)
colnames(X) <- c("X1", "X2")

# Plot the observations
plot(X[,1], X[,2], pch = 16, main = "Observations Plot", 
     xlab = "X1", ylab = "X2", xlim = c(-1, 7), ylim = c(-1, 5))
text(X[,1], X[,2], labels = paste("Obs", 1:6), pos = 4)
grid()

b. Randomly assign cluster labels

set.seed(42)  # For reproducibility
cluster_labels <- sample(1:2, 6, replace = TRUE)
print("Initial random cluster assignments:")
## [1] "Initial random cluster assignments:"
print(cluster_labels)
## [1] 1 1 1 1 2 2

c. Compute centroids for each cluster

compute_centroids <- function(X, labels) {
  centroids <- matrix(0, nrow = 2, ncol = 2)
  for (k in 1:2) {
    if (sum(labels == k) > 0) {
      centroids[k,] <- colMeans(X[labels == k, , drop = FALSE])
    }
  }
  return(centroids)
}

centroids <- compute_centroids(X, cluster_labels)
print("Initial centroids:")
## [1] "Initial centroids:"
print(centroids)
##      [,1] [,2]
## [1,] 1.75    3
## [2,] 5.00    1

d. Assign observations to closest centroid

assign_clusters <- function(X, centroids) {
  n <- nrow(X)
  distances <- matrix(0, nrow = n, ncol = 2)
  
  for (i in 1:n) {
    for (k in 1:2) {
      distances[i, k] <- sqrt(sum((X[i,] - centroids[k,])^2))
    }
  }
  
  return(apply(distances, 1, which.min))
}

new_labels <- assign_clusters(X, centroids)
print("Updated cluster assignments:")
## [1] "Updated cluster assignments:"
print(new_labels)
## [1] 1 1 1 2 2 2

e. Repeat until convergence

iteration <- 1
while (!all(cluster_labels == new_labels)) {
  cluster_labels <- new_labels
  centroids <- compute_centroids(X, cluster_labels)
  new_labels <- assign_clusters(X, centroids)
  iteration <- iteration + 1
  
  cat(paste("Iteration", iteration, "\n"))
  cat("Centroids:\n")
  print(centroids)
  cat("Cluster assignments:\n")
  print(new_labels)
  cat("\n")
}
## Iteration 2 
## Centroids:
##           [,1]     [,2]
## [1,] 0.6666667 3.666667
## [2,] 5.0000000 1.000000
## Cluster assignments:
## [1] 1 1 1 2 2 2
cat(paste("Converged after", iteration, "iterations\n"))
## Converged after 2 iterations
cat("Final cluster assignments:\n")
## Final cluster assignments:
print(new_labels)
## [1] 1 1 1 2 2 2
cat("Final centroids:\n")
## Final centroids:
print(centroids)
##           [,1]     [,2]
## [1,] 0.6666667 3.666667
## [2,] 5.0000000 1.000000

f. Plot observations with cluster colors

plot(X[,1], X[,2], col = ifelse(new_labels == 1, "red", "blue"), pch = 16,
     main = "Observations Colored by Final Cluster Labels", 
     xlab = "X1", ylab = "X2", xlim = c(-1, 7), ylim = c(-1, 5))
text(X[,1], X[,2], labels = paste("Obs", 1:6), pos = 4)
grid()
legend("topright", legend = c("Cluster 1", "Cluster 2"), 
       col = c("red", "blue"), pch = 16)

# Add centroids to the plot
points(centroids[,1], centroids[,2], col = c("red", "blue"), 
       pch = 8, cex = 2)


4.

set.seed(123)
# Create artificial data with the structure described in the problem
# Points 1, 2, 3 will be close together
# Points 4, 5 will be close together
# Point 6 will be isolated but closer to point 5
x <- c(1, 1.2, 0.8, 5, 5.5, 7)
y <- c(1, 1.5, 0.7, 5, 5.2, 5)
data <- data.frame(x = x, y = y)
rownames(data) <- 1:6
# Plot the data
plot(data$x, data$y, pch = 16, cex = 2, col = "blue",
     xlab = "X", ylab = "Y", main = "Artificial Dataset")
text(data$x, data$y, labels = rownames(data), pos = 3)

# Calculate distance matrix
dist_matrix <- dist(data)

# Perform hierarchical clustering with single linkage
hc_single <- hclust(dist_matrix, method = "single")

# Perform hierarchical clustering with complete linkage
hc_complete <- hclust(dist_matrix, method = "complete")
# Plot the dendrograms side by side
par(mfrow = c(1, 2))
plot(hc_single, main = "Single Linkage Dendrogram", 
     sub = "", xlab = "", ylab = "Height", hang = -1)
rect.hclust(hc_single, k = 3, border = c("red", "green", "blue"))

plot(hc_complete, main = "Complete Linkage Dendrogram", 
     sub = "", xlab = "", ylab = "Height", hang = -1)
rect.hclust(hc_complete, k = 3, border = c("red", "green", "blue"))

par(mfrow = c(1, 1))
# Extract the heights at which clusters {1,2,3} and {4,5} fuse
# We'll trace through the dendrogram structure to identify these heights

# Function to find the height at which specific clusters merge
find_merge_height <- function(hc, cluster1, cluster2) {
  # Get the merge matrix
  merge_matrix <- hc$merge
  
  # Find which rows in the merge matrix contain our clusters
  height <- NULL
  cluster1_idx <- NULL
  cluster2_idx <- NULL
  
  # Convert negative indices (which represent original observations) to positive
  for (i in 1:nrow(merge_matrix)) {
    row <- merge_matrix[i,]
    # Check which clusters are being merged in this step
    members1 <- get_cluster_members(merge_matrix, row[1])
    members2 <- get_cluster_members(merge_matrix, row[2])
    
    # Check if this merge matches our clusters
    if ((all(cluster1 %in% members1) && all(members1 %in% cluster1) &&
         all(cluster2 %in% members2) && all(members2 %in% cluster2)) ||
        (all(cluster1 %in% members2) && all(members1 %in% cluster2) &&
         all(cluster2 %in% members1) && all(members2 %in% cluster1))) {
      height <- hc$height[i]
      break
    }
  }
  return(height)
}
# Helper function to get all members of a cluster
get_cluster_members <- function(merge_matrix, node) {
  if (node < 0) {
    # Original observation
    return(abs(node))
  } else {
    # Merged cluster
    members1 <- get_cluster_members(merge_matrix, merge_matrix[node, 1])
    members2 <- get_cluster_members(merge_matrix, merge_matrix[node, 2])
    return(c(members1, members2))
  }
}
# For demonstration, we'll print the merge heights for the clusters in question
cat("\n--- Question (a) Analysis ---\n")
## 
## --- Question (a) Analysis ---
cat("Height at which {1,2,3} and {4,5} fuse in single linkage:",
    find_merge_height(hc_single, c(1,2,3), c(4,5)), "\n")
## Height at which {1,2,3} and {4,5} fuse in single linkage:
cat("Height at which {1,2,3} and {4,5} fuse in complete linkage:",
    find_merge_height(hc_complete, c(1,2,3), c(4,5)), "\n")
## Height at which {1,2,3} and {4,5} fuse in complete linkage:
cat("\n--- Question (b) Analysis ---\n")
## 
## --- Question (b) Analysis ---
cat("Height at which {5} and {6} fuse in single linkage:",
    find_merge_height(hc_single, c(5), c(6)), "\n")
## Height at which {5} and {6} fuse in single linkage:
cat("Height at which {5} and {6} fuse in complete linkage:",
    find_merge_height(hc_complete, c(5), c(6)), "\n")
## Height at which {5} and {6} fuse in complete linkage:

Summary of Findings

(a) Fusion of {1,2,3} and {4,5}:

Complete linkage height > Single linkage height.

This fusion occurs higher on the complete linkage dendrogram.

(b) Fusion of {5} and {6}:

Complete linkage height = Single linkage height.

Both fusions occur at the same height because they are singleton clusters.