exercise 8

# Create the dissimilarity matrix
dissimilarity <- matrix(c(
  0,   0.3, 0.4, 0.7,
  0.3, 0,   0.5, 0.8,
  0.4, 0.5, 0,   0.45,
  0.7, 0.8, 0.45, 0
), nrow = 4, byrow = TRUE)

# Convert to "dist" object
diss_dist <- as.dist(dissimilarity)

# Perform hierarchical clustering using COMPLETE linkage
hc_complete <- hclust(diss_dist, method = "complete")

# Plot the dendrogram
plot(hc_complete, main = "Complete Linkage Dendrogram", xlab = "Observations", sub = "", ylab = "Height")

# Same dissimilarity matrix
dissimilarity <- matrix(c(
  0,   0.3, 0.4, 0.7,
  0.3, 0,   0.5, 0.8,
  0.4, 0.5, 0,   0.45,
  0.7, 0.8, 0.45, 0
), nrow = 4, byrow = TRUE)

# Convert to dist object
diss_dist <- as.dist(dissimilarity)

# Perform hierarchical clustering using SINGLE linkage
hc_single <- hclust(diss_dist, method = "single")

# Plot dendrogram
plot(hc_single, main = "Single Linkage Dendrogram", xlab = "Observations", sub = "", ylab = "Height")

# Assuming you already have hc_complete from earlier

# Cut dendrogram into 2 clusters
clusters_complete <- cutree(hc_complete, k = 2)

# View cluster assignments
clusters_complete

## [1] 1 1 2 2

# Plot dendrogram
plot(hc_complete, main = "Complete Linkage Dendrogram with 2 Clusters")

# Add rectangle to show clusters
rect.hclust(hc_complete, k = 2, border = "red")

After cutting the complete linkage dendrogram into two clusters, Observations 1 and 2 are in one cluster, and Observations 3 and 4 are in the other cluster. #d

# Assuming you already have hc_single from earlier

# Cut the single linkage dendrogram into 2 clusters
clusters_single <- cutree(hc_single, k = 2)

# View cluster assignments
clusters_single

## [1] 1 1 1 2

After cutting the single linkage dendrogram into two clusters, Observations 1, 2, and 3 are grouped into one cluster, while Observation 4 forms its own separate cluster.

# Starting with the original complete linkage clustering
# Assuming 'hc_complete' was created before

# Convert hclust object to dendrogram object
dend <- as.dendrogram(hc_complete)

# Function to swap left and right at every merge
swap_leaves <- function(d) {
  if (is.leaf(d)) return(d)
  d[[c(1,2)]] <- d[[c(2,1)]] # Swap left and right
  d[[1]] <- swap_leaves(d[[1]])
  d[[2]] <- swap_leaves(d[[2]])
  return(d)
}

# Apply the swapping function
swapped_dend <- swap_leaves(dend)

# Plot the swapped dendrogram
plot(swapped_dend, main = "Equivalent Dendrogram (Leaves Swapped)")

# Define the data
obs <- 1:6
X1 <- c(1, 1, 0, 5, 6, 4)
X2 <- c(4, 3, 4, 1, 2, 0)

# Plot
plot(X1, X2, xlab = "X1", ylab = "X2", main = "Scatterplot of Observations", pch = 19, col = "blue")
text(X1, X2, labels = obs, pos = 3, cex = 0.8) # Add observation numbers

# Define observations
obs <- 1:6

# Randomly assign cluster labels (1 or 2)
set.seed(1234)  # Set seed for reproducibility
initial_clusters <- sample(1:2, size = 6, replace = TRUE)

# View assigned clusters
data.frame(Observation = obs, Cluster = initial_clusters)

##   Observation Cluster
## 1           1       2
## 2           2       2
## 3           3       2
## 4           4       2
## 5           5       1
## 6           6       2

Observations 1, 2, 3, 4, 6 → Cluster 2 Observation 5 → Cluster 1

# Data
X1 <- c(1, 1, 0, 5, 6, 4)
X2 <- c(4, 3, 4, 1, 2, 0)
clusters <- c(2, 2, 2, 2, 1, 2)

# Create a data frame
data <- data.frame(Observation = 1:6, X1 = X1, X2 = X2, Cluster = clusters)

# Compute centroid for each cluster
centroids <- aggregate(cbind(X1, X2) ~ Cluster, data = data, FUN = mean)

# View centroids
centroids

##   Cluster  X1  X2
## 1       1 6.0 2.0
## 2       2 2.2 2.4

The centroid of Cluster 1 is at (6.0, 2.0), and the centroid of Cluster 2 is at (2.2, 2.4).

# Data
X1 <- c(1, 1, 0, 5, 6, 4)
X2 <- c(4, 3, 4, 1, 2, 0)

# Centroids
centroid1 <- c(6, 2)
centroid2 <- c(2.2, 2.4)

# Compute distances from each observation to each centroid
dist_to_c1 <- sqrt((X1 - centroid1[1])^2 + (X2 - centroid1[2])^2)
dist_to_c2 <- sqrt((X1 - centroid2[1])^2 + (X2 - centroid2[2])^2)

# Assign to the closer centroid
new_clusters <- ifelse(dist_to_c1 < dist_to_c2, 1, 2)

# View results
data.frame(Observation = 1:6, X1 = X1, X2 = X2, Cluster_Assignment = new_clusters)

##   Observation X1 X2 Cluster_Assignment
## 1           1  1  4                  2
## 2           2  1  3                  2
## 3           3  0  4                  2
## 4           4  5  1                  1
## 5           5  6  2                  1
## 6           6  4  0                  1

Observations 1, 2, and 3 → Cluster 2 Observations 4, 5, and 6 → Cluster 1

# Observations
X1 <- c(1, 1, 0, 5, 6, 4)
X2 <- c(4, 3, 4, 1, 2, 0)

# Initial assignment from (d)
clusters <- c(2, 2, 2, 1, 1, 1)

# New centroids
centroid1 <- c(mean(X1[clusters == 1]), mean(X2[clusters == 1]))
centroid2 <- c(mean(X1[clusters == 2]), mean(X2[clusters == 2]))

# Distances
dist_to_c1 <- sqrt((X1 - centroid1[1])^2 + (X2 - centroid1[2])^2)
dist_to_c2 <- sqrt((X1 - centroid2[1])^2 + (X2 - centroid2[2])^2)

# New assignments
new_clusters <- ifelse(dist_to_c1 < dist_to_c2, 1, 2)

# View
data.frame(Observation = 1:6, Old_Cluster = clusters, New_Cluster = new_clusters)

##   Observation Old_Cluster New_Cluster
## 1           1           2           2
## 2           2           2           2
## 3           3           2           2
## 4           4           1           1
## 5           5           1           1
## 6           6           1           1

After recomputing the centroids and reassigning the observations, no changes in cluster assignments were observed. Thus, the K-means algorithm has converged.

# Define the data
X1 <- c(1, 1, 0, 5, 6, 4)
X2 <- c(4, 3, 4, 1, 2, 0)
final_clusters <- c(2, 2, 2, 1, 1, 1)  # Final cluster assignments

# Plot with color according to cluster
plot(X1, X2, col = final_clusters, pch = 19,
     xlab = "X1", ylab = "X2", main = "Final Clusters after K-means")
text(X1, X2, labels = 1:6, pos = 3, cex = 0.8)  # Add observation labels

Observations 1, 2, 3 (cluster 2) are shown in reddish color. Observations 4, 5, 6 (cluster 1) are shown in black. Each observation is labeled with its number (1–6).

# Create a sample dataset with 5 observations in 2D space
# Group 1: points close together
# Group 2: points close together, but farther from Group 1
set.seed(1)
x <- matrix(c(
  1, 1,   # point 1
  1.1, 1, # point 2
  1.05, 0.9, # point 3
  4, 4,   # point 4
  4.1, 4.1  # point 5
), ncol = 2, byrow = TRUE)

rownames(x) <- c("1", "2", "3", "4", "5")

# Compute distance matrix
d <- dist(x)

# Perform single linkage clustering
hc_single <- hclust(d, method = "single")

# Perform complete linkage clustering
hc_complete <- hclust(d, method = "complete")

# Plot side-by-side dendrograms
par(mfrow = c(1,2))  # 1 row, 2 plots
plot(hc_single, main = "Single Linkage", xlab = "", sub = "", hang = -1)
plot(hc_complete, main = "Complete Linkage", xlab = "", sub = "", hang = -1)

The dendrograms show that clusters {1,2,3} and {4,5} merge at a lower height in the single linkage dendrogram than in the complete linkage dendrogram, confirming that single linkage uses minimum inter-cluster distances, while complete linkage uses maximum.

#b When clusters {5} and {6} are both singleton observations, the distance between them is the same regardless of linkage method. Therefore, they will fuse at the same height in both the single linkage and complete linkage dendrograms.

# Prepare data
x <- matrix(c(
  1, 1,
  1.1, 1,
  0.9, 1.1,
  4, 4,
  10, 10,
  10.1, 10.1
), ncol = 2, byrow = TRUE)
rownames(x) <- c("1", "2", "3", "4", "5", "6")

d <- dist(x)

# Perform clustering
hc_single <- hclust(d, method = "single")
hc_complete <- hclust(d, method = "complete")

# Function to extract fusion height of points 5 and 6
get_fusion_height <- function(hc, label1, label2) {
  n <- length(hc$order)
  for (i in 1:nrow(hc$merge)) {
    merged <- hc$merge[i, ]
    # Convert to original labels
    merged_labels <- sapply(merged, function(x) if (x < 0) -x else NA)
    if (all(c(label1, label2) %in% merged_labels)) {
      return(hc$height[i])
    }
  }
  return(NA)
}

# Get heights
height_single_56 <- get_fusion_height(hc_single, 5, 6)
height_complete_56 <- get_fusion_height(hc_complete, 5, 6)

# Print
cat("Height at which {5,6} are fused:\n")

## Height at which {5,6} are fused:

cat("Single linkage:", height_single_56, "\n")

## Single linkage: 0.1414214

cat("Complete linkage:", height_complete_56, "\n")

## Complete linkage: 0.1414214

Since clusters {5} and {6} each consist of a single observation, the distance between them is the same regardless of the linkage method. Therefore, they will fuse at the same height in both the single linkage and complete linkage dendrograms

exercise 8

2025-04-29