library(cluster)
dissim_values <- matrix(c(
0, 0.3, 0.4, 0.7,
0.3, 0, 0.5, 0.8,
0.4, 0.5, 0, 0.45,
0.7, 0.8, 0.45, 0
), nrow = 4, byrow = TRUE)
row.names(dissim_values) <- colnames(dissim_values) <- c("Obs1", "Obs2", "Obs3", "Obs4")
diss_matrix <- as.dist(dissim_values)
hc_complete <- hclust(diss_matrix, method = "complete")
plot(hc_complete, main = "Dendrogram - Complete Linkage", ylab = "Height")
hc_single <- hclust(diss_matrix, method = "single")
plot(hc_single, main = "Dendrogram - Single Linkage", ylab = "Height")
cutree(hc_complete, k = 2)
## Obs1 Obs2 Obs3 Obs4
## 1 1 2 2
cutree(hc_single, k = 2)
## Obs1 Obs2 Obs3 Obs4
## 1 1 1 2
# Use 'hang = -1' to adjust leaf placement and reorder labels
plot(hc_complete, hang = -1, labels = c("Obs4", "Obs3", "Obs1", "Obs2"),
main = "Equivalent Dendrogram (Leaf Order Changed)")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
# Create the data
x_vals <- c(1, 1, 0, 5, 6, 4)
y_vals <- c(4, 3, 4, 1, 2, 0)
obs_df <- data.frame(ID = 1:6, Feature1 = x_vals, Feature2 = y_vals)
# Scatterplot
ggplot(obs_df, aes(x = Feature1, y = Feature2)) +
geom_point(size = 3) +
labs(title = "Initial Scatterplot", x = "X1", y = "X2")
set.seed(42)
obs_df$Group <- sample(1:2, size = 6, replace = TRUE)
obs_df
## ID Feature1 Feature2 Group
## 1 1 1 4 1
## 2 2 1 3 1
## 3 3 0 4 1
## 4 4 5 1 1
## 5 5 6 2 2
## 6 6 4 0 2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
centroids <- obs_df %>%
group_by(Group) %>%
summarise(C1 = mean(Feature1), C2 = mean(Feature2))
centroids
## # A tibble: 2 × 3
## Group C1 C2
## <int> <dbl> <dbl>
## 1 1 1.75 3
## 2 2 5 1
assign_cluster <- function(data, centers) {
data$NewGroup <- apply(data, 1, function(row) {
distances <- apply(centers, 1, function(center) {
sqrt((as.numeric(row["Feature1"]) - center["C1"])^2 +
(as.numeric(row["Feature2"]) - center["C2"])^2)
})
return(which.min(distances))
})
return(data)
}
obs_df <- assign_cluster(obs_df, centroids)
obs_df
## ID Feature1 Feature2 Group NewGroup
## 1 1 1 4 1 1
## 2 2 1 3 1 1
## 3 3 0 4 1 1
## 4 4 5 1 1 2
## 5 5 6 2 2 2
## 6 6 4 0 2 2
repeat {
obs_df$Group <- obs_df$NewGroup
centroids <- obs_df %>%
group_by(Group) %>%
summarise(C1 = mean(Feature1), C2 = mean(Feature2))
updated_df <- assign_cluster(obs_df, centroids)
if (all(updated_df$NewGroup == obs_df$Group)) break
obs_df <- updated_df
}
obs_df
## ID Feature1 Feature2 Group NewGroup
## 1 1 1 4 1 1
## 2 2 1 3 1 1
## 3 3 0 4 1 1
## 4 4 5 1 2 2
## 5 5 6 2 2 2
## 6 6 4 0 2 2
obs_df$Group <- as.factor(obs_df$Group)
ggplot(obs_df, aes(x = Feature1, y = Feature2, color = Group)) +
geom_point(size = 3) +
labs(title = "Final Clustering Result", x = "X1", y = "X2")
# Simulate example distances between clusters
d_matrix <- matrix(c(
0, 1, 1, 4, 5,
1, 0, 1, 4, 5,
1, 1, 0, 4, 5,
4, 4, 4, 0, 1,
5, 5, 5, 1, 0
), nrow = 5, byrow = TRUE)
colnames(d_matrix) <- rownames(d_matrix) <- c("1", "2", "3", "4", "5")
d <- as.dist(d_matrix)
# Perform both clusterings
hc_single <- hclust(d, method = "single")
hc_complete <- hclust(d, method = "complete")
# Plot both
par(mfrow = c(1, 2))
plot(hc_single, main = "Single Linkage Dendrogram")
plot(hc_complete, main = "Complete Linkage Dendrogram")
Single linkage combines clusters based on the minimum distance between any two points from the clusters, while complete linkage uses the maximum distance. As a result, unless all distances are identical (which is uncommon), complete linkage typically results in cluster merges occurring at higher levels.
# Simulated data: 5 and 6 are close, 7 is farther
data_three <- matrix(c(
1, 1, # obs 5
4, 4, # obs 6
10, 10 # extra obs 7 (to allow plotting)
), ncol = 2, byrow = TRUE)
rownames(data_three) <- c("5", "6", "7")
# Compute distance matrix
d3 <- dist(data_three)
# Perform clustering
hc_s3 <- hclust(d3, method = "single")
hc_c3 <- hclust(d3, method = "complete")
# Plot dendrograms
par(mfrow = c(1, 2))
plot(hc_s3, main = "Single Linkage (5,6,7)")
plot(hc_c3, main = "Complete Linkage (5,6,7)")
The merge will happen at an equal height in both dendrograms.
Since the fusion involves just two single points, there’s only one distance value to consider. Both single and complete linkage rely on that same distance when merging individual observations, resulting in the same fusion height regardless of the method used.