data <- data.frame(
Obs = 1:6,
X1 = c(1, 1, 0, 5, 6, 4),
X2 = c(4, 3, 4, 1, 2, 0)
)
plot(data$X1, data$X2, pch=19, xlab="X1", ylab="X2",
main="Initial Observations", xlim=c(-1,7), ylim=c(-1,5))
text(data$X1, data$X2, labels=data$Obs, pos=3)
set.seed(123) # For reproducibility
initial_clusters <- sample(1:2, size=6, replace=TRUE)
data$Cluster <- initial_clusters
print(data[,c("Obs", "Cluster")])
## Obs Cluster
## 1 1 1
## 2 2 1
## 3 3 1
## 4 4 2
## 5 5 1
## 6 6 2
centroids <- aggregate(.~Cluster, data=data[,c("X1","X2","Cluster")], mean)
print(centroids)
## Cluster X1 X2
## 1 1 2.0 3.25
## 2 2 4.5 0.50
dist_to_c1 <- sqrt((data$X1-centroids$X1[1])^2 + (data$X2-centroids$X2[1])^2)
dist_to_c2 <- sqrt((data$X1-centroids$X1[2])^2 + (data$X2-centroids$X2[2])^2)
data$NewCluster <- ifelse(dist_to_c1 < dist_to_c2, 1, 2)
print(data[,c("Obs", "NewCluster")])
## Obs NewCluster
## 1 1 1
## 2 2 1
## 3 3 1
## 4 4 2
## 5 5 2
## 6 6 2
# Check if clusters changed
all(data$Cluster == data$NewCluster)
## [1] FALSE
Repeat (c) and (d) until cluster assignments stop changing. Typically converges in 2-3 iterations for this small dataset.
After 1st iteration: * Check if clusters changed
The cluster hasn’t changed.
Thus we can conclude the clusters have stabilized
Final centroids are:
Cluster 1 centroid = [0.66,3.66 ]
Cluster 2 centroid = [5, 1]
Clusters should stabilize
Final centroids might be:
Cluster 1: (0.66,3.66 ) - mean of obs 1,2,3
Cluster 2: (5, 1) - mean of obs 4,5,6
final_clusters <- c(1, 1, 1, 2, 2, 2)
plot(data$X1, data$X2, pch=19, col=final_clusters,
xlab="X1", ylab="X2", main="Final Clustering")
text(data$X1, data$X2, labels=data$Obs, pos=3)
legend("topright", legend=c("Cluster 1", "Cluster 2"),
col=1:2, pch=19)