DATA 502 HW 3

Final clusters: (1,1), (1,2), (2,1) with centroid (1.33333,1.33333) (2,3), (3,3) with centroid (2.5, 3) (4,5), (5,4), (6,5) with centroid (5, 4.66667) SSE: 226.6667

This function carries out a K.means algorithim on the given data and starting centroids. I will be commenting on each step in the code and what it does.

data <- data.frame(x = c(1,1,2,2,3,4,5,6), y = c(1,2,1,3,3,5,4,5))


k.means <- function(c1, c2, c3) {
# infinity is just being used as a placeholder for now. The prev_centroids and centroids vectors are updated after
# every calculation of new centroids. They are compared before each calculation to see if the centroids haven't 
# changed and therefore conclude the algorithim.
  prev_centroids <- c(Inf, Inf, Inf, Inf, Inf, Inf)
  centroids <- c(c1, c2, c3)
  
  repeat {
    # Creating empty vectors to fill with updated calculations for the centroids and clusters later on
    c1_dist <- c()
    c2_dist <- c()
    c3_dist <- c()
    cluster1 <- data.frame(x = numeric(), y = numeric())
    cluster2 <- data.frame(x = numeric(), y = numeric())
    cluster3 <- data.frame(x = numeric(), y = numeric())
    
    # This for loop iterates through every row in the dataset. It then finds the euclidean distance between the current centroid and every observation of the data. The distance is then stored in its respective vector that was created earlier.
    for (i in 1:nrow(data)) {
      dist1 <- sqrt(sum((c1 - data[i, ])^2))
      dist2 <- sqrt(sum((c2 - data[i, ])^2))
      dist3 <- sqrt(sum((c3 - data[i, ])^2))
      c1_dist <- c(c1_dist, dist1)
      c2_dist <- c(c2_dist, dist2)
      c3_dist <- c(c3_dist, dist3)
    }
    # Creating a new dataframe with the data and the distance from each centroid. Also calculated the SSE
    data1 <- cbind(data, c1_dist, c2_dist, c3_dist)
    SSE <- sum(c1_dist^2) + sum(c2_dist^2) + sum(c3_dist^2)
    
    # This for loop iterates through every observation and finds which centroid it is the closest to by finding the centroid with the least distance. It then assigns each observation to its respective cluster.
    for (n in 1:nrow(data1)) {
      if (data1[n, 3] < data1[n, 4] & data1[n, 3] < data1[n, 5]) {
        cluster1 <- rbind(cluster1, data[n, c(1, 2)])
      } else if (data1[n, 4] < data1[n, 3] & data1[n, 4] < data1[n, 5]) {
        cluster2 <- rbind(cluster2, data[n, c(1, 2)])
      } else {
        cluster3 <- rbind(cluster3, data[n, c(1, 2)])
      }
    }
    
    # New values for each centroid are calculated
    c1 <- c(mean(cluster1$x), mean(cluster1$y))
    c2 <- c(mean(cluster2$x), mean(cluster2$y))
    c3 <- c(mean(cluster3$x), mean(cluster3$y))
    
    prev_centroids <- centroids
    centroids <- c(c1, c2, c3)
    
    # The clusters, new centroids, and SSE are printed
    print(list(cluster1 = cluster1,
              centroid1 = c1,
              cluster2 = cluster2,
              centroid2 = c2,
              cluster3 = cluster3,
              centroid3 = c3,
              SSE = SSE))
    
    # This will loop until the previous centroids are equal to the current centroids.
    if (isTRUE(all.equal(prev_centroids, centroids))) break
  }
  
  return(list(cluster1 = cluster1,
              centroid1 = c1,
              cluster2 = cluster2,
              centroid2 = c2,
              cluster3 = cluster3,
              centroid3 = c3,
              SSE = SSE))
}

k.means(c(2,3), c(3,3), c(5,4))

## $cluster1
##   x y
## 1 1 1
## 2 1 2
## 3 2 1
## 4 2 3
## 
## $centroid1
## [1] 1.50 1.75
## 
## $cluster2
##   x y
## 5 3 3
## 
## $centroid2
## [1] 3 3
## 
## $cluster3
##   x y
## 6 4 5
## 7 5 4
## 8 6 5
## 
## $centroid3
## [1] 5.000000 4.666667
## 
## $SSE
## [1] 174
## 
## $cluster1
##   x y
## 1 1 1
## 2 1 2
## 3 2 1
## 
## $centroid1
## [1] 1.333333 1.333333
## 
## $cluster2
##   x y
## 4 2 3
## 5 3 3
## 
## $centroid2
## [1] 2.5 3.0
## 
## $cluster3
##   x y
## 6 4 5
## 7 5 4
## 8 6 5
## 
## $centroid3
## [1] 5.000000 4.666667
## 
## $SSE
## [1] 210.7222
## 
## $cluster1
##   x y
## 1 1 1
## 2 1 2
## 3 2 1
## 
## $centroid1
## [1] 1.333333 1.333333
## 
## $cluster2
##   x y
## 4 2 3
## 5 3 3
## 
## $centroid2
## [1] 2.5 3.0
## 
## $cluster3
##   x y
## 6 4 5
## 7 5 4
## 8 6 5
## 
## $centroid3
## [1] 5.000000 4.666667
## 
## $SSE
## [1] 226.6667

## $cluster1
##   x y
## 1 1 1
## 2 1 2
## 3 2 1
## 
## $centroid1
## [1] 1.333333 1.333333
## 
## $cluster2
##   x y
## 4 2 3
## 5 3 3
## 
## $centroid2
## [1] 2.5 3.0
## 
## $cluster3
##   x y
## 6 4 5
## 7 5 4
## 8 6 5
## 
## $centroid3
## [1] 5.000000 4.666667
## 
## $SSE
## [1] 226.6667

DATA 502 HW 3

2025-11-10