K-mean - code

Summary

K-mean implemented in R. The code is coming from the Machine learning course by Andrew Ng on Coursera. Initially, (as part of the weekly exercise) it was done in Matlab/Octave. This is a transposition in R.

The algorithm repeatedly carries out two steps:

Assigning each training example x(i) to its closest centroid
Recomputing the mean of each centroid using the points assigned to it

The K-means algorithm will always converge to somefinal set of means for the centroids. Note that the converged solution may not always be ideal and depends on the initial setting of the centroids. Therefore, in practice the K-means algorithm is usually run a few times with diferent random initializations. One way to choose between these diferent solutions from diferent random initializations is to choose the one with the lowest cost function value (distortion).

Kmean function

The function runs the K-Means algorithm on data matrix X, where eachrow of X is a single example. It uses initial_centroids used as the initial centroids.

max_iters specifies the total number of interactions of K-Means to execute.

runkMeans returns centroids, a K x n matrix of the computed centroids and idx, a m x 1 vector of centroid assignments (i.e. each entry in range [1..K])

runkMeans <- function(X, initial_centroids, max_iters){
  
  # Initialize values
  K <- nrow(initial_centroids)
  centroids <- initial_centroids
  m <- nrow(X)
  J_matrix <- matrix(0, max_iters, 2)
  J_matrix[ ,1] <- 1:max_iters
  
  # Run K-Means
  for (i in 1 : max_iters){

    # For each example in X, assign it to the closest centroid
    idx <- findClosestCentroids(X, centroids)
    # Given the memberships, compute new centroids
    centroids <- computeCentroids(X, idx, K)
    
    # Compute the cost to check if everything is working correctly
    J <- calculateCost(X, centroids, idx)
    J_matrix[i,2] <- J
    colnames(J_matrix) <- c("iterations","J")
  }
  assign("centroids", centroids, .GlobalEnv)
  assign("J_matrix", J_matrix, .GlobalEnv)
  return(centroids)
}

Initialize random centroids

Initialize the centroids to be random examples. K should be < m

kMeansInitCentroids <- function(X, K){
  randidx <- X[sample(1:nrow(X)), ]   #Randomly reorder the indices of examples
  initial_centroids <- randidx[1:K, ] # Take the first K examples as centroids
  rownames(initial_centroids) <- NULL
  assign("initial_centroids", initial_centroids, .GlobalEnv)
}

Find closest centroids

Go over every example, find its closest centroid, and store the index inside idx at the appropriate location

findClosestCentroids <- function(X, centroids){

  K <- nrow(centroids) # Set K
  m <- nrow(X) # set m
  idx <- vector(mode="numeric", length= m)
  
  for (i in 1 : m){
  #    calculate: distance: sqrt((x2 - x1)^2 + (y2 - y1)^2)
      Compare <- vector(mode="numeric", length= K)
      
      for (j in 1 : K){
          Compare[j] <- sqrt(sum((X[i,] - centroids[j,])^ 2))
      }
  
      idx[i] <- which.min(Compare)
  }
  assign("idx", idx, .GlobalEnv)
  return(idx)
}

Compute centroids

ComputeCentroids returns the new centroids by computing the means of the data points assigned to each centroid

computeCentroids <- function(X, idx, K){
  
  centroids <- matrix(0, K, ncol(X))
  
  for (i in 1 : K){
    
    for (j in 1 : ncol(X)){
        A <- sum(X[idx == i, j])        # sum the column
        B <- length(idx[idx == i])       # count 
        centroids[i,j] <- A / B
    }
  }
  return(centroids)
}

Optimization objective

This function calculates the cost J for each iteration. J should decrease each time, if not, there is bug!

calculateCost <- function(X, centroids, idx){
  
  J <- 0
  m <- nrow(X) # set m

  m_vect <- vector(mode="numeric", length= m)
  
  for (l in 1 : m){
    m_vect[l] <- sum((X[l,] - centroids[idx[l],])^2)
  }
  J <- (1/m) * sum(m_vect)
  return(J)
}

Test with the Iris dataset

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

X <- as.matrix(iris[,1:4]) #as.matrix for speed
X <- X[sample(1:nrow(X)), ]

kMeansInitCentroids(X,3) # initialize the centroids

runkMeans(X, initial_centroids,max_iters = 8) # run Kmeans

##          [,1]     [,2]     [,3]     [,4]
## [1,] 5.006000 3.428000 1.462000 0.246000
## [2,] 6.853846 3.076923 5.715385 2.053846
## [3,] 5.883607 2.740984 4.388525 1.434426

ggplot(as.data.frame(J_matrix), aes(x=iterations, y= J))+ 
  geom_point()+
  ggtitle("Cost Reduction")

Local optima

K-mean can be stuck on wrong local optima. To avoid this we initialize K-mean a lot of times (usually between 50 and 1000). Good for clusters between 1 and 10.

kmeanLocalOptima <- function(X, K, Outer_ite = 10, in_ite = 10){
  
  J_comp <- vector(mode="numeric", length= Outer_ite)
  centroids_comp <- vector("list", Outer_ite) 
  
  for (z in 1 : Outer_ite){
    # Initialize centroids
    centroids <- kMeansInitCentroids(X, K)
    # run
    runkMeans(X, centroids, in_ite)
    # Save J and centroids in a separate dataframe
    J_comp[z] <- J_matrix[nrow(J_matrix),2]
    centroids_comp[[z]] <- centroids
  }
  centroids_comp <- centroids_comp[[which.min(J_comp)]]
  J_comp <- min(J_comp)
  
  assign("J_comp", J_comp, .GlobalEnv)
  assign("centroids_comp", centroids_comp, .GlobalEnv)
  print(paste("min J", min(J_comp)))
  return(centroids_comp)
}

Test on Iris vs the Kmean function from R

X <- as.matrix(iris[,1:4])

kmeanLocalOptima(X, 3, Outer_ite = 10, in_ite = 10)

## [1] "min J 0.525676276174307"

##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]          6.0         2.2          5.0         1.5
## [2,]          4.4         2.9          1.4         0.2
## [3,]          7.2         3.6          6.1         2.5

# our guess
table(idx, iris$Species)

##    
## idx setosa versicolor virginica
##   1     50          0         0
##   2      0          2        36
##   3      0         48        14

kmean_R <- kmeans(X, 3, nstart = 20)

# the kmean function from R 
table(kmean_R$cluster, iris$Species)

##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0          2        36
##   3      0         48        14

Pretty close!

Guessing the number of cluster: the Elbow method

If we plot the number of clusters and the related cost function J, we can see an “elbow” which indicates what is the correct number of cluster to be taken.

elbow <- matrix(0, 4, 2)
elbow[,1] <- 2:5

for (h in 1 : 4){
  kmeanLocalOptima(X, h+1, Outer_ite = 10, in_ite = 10)
  elbow[h,2] <- J_comp
}

## [1] "min J 1.01565301173572"
## [1] "min J 0.525676276174307"
## [1] "min J 0.381706728771454"
## [1] "min J 0.309814867724868"

elbow <- as.data.frame(elbow)
colnames(elbow) <- c("K", "J")

ggplot(elbow, aes(x= K, y= J))+
  geom_line()+
  geom_point(size=3)+
  xlab("K number of cluster")+
  ylab("Cost function J")+
  ggtitle("Elbow method")

3 is the number we wanted!