K-mean implemented in R. The code is coming from the Machine learning course by Andrew Ng on Coursera. Initially, (as part of the weekly exercise) it was done in Matlab/Octave. This is a transposition in R.
The algorithm repeatedly carries out two steps:
Assigning each training example x(i) to its closest centroid
Recomputing the mean of each centroid using the points assigned to it
The K-means algorithm will always converge to somefinal set of means for the centroids. Note that the converged solution may not always be ideal and depends on the initial setting of the centroids. Therefore, in practice the K-means algorithm is usually run a few times with diferent random initializations. One way to choose between these diferent solutions from diferent random initializations is to choose the one with the lowest cost function value (distortion).
The function runs the K-Means algorithm on data matrix X, where eachrow of X is a single example. It uses initial_centroids used as the initial centroids.
max_iters specifies the total number of interactions of K-Means to execute.
runkMeans returns centroids, a K x n matrix of the computed centroids and idx, a m x 1 vector of centroid assignments (i.e. each entry in range [1..K])
runkMeans <- function(X, initial_centroids, max_iters){
# Initialize values
K <- nrow(initial_centroids)
centroids <- initial_centroids
m <- nrow(X)
J_matrix <- matrix(0, max_iters, 2)
J_matrix[ ,1] <- 1:max_iters
# Run K-Means
for (i in 1 : max_iters){
# For each example in X, assign it to the closest centroid
idx <- findClosestCentroids(X, centroids)
# Given the memberships, compute new centroids
centroids <- computeCentroids(X, idx, K)
# Compute the cost to check if everything is working correctly
J <- calculateCost(X, centroids, idx)
J_matrix[i,2] <- J
colnames(J_matrix) <- c("iterations","J")
}
assign("centroids", centroids, .GlobalEnv)
assign("J_matrix", J_matrix, .GlobalEnv)
return(centroids)
}
Initialize the centroids to be random examples. K should be < m
kMeansInitCentroids <- function(X, K){
randidx <- X[sample(1:nrow(X)), ] #Randomly reorder the indices of examples
initial_centroids <- randidx[1:K, ] # Take the first K examples as centroids
rownames(initial_centroids) <- NULL
assign("initial_centroids", initial_centroids, .GlobalEnv)
}
Go over every example, find its closest centroid, and store the index inside idx at the appropriate location
findClosestCentroids <- function(X, centroids){
K <- nrow(centroids) # Set K
m <- nrow(X) # set m
idx <- vector(mode="numeric", length= m)
for (i in 1 : m){
# calculate: distance: sqrt((x2 - x1)^2 + (y2 - y1)^2)
Compare <- vector(mode="numeric", length= K)
for (j in 1 : K){
Compare[j] <- sqrt(sum((X[i,] - centroids[j,])^ 2))
}
idx[i] <- which.min(Compare)
}
assign("idx", idx, .GlobalEnv)
return(idx)
}
ComputeCentroids returns the new centroids by computing the means of the data points assigned to each centroid
computeCentroids <- function(X, idx, K){
centroids <- matrix(0, K, ncol(X))
for (i in 1 : K){
for (j in 1 : ncol(X)){
A <- sum(X[idx == i, j]) # sum the column
B <- length(idx[idx == i]) # count
centroids[i,j] <- A / B
}
}
return(centroids)
}
This function calculates the cost J for each iteration. J should decrease each time, if not, there is bug!
calculateCost <- function(X, centroids, idx){
J <- 0
m <- nrow(X) # set m
m_vect <- vector(mode="numeric", length= m)
for (l in 1 : m){
m_vect[l] <- sum((X[l,] - centroids[idx[l],])^2)
}
J <- (1/m) * sum(m_vect)
return(J)
}
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
X <- as.matrix(iris[,1:4]) #as.matrix for speed
X <- X[sample(1:nrow(X)), ]
kMeansInitCentroids(X,3) # initialize the centroids
runkMeans(X, initial_centroids,max_iters = 8) # run Kmeans
## [,1] [,2] [,3] [,4]
## [1,] 5.006000 3.428000 1.462000 0.246000
## [2,] 6.853846 3.076923 5.715385 2.053846
## [3,] 5.883607 2.740984 4.388525 1.434426
ggplot(as.data.frame(J_matrix), aes(x=iterations, y= J))+
geom_point()+
ggtitle("Cost Reduction")
K-mean can be stuck on wrong local optima. To avoid this we initialize K-mean a lot of times (usually between 50 and 1000). Good for clusters between 1 and 10.
kmeanLocalOptima <- function(X, K, Outer_ite = 10, in_ite = 10){
J_comp <- vector(mode="numeric", length= Outer_ite)
centroids_comp <- vector("list", Outer_ite)
for (z in 1 : Outer_ite){
# Initialize centroids
centroids <- kMeansInitCentroids(X, K)
# run
runkMeans(X, centroids, in_ite)
# Save J and centroids in a separate dataframe
J_comp[z] <- J_matrix[nrow(J_matrix),2]
centroids_comp[[z]] <- centroids
}
centroids_comp <- centroids_comp[[which.min(J_comp)]]
J_comp <- min(J_comp)
assign("J_comp", J_comp, .GlobalEnv)
assign("centroids_comp", centroids_comp, .GlobalEnv)
print(paste("min J", min(J_comp)))
return(centroids_comp)
}
X <- as.matrix(iris[,1:4])
kmeanLocalOptima(X, 3, Outer_ite = 10, in_ite = 10)
## [1] "min J 0.525676276174307"
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 6.0 2.2 5.0 1.5
## [2,] 4.4 2.9 1.4 0.2
## [3,] 7.2 3.6 6.1 2.5
# our guess
table(idx, iris$Species)
##
## idx setosa versicolor virginica
## 1 50 0 0
## 2 0 2 36
## 3 0 48 14
kmean_R <- kmeans(X, 3, nstart = 20)
# the kmean function from R
table(kmean_R$cluster, iris$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 2 36
## 3 0 48 14
Pretty close!
If we plot the number of clusters and the related cost function J, we can see an “elbow” which indicates what is the correct number of cluster to be taken.
elbow <- matrix(0, 4, 2)
elbow[,1] <- 2:5
for (h in 1 : 4){
kmeanLocalOptima(X, h+1, Outer_ite = 10, in_ite = 10)
elbow[h,2] <- J_comp
}
## [1] "min J 1.01565301173572"
## [1] "min J 0.525676276174307"
## [1] "min J 0.381706728771454"
## [1] "min J 0.309814867724868"
elbow <- as.data.frame(elbow)
colnames(elbow) <- c("K", "J")
ggplot(elbow, aes(x= K, y= J))+
geom_line()+
geom_point(size=3)+
xlab("K number of cluster")+
ylab("Cost function J")+
ggtitle("Elbow method")
3 is the number we wanted!