K-means.R

# preparing the data
# simulated data set containing 50 observations, in which there are two clusters;
# the first 25 observations have a mean shift relative to the next 25 observations
set.seed(123)
x = matrix(rnorm(50*2), ncol = 2)
x[1:25, 1] = x[1:25,1] + 3
x[1:25, 2] = x[1:25,2] - 4

#================================================================================================

# performing k-means clustering with k = 2
km.out = kmeans(x,2,nstart = 20)
km.out$cluster

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

km.out$withinss

## [1] 44.20574 36.81422

km.out$tot.withinss

## [1] 81.01995

#81.01995
plot(x,col = (km.out$cluster + 1), main = "K-Means Clustering Results with K=2",
     xlab = "", ylab = "", pch = 20, cex = 2)

# The k-means clustering perfectly separated the observations into two clusters.
# In this example, we knew there were really two clusters because we generated the data.
# However, in real data we do not know the true number of clusters.
# So how about performing k-means clustering with k = 3?
set.seed(456)
km.out = kmeans(x,3,nstart = 20)
km.out$cluster

##  [1] 1 1 2 2 1 2 1 1 1 1 2 1 1 1 1 2 2 1 2 2 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3

km.out$withinss

## [1] 17.222531  8.396336 36.814218

km.out$tot.withinss

## [1] 62.43309

#62.43309
#======================================================================================================
# kmeans() function can run with multiple initial configurations. If the value of nstart >1
# k-means will be performed using multiple random assignments. It is strongly recommended running
# k-means with large value of nstart, otherwise undesirable optimum will be obtained

set.seed(789)
km.out = kmeans(x,3,nstart = 1)
km.out$tot.withinss

## [1] 66.6404

km.out = kmeans(x,3,nstart = 20)
km.out$tot.withinss

## [1] 62.43309

km.out = kmeans(x,3,nstart = 50)
km.out$tot.withinss

## [1] 62.43309

#======================================================================================================
# Within Sum of Squares (WSS)

# Compute clustering algorithm (e.g., k-means clustering) for different values of k.
# For instance, by varying k from 1 to 10 clusters, for each k, calculate the total 
# within-cluster sum of square (wss). # Plot the curve of wss according to the number
# of clusters k.# The location of a bend (knee) in the plot is generally considered as 
# an indicator of the appropriate number of clusters.

# Compute and plot wss for k = 2 to k = 10

k.max <- 10 # Maximal number of clusters
data <- x
wss <- sapply(1:k.max, 
              function(k){kmeans(data, k, nstart=50 )$tot.withinss})
plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")
abline(v = 2, lty =2)

#==================================================================================================
#applying Silhoutte statistics

# Compute clustering algorithm (e.g., k-means clustering) for different values of k. 
# For instance, by varying k from 1 to 15 clusters
# For each k, calculate the average silhouette of observations (avg.sil)
# Plot the curve of avg.sil according to the number of clusters k.
# The location of the maximum is considered as the appropriate number of clusters.

# Compute the average silhouette width for 
# k = 2 to k = 15
library(cluster)
k.max = 10
sil <- rep(0, k.max)

for(i in 2:k.max){
  km.res <- kmeans(data, i, nstart = 50)
  
  ss <- silhouette(km.res$cluster, dist(data))
  sil[i] <- mean(ss[, 3])
}

# Plot the  average silhouette width
plot(1:k.max, sil, type = "b", pch = 19, 
     frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)

#================================================================================================

K-means.R

aksha

Sat Oct 07 22:43:29 2017