Clustering

#CLUSTERING 1-0-1
# k-means clustering with kmeans()
# k-medoids clustering with pam(), pamk()
# hierarchical clustering
# density based clustering with DBSCAN
set.seed(8953)
iris2 <- iris
head(iris2)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

iris2$Species <- NULL
(kmeans.result <- kmeans(iris2, 3))

## K-means clustering with 3 clusters of sizes 38, 50, 62
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     6.850000    3.073684     5.742105    2.071053
## 2     5.006000    3.428000     1.462000    0.246000
## 3     5.901613    2.748387     4.393548    1.433871
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1
## [106] 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1
## [141] 1 1 3 1 1 1 3 1 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 23.87947 15.15100 39.82097
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

table(iris$Species, kmeans.result$cluster)

##             
##               1  2  3
##   setosa      0 50  0
##   versicolor  2  0 48
##   virginica  36  0 14

# Class “setosa” can be easily separated from the other clusters
#􏰀 Classes “versicolor” and “virginica” are to a small degree overlapped with each other.
plot(iris2[c("Sepal.Length", "Sepal.Width")], 
     col=kmeans.result$cluster)
points(kmeans.result$centers[, c("Sepal.Length", "Sepal.Width")],
       col=1:3, pch=8, cex=2)
#Clustering with pamk()
        #install.packages("fpc")
library(fpc)

## Warning: package 'fpc' was built under R version 3.1.1

pamk.result <- pamk(iris2) # number of clusters pamk.result$nc
# check clustering against actual species
table(pamk.result$pamobject$clustering, iris$Species)

##    
##     setosa versicolor virginica
##   1     50          1         0
##   2      0         49        50

#2 clusters formed - setosa, and a mixture of vesicolro and virginica
layout(matrix(c(1, 2), 1, 2)) # 2 graphs per page 
plot(pamk.result$pamobject)

# layout(matrix(1)) # change back to one graph per page

#The left chart is a 2-dimensional “clusplot” (clustering plot)
#of the two clusters and the lines show the distance between clusters.

#The right chart shows their silhouettes. A large si (almost 1) suggests that 
#the corresponding observations are very well clustered, a small si (around 0) 
#means that the observation lies between two clusters, and observations with a 
#negative si are probably placed in the wrong cluster.

#Since the average Si are respectively 0.81 and 0.62 in the above silhouette, 
#the identified two clusters are well clustered.

#clustering with pam()
library(cluster)

## Warning: package 'cluster' was built under R version 3.1.2

# group into 3 clusters
pam.result <- pam(iris2, 3) 
table(pam.result$clustering, iris$Species)

##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         48        14
##   3      0          2        36

layout(matrix(c(1, 2), 1, 2)) # 2 graphs per page 
plot(pam.result)

#In this example, the result of pam() seems better, because it identifies 
#three clusters, corresponding to three species.

#Hierarchical Clustering 
set.seed(2835)
# draw a sample of 40 records from the iris data, so that the # clustering plot will not be over crowded
idx <- sample(1:dim(iris)[1], 40)
irisSample <- iris[idx, ]
# remove class label
irisSample$Species <- NULL
# hierarchical clustering
hc <- hclust(dist(irisSample), method = "ave")
# plot clusters
layout(matrix(1))
plot(hc, hang = -1, labels = iris$Species[idx])
# cut tree into 3 clusters
rect.hclust(hc, k = 3)

# get cluster IDs
groups <- cutree(hc, k = 3)
        #dist(irisSample) 

#Density Based Clustering
#Group objects into one cluster if they are connected 
#to one another by densely populated area
#Two key parameters in DBSCAN:
#eps: reachability distance which defines the size of neighborhood
#MinPts: minimum number of points

#If the number of points in the neighborhood of point α is no less than 
#MinPts, then α is a dense point. All the points in its neighborhood are 
#density-reachable from α and are put into the same cluster as α.

#numeric data only
library(fpc)
iris2 <- iris[-5] # remove class tags
tail(iris2)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 145          6.7         3.3          5.7         2.5
## 146          6.7         3.0          5.2         2.3
## 147          6.3         2.5          5.0         1.9
## 148          6.5         3.0          5.2         2.0
## 149          6.2         3.4          5.4         2.3
## 150          5.9         3.0          5.1         1.8

ds <- dbscan(iris2, eps = 0.42, MinPts = 5)
# compare clusters with original class labels 
table(ds$cluster, iris$Species)

##    
##     setosa versicolor virginica
##   0      2         10        17
##   1     48          0         0
##   2      0         37         0
##   3      0          3        33

#1 to 3: identified clusters
# 0: noises or outliers, i.e., objects that are not assigned to any clusters

plot(ds, iris2)

plot(ds, iris2[c(1,4)])

plotcluster(iris2, ds$cluster)

#Prediction with Clustering Model
#Label new data, based on their similarity with the clusters
#􏰀 Draw a sample of 10 objects from iris and add small noises
#to them to make a new dataset for labeling
#􏰀 Random noises are generated with a uniform distribution using function runif().

# create a new dataset for labeling
set.seed(435)
idx <- sample(1:nrow(iris), 10)
# remove class labels
new.data <- iris[idx,-5]
# add random noise
new.data <- new.data + matrix(runif(10*4, min=0, max=0.2),
                              nrow=10, ncol=4)
pred <- predict(ds, iris2, new.data)
table(pred, iris$Species[idx]) # check cluster labels # results

##     
## pred setosa versicolor virginica
##    0      0          0         1
##    1      3          0         0
##    2      0          3         0
##    3      0          1         2

#Eight(=3+3+2) out of 10 objects are assigned with correct class labels.
plot(iris2[c(1, 4)], col = 1 + ds$cluster)
points(new.data[c(1, 4)], pch = "+", col = 1 + pred, cex = 3)

Clustering

SA