#CLUSTERING 1-0-1
# k-means clustering with kmeans()
# k-medoids clustering with pam(), pamk()
# hierarchical clustering
# density based clustering with DBSCAN
set.seed(8953)
iris2 <- iris
head(iris2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris2$Species <- NULL
(kmeans.result <- kmeans(iris2, 3))
## K-means clustering with 3 clusters of sizes 38, 50, 62
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 6.850000 3.073684 5.742105 2.071053
## 2 5.006000 3.428000 1.462000 0.246000
## 3 5.901613 2.748387 4.393548 1.433871
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1
## [106] 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1
## [141] 1 1 3 1 1 1 3 1 1 3
##
## Within cluster sum of squares by cluster:
## [1] 23.87947 15.15100 39.82097
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(iris$Species, kmeans.result$cluster)
##
## 1 2 3
## setosa 0 50 0
## versicolor 2 0 48
## virginica 36 0 14
# Class “setosa” can be easily separated from the other clusters
# Classes “versicolor” and “virginica” are to a small degree overlapped with each other.
plot(iris2[c("Sepal.Length", "Sepal.Width")],
col=kmeans.result$cluster)
points(kmeans.result$centers[, c("Sepal.Length", "Sepal.Width")],
col=1:3, pch=8, cex=2)
#Clustering with pamk()
#install.packages("fpc")
library(fpc)
## Warning: package 'fpc' was built under R version 3.1.1

pamk.result <- pamk(iris2) # number of clusters pamk.result$nc
# check clustering against actual species
table(pamk.result$pamobject$clustering, iris$Species)
##
## setosa versicolor virginica
## 1 50 1 0
## 2 0 49 50
#2 clusters formed - setosa, and a mixture of vesicolro and virginica
layout(matrix(c(1, 2), 1, 2)) # 2 graphs per page
plot(pamk.result$pamobject)

# layout(matrix(1)) # change back to one graph per page
#The left chart is a 2-dimensional “clusplot” (clustering plot)
#of the two clusters and the lines show the distance between clusters.
#The right chart shows their silhouettes. A large si (almost 1) suggests that
#the corresponding observations are very well clustered, a small si (around 0)
#means that the observation lies between two clusters, and observations with a
#negative si are probably placed in the wrong cluster.
#Since the average Si are respectively 0.81 and 0.62 in the above silhouette,
#the identified two clusters are well clustered.
#clustering with pam()
library(cluster)
## Warning: package 'cluster' was built under R version 3.1.2
# group into 3 clusters
pam.result <- pam(iris2, 3)
table(pam.result$clustering, iris$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 48 14
## 3 0 2 36
layout(matrix(c(1, 2), 1, 2)) # 2 graphs per page
plot(pam.result)

#In this example, the result of pam() seems better, because it identifies
#three clusters, corresponding to three species.
#Hierarchical Clustering
set.seed(2835)
# draw a sample of 40 records from the iris data, so that the # clustering plot will not be over crowded
idx <- sample(1:dim(iris)[1], 40)
irisSample <- iris[idx, ]
# remove class label
irisSample$Species <- NULL
# hierarchical clustering
hc <- hclust(dist(irisSample), method = "ave")
# plot clusters
layout(matrix(1))
plot(hc, hang = -1, labels = iris$Species[idx])
# cut tree into 3 clusters
rect.hclust(hc, k = 3)

# get cluster IDs
groups <- cutree(hc, k = 3)
#dist(irisSample)
#Density Based Clustering
#Group objects into one cluster if they are connected
#to one another by densely populated area
#Two key parameters in DBSCAN:
#eps: reachability distance which defines the size of neighborhood
#MinPts: minimum number of points
#If the number of points in the neighborhood of point α is no less than
#MinPts, then α is a dense point. All the points in its neighborhood are
#density-reachable from α and are put into the same cluster as α.
#numeric data only
library(fpc)
iris2 <- iris[-5] # remove class tags
tail(iris2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 145 6.7 3.3 5.7 2.5
## 146 6.7 3.0 5.2 2.3
## 147 6.3 2.5 5.0 1.9
## 148 6.5 3.0 5.2 2.0
## 149 6.2 3.4 5.4 2.3
## 150 5.9 3.0 5.1 1.8
ds <- dbscan(iris2, eps = 0.42, MinPts = 5)
# compare clusters with original class labels
table(ds$cluster, iris$Species)
##
## setosa versicolor virginica
## 0 2 10 17
## 1 48 0 0
## 2 0 37 0
## 3 0 3 33
#1 to 3: identified clusters
# 0: noises or outliers, i.e., objects that are not assigned to any clusters
plot(ds, iris2)

plot(ds, iris2[c(1,4)])

plotcluster(iris2, ds$cluster)

#Prediction with Clustering Model
#Label new data, based on their similarity with the clusters
# Draw a sample of 10 objects from iris and add small noises
#to them to make a new dataset for labeling
# Random noises are generated with a uniform distribution using function runif().
# create a new dataset for labeling
set.seed(435)
idx <- sample(1:nrow(iris), 10)
# remove class labels
new.data <- iris[idx,-5]
# add random noise
new.data <- new.data + matrix(runif(10*4, min=0, max=0.2),
nrow=10, ncol=4)
pred <- predict(ds, iris2, new.data)
table(pred, iris$Species[idx]) # check cluster labels # results
##
## pred setosa versicolor virginica
## 0 0 0 1
## 1 3 0 0
## 2 0 3 0
## 3 0 1 2
#Eight(=3+3+2) out of 10 objects are assigned with correct class labels.
plot(iris2[c(1, 4)], col = 1 + ds$cluster)
points(new.data[c(1, 4)], pch = "+", col = 1 + pred, cex = 3)
