# #CLUSTERING 1-0-1
# # k-means clustering with kmeans()
# # k-medoids clustering with pam(), pamk()
# # hierarchical clustering
# # density based clustering with DBSCAN
# set.seed(8953)
# iris2 <- iris
# head(iris2)
# iris2$Species <- NULL
# (kmeans.result <- kmeans(iris2, 3))
# table(iris$Species, kmeans.result$cluster)
# # Class “setosa” can be easily separated from the other clusters
# # Classes “versicolor” and “virginica” are to a small degree overlapped with each other.
# plot(iris2[c("Sepal.Length", "Sepal.Width")],
# col=kmeans.result$cluster)
# points(kmeans.result$centers[, c("Sepal.Length", "Sepal.Width")],
# col=1:3, pch=8, cex=2)
# #Clustering with pamk()
# #install.packages("fpc")
# library(fpc)
# pamk.result <- pamk(iris2) # number of clusters pamk.result$nc
# # check clustering against actual species
# table(pamk.result$pamobject$clustering, iris$Species)
# #2 clusters formed - setosa, and a mixture of vesicolro and virginica
# layout(matrix(c(1, 2), 1, 2)) # 2 graphs per page
# plot(pamk.result$pamobject)
# # layout(matrix(1)) # change back to one graph per page
#
# #The left chart is a 2-dimensional “clusplot” (clustering plot)
# #of the two clusters and the lines show the distance between clusters.
#
# #The right chart shows their silhouettes. A large si (almost 1) suggests that
# #the corresponding observations are very well clustered, a small si (around 0)
# #means that the observation lies between two clusters, and observations with a
# #negative si are probably placed in the wrong cluster.
#
# #Since the average Si are respectively 0.81 and 0.62 in the above silhouette,
# #the identified two clusters are well clustered.
#
# #clustering with pam()
# library(cluster)
# # group into 3 clusters
# pam.result <- pam(iris2, 3)
# table(pam.result$clustering, iris$Species)
#
# layout(matrix(c(1, 2), 1, 2)) # 2 graphs per page
# plot(pam.result)
# #In this example, the result of pam() seems better, because it identifies
# #three clusters, corresponding to three species.
#
# #Hierarchical Clustering
# set.seed(2835)
# # draw a sample of 40 records from the iris data, so that the # clustering plot will not be over crowded
# idx <- sample(1:dim(iris)[1], 40)
# irisSample <- iris[idx, ]
# # remove class label
# irisSample$Species <- NULL
# # hierarchical clustering
# hc <- hclust(dist(irisSample), method = "ave")
# # plot clusters
# layout(matrix(1))
# plot(hc, hang = -1, labels = iris$Species[idx])
# # cut tree into 3 clusters
# rect.hclust(hc, k = 3)
# # get cluster IDs
# groups <- cutree(hc, k = 3)
# #dist(irisSample)
#
# #Density Based Clustering
# #Group objects into one cluster if they are connected
# #to one another by densely populated area
# #Two key parameters in DBSCAN:
# #eps: reachability distance which defines the size of neighborhood
# #MinPts: minimum number of points
#
# #If the number of points in the neighborhood of point α is no less than
# #MinPts, then α is a dense point. All the points in its neighborhood are
# #density-reachable from α and are put into the same cluster as α.
#
# #numeric data only
# library(fpc)
# iris2 <- iris[-5] # remove class tags
# tail(iris2)
# ds <- dbscan(iris2, eps = 0.42, MinPts = 5)
# # compare clusters with original class labels
# table(ds$cluster, iris$Species)
# #1 to 3: identified clusters
# # 0: noises or outliers, i.e., objects that are not assigned to any clusters
#
# plot(ds, iris2)
# plot(ds, iris2[c(1,4)])
# plotcluster(iris2, ds$cluster)
#
# #Prediction with Clustering Model
# #Label new data, based on their similarity with the clusters
# # Draw a sample of 10 objects from iris and add small noises
# #to them to make a new dataset for labeling
# # Random noises are generated with a uniform distribution using function runif().
#
# # create a new dataset for labeling
# set.seed(435)
# idx <- sample(1:nrow(iris), 10)
# # remove class labels
# new.data <- iris[idx,-5]
# # add random noise
# new.data <- new.data + matrix(runif(10*4, min=0, max=0.2),
# nrow=10, ncol=4)
# pred <- predict(ds, iris2, new.data)
# table(pred, iris$Species[idx]) # check cluster labels # results
# #Eight(=3+3+2) out of 10 objects are assigned with correct class labels.
# plot(iris2[c(1, 4)], col = 1 + ds$cluster)
# points(new.data[c(1, 4)], pch = "+", col = 1 + pred, cex = 3)
#
#