SampleCode

#         #CLUSTERING 1-0-1
#         # k-means clustering with kmeans()
#         # k-medoids clustering with pam(), pamk()
#         # hierarchical clustering
#         # density based clustering with DBSCAN
#         set.seed(8953)
#         iris2 <- iris
#         head(iris2)
#         iris2$Species <- NULL
#         (kmeans.result <- kmeans(iris2, 3))
#         table(iris$Species, kmeans.result$cluster)
#         # Class “setosa” can be easily separated from the other clusters
#         #􏰀 Classes “versicolor” and “virginica” are to a small degree overlapped with each other.
#         plot(iris2[c("Sepal.Length", "Sepal.Width")], 
#              col=kmeans.result$cluster)
#         points(kmeans.result$centers[, c("Sepal.Length", "Sepal.Width")],
#                col=1:3, pch=8, cex=2)
#         #Clustering with pamk()
#                 #install.packages("fpc")
#         library(fpc)
#         pamk.result <- pamk(iris2) # number of clusters pamk.result$nc
#         # check clustering against actual species
#         table(pamk.result$pamobject$clustering, iris$Species)
#         #2 clusters formed - setosa, and a mixture of vesicolro and virginica
#         layout(matrix(c(1, 2), 1, 2)) # 2 graphs per page 
#         plot(pamk.result$pamobject)
#         # layout(matrix(1)) # change back to one graph per page
#         
#         #The left chart is a 2-dimensional “clusplot” (clustering plot)
#         #of the two clusters and the lines show the distance between clusters.
#         
#         #The right chart shows their silhouettes. A large si (almost 1) suggests that 
#         #the corresponding observations are very well clustered, a small si (around 0) 
#         #means that the observation lies between two clusters, and observations with a 
#         #negative si are probably placed in the wrong cluster.
#         
#         #Since the average Si are respectively 0.81 and 0.62 in the above silhouette, 
#         #the identified two clusters are well clustered.
#         
#         #clustering with pam()
#         library(cluster)
#         # group into 3 clusters
#         pam.result <- pam(iris2, 3) 
#         table(pam.result$clustering, iris$Species)
#         
#         layout(matrix(c(1, 2), 1, 2)) # 2 graphs per page 
#         plot(pam.result)
#         #In this example, the result of pam() seems better, because it identifies 
#         #three clusters, corresponding to three species.
#         
#         #Hierarchical Clustering 
#         set.seed(2835)
#         # draw a sample of 40 records from the iris data, so that the # clustering plot will not be over crowded
#         idx <- sample(1:dim(iris)[1], 40)
#         irisSample <- iris[idx, ]
#         # remove class label
#         irisSample$Species <- NULL
#         # hierarchical clustering
#         hc <- hclust(dist(irisSample), method = "ave")
#         # plot clusters
#         layout(matrix(1))
#         plot(hc, hang = -1, labels = iris$Species[idx])
#         # cut tree into 3 clusters
#         rect.hclust(hc, k = 3)
#         # get cluster IDs
#         groups <- cutree(hc, k = 3)
#                 #dist(irisSample) 
#         
#         #Density Based Clustering
#         #Group objects into one cluster if they are connected 
#         #to one another by densely populated area
#         #Two key parameters in DBSCAN:
#         #eps: reachability distance which defines the size of neighborhood
#         #MinPts: minimum number of points
#         
#         #If the number of points in the neighborhood of point α is no less than 
#         #MinPts, then α is a dense point. All the points in its neighborhood are 
#         #density-reachable from α and are put into the same cluster as α.
#         
#         #numeric data only
#         library(fpc)
#         iris2 <- iris[-5] # remove class tags
#         tail(iris2)
#         ds <- dbscan(iris2, eps = 0.42, MinPts = 5)
#         # compare clusters with original class labels 
#         table(ds$cluster, iris$Species)
#         #1 to 3: identified clusters
#         # 0: noises or outliers, i.e., objects that are not assigned to any clusters
#         
#         plot(ds, iris2)
#         plot(ds, iris2[c(1,4)])
#         plotcluster(iris2, ds$cluster)
#         
#         #Prediction with Clustering Model
#         #Label new data, based on their similarity with the clusters
#         #􏰀 Draw a sample of 10 objects from iris and add small noises
#         #to them to make a new dataset for labeling
#         #􏰀 Random noises are generated with a uniform distribution using function runif().
#         
#         # create a new dataset for labeling
#         set.seed(435)
#         idx <- sample(1:nrow(iris), 10)
#         # remove class labels
#         new.data <- iris[idx,-5]
#         # add random noise
#         new.data <- new.data + matrix(runif(10*4, min=0, max=0.2),
#                                       nrow=10, ncol=4)
#         pred <- predict(ds, iris2, new.data)
#         table(pred, iris$Species[idx]) # check cluster labels # results
#         #Eight(=3+3+2) out of 10 objects are assigned with correct class labels.
#         plot(iris2[c(1, 4)], col = 1 + ds$cluster)
#         points(new.data[c(1, 4)], pch = "+", col = 1 + pred, cex = 3)
#         
#