Clustering Practice - Hierchical & K Means

require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(cluster)
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.5.1
require(factoextra)
## Loading required package: factoextra
## Warning: package 'factoextra' was built under R version 3.5.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
require(clusterSim)
## Loading required package: clusterSim
## Warning: package 'clusterSim' was built under R version 3.5.1
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## This is package 'modeest' written by P. PONCET.
## For a complete list of functions, use 'library(help = "modeest")' or 'help.start()'.
protein <- read.csv("http://www.biz.uiowa.edu/faculty/jledolter/DataMining/protein.csv") 

names(protein)
##  [1] "Country"   "RedMeat"   "WhiteMeat" "Eggs"      "Milk"     
##  [6] "Fish"      "Cereals"   "Starch"    "Nuts"      "Fr.Veg"
table(protein$Country)
## 
##        Albania        Austria        Belgium       Bulgaria Czechoslovakia 
##              1              1              1              1              1 
##        Denmark      E Germany        Finland         France         Greece 
##              1              1              1              1              1 
##        Hungary        Ireland          Italy    Netherlands         Norway 
##              1              1              1              1              1 
##         Poland       Portugal        Romania          Spain         Sweden 
##              1              1              1              1              1 
##    Switzerland             UK           USSR      W Germany     Yugoslavia 
##              1              1              1              1              1
p.Country <- protein$Country
protein[1:6,]
##          Country RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts
## 1        Albania    10.1       1.4  0.5  8.9  0.2    42.3    0.6  5.5
## 2        Austria     8.9      14.0  4.3 19.9  2.1    28.0    3.6  1.3
## 3        Belgium    13.5       9.3  4.1 17.5  4.5    26.6    5.7  2.1
## 4       Bulgaria     7.8       6.0  1.6  8.3  1.2    56.7    1.1  3.7
## 5 Czechoslovakia     9.7      11.4  2.8 12.5  2.0    34.3    5.0  1.1
## 6        Denmark    10.6      10.8  3.7 25.0  9.9    21.9    4.8  0.7
##   Fr.Veg
## 1    1.7
## 2    4.3
## 3    4.0
## 4    4.2
## 5    4.0
## 6    2.4
sc.protein = scale(protein[,2:10], center = TRUE, scale = TRUE)

head(sc.protein)
##          RedMeat  WhiteMeat       Eggs        Milk        Fish    Cereals
## [1,]  0.08126490 -1.7584889 -2.1796385 -1.15573814 -1.20028213  0.9159176
## [2,] -0.27725673  1.6523731  1.2204544  0.39237676 -0.64187467 -0.3870690
## [3,]  1.09707621  0.3800675  1.0415022  0.05460623  0.06348211 -0.5146342
## [4,] -0.60590157 -0.5132535 -1.1954011 -1.24018077 -0.90638347  2.2280161
## [5,] -0.03824231  0.9485445 -0.1216875 -0.64908235 -0.67126454  0.1869740
## [6,]  0.23064892  0.7861225  0.6835976  1.11013912  1.65053488 -0.9428885
##          Starch       Nuts      Fr.Veg
## [1,] -2.2495772  1.2227536 -1.35040507
## [2,] -0.4136872 -0.8923886  0.09091397
## [3,]  0.8714358 -0.4895043 -0.07539207
## [4,] -1.9435955  0.3162641  0.03547862
## [5,]  0.4430614 -0.9931096 -0.07539207
## [6,]  0.3206688 -1.1945517 -0.96235764
par(mfrow =c(1,3))

data.dist=dist(sc.protein[,1:9])
x = hclust(data.dist, method = "complete", members = NULL)

plot(x, labels = p.Country, hang = 0.1, check = TRUE,
     axes = TRUE, frame.plot = FALSE, ann = TRUE,
     main = "Complete Linkage - Cluster Dendrogram",
     sub = NULL, xlab = NULL, ylab = "Height")

plot(hclust(data.dist ,method ="average"), labels =p.Country,main=" Average Linkage - Cluster Dendrogram", xlab ="", sub ="", ylab ="")

hc.out =hclust(data.dist)
hc.clusters =cutree(hc.out ,4)
table(hc.clusters,p.Country)
##            p.Country
## hc.clusters Albania Austria Belgium Bulgaria Czechoslovakia Denmark
##           1       1       0       0        1              0       0
##           2       0       1       1        0              0       1
##           3       0       0       0        0              1       0
##           4       0       0       0        0              0       0
##            p.Country
## hc.clusters E Germany Finland France Greece Hungary Ireland Italy
##           1         0       0      0      1       0       0     1
##           2         0       1      1      0       0       1     0
##           3         1       0      0      0       1       0     0
##           4         0       0      0      0       0       0     0
##            p.Country
## hc.clusters Netherlands Norway Poland Portugal Romania Spain Sweden
##           1           0      0      0        0       1     0      0
##           2           1      1      0        0       0     0      1
##           3           0      0      1        0       0     0      0
##           4           0      0      0        1       0     1      0
##            p.Country
## hc.clusters Switzerland UK USSR W Germany Yugoslavia
##           1           0  0    0         0          1
##           2           1  1    0         1          0
##           3           0  0    1         0          0
##           4           0  0    0         0          0
par(mfrow =c(1,1))
plot(hc.out , labels =p.Country);abline(h=4, col ="red")

set.seed (2)
km.out =kmeans(sc.protein, 4, nstart =20)
km.clusters =km.out$cluster
table(km.clusters,hc.clusters )
##            hc.clusters
## km.clusters 1 2 3 4
##           1 4 0 2 0
##           2 0 4 0 0
##           3 2 0 0 2
##           4 0 8 3 0

GAP Index

library(clusterSim)

cl1 = pam(sc.protein[,1:9],4)
cl2 = pam(sc.protein[,1:9],5)

clall = cbind(cl1$clustering,cl2$clustering)

g1 <-index.Gap(sc.protein[,1:9], clall, reference.distribution="unif", B=10,method="pam")

g1
## $gap
## [1] 0.7776443
## 
## $diffu
## [1] 0.1149815

Davies-Bouldin’s index

clu2 <- pam(sc.protein[,1:9], 3)
print(index.DB(sc.protein[,1:9], clu2$clustering, centrotypes="centroids"))
## $DB
## [1] 1.173295
## 
## $r
## [1] 1.224779 1.070328 1.224779
## 
## $R
##           [,1]      [,2]     [,3]
## [1,]       Inf 0.9808626 1.224779
## [2,] 0.9808626       Inf 1.070328
## [3,] 1.2247788 1.0703278      Inf
## 
## $d
##          1        2        3
## 1 0.000000 4.131756 3.412034
## 2 4.131756 0.000000 3.946534
## 3 3.412034 3.946534 0.000000
## 
## $S
## [1] 2.003793 2.048891 2.175194
## 
## $centers
##            [,1]       [,2]       [,3]       [,4]       [,5]       [,6]
## [1,] -0.7901419 -0.5267887 -1.1655757 -0.9047559 -0.9504683  1.4383272
## [2,]  0.4517373  0.5063957  0.5762263  0.5837801  0.1183432 -0.6100043
## [3,] -0.5088020 -1.1088009 -0.4124850 -0.8320414  0.9819154  0.1300253
##            [,7]       [,8]       [,9]
## [1,] -0.7604664  0.8870168 -0.5373533
## [2,]  0.3533068 -0.7043759 -0.2195240
## [3,] -0.1842010  1.3108846  1.6292449

Calinski-Harabasz pseudo F-statistic

cl <- pam(sc.protein[,1:9],10)
index.G1(sc.protein[,1:9],cl$clustering)
## [1] 8.582957

K-means clustering of iris data

str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
df_iris = scale(iris[,-5], center = T, scale = T)
fviz_nbclust(df_iris, kmeans, method = "wss") + geom_vline(xintercept = 3, linetype = 2)

km_iris = kmeans(df_iris, centers = 3, nstart = 10)
fviz_cluster(km_iris, data = df_iris, 
  ellipse.type = "convex",
  palette = "jco",
  repel = TRUE)

fviz_cluster(km_iris, data = df_iris,
             palette = "jco", 
             ellipse.type = "euclid", # Concentration ellipse
             star.plot = TRUE, # Add segments from centroids to items
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_minimal()
             )

GAP Index

library(clusterSim)

clus0 = pam(df_iris,3)
clus1 = pam(df_iris,4)


clall1 = cbind(clus0$clustering,clus1$clustering)

g12 <-index.Gap(df_iris, clall1)

g12
## $gap
## [1] 1.272558
## 
## $diffu
## [1] 0.02258823

Davies-Bouldin’s index

clu2 <- pam(df_iris, 3)
print(index.DB(df_iris, clu2$clustering, centrotypes="centroids"))
## $DB
## [1] 0.9283444
## 
## $r
## [1] 0.6416233 1.0717050 1.0717050
## 
## $R
##           [,1]      [,2]      [,3]
## [1,]       Inf 0.4934825 0.6416233
## [2,] 0.4934825       Inf 1.0717050
## [3,] 0.6416233 1.0717050       Inf
## 
## $d
##          1        2        3
## 1 0.000000 4.004320 2.967705
## 2 4.004320 0.000000 1.804525
## 3 2.967705 1.804525 0.000000
## 
## $S
## [1] 0.9731456 1.0029160 0.9310027
## 
## $centers
##             [,1]        [,2]       [,3]       [,4]
## [1,] -1.01119138  0.85041372 -1.3006301 -1.2507035
## [2,]  1.16872040  0.05710212  1.0170142  1.0475006
## [3,] -0.03696089 -0.81982329  0.3502885  0.2799572

Calinski-Harabasz pseudo F-statistic

cl <- pam(df_iris,10)
index.G1(df_iris,cl$clustering)
## [1] 178.8087