Clustering Practice - Hierchical & K Means
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(cluster)
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.5.1
require(factoextra)
## Loading required package: factoextra
## Warning: package 'factoextra' was built under R version 3.5.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
require(clusterSim)
## Loading required package: clusterSim
## Warning: package 'clusterSim' was built under R version 3.5.1
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## This is package 'modeest' written by P. PONCET.
## For a complete list of functions, use 'library(help = "modeest")' or 'help.start()'.
protein <- read.csv("http://www.biz.uiowa.edu/faculty/jledolter/DataMining/protein.csv")
names(protein)
## [1] "Country" "RedMeat" "WhiteMeat" "Eggs" "Milk"
## [6] "Fish" "Cereals" "Starch" "Nuts" "Fr.Veg"
table(protein$Country)
##
## Albania Austria Belgium Bulgaria Czechoslovakia
## 1 1 1 1 1
## Denmark E Germany Finland France Greece
## 1 1 1 1 1
## Hungary Ireland Italy Netherlands Norway
## 1 1 1 1 1
## Poland Portugal Romania Spain Sweden
## 1 1 1 1 1
## Switzerland UK USSR W Germany Yugoslavia
## 1 1 1 1 1
p.Country <- protein$Country
protein[1:6,]
## Country RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts
## 1 Albania 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5
## 2 Austria 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3
## 3 Belgium 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1
## 4 Bulgaria 7.8 6.0 1.6 8.3 1.2 56.7 1.1 3.7
## 5 Czechoslovakia 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1
## 6 Denmark 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7
## Fr.Veg
## 1 1.7
## 2 4.3
## 3 4.0
## 4 4.2
## 5 4.0
## 6 2.4
sc.protein = scale(protein[,2:10], center = TRUE, scale = TRUE)
head(sc.protein)
## RedMeat WhiteMeat Eggs Milk Fish Cereals
## [1,] 0.08126490 -1.7584889 -2.1796385 -1.15573814 -1.20028213 0.9159176
## [2,] -0.27725673 1.6523731 1.2204544 0.39237676 -0.64187467 -0.3870690
## [3,] 1.09707621 0.3800675 1.0415022 0.05460623 0.06348211 -0.5146342
## [4,] -0.60590157 -0.5132535 -1.1954011 -1.24018077 -0.90638347 2.2280161
## [5,] -0.03824231 0.9485445 -0.1216875 -0.64908235 -0.67126454 0.1869740
## [6,] 0.23064892 0.7861225 0.6835976 1.11013912 1.65053488 -0.9428885
## Starch Nuts Fr.Veg
## [1,] -2.2495772 1.2227536 -1.35040507
## [2,] -0.4136872 -0.8923886 0.09091397
## [3,] 0.8714358 -0.4895043 -0.07539207
## [4,] -1.9435955 0.3162641 0.03547862
## [5,] 0.4430614 -0.9931096 -0.07539207
## [6,] 0.3206688 -1.1945517 -0.96235764
par(mfrow =c(1,3))
data.dist=dist(sc.protein[,1:9])
x = hclust(data.dist, method = "complete", members = NULL)
plot(x, labels = p.Country, hang = 0.1, check = TRUE,
axes = TRUE, frame.plot = FALSE, ann = TRUE,
main = "Complete Linkage - Cluster Dendrogram",
sub = NULL, xlab = NULL, ylab = "Height")

plot(hclust(data.dist ,method ="average"), labels =p.Country,main=" Average Linkage - Cluster Dendrogram", xlab ="", sub ="", ylab ="")

hc.out =hclust(data.dist)
hc.clusters =cutree(hc.out ,4)
table(hc.clusters,p.Country)
## p.Country
## hc.clusters Albania Austria Belgium Bulgaria Czechoslovakia Denmark
## 1 1 0 0 1 0 0
## 2 0 1 1 0 0 1
## 3 0 0 0 0 1 0
## 4 0 0 0 0 0 0
## p.Country
## hc.clusters E Germany Finland France Greece Hungary Ireland Italy
## 1 0 0 0 1 0 0 1
## 2 0 1 1 0 0 1 0
## 3 1 0 0 0 1 0 0
## 4 0 0 0 0 0 0 0
## p.Country
## hc.clusters Netherlands Norway Poland Portugal Romania Spain Sweden
## 1 0 0 0 0 1 0 0
## 2 1 1 0 0 0 0 1
## 3 0 0 1 0 0 0 0
## 4 0 0 0 1 0 1 0
## p.Country
## hc.clusters Switzerland UK USSR W Germany Yugoslavia
## 1 0 0 0 0 1
## 2 1 1 0 1 0
## 3 0 0 1 0 0
## 4 0 0 0 0 0
par(mfrow =c(1,1))
plot(hc.out , labels =p.Country);abline(h=4, col ="red")

set.seed (2)
km.out =kmeans(sc.protein, 4, nstart =20)
km.clusters =km.out$cluster
table(km.clusters,hc.clusters )
## hc.clusters
## km.clusters 1 2 3 4
## 1 4 0 2 0
## 2 0 4 0 0
## 3 2 0 0 2
## 4 0 8 3 0
GAP Index
library(clusterSim)
cl1 = pam(sc.protein[,1:9],4)
cl2 = pam(sc.protein[,1:9],5)
clall = cbind(cl1$clustering,cl2$clustering)
g1 <-index.Gap(sc.protein[,1:9], clall, reference.distribution="unif", B=10,method="pam")
g1
## $gap
## [1] 0.7776443
##
## $diffu
## [1] 0.1149815
Davies-Bouldin’s index
clu2 <- pam(sc.protein[,1:9], 3)
print(index.DB(sc.protein[,1:9], clu2$clustering, centrotypes="centroids"))
## $DB
## [1] 1.173295
##
## $r
## [1] 1.224779 1.070328 1.224779
##
## $R
## [,1] [,2] [,3]
## [1,] Inf 0.9808626 1.224779
## [2,] 0.9808626 Inf 1.070328
## [3,] 1.2247788 1.0703278 Inf
##
## $d
## 1 2 3
## 1 0.000000 4.131756 3.412034
## 2 4.131756 0.000000 3.946534
## 3 3.412034 3.946534 0.000000
##
## $S
## [1] 2.003793 2.048891 2.175194
##
## $centers
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.7901419 -0.5267887 -1.1655757 -0.9047559 -0.9504683 1.4383272
## [2,] 0.4517373 0.5063957 0.5762263 0.5837801 0.1183432 -0.6100043
## [3,] -0.5088020 -1.1088009 -0.4124850 -0.8320414 0.9819154 0.1300253
## [,7] [,8] [,9]
## [1,] -0.7604664 0.8870168 -0.5373533
## [2,] 0.3533068 -0.7043759 -0.2195240
## [3,] -0.1842010 1.3108846 1.6292449
Calinski-Harabasz pseudo F-statistic
cl <- pam(sc.protein[,1:9],10)
index.G1(sc.protein[,1:9],cl$clustering)
## [1] 8.582957
K-means clustering of iris data
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
df_iris = scale(iris[,-5], center = T, scale = T)
fviz_nbclust(df_iris, kmeans, method = "wss") + geom_vline(xintercept = 3, linetype = 2)

km_iris = kmeans(df_iris, centers = 3, nstart = 10)
fviz_cluster(km_iris, data = df_iris,
ellipse.type = "convex",
palette = "jco",
repel = TRUE)

fviz_cluster(km_iris, data = df_iris,
palette = "jco",
ellipse.type = "euclid", # Concentration ellipse
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_minimal()
)

GAP Index
library(clusterSim)
clus0 = pam(df_iris,3)
clus1 = pam(df_iris,4)
clall1 = cbind(clus0$clustering,clus1$clustering)
g12 <-index.Gap(df_iris, clall1)
g12
## $gap
## [1] 1.272558
##
## $diffu
## [1] 0.02258823
Davies-Bouldin’s index
clu2 <- pam(df_iris, 3)
print(index.DB(df_iris, clu2$clustering, centrotypes="centroids"))
## $DB
## [1] 0.9283444
##
## $r
## [1] 0.6416233 1.0717050 1.0717050
##
## $R
## [,1] [,2] [,3]
## [1,] Inf 0.4934825 0.6416233
## [2,] 0.4934825 Inf 1.0717050
## [3,] 0.6416233 1.0717050 Inf
##
## $d
## 1 2 3
## 1 0.000000 4.004320 2.967705
## 2 4.004320 0.000000 1.804525
## 3 2.967705 1.804525 0.000000
##
## $S
## [1] 0.9731456 1.0029160 0.9310027
##
## $centers
## [,1] [,2] [,3] [,4]
## [1,] -1.01119138 0.85041372 -1.3006301 -1.2507035
## [2,] 1.16872040 0.05710212 1.0170142 1.0475006
## [3,] -0.03696089 -0.81982329 0.3502885 0.2799572
Calinski-Harabasz pseudo F-statistic
cl <- pam(df_iris,10)
index.G1(df_iris,cl$clustering)
## [1] 178.8087