Data Generation
library(MASS)
library(fpc)
library(cluster)
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.16.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(mclust)
## Package 'mclust' version 5.4.10
## Type 'citation("mclust")' for citing this R package in publications.
library(EMCluster)
## Loading required package: Matrix
 D1<- rbind(cbind(rnorm(15,0,0.5), rnorm(15,0,0.5)),
           cbind(rnorm(15,5,0.5), rnorm(15,5,0.5)), 
           cbind(rnorm(15,5,0.5), rnorm(15,0,0.5)),
           cbind(rnorm(15,0,0.5), rnorm(15,5,0.5)))

D1full=rbind(cbind(rnorm(15,0,0.5), rnorm(15,0,0.5), rep(1,15)),
           cbind(rnorm(15,5,0.5), rnorm(15,5,0.5),rep(2,15)), 
           cbind(rnorm(15,5,0.5), rnorm(15,0,0.5),rep(3,15)),
           cbind(rnorm(15,0,0.5), rnorm(15,5,0.5),rep(4,15)))
 D2<- rbind(cbind(rnorm(10,0,2), rnorm(10,0,2)),
           cbind(rnorm(15,5,2), rnorm(15,5,2)), 
           cbind(rnorm(15,5,2), rnorm(15,0,2)),
           cbind(rnorm(15,0,2), rnorm(15,5,2)))

D2full=rbind(cbind(rnorm(15,0,2), rnorm(15,0,2), rep(1,15)),
           cbind(rnorm(15,5,2), rnorm(15,5,2),rep(2,15)), 
           cbind(rnorm(15,5,2), rnorm(15,0,2),rep(3,15)),
           cbind(rnorm(15,0,2), rnorm(15,5,2),rep(4,15)))
K Mean Clustering
kmD2 <- kmeans(D2full[,1:2], 4)
plot(D2full[,1:2], col = kmD2$cluster, cex=2)
points(D2full[,1:2], pch=4, cex=0.3, col=D2full[,3])

table(D2full[,3], kmD2$cluster)
##    
##      1  2  3  4
##   1  1  0  4 10
##   2 14  1  0  0
##   3  2  0 13  0
##   4  0 14  0  1
K-Medioid Clustering
pamD1 <- pam(D1, 4)
pamD1
## Medoids:
##      ID                        
## [1,] 11 -0.11479072  0.08974015
## [2,] 19  5.19069752  4.90067324
## [3,] 31  4.74114505 -0.11205244
## [4,] 58  0.04494387  4.81189654
## Clustering vector:
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
## [39] 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## Objective function:
##     build      swap 
## 1.0025105 0.6523814 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
pamD2 <- pam(D2full[,1:2], 4)
pamD2
## Medoids:
##      ID                     
## [1,]  9 0.5627189 -0.1360652
## [2,] 20 6.2662297  5.6887306
## [3,] 51 0.1766644  5.3909433
## [4,] 42 4.9811221 -2.1597856
## Clustering vector:
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 3 2 2 2 2 2 2 2 3 2 2 4 4 4 4 1 1 4
## [39] 4 4 2 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Objective function:
##    build     swap 
## 2.239968 2.025144 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
summary(pamD1)
## Medoids:
##      ID                        
## [1,] 11 -0.11479072  0.08974015
## [2,] 19  5.19069752  4.90067324
## [3,] 31  4.74114505 -0.11205244
## [4,] 58  0.04494387  4.81189654
## Clustering vector:
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
## [39] 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## Objective function:
##     build      swap 
## 1.0025105 0.6523814 
## 
## Numerical information per cluster:
##      size max_diss   av_diss diameter separation
## [1,]   15 1.281663 0.6747804 2.261776   2.871446
## [2,]   15 1.800909 0.7197072 2.833866   2.806656
## [3,]   15 1.430770 0.5933567 1.994104   2.806656
## [4,]   15 1.338423 0.6216813 2.070500   2.835471
## 
## Isolated clusters:
##  L-clusters: [1] 2
##  L*-clusters: [1] 1 3 4
## 
## Silhouette plot information:
##    cluster neighbor sil_width
## 1        1        4 0.8507908
## 11       1        4 0.8503591
## 4        1        4 0.8420419
## 10       1        3 0.8349749
## 12       1        4 0.8190246
## 6        1        4 0.8124547
## 15       1        3 0.7980802
## 9        1        4 0.7898401
## 5        1        4 0.7802310
## 8        1        3 0.7557605
## 3        1        3 0.7404718
## 7        1        4 0.7327528
## 2        1        3 0.7306409
## 14       1        3 0.6714613
## 13       1        4 0.6341489
## 24       2        3 0.8455856
## 19       2        3 0.8445278
## 29       2        4 0.8355563
## 25       2        3 0.8355097
## 20       2        3 0.8301493
## 16       2        3 0.8223442
## 26       2        4 0.8176193
## 18       2        3 0.8146331
## 22       2        4 0.7831433
## 30       2        3 0.7830590
## 23       2        3 0.7673720
## 17       2        3 0.7579896
## 27       2        4 0.7103154
## 21       2        3 0.5245342
## 28       2        4 0.4837699
## 44       3        1 0.8680662
## 43       3        1 0.8679866
## 31       3        1 0.8656572
## 42       3        1 0.8594783
## 37       3        1 0.8544116
## 35       3        1 0.8460447
## 32       3        2 0.8444881
## 45       3        1 0.8096819
## 36       3        2 0.8019857
## 33       3        2 0.7731579
## 38       3        1 0.7640664
## 40       3        2 0.7613132
## 34       3        1 0.7587327
## 39       3        2 0.7555163
## 41       3        2 0.7469703
## 54       4        1 0.8627075
## 58       4        1 0.8609427
## 60       4        2 0.8523907
## 55       4        1 0.8446875
## 57       4        1 0.8319548
## 51       4        1 0.8268495
## 46       4        1 0.8251184
## 52       4        2 0.8248433
## 47       4        2 0.8235835
## 59       4        2 0.8095711
## 49       4        1 0.8005677
## 56       4        1 0.7959372
## 48       4        2 0.7823767
## 53       4        2 0.7687029
## 50       4        1 0.6321738
## Average silhouette width per cluster:
## [1] 0.7762022 0.7637406 0.8118371 0.8094938
## Average silhouette width of total data set:
## [1] 0.7903184
## 
## 1770 dissimilarities, summarized :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.05605 3.54800 5.07020 4.59570 6.20620 9.24430 
## Metric :  euclidean 
## Number of objects : 60
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
summary(pamD2)
## Medoids:
##      ID                     
## [1,]  9 0.5627189 -0.1360652
## [2,] 20 6.2662297  5.6887306
## [3,] 51 0.1766644  5.3909433
## [4,] 42 4.9811221 -2.1597856
## Clustering vector:
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 3 2 2 2 2 2 2 2 3 2 2 4 4 4 4 1 1 4
## [39] 4 4 2 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Objective function:
##    build     swap 
## 2.239968 2.025144 
## 
## Numerical information per cluster:
##      size max_diss  av_diss diameter separation
## [1,]   16 4.434664 1.919371 6.846250  0.4675720
## [2,]   16 4.880355 2.150056 7.868150  0.9400962
## [3,]   17 4.008736 2.103055 7.246507  0.9400962
## [4,]   11 3.693022 1.876899 5.860279  0.4675720
## 
## Isolated clusters:
##  L-clusters: character(0)
##  L*-clusters: character(0)
## 
## Silhouette plot information:
##    cluster neighbor   sil_width
## 4        1        3  0.62493949
## 15       1        4  0.62319259
## 12       1        3  0.61804646
## 3        1        4  0.59948639
## 9        1        4  0.59471075
## 1        1        3  0.57842738
## 13       1        3  0.50138542
## 8        1        4  0.39980511
## 5        1        4  0.34500052
## 10       1        4  0.29064358
## 37       1        4  0.25234036
## 7        1        4  0.24755335
## 6        1        3  0.22991495
## 11       1        3  0.19835608
## 2        1        4  0.14158198
## 36       1        4 -0.05599211
## 20       2        3  0.64437271
## 26       2        3  0.63868345
## 19       2        3  0.62999255
## 24       2        3  0.62151018
## 25       2        3  0.60492529
## 16       2        3  0.56700186
## 23       2        3  0.50635572
## 18       2        3  0.50306908
## 28       2        3  0.49312709
## 30       2        3  0.44687110
## 22       2        4  0.42030601
## 27       2        3  0.37906766
## 14       2        3  0.26902379
## 41       2        3  0.26189858
## 17       2        3  0.25217888
## 31       2        4  0.24460127
## 59       3        2  0.60736306
## 51       3        2  0.59259495
## 53       3        1  0.56501123
## 57       3        2  0.55165181
## 49       3        1  0.54913064
## 54       3        1  0.54272364
## 60       3        1  0.52288945
## 48       3        2  0.51956540
## 56       3        1  0.47154532
## 50       3        1  0.34544079
## 21       3        2  0.29315596
## 58       3        2  0.28481801
## 46       3        2  0.26217857
## 47       3        2  0.18943647
## 55       3        2  0.18262066
## 52       3        1  0.04908075
## 29       3        2 -0.14236469
## 40       4        1  0.62007218
## 42       4        1  0.60821068
## 35       4        1  0.55535410
## 33       4        1  0.54088336
## 32       4        1  0.53145719
## 34       4        1  0.45168524
## 38       4        1  0.44088294
## 39       4        1  0.41655237
## 43       4        1  0.36896746
## 45       4        2  0.34418443
## 44       4        1  0.12390263
## Average silhouette width per cluster:
## [1] 0.3868370 0.4676866 0.3756966 0.4547411
## Average silhouette width of total data set:
## [1] 0.4176895
## 
## 1770 dissimilarities, summarized :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1815  3.7752  5.8956  5.9844  7.9648 15.0370 
## Metric :  euclidean 
## Number of objects : 60
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
plot(pamD1)

plot(pamD2)

table(D2full[,3], pamD2$clustering)
##    
##      1  2  3  4
##   1 14  1  0  0
##   2  0 13  2  0
##   3  2  2  0 11
##   4  0  0 15  0
Model Based Clustering using EM Algorithm
modD2 <- Mclust(D2full[,1:2], G = 4)
summary(modD2, parameters = TRUE)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EII (spherical, equal volume) model with 4 components: 
## 
##  log-likelihood  n df       BIC       ICL
##       -300.5249 60 12 -650.1819 -664.4548
## 
## Clustering table:
##  1  2  3  4 
## 11 17 17 15 
## 
## Mixing probabilities:
##         1         2         3         4 
## 0.1990388 0.2710637 0.2845334 0.2453641 
## 
## Means:
##            [,1]      [,2]     [,3]       [,4]
## [1,] -0.5868774  4.096657 5.051400 0.02478723
## [2,]  0.5165496 -1.352826 5.647688 5.53166212
## 
## Variances:
## [,,1]
##          [,1]     [,2]
## [1,] 2.901474 0.000000
## [2,] 0.000000 2.901474
## [,,2]
##          [,1]     [,2]
## [1,] 2.901474 0.000000
## [2,] 0.000000 2.901474
## [,,3]
##          [,1]     [,2]
## [1,] 2.901474 0.000000
## [2,] 0.000000 2.901474
## [,,4]
##          [,1]     [,2]
## [1,] 2.901474 0.000000
## [2,] 0.000000 2.901474
table(D2full[,3], modD2$classification)
##    
##      1  2  3  4
##   1 10  4  1  0
##   2  0  0 14  1
##   3  0 13  2  0
##   4  1  0  0 14
plot(modD2, what=c("classification"), main = "Model based Clustering using EM")

plot(modD2, "density")

Hiararchical Clustering
D2H <- as.data.frame(scale(D2full[,1:2]))
#summary(D2H)
dist_m <- dist(D2H, method = 'euclidean')
hcl_avg <- hclust(dist_m, method = 'average')
plot(hcl_avg)

dist_m <- dist(D2H, method = 'euclidean')
hcl_mcquitty <- hclust(dist_m, method = 'mcquitty')
plot(hcl_mcquitty)

c_avg <- cutree(hcl_avg, k = 4)
plot(hcl_avg)
rect.hclust(hcl_avg , k = 4, border = 2:6)
abline(h = 4, col = 'red')

c_avg <- cutree(hcl_avg, k = 4)
plot(hcl_mcquitty)
rect.hclust(hcl_mcquitty , k = 4, border = 2:6)
abline(h = 4, col = 'red')