Tài liệu phục vụ Seminar TKUD26, chi tiết tại [tại đây] (https://sites.google.com/view/tkud/home?authuser=1)

require(tidyverse)
library(fpc)
library(cluster)
data(iris)

1.The k-Means Clustering

Code - k-means clustering of iris data

irisB <- iris
names(irisB)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"
summary(irisB$Species)
##     setosa versicolor  virginica 
##         50         50         50
str(irisB)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
irisB$Species <- NULL
kmeans.result <- kmeans(irisB, 3)
kmeans.result
## K-means clustering with 3 clusters of sizes 38, 50, 62
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     6.850000    3.073684     5.742105    2.071053
## 2     5.006000    3.428000     1.462000    0.246000
## 3     5.901613    2.748387     4.393548    1.433871
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [75] 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1 1 3 1 1 1 1
## [112] 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1 1 1 3 1 1 1 3 1
## [149] 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 23.87947 15.15100 39.82097
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

2.clustering result is then compared with the class label (Species)

table(iris$Species, kmeans.result$cluster)
##             
##               1  2  3
##   setosa      0 50  0
##   versicolor  2  0 48
##   virginica  36  0 14

3.clusters and their centers are plotted & plot cluster centers

plot(irisB[c("Sepal.Length", "Sepal.Width")], col = kmeans.result$cluster)
points(kmeans.result$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex=2)

plot(irisB[c("Petal.Length", "Petal.Width")], col = kmeans.result$cluster)
points(kmeans.result$centers[,c("Petal.Length", "Petal.Width")], col = 1:3, pch = 8, cex=2)

4.The k-Medoids Clustering

library(fpc)
library(cluster)
pam.result <- pam(irisB, 3)
table(pam.result$clustering, iris$Species)
##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         48        14
##   3      0          2        36
pam.result
## Medoids:
##       ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   8          5.0         3.4          1.5         0.2
## [2,]  79          6.0         2.9          4.5         1.5
## [3,] 113          6.8         3.0          5.5         2.1
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
## Objective function:
##     build      swap 
## 0.6709391 0.6542077 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
layout(matrix(c(1,2),1,2)) # 2 graphs per page
plot(pam.result)

#par(mfrow=c(2,2))
pam.result <- pam(irisB, 3)
table(pam.result$clustering, iris$Species)
##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         48        14
##   3      0          2        36
pamk.result <- fpc::pamk(irisB)

pamk.result
## $pamobject
## Medoids:
##       ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   8          5.0         3.4          1.5         0.2
## [2,] 127          6.2         2.8          4.8         1.8
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2
## Objective function:
##     build      swap 
## 0.9901187 0.8622026 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"      
## 
## $nc
## [1] 2
## 
## $crit
##  [1] 0.0000000 0.6857882 0.5528190 0.4896972 0.4867481 0.4703951 0.3390116
##  [8] 0.3318516 0.2918520 0.2918482
str(pamk.result)
## List of 3
##  $ pamobject:List of 10
##   ..$ medoids   : num [1:2, 1:4] 5 6.2 3.4 2.8 1.5 4.8 0.2 1.8
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   ..$ id.med    : int [1:2] 8 127
##   ..$ clustering: int [1:150] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ objective : Named num [1:2] 0.99 0.862
##   .. ..- attr(*, "names")= chr [1:2] "build" "swap"
##   ..$ isolation : Factor w/ 3 levels "no","L","L*": 1 1
##   .. ..- attr(*, "names")= chr [1:2] "1" "2"
##   ..$ clusinfo  : num [1:2, 1:5] 51 99 1.97 2.65 0.514 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:5] "size" "max_diss" "av_diss" "diameter" ...
##   ..$ silinfo   :List of 3
##   .. ..$ widths         : num [1:150, 1:3] 1 1 1 1 1 1 1 1 1 1 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:150] "8" "50" "1" "40" ...
##   .. .. .. ..$ : chr [1:3] "cluster" "neighbor" "sil_width"
##   .. ..$ clus.avg.widths: num [1:2] 0.81 0.622
##   .. ..$ avg.width      : num 0.686
##   ..$ diss      : NULL
##   ..$ call      : language pam(x = sdata, k = k, diss = diss)
##   ..$ data      : num [1:150, 1:4] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   ..- attr(*, "class")= chr [1:2] "pam" "partition"
##  $ nc       : int 2
##  $ crit     : num [1:10] 0 0.686 0.553 0.49 0.487 ...
pamk.result$pamobject$medoids
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]          5.0         3.4          1.5         0.2
## [2,]          6.2         2.8          4.8         1.8

5.Check clustering against actual species

table(pamk.result$pamobject$clustering, iris$Species)
##    
##     setosa versicolor virginica
##   1     50          1         0
##   2      0         49        50
layout(matrix(c(1,2),1,2)) # 2 graphs per page
plot(pamk.result$pamobject)

DQotLS0NCnRpdGxlOiAiQ2x1c3RlciBrLW1lYW4gay1tZWRvaWQiDQphdXRob3I6ICJEYW8gVGhpIFRoYW5oIEJpbmggLSBIQU5VIg0KZGF0ZTogIjI0IE5PViAyMDIxIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIGNvZGVfZG93bmxvYWQ6IHllcw0KICAgIGNvZGVfZm9sZGluZzogaGlkZQ0KICAgIGhpZ2hsaWdodDogcHlnbWVudHMNCiAgICB0aGVtZTogZmxhdGx5DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZmxvYXQ6IHllcw0KICB3b3JkX2RvY3VtZW50Og0KICAgIHRvYzogeWVzDQotLS0NCg0KDQpgYGB7ciBzZXR1cCxpbmNsdWRlPUZBTFNFfQ0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSkNCmBgYA0KDQpUw6BpIGxp4buHdSBwaOG7pWMgduG7pSBTZW1pbmFyIFRLVUQyNiwgY2hpIHRp4bq/dCB04bqhaSBbdOG6oWkgxJHDonldDQooaHR0cHM6Ly9zaXRlcy5nb29nbGUuY29tL3ZpZXcvdGt1ZC9ob21lP2F1dGh1c2VyPTEpDQoNCmBgYHtyfQ0KcmVxdWlyZSh0aWR5dmVyc2UpDQpsaWJyYXJ5KGZwYykNCmxpYnJhcnkoY2x1c3RlcikNCmBgYA0KDQpgYGB7cn0NCmRhdGEoaXJpcykNCg0KDQpgYGANCg0KDQojIyAxLlRoZSBrLU1lYW5zIENsdXN0ZXJpbmcNCg0KQ29kZSAtIGstbWVhbnMgY2x1c3RlcmluZyBvZiBpcmlzIGRhdGENCmBgYHtyfQ0KaXJpc0IgPC0gaXJpcw0KbmFtZXMoaXJpc0IpDQoNCnN1bW1hcnkoaXJpc0IkU3BlY2llcykNCnN0cihpcmlzQikNCmlyaXNCJFNwZWNpZXMgPC0gTlVMTA0KDQpgYGANCg0KYGBge3J9DQoNCmttZWFucy5yZXN1bHQgPC0ga21lYW5zKGlyaXNCLCAzKQ0Ka21lYW5zLnJlc3VsdA0KYGBgDQoNCiMjIDIuY2x1c3RlcmluZyByZXN1bHQgaXMgdGhlbiBjb21wYXJlZCB3aXRoIHRoZSBjbGFzcyBsYWJlbCAoU3BlY2llcykNCmBgYHtyfQ0KdGFibGUoaXJpcyRTcGVjaWVzLCBrbWVhbnMucmVzdWx0JGNsdXN0ZXIpDQoNCmBgYA0KDQoNCiMjIDMuY2x1c3RlcnMgYW5kIHRoZWlyIGNlbnRlcnMgYXJlIHBsb3R0ZWQgJiBwbG90IGNsdXN0ZXIgY2VudGVycw0KYGBge3J9DQpwbG90KGlyaXNCW2MoIlNlcGFsLkxlbmd0aCIsICJTZXBhbC5XaWR0aCIpXSwgY29sID0ga21lYW5zLnJlc3VsdCRjbHVzdGVyKQ0KcG9pbnRzKGttZWFucy5yZXN1bHQkY2VudGVyc1ssYygiU2VwYWwuTGVuZ3RoIiwgIlNlcGFsLldpZHRoIildLCBjb2wgPSAxOjMsIHBjaCA9IDgsIGNleD0yKQ0KYGBgDQoNCmBgYHtyfQ0KcGxvdChpcmlzQltjKCJQZXRhbC5MZW5ndGgiLCAiUGV0YWwuV2lkdGgiKV0sIGNvbCA9IGttZWFucy5yZXN1bHQkY2x1c3RlcikNCnBvaW50cyhrbWVhbnMucmVzdWx0JGNlbnRlcnNbLGMoIlBldGFsLkxlbmd0aCIsICJQZXRhbC5XaWR0aCIpXSwgY29sID0gMTozLCBwY2ggPSA4LCBjZXg9MikNCmBgYA0KDQoNCiMjIDQuVGhlIGstTWVkb2lkcyBDbHVzdGVyaW5nDQpgYGB7cn0NCmxpYnJhcnkoZnBjKQ0KbGlicmFyeShjbHVzdGVyKQ0KYGBgDQoNCg0KYGBge3J9DQpwYW0ucmVzdWx0IDwtIHBhbShpcmlzQiwgMykNCnRhYmxlKHBhbS5yZXN1bHQkY2x1c3RlcmluZywgaXJpcyRTcGVjaWVzKQ0KcGFtLnJlc3VsdA0KYGBgDQoNCmBgYHtyfQ0KbGF5b3V0KG1hdHJpeChjKDEsMiksMSwyKSkgIyAyIGdyYXBocyBwZXIgcGFnZQ0KcGxvdChwYW0ucmVzdWx0KQ0KYGBgDQoNCmBgYHtyfQ0KI3BhcihtZnJvdz1jKDIsMikpDQpwYW0ucmVzdWx0IDwtIHBhbShpcmlzQiwgMykNCnRhYmxlKHBhbS5yZXN1bHQkY2x1c3RlcmluZywgaXJpcyRTcGVjaWVzKQ0KYGBgDQoNCmBgYHtyfQ0KDQpwYW1rLnJlc3VsdCA8LSBmcGM6OnBhbWsoaXJpc0IpDQoNCnBhbWsucmVzdWx0DQpzdHIocGFtay5yZXN1bHQpDQoNCmBgYA0KDQpgYGB7cn0NCnBhbWsucmVzdWx0JHBhbW9iamVjdCRtZWRvaWRzDQoNCmBgYA0KDQojIyA1LkNoZWNrIGNsdXN0ZXJpbmcgYWdhaW5zdCBhY3R1YWwgc3BlY2llcw0KYGBge3J9DQp0YWJsZShwYW1rLnJlc3VsdCRwYW1vYmplY3QkY2x1c3RlcmluZywgaXJpcyRTcGVjaWVzKQ0KDQpgYGANCg0KDQoNCg0KYGBge3J9DQpsYXlvdXQobWF0cml4KGMoMSwyKSwxLDIpKSAjIDIgZ3JhcGhzIHBlciBwYWdlDQpwbG90KHBhbWsucmVzdWx0JHBhbW9iamVjdCkNCg0KYGBgDQoNCg0KDQo=