Tà i liệu phục vụ Seminar TKUD26, chi tiết tại [tại đây] (https://sites.google.com/view/tkud/home?authuser=1)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization## Warning: package 'factoextra' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
## [1] 2.703754 2.293520 1.289810 3.263110 2.651067 3.215297
## List of 9
## $ cluster : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:2, 1:4] -0.67 1.005 -0.676 1.014 -0.132 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:2] 56.1 46.7
## $ tot.withinss: num 103
## $ betweenss : num 93.1
## $ size : int [1:2] 30 20
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
## K-means clustering with 2 clusters of sizes 30, 20
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 -0.669956 -0.6758849 -0.1317235 -0.5646433
## 2 1.004934 1.0138274 0.1975853 0.8469650
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 2 2 2 1 2
## Colorado Connecticut Delaware Florida Georgia
## 2 1 1 2 2
## Hawaii Idaho Illinois Indiana Iowa
## 1 1 2 1 1
## Kansas Kentucky Louisiana Maine Maryland
## 1 1 2 1 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 1 2 1 2 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 2 1 1
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 2 1 1
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 1 1 1 1 2
## South Dakota Tennessee Texas Utah Vermont
## 1 2 2 1 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 56.11445 46.74796
## (between_SS / total_SS = 47.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
## K-means clustering with 3 clusters of sizes 13, 20, 17
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.9667633
## 2 1.0049340 1.0138274 0.1975853 0.8469650
## 3 -0.4469795 -0.3465138 0.4788049 -0.2571398
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 2 2 2 3 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 2
## Hawaii Idaho Illinois Indiana Iowa
## 3 1 2 3 1
## Kansas Kentucky Louisiana Maine Maryland
## 3 1 2 1 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 1 2 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 2 1 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 2 1 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 2
## South Dakota Tennessee Texas Utah Vermont
## 1 2 2 3 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 1 1 3
##
## Within cluster sum of squares by cluster:
## [1] 11.95246 46.74796 19.62285
## (between_SS / total_SS = 60.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Arrest %>%
as_tibble() %>%
mutate(cluster = k2$cluster,
state = row.names(USArrests)) %>%
ggplot(aes(UrbanPop, Murder, color = factor(cluster), label = state)) +
geom_text()k3 <- kmeans(Arrest, centers = 3, nstart = 25)
k4 <- kmeans(Arrest, centers = 4, nstart = 25)
k5 <- kmeans(Arrest, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = Arrest) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = Arrest) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = Arrest) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = Arrest) + ggtitle("k = 5")
library(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(100)
# function to compute total within-cluster sum of square (Wss)
wss <- function(k) {
kmeans(Arrest, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")## Warning: package 'nFactors' was built under R version 4.0.5
## Loading required package: lattice
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
## eigen() decomposition
## $values
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
##
## $vectors
## [,1] [,2] [,3] [,4]
## [1,] -0.5358995 0.4181809 -0.3412327 0.64922780
## [2,] -0.5831836 0.1879856 -0.2681484 -0.74340748
## [3,] -0.2781909 -0.8728062 -0.3780158 0.13387773
## [4,] -0.5434321 -0.1673186 0.8177779 0.08902432
ap <- parallel(subject=nrow(Arrest),var=ncol(Arrest),
rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)## num [1:50, 1:4] 1.2426 0.5079 0.0716 0.2323 0.2783 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## - attr(*, "scaled:center")= Named num [1:4] 7.79 170.76 65.54 21.23
## ..- attr(*, "names")= chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## - attr(*, "scaled:scale")= Named num [1:4] 4.36 83.34 14.47 9.37
## ..- attr(*, "names")= chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
# function to compute average silhouette for k clusters
avg_sil <- function(k) {
km.res <- kmeans(Arrest, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(Arrest))
mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15
# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")# compute gap statistic
set.seed(100)
gap_stat <- clusGap(Arrest, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = Arrest, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 4
## logW E.logW gap SE.sim
## [1,] 3.458369 3.630860 0.1724909 0.03625722
## [2,] 3.135112 3.359114 0.2240025 0.03543637
## [3,] 2.977727 3.224930 0.2472034 0.03342621
## [4,] 2.826221 3.109910 0.2836891 0.03615581
## [5,] 2.738868 3.011982 0.2731140 0.03645021
## [6,] 2.666967 2.926076 0.2591085 0.03602029
## [7,] 2.612957 2.846616 0.2336583 0.03447295
## [8,] 2.545027 2.774382 0.2293546 0.03397513
## [9,] 2.468162 2.706808 0.2386457 0.03383600
## [10,] 2.397200 2.640321 0.2431213 0.03479945
## Warning: package 'fpc' was built under R version 4.0.5
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
## setosa versicolor virginica
## 50 50 50
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## K-means clustering with 3 clusters of sizes 50, 62, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
##
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
## K-means clustering with 4 clusters of sizes 8, 16, 13, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 3 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 4 0.6950701 1.0394414 0.7226370 1.27693964
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 2 2 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 3 4 2 3
## Kansas Kentucky Louisiana Maine Maryland
## 2 3 1 3 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 4 3 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 4 3 2
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 3 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 3 1 4 2 3
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 8.316061 16.212213 11.952463 19.922437
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
## # A tibble: 4 x 5
## Cluster Murder Assault UrbanPop Rape
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.9 244. 53.8 21.4
## 2 2 5.66 139. 73.9 18.8
## 3 3 3.6 78.5 52.1 12.2
## 4 4 10.8 257. 76 33.2
##K-Medoids with US Arrest
## Medoids:
## ID Murder Assault UrbanPop Rape
## New Mexico 31 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska 27 -0.8008247 -0.8250772 -0.2445636 -0.5052109
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 2 2 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 2 1 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 2 2 2
## Objective function:
## build swap
## 1.441358 1.368969
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
## Murder Assault UrbanPop Rape cluster
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473 1
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941 1
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388 1
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602 2
## California 0.27826823 1.2628144 1.7589234 2.067820292 1
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207 1
## Connecticut -1.03041900 -0.7290821 0.7917228 -1.081740768 2
## Delaware -0.43347395 0.8068381 0.4462940 -0.579946294 2
## Florida 1.74767144 1.9707777 0.9989801 1.138966691 1
## Georgia 2.20685994 0.4828549 -0.3827351 0.487701523 1
## Murder Assault UrbanPop Rape
## New Mexico 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska -0.8008247 -0.8250772 -0.2445636 -0.5052109
## Alabama Alaska Arizona Arkansas California Colorado
## 1 1 1 2 1 1
## Medoids:
## ID Murder Assault UrbanPop Rape
## Alabama 1 1.2425641 0.7828393 -0.5209066 -0.003416473
## Michigan 22 0.9900104 1.0108275 0.5844655 1.480613993
## Oklahoma 36 -0.2727580 -0.2371077 0.1699510 -0.131534211
## New Hampshire 29 -1.3059321 -1.3650491 -0.6590781 -1.252564419
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 2 2 1 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 4 2 3 4
## Kansas Kentucky Louisiana Maine Maryland
## 3 3 1 4 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 4 1 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 2 4 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 1 4 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 4 1 2 3 4
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 4 4 3
## Objective function:
## build swap
## 1.035116 1.027102
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
## Murder Assault UrbanPop Rape
## Alabama 1.2425641 0.7828393 -0.5209066 -0.003416473
## Michigan 0.9900104 1.0108275 0.5844655 1.480613993
## Oklahoma -0.2727580 -0.2371077 0.1699510 -0.131534211
## New Hampshire -1.3059321 -1.3650491 -0.6590781 -1.252564419
## Alabama Alaska Arizona Arkansas California Colorado
## 1 2 2 1 2 2
## import data
USA_clara <- USArrests
## run CLARA
clarax <- clara(USA_clara[1:4], 3)
## print components of clarax
print(clarax)## Call: clara(x = USA_clara[1:4], k = 3)
## Medoids:
## Murder Assault UrbanPop Rape
## Michigan 12.1 255 74 35.1
## Missouri 9.0 178 70 28.2
## Nebraska 4.3 102 62 16.5
## Objective function: 29.31019
## Clustering vector: Named int [1:50] 1 1 1 2 1 2 3 1 1 2 3 3 1 3 3 3 3 1 ...
## - attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" "California" "Colorado" "Connecticut" ...
## Cluster sizes: 16 14 20
## Best sample:
## [1] Alabama Alaska Arizona Arkansas California
## [6] Colorado Delaware Florida Georgia Idaho
## [11] Illinois Indiana Iowa Kansas Kentucky
## [16] Louisiana Maine Maryland Massachusetts Michigan
## [21] Minnesota Mississippi Missouri Montana Nebraska
## [26] Nevada New Hampshire New York North Carolina North Dakota
## [31] Ohio Oklahoma Oregon Pennsylvania Rhode Island
## [36] South Carolina South Dakota Tennessee Texas Utah
## [41] Vermont Virginia Washington West Virginia Wisconsin
## [46] Wyoming
##
## Available components:
## [1] "sample" "medoids" "i.med" "clustering" "objective"
## [6] "clusinfo" "diss" "call" "silinfo" "data"
## plot clusters
plot(USA_clara, col = clarax$cluster)
## plot centers
points(clarax$centers, col = 1:2, pch = 8)