K-Meaning Analysis in R
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.1
## Warning: package 'forcats' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# checkign the dataset
df <- USArrests
head(df)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# remove missing value
df <- na.omit(df)
# scaling/standardizing the dataset
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
# calculate distance
distance <- get_dist(df)
# visualizing
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
## List of 9
## $ cluster : Named int [1:50] 1 1 1 2 1 1 2 2 1 1 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:2, 1:4] 1.005 -0.67 1.014 -0.676 0.198 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:2] 46.7 56.1
## $ tot.withinss: num 103
## $ betweenss : num 93.1
## $ size : int [1:2] 20 30
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
k2
## K-means clustering with 2 clusters of sizes 20, 30
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.004934 1.0138274 0.1975853 0.8469650
## 2 -0.669956 -0.6758849 -0.1317235 -0.5646433
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 2 2 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 2 1 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 46.74796 56.11445
## (between_SS / total_SS = 47.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(k2, data = df)
df %>%
as_tibble() %>%
mutate(cluster = k2$cluster,
state = row.names(USArrests)) %>%
ggplot(aes(UrbanPop, Murder, color=factor(cluster), label = state)) +
geom_text()
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, p4, nrow = 2)
# Elbow Method
set.seed(123)
# function to compute total within-cluster sum of square
wss <- function(k) {
kmeans(df, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
# fviz_nbclust function
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")
# silhouette function
# function to compute average silhouette for k clusters
avg_sil <- function(k) {
km.res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(df))
mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15
# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
# fviz_nbclust function
fviz_nbclust(df, kmeans, method = "silhouette")
# Gap Statistic Method
# compute gap statistic
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 4
## logW E.logW gap SE.sim
## [1,] 3.458369 3.640154 0.1817845 0.04422857
## [2,] 3.135112 3.372283 0.2371717 0.03559601
## [3,] 2.977727 3.233771 0.2560446 0.03749193
## [4,] 2.826221 3.119172 0.2929511 0.04067348
## [5,] 2.738868 3.019965 0.2810969 0.04185469
## [6,] 2.666967 2.930002 0.2630347 0.04105040
## [7,] 2.609895 2.852152 0.2422572 0.04184725
## [8,] 2.539156 2.778562 0.2394054 0.04292750
## [9,] 2.468162 2.711752 0.2435901 0.04344197
## [10,] 2.407265 2.647595 0.2403307 0.04548446
# visualization
fviz_gap_stat((gap_stat))
# extracting results
set.seed(123)
final <- kmeans(df,4,nstart = 25)
print(final)
## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.4118898 0.8743346 -0.8145211 0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 0.6950701 1.0394414 0.7226370 1.27693964
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 4 4 1 4
## Colorado Connecticut Delaware Florida Georgia
## 4 3 3 4 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 4 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 4
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 4 2 1 4
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 4 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 4 4 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 4 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 8.316061 11.952463 16.212213 19.922437
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# visualization
fviz_cluster(final, data = df)
USArrests %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
## # A tibble: 4 × 5
## Cluster Murder Assault UrbanPop Rape
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.9 244. 53.8 21.4
## 2 2 3.6 78.5 52.1 12.2
## 3 3 5.66 139. 73.9 18.8
## 4 4 10.8 257. 76 33.2
Hierarchical Analysis
library(tidyverse)
library(cluster)
library(factoextra)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
# check the data
df <- USArrests
# remove missing value
df <- na.omit(df)
# scaling
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
# agglomerative hierarchical clustering
# dissimilarity matrix
d <- dist(df, method = "euclidean")
# hierarchical clustering using complete linkage
hc1 <- hclust(d, method = "complete")
# plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
# compute with agnes
hc2 <- agnes(df, method = "complete")
# agglomerative coefficient
hc2$ac
## [1] 0.8531583
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
agnes(df, method = x) $ ac
}
map_dbl(m,ac)
## average single complete ward
## 0.7379371 0.6276128 0.8531583 0.9346210
# visualize the dendrogram (agnes)
hc3 <- agnes(df, method = "ward")
pltree(hc3, cex = 0.6, hang=-1, main = "Dendrogram of agnes")
# Divisive
# diana function
# compute divisive hierarchical clustering
hc4 <- diana(df)
# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.8514345
# plot dendrogram
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")
# cut the dendrogram
# ward's method
hc5 <- hclust(d, method = "ward.D2")
# cut tree into 4 groups
sub_grp <- cutree(hc5, k=4)
# number of members in each cluster
table(sub_grp)
## sub_grp
## 1 2 3 4
## 7 12 19 12
USArrests %>%
mutate(cluster = sub_grp) %>%
head
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 2
## Arizona 8.1 294 80 31.0 2
## Arkansas 8.8 190 50 19.5 3
## California 9.0 276 91 40.6 2
## Colorado 7.9 204 78 38.7 2
plot(hc5, cex = 0.6)
rect.hclust(hc5, k = 4, border = 2:5)
# visualization with fviz_cluster function
fviz_cluster(list(data = df, cluster = sub_grp))
# Cut agnes() tree into 4 groups
hc_a <- agnes(df, method = "ward")
cutree(as.hclust(hc_a), k = 4)
## Alabama Alaska Arizona Arkansas California
## 1 2 2 3 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 4 2 3 4
## Kansas Kentucky Louisiana Maine Maryland
## 3 3 1 4 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 4 1 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 4 4 2 4 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 1 4 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 4 1 2 3 4
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 4 4 3
cat("\n\n")
# cut diana() tree into 4groups
hc_d <- diana(df)
cutree(as.hclust(hc_d), k = 4)
## Alabama Alaska Arizona Arkansas California
## 1 2 2 3 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 4 2 3 4
## Kansas Kentucky Louisiana Maine Maryland
## 3 4 1 4 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 4 1 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 4 4 2 4 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 1 4 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 4 1 2 3 4
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 4 4 3
# Compute distance matrix
res.dist <- dist(df, method = "euclidean")
# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "complete")
hc2 <- hclust(res.dist, method = "ward.D2")
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
tanglegram(dend1, dend2)
## Loading required namespace: colorspace
dend_list <- dendlist(dend1, dend2)
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE, # Turn-off dashed lines
common_subtrees_color_lines = FALSE, # Turn-off line colors
common_subtrees_color_branches = TRUE, # Color common branches
main = paste("entanglement =", round(entanglement(dend_list), 2))
)
# optimal clusters
# elbow method
fviz_nbclust(df, FUN = hcut, method = "wss")
# average silhouette method
fviz_nbclust(df, FUN = hcut, method = "silhouette")
# gap statistic method
gap_stat <- clusGap(df, FUN=hcut, nstart = 25, K.max = 10, B=50)
fviz_gap_stat(gap_stat)