This analysis applies hierarchical clustering to a simulated dataset of 50 observations with 5 variables using Ward’s method and Euclidean distance, visualizes the results with a dendrogram, and extracts cluster assignments.
To install these packages, use the following command in R: install.packages(‘package_name’).
library(questionr)
library(factoextra)
library(NbClust)
require(clValid)
## Loading required package: clValid
require(kableExtra)
## Loading required package: kableExtra
library(optpart)
## Loading required package: labdsv
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## This is labdsv 2.1-0
## convert existing ordinations with as.dsvord()
##
## Attaching package: 'labdsv'
## The following objects are masked from 'package:stats':
##
## density, loadings
## Loading required package: MASS
## Loading required package: plotrix
library(cluster)
library(FactoMineR)
library(devtools)
## Loading required package: usethis
library(WeightedCluster)
## Loading required package: TraMineR
##
## TraMineR stable version 2.2-7 (Built: 2023-04-17)
## Website: http://traminer.unige.ch
## Please type 'citation("TraMineR")' for citation information.
## This is WeightedCluster stable version 1.6-4 (Built: 2023-07-26)
##
## To get the manuals, please run:
## vignette("WeightedCluster") ## Complete manual in English
## vignette("WeightedClusterFR") ## Complete manual in French
## vignette("WeightedClusterPreview") ## Short preview in English
##
## To cite WeightedCluster in publications please use:
## Studer, Matthias (2013). WeightedCluster Library Manual: A practical
## guide to creating typologies of trajectories in the social sciences
## with R. LIVES Working Papers, 24. doi:
## 10.12682/lives.2296-1658.2013.24
library(dendextend)
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(ggplot2)
library(ggdendro)
##
## Attaching package: 'ggdendro'
## The following object is masked from 'package:dendextend':
##
## theme_dendro
require(fastcluster)
## Loading required package: fastcluster
##
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
##
## hclust
Generate a dataset with 50 observations and 5 variables.
set.seed(123)
n <- 50
p <- 5
data <- data.frame(
Var1 = rnorm(n, mean = 10, sd = 2),
Var2 = rnorm(n, mean = 15, sd = 3),
Var3 = rnorm(n, mean = 20, sd = 4),
Var4 = rnorm(n, mean = 25, sd = 2.5),
Var5 = rnorm(n, mean = 30, sd = 3.5)
)
rownames(data) <- paste("Ind", 1:n, sep = "_")
str(data)
## 'data.frame': 50 obs. of 5 variables:
## $ Var1: num 8.88 9.54 13.12 10.14 10.26 ...
## $ Var2: num 15.8 14.9 14.9 19.1 14.3 ...
## $ Var3: num 17.2 21 19 18.6 16.2 ...
## $ Var4: num 27 26.9 25.8 22.5 24.7 ...
## $ Var5: num 37.7 34.6 29.1 31.9 28.5 ...
Compute Euclidean distance and apply Ward’s method to create and visualize a dendrogram.
dd <- dist(scale(data), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
plot(hc, main = "Dendrogram", cex = 0.5, xlab = "", sub = "", hang = -1, las = 1)
rect.hclust(hc, k = 2, border = 1:3)
Determine the optimal number of clusters using NbClust and visualize.
diss_matrix <- dist(data, method = "euclidean", diag = FALSE)
res <- NbClust(data, diss = diss_matrix, distance = NULL, min.nc = 2, max.nc = 6,
method = "ward.D", index = "ch")
table(res$Best.partition)
##
## 1 2 3
## 8 23 19
library(factoextra)
my_data <- scale(data)
x11(); fviz_nbclust(my_data, kmeans, method = "gap_stat")
# Commented-out custom function (ensure it’s not executed unintentionally)
# best.cutree <-
# function(hc, min = 3, max = 20, loss = FALSE, graph = FALSE, ...) {
# if (!inherits(hc, "hclust")) hc <- as.hclust(hc)
# max <- min(max, length(hc$height))
# inert.gain <- rev(hc$height)
# intra <- rev(cumsum(rev(inert.gain)))
# relative.loss <- intra[min:(max)] / intra[(min - 1):(max - 1)]
# best <- which.min(relative.loss)
# names(relative.loss) <- min:max
# if (graph) {
# temp <- relative.loss
# temp[best] <- NA
# best2 <- which.min(temp)
# pch <- rep(1, max - min + 1)
# pch[best] <- 16
# pch[best2] <- 21
# plot(min:max, relative.loss, pch = pch, bg = "grey75", ...)
# }
# if (loss) {
# relative.loss
# } else {
# best + min - 1
# }
# }
# source(url("https://raw.githubusercontent.com/larmarange/JLutils/master/R/clustering.R"))
# best.cutree(hclust(dd, method = "ward.D2"))
Visualize the dendrogram with 2 clusters using factoextra.
library(factoextra)
fviz_dend(hc, k = 2, rect = TRUE, cex = 0.35,
k_colors = c("red", "blue"), # "green"
labels_track_height = 2,
ggtheme = ggplot2::theme_light()) +
ggplot2::theme(axis.text.x = element_text(face = "bold", color = "black", size = 11),
axis.text.y = element_text(face = "bold", color = "black", size = 11))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Perform PCA and hierarchical clustering to summarize cluster assignments.
kk <- 2
acp2 <- FactoMineR::PCA(data[complete.cases(data), ], ncp = kk, graph = FALSE)
cah <- FactoMineR::HCPC(acp2, graph = FALSE)
table(cah$data.clust$clust)
##
## 1 2 3
## 18 21 11
questionr::freq(cah$data.clust$clust, valid = TRUE)
## n % val%
## 1 18 36 36
## 2 21 42 42
## 3 11 22 22
Extract and save the third cluster to a file.
groups <- cutree(hc, k = kk)
table(groups)
## groups
## 1 2
## 9 41
G <- subset(groups, groups == 2)
write.table(G, "G1.txt", col.names = FALSE, row.names = TRUE)
getwd()
## [1] "C:/Abdi-Basid ADAN/22.Doc.Rpubs/1 Rpubs HCA/English"