Hierarchical Cluster Analysis with Dendrogram

Objective Description

This analysis applies hierarchical clustering to a simulated dataset of 50 observations with 5 variables using Ward’s method and Euclidean distance, visualizes the results with a dendrogram, and extracts cluster assignments.

Packages

To install these packages, use the following command in R: install.packages(‘package_name’).

library(questionr)
library(factoextra)
library(NbClust)
require(clValid)

## Loading required package: clValid

require(kableExtra)

## Loading required package: kableExtra

library(optpart)

## Loading required package: labdsv

## Loading required package: mgcv

## Loading required package: nlme

## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

## This is labdsv 2.1-0
## convert existing ordinations with as.dsvord()

## 
## Attaching package: 'labdsv'

## The following objects are masked from 'package:stats':
## 
##     density, loadings

## Loading required package: MASS

## Loading required package: plotrix

library(cluster)
library(FactoMineR)
library(devtools)

## Loading required package: usethis

library(WeightedCluster)

## Loading required package: TraMineR

## 
## TraMineR stable version 2.2-7 (Built: 2023-04-17)

## Website: http://traminer.unige.ch

## Please type 'citation("TraMineR")' for citation information.

## This is WeightedCluster stable version 1.6-4 (Built: 2023-07-26)

## 
## To get the manuals, please run:

##    vignette("WeightedCluster") ## Complete manual in English

##    vignette("WeightedClusterFR") ## Complete manual in French

##    vignette("WeightedClusterPreview") ## Short preview in English

## 
## To cite WeightedCluster in publications please use:

## Studer, Matthias (2013). WeightedCluster Library Manual: A practical

##    guide to creating typologies of trajectories in the social sciences

##    with R. LIVES Working Papers, 24. doi:

##    10.12682/lives.2296-1658.2013.24

library(dendextend)

## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(ggplot2)
library(ggdendro)

## 
## Attaching package: 'ggdendro'

## The following object is masked from 'package:dendextend':
## 
##     theme_dendro

require(fastcluster)

## Loading required package: fastcluster

## 
## Attaching package: 'fastcluster'

## The following object is masked from 'package:stats':
## 
##     hclust

Data

Generate a dataset with 50 observations and 5 variables.

set.seed(123)
n <- 50
p <- 5
data <- data.frame(
  Var1 = rnorm(n, mean = 10, sd = 2),
  Var2 = rnorm(n, mean = 15, sd = 3),
  Var3 = rnorm(n, mean = 20, sd = 4),
  Var4 = rnorm(n, mean = 25, sd = 2.5),
  Var5 = rnorm(n, mean = 30, sd = 3.5)
)
rownames(data) <- paste("Ind", 1:n, sep = "_")
str(data)

## 'data.frame':    50 obs. of  5 variables:
##  $ Var1: num  8.88 9.54 13.12 10.14 10.26 ...
##  $ Var2: num  15.8 14.9 14.9 19.1 14.3 ...
##  $ Var3: num  17.2 21 19 18.6 16.2 ...
##  $ Var4: num  27 26.9 25.8 22.5 24.7 ...
##  $ Var5: num  37.7 34.6 29.1 31.9 28.5 ...

Hierarchical Clustering

Compute Euclidean distance and apply Ward’s method to create and visualize a dendrogram.

dd <- dist(scale(data), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
plot(hc, main = "Dendrogram", cex = 0.5, xlab = "", sub = "", hang = -1, las = 1)
rect.hclust(hc, k = 2, border = 1:3)

Optimal Number of Clusters

Determine the optimal number of clusters using NbClust and visualize.

diss_matrix <- dist(data, method = "euclidean", diag = FALSE)
res <- NbClust(data, diss = diss_matrix, distance = NULL, min.nc = 2, max.nc = 6, 
               method = "ward.D", index = "ch") 
table(res$Best.partition)

## 
##  1  2  3 
##  8 23 19

library(factoextra)
my_data <- scale(data)
x11(); fviz_nbclust(my_data, kmeans, method = "gap_stat")

# Commented-out custom function (ensure it’s not executed unintentionally)
# best.cutree <-
# function(hc, min = 3, max = 20, loss = FALSE, graph = FALSE, ...) {
#   if (!inherits(hc, "hclust")) hc <- as.hclust(hc)
#   max <- min(max, length(hc$height))
#   inert.gain <- rev(hc$height)
#   intra <- rev(cumsum(rev(inert.gain)))
#   relative.loss <- intra[min:(max)] / intra[(min - 1):(max - 1)]
#   best <- which.min(relative.loss)
#   names(relative.loss) <- min:max
#   if (graph) {
#     temp <- relative.loss
#     temp[best] <- NA
#     best2 <- which.min(temp)
#     pch <- rep(1, max - min + 1)
#     pch[best] <- 16
#     pch[best2] <- 21
#     plot(min:max, relative.loss, pch = pch, bg = "grey75", ...)
#   }
#   if (loss) {
#     relative.loss
#   } else {
#     best + min - 1
#   }
# }
# source(url("https://raw.githubusercontent.com/larmarange/JLutils/master/R/clustering.R"))
# best.cutree(hclust(dd, method = "ward.D2"))

Enhanced Dendrogram

Visualize the dendrogram with 2 clusters using factoextra.

library(factoextra)
fviz_dend(hc, k = 2, rect = TRUE, cex = 0.35,
          k_colors = c("red", "blue"),  # "green"
          labels_track_height = 2,
          ggtheme = ggplot2::theme_light()) +
  ggplot2::theme(axis.text.x = element_text(face = "bold", color = "black", size = 11),
                 axis.text.y = element_text(face = "bold", color = "black", size = 11))

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Results

Perform PCA and hierarchical clustering to summarize cluster assignments.

kk <- 2
acp2 <- FactoMineR::PCA(data[complete.cases(data), ], ncp = kk, graph = FALSE)
cah <- FactoMineR::HCPC(acp2, graph = FALSE)
table(cah$data.clust$clust)

## 
##  1  2  3 
## 18 21 11

questionr::freq(cah$data.clust$clust, valid = TRUE)

##    n  % val%
## 1 18 36   36
## 2 21 42   42
## 3 11 22   22

Cluster Extraction

Extract and save the third cluster to a file.

groups <- cutree(hc, k = kk)
table(groups)

## groups
##  1  2 
##  9 41

G <- subset(groups, groups == 2)
write.table(G, "G1.txt", col.names = FALSE, row.names = TRUE)
getwd()

## [1] "C:/Abdi-Basid ADAN/22.Doc.Rpubs/1 Rpubs HCA/English"