Hierarchical Cluster Analysis with Dendrogram

Données

set.seed(123)
n <- 50
p <- 5
data <- data.frame(
  Var1 = rnorm(n, mean = 10, sd = 2),
  Var2 = rnorm(n, mean = 15, sd = 3),
  Var3 = rnorm(n, mean = 20, sd = 4),
  Var4 = rnorm(n, mean = 25, sd = 2.5),
  Var5 = rnorm(n, mean = 30, sd = 3.5)
)
rownames(data) <- paste("Ind", 1:n, sep = "_")
str(data)

## 'data.frame':    50 obs. of  5 variables:
##  $ Var1: num  8.88 9.54 13.12 10.14 10.26 ...
##  $ Var2: num  15.8 14.9 14.9 19.1 14.3 ...
##  $ Var3: num  17.2 21 19 18.6 16.2 ...
##  $ Var4: num  27 26.9 25.8 22.5 24.7 ...
##  $ Var5: num  37.7 34.6 29.1 31.9 28.5 ...

Clustering hiérarchique

dd <- dist(scale(data), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
plot(hc, main = "Dendrogramme", cex = 0.5, xlab = "", sub = "", hang = -1, las = 1)
rect.hclust(hc, k = 3, border = 1:3)

Nombre optimal de clusters

#opt_nc <- NbClust(data, diss = NULL, distance = "euclidean",
#                  min.nc = 2, max.nc = 6, method = "ward.D2",
#                  index = c("silhouette", "gap"), alphaBeale = 0.1)
#opt_nc <- NbClust(data, diss = NULL, distance = "euclidean",
#                  min.nc = 2, max.nc = 10, method = "ward.D2",
#                  index = "all", alphaBeale = 0.1)

Visualisation avancée du dendrogramme

library(factoextra)
fviz_dend(hc, k = 3, rect = TRUE, cex = 0.35,
          k_colors = c("red", "blue", "green"),
          labels_track_height = 2,
          ggtheme = ggplot2::theme_light()) +
  ggplot2::theme(axis.text.x = element_text(face = "bold", color = "black", size = 11),
                 axis.text.y = element_text(face = "bold", color = "black", size = 11))

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Résultats

kk <- 3
acp2 <- FactoMineR::PCA(data[complete.cases(data), ], ncp = kk, graph = FALSE)
cah <- FactoMineR::HCPC(acp2, graph = FALSE)
table(cah$data.clust$clust)

## 
##  1  2  3  4 
## 18 10 13  9

questionr::freq(cah$data.clust$clust, valid = TRUE)

##    n  % val%
## 1 18 36   36
## 2 10 20   20
## 3 13 26   26
## 4  9 18   18

Extraction des groupes

groups <- cutree(hc, k = kk)
table(groups)

## groups
##  1  2  3 
##  9 25 16

G <- subset(groups, groups == 3)
write.table(G, "G1.txt", col.names = FALSE, row.names = TRUE)
getwd()

## [1] "C:/Users/hp/Downloads"

Hierarchical Cluster Analysis with Dendrogram

Abdi-Basid ADAN

Données

Clustering hiérarchique

Nombre optimal de clusters

Visualisation avancée du dendrogramme

Résultats

Extraction des groupes