Données
set.seed(123)
n <- 50
p <- 5
data <- data.frame(
Var1 = rnorm(n, mean = 10, sd = 2),
Var2 = rnorm(n, mean = 15, sd = 3),
Var3 = rnorm(n, mean = 20, sd = 4),
Var4 = rnorm(n, mean = 25, sd = 2.5),
Var5 = rnorm(n, mean = 30, sd = 3.5)
)
rownames(data) <- paste("Ind", 1:n, sep = "_")
str(data)
## 'data.frame': 50 obs. of 5 variables:
## $ Var1: num 8.88 9.54 13.12 10.14 10.26 ...
## $ Var2: num 15.8 14.9 14.9 19.1 14.3 ...
## $ Var3: num 17.2 21 19 18.6 16.2 ...
## $ Var4: num 27 26.9 25.8 22.5 24.7 ...
## $ Var5: num 37.7 34.6 29.1 31.9 28.5 ...
Clustering hiérarchique
dd <- dist(scale(data), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
plot(hc, main = "Dendrogramme", cex = 0.5, xlab = "", sub = "", hang = -1, las = 1)
rect.hclust(hc, k = 3, border = 1:3)

Nombre optimal de clusters
#opt_nc <- NbClust(data, diss = NULL, distance = "euclidean",
# min.nc = 2, max.nc = 6, method = "ward.D2",
# index = c("silhouette", "gap"), alphaBeale = 0.1)
#opt_nc <- NbClust(data, diss = NULL, distance = "euclidean",
# min.nc = 2, max.nc = 10, method = "ward.D2",
# index = "all", alphaBeale = 0.1)
Visualisation avancée du dendrogramme
library(factoextra)
fviz_dend(hc, k = 3, rect = TRUE, cex = 0.35,
k_colors = c("red", "blue", "green"),
labels_track_height = 2,
ggtheme = ggplot2::theme_light()) +
ggplot2::theme(axis.text.x = element_text(face = "bold", color = "black", size = 11),
axis.text.y = element_text(face = "bold", color = "black", size = 11))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Résultats
kk <- 3
acp2 <- FactoMineR::PCA(data[complete.cases(data), ], ncp = kk, graph = FALSE)
cah <- FactoMineR::HCPC(acp2, graph = FALSE)
table(cah$data.clust$clust)
##
## 1 2 3 4
## 18 10 13 9
questionr::freq(cah$data.clust$clust, valid = TRUE)
## n % val%
## 1 18 36 36
## 2 10 20 20
## 3 13 26 26
## 4 9 18 18