library(ggplot2)
## Warning: le package 'ggplot2' a été compilé avec la version R 4.4.3
library(cluster)
library(tidyverse) # Pour la manipulation des données
## Warning: le package 'tidyverse' a été compilé avec la version R 4.4.3
## Warning: le package 'readr' a été compilé avec la version R 4.4.3
## Warning: le package 'dplyr' a été compilé avec la version R 4.4.3
## Warning: le package 'stringr' a été compilé avec la version R 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.6.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(formattable) # Pour la visualisation du tableau coloré
## Warning: le package 'formattable' a été compilé avec la version R 4.4.3
library(factoextra) # Pour la fonction fviz_cluster
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(formattable)
Context: Piranhas (Serrasalmus sp.) is known to exhibit a variety of behaviors. Often associated with aggressive behavior, they can also show fear or flight responses. Behavioural traits were measured in 150 individuals to identify personality traits/syndromes.
Aim:
Clustering on individuals to separate them into groups to identify personality traits
Determine how many personality traits can be identified
fish_behaviour <- read_csv2("fish_behaviour.csv")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 150 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (7): Fish_ID, Average_activity, Hiding_time, Social_interactions, Observ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(fish_behaviour)
summary(fish_behaviour)
## Fish_ID Average_activity Hiding_time Social_interactions
## Min. : 1.00 Min. : 1.355 Min. :-0.4965 Min. :-0.03569
## 1st Qu.: 38.25 1st Qu.: 3.993 1st Qu.: 1.5204 1st Qu.: 1.73644
## Median : 75.50 Median : 5.708 Median : 2.1575 Median : 3.73136
## Mean : 75.50 Mean : 5.619 Mean : 3.3049 Mean : 4.24929
## 3rd Qu.:112.75 3rd Qu.: 7.353 3rd Qu.: 4.3177 3rd Qu.: 5.94535
## Max. :150.00 Max. :10.287 Max. : 9.5205 Max. :11.22353
## Observed_attacks Exploration Stimuli_reaction
## Min. :-0.509 Min. :0.07505 Min. : 2.698
## 1st Qu.: 1.075 1st Qu.:2.57534 1st Qu.: 4.522
## Median : 2.045 Median :4.20172 Median : 5.934
## Mean : 3.351 Mean :4.50436 Mean : 6.191
## 3rd Qu.: 4.616 3rd Qu.:6.63353 3rd Qu.: 7.563
## Max. :11.702 Max. :9.80938 Max. :12.229
fish_behaviour <- transform(fish_behaviour, Fish_ID=factor(Fish_ID))
Standardize data
fish_num <- fish_behaviour[,c(2:7)]
# scale = TRUE réalise le centrage (moyenne = 0) et la réduction (écart-type = 1)
fish_std <- scale(fish_num)
Optimal number of clusters determination: WSS or another method
fviz_nbclust(fish_std, # Méthode du coude (Elbow method)
kmeans,
method = "wss") + # WCSS - Within-Cluster Sum of Squares
labs(subtitle = "Méthode du coude")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_nbclust(fish_std, # Méthode du coefficient de silhouette
kmeans,
method = "silhouette") +
labs(subtitle = "Méthode du coefficient de silhouette")
fviz_nbclust(fish_std, # Statistique du Gap
kmeans,
method = "gap_stat",
nboot = 50) + # Nombre de simulations de référence
labs(subtitle = "Statistique du Gap")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Choose which clustering method to apply
# Effectuer le clustering Single: K-means avec k=5
# nstart = 25 assure que l'algorithme est exécuté 25 fois
# Avec des centres initiaux aléatoires différents pour trouver la meilleure solution globale.
set.seed(123) # Pour la reproductibilité des résultats aléatoires, K-means commence par placer des centres au hasard
km.fish <- kmeans(fish_std,
centers = 5,
nstart = 25)
fish_behaviour$cluster <- as.factor(km.fish$cluster)
Clustering on scaled/standardized data giving a specific number of clusters
# Visualisation des clusters sur le plan de l'ACP pour réduire vos 6 variables en 2 dimensions (Dim1 et Dim2) pour la visualisation.
fviz_cluster(km.fish, data = fish_std, # utilise par défaut une ACP
palette = "jco",
geom = "point",
star.plot = TRUE,
ellipse.type = "convex",
ggtheme = theme_minimal(),
main = "Partitionnement des personnalités (Plan ACP),
Clustering K-means des Piranhas (k=5)")
Compare cluster results with different numbers of clusters
# Calcul de la moyenne de chaque comportement par cluster
analyse_profils <- fish_behaviour %>%
group_by(cluster) %>%
summarise(across(where(is.numeric), mean))
formattable(analyse_profils,
list(area(col = 2:ncol(analyse_profils)) ~
color_tile("white", "orange")))
| cluster | Average_activity | Hiding_time | Social_interactions | Observed_attacks | Exploration | Stimuli_reaction |
|---|---|---|---|---|---|---|
| 1 | 8.068587 | 0.9390457 | 2.190769 | 8.9811603 | 2.737690 | 5.983181 |
| 2 | 6.858792 | 1.9964016 | 4.988406 | 2.1027304 | 7.979879 | 6.889457 |
| 3 | 6.087402 | 1.9233174 | 9.159670 | 0.9031698 | 5.987675 | 4.626971 |
| 4 | 3.910136 | 3.8211131 | 3.893799 | 3.7799962 | 4.026532 | 4.339162 |
| 5 | 3.169023 | 7.8447460 | 1.013823 | 0.9877609 | 1.790026 | 9.114561 |
# cluster 1 : aggressive, 2 : verry mobile and exploratore = curious, 3 : social, 4 : neutral, 5 : scared/shy