library(ggplot2)
## Warning: le package 'ggplot2' a été compilé avec la version R 4.4.3
library(cluster)
library(tidyverse)    # Pour la manipulation des données 
## Warning: le package 'tidyverse' a été compilé avec la version R 4.4.3
## Warning: le package 'readr' a été compilé avec la version R 4.4.3
## Warning: le package 'dplyr' a été compilé avec la version R 4.4.3
## Warning: le package 'stringr' a été compilé avec la version R 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.6.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(formattable)  # Pour la visualisation du tableau coloré
## Warning: le package 'formattable' a été compilé avec la version R 4.4.3
library(factoextra)   # Pour la fonction fviz_cluster
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(formattable)

Context: Piranhas (Serrasalmus sp.) is known to exhibit a variety of behaviors. Often associated with aggressive behavior, they can also show fear or flight responses. Behavioural traits were measured in 150 individuals to identify personality traits/syndromes.

Aim:

  1. Clustering on individuals to separate them into groups to identify personality traits

  2. Determine how many personality traits can be identified

fish_behaviour <- read_csv2("fish_behaviour.csv")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 150 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (7): Fish_ID, Average_activity, Hiding_time, Social_interactions, Observ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(fish_behaviour)

summary(fish_behaviour)
##     Fish_ID       Average_activity  Hiding_time      Social_interactions
##  Min.   :  1.00   Min.   : 1.355   Min.   :-0.4965   Min.   :-0.03569   
##  1st Qu.: 38.25   1st Qu.: 3.993   1st Qu.: 1.5204   1st Qu.: 1.73644   
##  Median : 75.50   Median : 5.708   Median : 2.1575   Median : 3.73136   
##  Mean   : 75.50   Mean   : 5.619   Mean   : 3.3049   Mean   : 4.24929   
##  3rd Qu.:112.75   3rd Qu.: 7.353   3rd Qu.: 4.3177   3rd Qu.: 5.94535   
##  Max.   :150.00   Max.   :10.287   Max.   : 9.5205   Max.   :11.22353   
##  Observed_attacks  Exploration      Stimuli_reaction
##  Min.   :-0.509   Min.   :0.07505   Min.   : 2.698  
##  1st Qu.: 1.075   1st Qu.:2.57534   1st Qu.: 4.522  
##  Median : 2.045   Median :4.20172   Median : 5.934  
##  Mean   : 3.351   Mean   :4.50436   Mean   : 6.191  
##  3rd Qu.: 4.616   3rd Qu.:6.63353   3rd Qu.: 7.563  
##  Max.   :11.702   Max.   :9.80938   Max.   :12.229
fish_behaviour <-  transform(fish_behaviour, Fish_ID=factor(Fish_ID))

Standardize data

fish_num <- fish_behaviour[,c(2:7)]

# scale = TRUE réalise le centrage (moyenne = 0) et la réduction (écart-type = 1)
fish_std <- scale(fish_num)

Optimal number of clusters determination: WSS or another method

fviz_nbclust(fish_std, # Méthode du coude (Elbow method)
             kmeans, 
             method = "wss") +  # WCSS - Within-Cluster Sum of Squares
  labs(subtitle = "Méthode du coude")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_nbclust(fish_std, # Méthode du coefficient de silhouette
             kmeans, 
             method = "silhouette") + 
  labs(subtitle = "Méthode du coefficient de silhouette")

fviz_nbclust(fish_std,  # Statistique du Gap
             kmeans, 
             method = "gap_stat", 
             nboot = 50) + # Nombre de simulations de référence 
  labs(subtitle = "Statistique du Gap")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Choose which clustering method to apply

# Effectuer le clustering Single: K-means avec k=5
# nstart = 25 assure que l'algorithme est exécuté 25 fois 
# Avec des centres initiaux aléatoires différents pour trouver la meilleure solution globale.
set.seed(123) # Pour la reproductibilité des résultats aléatoires, K-means commence par placer des centres au hasard
km.fish <- kmeans(fish_std,
                  centers = 5,
                  nstart = 25)
fish_behaviour$cluster <- as.factor(km.fish$cluster)

Clustering on scaled/standardized data giving a specific number of clusters

# Visualisation des clusters sur le plan de l'ACP pour réduire vos 6 variables en 2 dimensions (Dim1 et Dim2) pour la visualisation.

fviz_cluster(km.fish, data = fish_std, # utilise par défaut une ACP 
             palette = "jco",
             geom = "point",
             star.plot = TRUE,
             ellipse.type = "convex", 
             ggtheme = theme_minimal(),
             main = "Partitionnement des personnalités (Plan ACP), 
             Clustering K-means des Piranhas (k=5)")

Compare cluster results with different numbers of clusters

# Calcul de la moyenne de chaque comportement par cluster
analyse_profils <- fish_behaviour %>%
  group_by(cluster) %>%
  summarise(across(where(is.numeric), mean))

formattable(analyse_profils, 
            list(area(col = 2:ncol(analyse_profils)) ~ 
                   color_tile("white", "orange")))
cluster Average_activity Hiding_time Social_interactions Observed_attacks Exploration Stimuli_reaction
1 8.068587 0.9390457 2.190769 8.9811603 2.737690 5.983181
2 6.858792 1.9964016 4.988406 2.1027304 7.979879 6.889457
3 6.087402 1.9233174 9.159670 0.9031698 5.987675 4.626971
4 3.910136 3.8211131 3.893799 3.7799962 4.026532 4.339162
5 3.169023 7.8447460 1.013823 0.9877609 1.790026 9.114561
# cluster 1 : aggressive, 2 : verry mobile and exploratore = curious, 3 : social, 4 : neutral, 5 : scared/shy