Import Libraries

library(dplyr)
library(GGally)     # Correlation
library(FactoMineR) # PCA
library(factoextra) # Elbow Method
library(tidyr)

Data Input

We want to cluster penguin species based on their features.

Data Source: https://www.kaggle.com/datasets/youssefaboelwafa/clustering-penguins-species/

penguin <- read.csv("penguins.csv")
glimpse(penguin)
#> Rows: 344
#> Columns: 5
#> $ culmen_length_mm  <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
#> $ culmen_depth_mm   <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
#> $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 5000, 18…
#> $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
#> $ sex               <chr> "MALE", "FEMALE", "FEMALE", NA, "FEMALE", "MALE", "F…

Data Wrangling & EDA

Check unique values in sex column.

unique(penguin %>% filter(complete.cases(.)) %>% select(sex))
#>        sex
#> 1     MALE
#> 2   FEMALE
#> 329      .

There is data where sex = “.” and we will turn these data to NA values.

# Change "." to NA
penguin <- penguin %>% mutate(sex = na_if(sex, "."))

Clean data

  1. Convert sex column to numerical
  2. Filter out data with missing values (NA)
penguin_clean <- penguin %>%
  filter(complete.cases(.)) %>%
  mutate(
    is_female = if_else(sex == "FEMALE", 1, 0),
    is_male = if_else(sex == "MALE", 1, 0)
  ) %>%
  select(-sex) 

penguin_clean %>% glimpse
#> Rows: 334
#> Columns: 6
#> $ culmen_length_mm  <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 42.0, 41.1…
#> $ culmen_depth_mm   <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 20.2, 17.6…
#> $ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 5000, 182, 191, -…
#> $ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 4250, 3200…
#> $ is_female         <dbl> 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0…
#> $ is_male           <dbl> 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1…

Check for outliers

View boxplot for all features

penguin_clean %>% boxplot()

Seems like there are outliers for the flipper length.

penguin_clean %>% select(flipper_length_mm) %>% max()
#> [1] 5000
penguin_clean %>% select(flipper_length_mm) %>% min()
#> [1] -132

Penguins that have 5 meters or negative flippers length seems unlikely, so we will remove these data.

penguin_clean <- penguin_clean %>% filter(flipper_length_mm < 1000 & flipper_length_mm > 0)

Check correlation

ggcorr(penguin_clean %>% select(is.numeric), 
       low = "blue", high = "red", label = T, hjust = 0.8)

Scaling

All our data are already in the numeric data type. However, they are not in the same scale so we must do scaling.

penguin_scale <- penguin_clean %>% scale
summary(penguin_scale)
#>  culmen_length_mm  culmen_depth_mm    flipper_length_mm  body_mass_g     
#>  Min.   :-2.1864   Min.   :-2.06757   Min.   :-2.0644   Min.   :-1.8682  
#>  1st Qu.:-0.8292   1st Qu.:-0.79224   1st Qu.:-0.7820   1st Qu.:-0.8141  
#>  Median : 0.1245   Median : 0.07498   Median :-0.2833   Median :-0.2251  
#>  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
#>  3rd Qu.: 0.8444   3rd Qu.: 0.78917   3rd Qu.: 0.8567   3rd Qu.: 0.7128  
#>  Max.   : 2.8572   Max.   : 2.21754   Max.   : 2.1391   Max.   : 2.5963  
#>    is_female          is_male       
#>  Min.   :-0.9925   Min.   :-1.0045  
#>  1st Qu.:-0.9925   1st Qu.:-1.0045  
#>  Median :-0.9925   Median : 0.9925  
#>  Mean   : 0.0000   Mean   : 0.0000  
#>  3rd Qu.: 1.0045   3rd Qu.: 0.9925  
#>  Max.   : 1.0045   Max.   : 0.9925

K-means Clustering

Optimum K: Elbow method

We will try to use elbow method to determine the optimum k for our data.

fviz_nbclust(x = penguin_scale, FUNcluster = kmeans, method = "wss")

Based on the graph, we will choose k = 5 since there is no significant decline in the graph for larger k numbers after 5.

Create cluster

RNGkind(sample.kind = "Rounding")
set.seed(123)

penguin_kmeans <- kmeans(x = penguin_scale, centers = 5)
penguin_kmeans
#> K-means clustering with 5 clusters of sizes 107, 29, 106, 61, 29
#> 
#> Cluster means:
#>   culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g  is_female
#> 1      -0.69754311       0.2337441        -0.8499002  -0.9763825  1.0045261
#> 2       0.01637068      -1.6541896         0.7706770   0.4024977  1.0045261
#> 3      -0.02618678       0.9989967        -0.4425595  -0.2478692 -0.9924958
#> 4       1.00004110      -0.7320294         1.4651697   1.5853442 -0.9924958
#> 5       0.54950534      -1.3199649         0.9008843   0.7713321  1.0045261
#>      is_male
#> 1 -1.0045261
#> 2 -1.0045261
#> 3  0.9924958
#> 4  0.9924958
#> 5 -1.0045261
#> 
#> Clustering vector:
#>   [1] 3 1 1 1 3 1 3 1 3 1 1 3 1 3 1 3 1 3 3 1 3 1 1 3 1 3 1 3 1 3 3 1 1 3 1 3 1
#>  [38] 3 1 3 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1
#>  [75] 3 1 3 1 3 3 1 3 1 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
#> [112] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 3 1 1 3 1 3 1 3 3
#> [149] 1 3 1 1 3 1 3 1 3 1 3 1 3 3 1 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 3 1 1 3 1 3
#> [186] 3 1 3 1 1 3 1 3 3 1 1 3 1 3 1 3 1 3 3 1 3 1 1 3 1 3 3 1 2 4 5 4 4 2 5 4 2
#> [223] 4 2 4 2 4 2 4 2 4 5 4 4 5 5 4 2 4 4 5 4 4 2 5 4 5 4 2 4 2 4 2 4 2 4 4 2 2
#> [260] 4 2 4 5 4 2 4 5 4 2 4 2 4 2 4 2 4 5 4 4 2 4 5 4 4 5 5 4 5 4 2 4 5 4 5 4 2
#> [297] 4 2 4 5 4 5 4 5 4 5 4 4 5 5 4 5 4 5 4 4 2 4 2 4 5 4 2 4 2 4 4 5 5 4 5 4
#> 
#> Within cluster sum of squares by cluster:
#> [1] 142.197850   8.608391 174.003894  42.364600   7.175067
#>  (between_SS / total_SS =  81.2 %)
#> 
#> Available components:
#> 
#> [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
#> [6] "betweenss"    "size"         "iter"         "ifault"

Join cluster with dataset.

penguin_clean_new <- penguin_clean %>% mutate(cluster = as.factor(penguin_kmeans$cluster)) 
head(penguin_clean_new)
#>   culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g is_female
#> 1             39.1            18.7               181        3750         0
#> 2             39.5            17.4               186        3800         1
#> 3             40.3            18.0               195        3250         1
#> 4             36.7            19.3               193        3450         1
#> 5             39.3            20.6               190        3650         0
#> 6             38.9            17.8               181        3625         1
#>   is_male cluster
#> 1       1       3
#> 2       0       1
#> 3       0       1
#> 4       0       1
#> 5       1       3
#> 6       0       1

Cluster Analysis

# Profiling by summarising data
penguin_clean_new %>% 
  group_by(cluster) %>% 
  summarise_all(mean) %>% 
  
  pivot_longer(-cluster) %>% 
  group_by(name) %>% 
  summarize(
    min_cluster = which.min(value),
    max_cluster = which.max(value))
#> # A tibble: 6 × 3
#>   name              min_cluster max_cluster
#>   <chr>                   <int>       <int>
#> 1 body_mass_g                 1           4
#> 2 culmen_depth_mm             2           3
#> 3 culmen_length_mm            1           4
#> 4 flipper_length_mm           1           4
#> 5 is_female                   3           1
#> 6 is_male                     1           3
  • Cluster 1 has the least body mass, culmen length, flipper length. It has the most female and the least male.
  • Cluster 2 has the least culmen depth.
  • Cluster 3 has the most culmen depth, least female and most male.
  • Cluster 4 has the most body mass, culmen length, and flipper length.
  • Cluster 5 has no features that is less or more compared to all other clusters.

PCA

We will make PCA from the data set.

penguin_pca <- PCA(penguin_scale, scale.unit = F, graph = F)
penguin_clean_new %>% glimpse
#> Rows: 332
#> Columns: 7
#> $ culmen_length_mm  <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
#> $ culmen_depth_mm   <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
#> $ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 185, 19…
#> $ body_mass_g       <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
#> $ is_female         <dbl> 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0…
#> $ is_male           <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1…
#> $ cluster           <fct> 3, 1, 1, 1, 3, 1, 3, 1, 3, 1, 1, 3, 1, 3, 1, 3, 1, 3…

Dimensionality Reduction

Let’s visualize the percentage of variance captured by each dimensions and observe the eigenvalues

# Visualizing % variance captured by each dimension
fviz_eig(penguin_pca, addlabels = T, main = "Variance explained by each dimensions")

# Eigenvalue summary
penguin_pca$eig
#>                                     eigenvalue
#> comp 1 3.1126399491037544109417467552702873945
#> comp 2 2.0615584365038532865810338989831507206
#> comp 3 0.5135825088388661363580922625260427594
#> comp 4 0.1931635997386259573360689500987064093
#> comp 5 0.1009832166582729923165473451263096649
#> comp 6 0.0000000000000000000000000000001783971
#>                         percentage of variance
#> comp 1 52.034061586225909934455557959154248238
#> comp 2 34.463111828765327970813814317807555199
#> comp 3  8.585568627114984252557405852712690830
#> comp 4  3.229119592810867356291737451101653278
#> comp 5  1.688138365082912262238323819474317133
#> comp 6  0.000000000000000000000000000002982267
#>        cumulative percentage of variance
#> comp 1                          52.03406
#> comp 2                          86.49717
#> comp 3                          95.08274
#> comp 4                          98.31186
#> comp 5                         100.00000
#> comp 6                         100.00000

From both the eigenvalue summary and the graph above, we can see that 86% of the variances can be explained by only using the first 2 dimensions. So we can reduce the number of features on our data set from 6 to 2 features.

We will extract the values from PC1 to PC2 and put it in a dataframe, which later on can be used for supervised learning.

penguin_pca_keep <- data.frame(penguin_pca$ind$coord[, 1:2]) %>% 
  bind_cols(cluster = as.factor(penguin_kmeans$cluster)) 

penguin_pca_keep %>% head()
#>        Dim.1      Dim.2 cluster
#> 1 -0.7169639  2.0939647       3
#> 2 -1.9241582 -0.4075447       1
#> 3 -1.9476043 -0.3044176       1
#> 4 -2.2797068  0.1628807       1
#> 5 -0.6076547  2.5090384       3
#> 6 -2.2847400 -0.1210956       1

K-means & PCA Combined

PCA can be integrated with K-means Clustering result to help visualize our data in a fewer dimensions than the original features.

fviz_cluster(object = penguin_kmeans, data = penguin_clean_new %>% select(-cluster))

# Create PCA model with cluster
new_pca <- PCA(X = penguin_clean_new, 
               scale.unit = T, 
               quali.sup = "cluster", 
               graph = F)

fviz_pca_biplot(X = new_pca,
                habillage = "cluster",
                geom.ind = "point",
                addEllipses = T ,
                col.var = "navy"
                )