library(dplyr)
library(GGally) # Correlation
library(FactoMineR) # PCA
library(factoextra) # Elbow Method
library(tidyr)
We want to cluster penguin species based on their features.
Data Source: https://www.kaggle.com/datasets/youssefaboelwafa/clustering-penguins-species/
penguin <- read.csv("penguins.csv")
glimpse(penguin)
#> Rows: 344
#> Columns: 5
#> $ culmen_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
#> $ culmen_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
#> $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 5000, 18…
#> $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
#> $ sex <chr> "MALE", "FEMALE", "FEMALE", NA, "FEMALE", "MALE", "F…
Check unique values in sex column.
unique(penguin %>% filter(complete.cases(.)) %>% select(sex))
#> sex
#> 1 MALE
#> 2 FEMALE
#> 329 .
There is data where sex = “.” and we will turn these data to NA values.
# Change "." to NA
penguin <- penguin %>% mutate(sex = na_if(sex, "."))
sex column to numericalpenguin_clean <- penguin %>%
filter(complete.cases(.)) %>%
mutate(
is_female = if_else(sex == "FEMALE", 1, 0),
is_male = if_else(sex == "MALE", 1, 0)
) %>%
select(-sex)
penguin_clean %>% glimpse
#> Rows: 334
#> Columns: 6
#> $ culmen_length_mm <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 42.0, 41.1…
#> $ culmen_depth_mm <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 20.2, 17.6…
#> $ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 5000, 182, 191, -…
#> $ body_mass_g <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 4250, 3200…
#> $ is_female <dbl> 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0…
#> $ is_male <dbl> 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1…
View boxplot for all features
penguin_clean %>% boxplot()
Seems like there are outliers for the flipper length.
penguin_clean %>% select(flipper_length_mm) %>% max()
#> [1] 5000
penguin_clean %>% select(flipper_length_mm) %>% min()
#> [1] -132
Penguins that have 5 meters or negative flippers length seems unlikely, so we will remove these data.
penguin_clean <- penguin_clean %>% filter(flipper_length_mm < 1000 & flipper_length_mm > 0)
ggcorr(penguin_clean %>% select(is.numeric),
low = "blue", high = "red", label = T, hjust = 0.8)
All our data are already in the numeric data type. However, they are not in the same scale so we must do scaling.
penguin_scale <- penguin_clean %>% scale
summary(penguin_scale)
#> culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
#> Min. :-2.1864 Min. :-2.06757 Min. :-2.0644 Min. :-1.8682
#> 1st Qu.:-0.8292 1st Qu.:-0.79224 1st Qu.:-0.7820 1st Qu.:-0.8141
#> Median : 0.1245 Median : 0.07498 Median :-0.2833 Median :-0.2251
#> Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
#> 3rd Qu.: 0.8444 3rd Qu.: 0.78917 3rd Qu.: 0.8567 3rd Qu.: 0.7128
#> Max. : 2.8572 Max. : 2.21754 Max. : 2.1391 Max. : 2.5963
#> is_female is_male
#> Min. :-0.9925 Min. :-1.0045
#> 1st Qu.:-0.9925 1st Qu.:-1.0045
#> Median :-0.9925 Median : 0.9925
#> Mean : 0.0000 Mean : 0.0000
#> 3rd Qu.: 1.0045 3rd Qu.: 0.9925
#> Max. : 1.0045 Max. : 0.9925
We will try to use elbow method to determine the optimum k for our data.
fviz_nbclust(x = penguin_scale, FUNcluster = kmeans, method = "wss")
Based on the graph, we will choose k = 5 since there is no significant decline in the graph for larger k numbers after 5.
RNGkind(sample.kind = "Rounding")
set.seed(123)
penguin_kmeans <- kmeans(x = penguin_scale, centers = 5)
penguin_kmeans
#> K-means clustering with 5 clusters of sizes 107, 29, 106, 61, 29
#>
#> Cluster means:
#> culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g is_female
#> 1 -0.69754311 0.2337441 -0.8499002 -0.9763825 1.0045261
#> 2 0.01637068 -1.6541896 0.7706770 0.4024977 1.0045261
#> 3 -0.02618678 0.9989967 -0.4425595 -0.2478692 -0.9924958
#> 4 1.00004110 -0.7320294 1.4651697 1.5853442 -0.9924958
#> 5 0.54950534 -1.3199649 0.9008843 0.7713321 1.0045261
#> is_male
#> 1 -1.0045261
#> 2 -1.0045261
#> 3 0.9924958
#> 4 0.9924958
#> 5 -1.0045261
#>
#> Clustering vector:
#> [1] 3 1 1 1 3 1 3 1 3 1 1 3 1 3 1 3 1 3 3 1 3 1 1 3 1 3 1 3 1 3 3 1 1 3 1 3 1
#> [38] 3 1 3 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1
#> [75] 3 1 3 1 3 3 1 3 1 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
#> [112] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 3 1 1 3 1 3 1 3 3
#> [149] 1 3 1 1 3 1 3 1 3 1 3 1 3 3 1 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 3 1 1 3 1 3
#> [186] 3 1 3 1 1 3 1 3 3 1 1 3 1 3 1 3 1 3 3 1 3 1 1 3 1 3 3 1 2 4 5 4 4 2 5 4 2
#> [223] 4 2 4 2 4 2 4 2 4 5 4 4 5 5 4 2 4 4 5 4 4 2 5 4 5 4 2 4 2 4 2 4 2 4 4 2 2
#> [260] 4 2 4 5 4 2 4 5 4 2 4 2 4 2 4 2 4 5 4 4 2 4 5 4 4 5 5 4 5 4 2 4 5 4 5 4 2
#> [297] 4 2 4 5 4 5 4 5 4 5 4 4 5 5 4 5 4 5 4 4 2 4 2 4 5 4 2 4 2 4 4 5 5 4 5 4
#>
#> Within cluster sum of squares by cluster:
#> [1] 142.197850 8.608391 174.003894 42.364600 7.175067
#> (between_SS / total_SS = 81.2 %)
#>
#> Available components:
#>
#> [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
#> [6] "betweenss" "size" "iter" "ifault"
Join cluster with dataset.
penguin_clean_new <- penguin_clean %>% mutate(cluster = as.factor(penguin_kmeans$cluster))
head(penguin_clean_new)
#> culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g is_female
#> 1 39.1 18.7 181 3750 0
#> 2 39.5 17.4 186 3800 1
#> 3 40.3 18.0 195 3250 1
#> 4 36.7 19.3 193 3450 1
#> 5 39.3 20.6 190 3650 0
#> 6 38.9 17.8 181 3625 1
#> is_male cluster
#> 1 1 3
#> 2 0 1
#> 3 0 1
#> 4 0 1
#> 5 1 3
#> 6 0 1
# Profiling by summarising data
penguin_clean_new %>%
group_by(cluster) %>%
summarise_all(mean) %>%
pivot_longer(-cluster) %>%
group_by(name) %>%
summarize(
min_cluster = which.min(value),
max_cluster = which.max(value))
#> # A tibble: 6 × 3
#> name min_cluster max_cluster
#> <chr> <int> <int>
#> 1 body_mass_g 1 4
#> 2 culmen_depth_mm 2 3
#> 3 culmen_length_mm 1 4
#> 4 flipper_length_mm 1 4
#> 5 is_female 3 1
#> 6 is_male 1 3
We will make PCA from the data set.
penguin_pca <- PCA(penguin_scale, scale.unit = F, graph = F)
penguin_clean_new %>% glimpse
#> Rows: 332
#> Columns: 7
#> $ culmen_length_mm <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
#> $ culmen_depth_mm <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
#> $ flipper_length_mm <int> 181, 186, 195, 193, 190, 181, 195, 182, 191, 185, 19…
#> $ body_mass_g <int> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
#> $ is_female <dbl> 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0…
#> $ is_male <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1…
#> $ cluster <fct> 3, 1, 1, 1, 3, 1, 3, 1, 3, 1, 1, 3, 1, 3, 1, 3, 1, 3…
Let’s visualize the percentage of variance captured by each dimensions and observe the eigenvalues
# Visualizing % variance captured by each dimension
fviz_eig(penguin_pca, addlabels = T, main = "Variance explained by each dimensions")
# Eigenvalue summary
penguin_pca$eig
#> eigenvalue
#> comp 1 3.1126399491037544109417467552702873945
#> comp 2 2.0615584365038532865810338989831507206
#> comp 3 0.5135825088388661363580922625260427594
#> comp 4 0.1931635997386259573360689500987064093
#> comp 5 0.1009832166582729923165473451263096649
#> comp 6 0.0000000000000000000000000000001783971
#> percentage of variance
#> comp 1 52.034061586225909934455557959154248238
#> comp 2 34.463111828765327970813814317807555199
#> comp 3 8.585568627114984252557405852712690830
#> comp 4 3.229119592810867356291737451101653278
#> comp 5 1.688138365082912262238323819474317133
#> comp 6 0.000000000000000000000000000002982267
#> cumulative percentage of variance
#> comp 1 52.03406
#> comp 2 86.49717
#> comp 3 95.08274
#> comp 4 98.31186
#> comp 5 100.00000
#> comp 6 100.00000
From both the eigenvalue summary and the graph above, we can see that 86% of the variances can be explained by only using the first 2 dimensions. So we can reduce the number of features on our data set from 6 to 2 features.
We will extract the values from PC1 to PC2 and put it in a dataframe, which later on can be used for supervised learning.
penguin_pca_keep <- data.frame(penguin_pca$ind$coord[, 1:2]) %>%
bind_cols(cluster = as.factor(penguin_kmeans$cluster))
penguin_pca_keep %>% head()
#> Dim.1 Dim.2 cluster
#> 1 -0.7169639 2.0939647 3
#> 2 -1.9241582 -0.4075447 1
#> 3 -1.9476043 -0.3044176 1
#> 4 -2.2797068 0.1628807 1
#> 5 -0.6076547 2.5090384 3
#> 6 -2.2847400 -0.1210956 1
PCA can be integrated with K-means Clustering result to help visualize our data in a fewer dimensions than the original features.
fviz_cluster(object = penguin_kmeans, data = penguin_clean_new %>% select(-cluster))
# Create PCA model with cluster
new_pca <- PCA(X = penguin_clean_new,
scale.unit = T,
quali.sup = "cluster",
graph = F)
fviz_pca_biplot(X = new_pca,
habillage = "cluster",
geom.ind = "point",
addEllipses = T ,
col.var = "navy"
)