Hierarchical Clustering

#Praktikum 2 - Hierarchical Clustering

#===Packages
library("tidyverse")

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library("factoextra")

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library("DataExplorer")
library("easystats")

## # Attaching packages: easystats 0.7.5 (red = needs update)
## ✔ bayestestR  0.17.0   ✔ correlation 0.8.8 
## ✔ datawizard  1.2.0    ✔ effectsize  1.0.1 
## ✔ insight     1.4.2    ✔ modelbased  0.13.0
## ✖ performance 0.15.1   ✖ parameters  0.28.1
## ✔ report      0.6.1    ✖ see         0.11.0
## 
## Restart the R-Session and update packages with `easystats::easystats_update()`.

library("umap")
library("ggpubr")

## 
## Attaching package: 'ggpubr'
## 
## The following objects are masked from 'package:datawizard':
## 
##     mean_sd, median_mad

#===IMport Data
df <- read.csv("D:/Kuliah/IPB 2025 Semester 3/Pemodelan Klasifikasi/Praktikum/Praktikum 2/women_track_records.csv", stringsAsFactors = T)
glimpse(df)

## Rows: 55
## Columns: 8
## $ lOOm     <dbl> 11.61, 11.20, 11.43, 11.41, 11.46, 11.31, 12.14, 11.00, 12.00…
## $ X200m    <dbl> 22.94, 22.35, 23.09, 23.04, 23.05, 23.17, 24.47, 22.25, 24.52…
## $ X400m    <dbl> 54.50, 51.08, 50.62, 52.00, 53.30, 52.80, 55.00, 50.06, 54.90…
## $ X800m    <dbl> 2.15, 1.98, 1.99, 2.00, 2.16, 2.10, 2.18, 2.00, 2.05, 2.08, 2…
## $ X1500m   <dbl> 4.43, 4.13, 4.22, 4.14, 4.58, 4.49, 4.45, 4.06, 4.23, 4.33, 4…
## $ X3000m   <dbl> 9.79, 9.08, 9.34, 8.88, 9.81, 9.77, 9.51, 8.81, 9.37, 9.31, 9…
## $ Marathon <dbl> 178.52, 152.37, 159.37, 157.85, 169.98, 168.75, 191.02, 149.4…
## $ country  <fct> argentina, australia, austria, belgium, bermuda, brazil, burm…

#===Eksplorasi Data
#1. PLot
plot_intro(data = df,ggtheme = theme_minimal())

#2. HIstogram
plot_histogram(data = df,
               ncol = 2,nrow = 2,
               geom_histogram_args = list(fill="steelblue",col="black"),
               ggtheme = theme_minimal())

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#3. Korelasi
correlation(data = df,method = "spearman",
            include_factors = FALSE) %>% 
  as.matrix() %>% 
  plot(text=list(size=2))+theme(text = element_text(size = 6),
                                axis.text.x = element_text(angle = 45,
                                                           hjust = 1))

#4. Standarisasi Peubah 
df_std <- standardize(df)
plot_histogram(data = df_std,
               ncol = 2,nrow = 2,
               geom_histogram_args = list(fill="steelblue",col="black"),
               ggtheme = theme_minimal())

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#=== Menentukan Banykanya Cluster Terbaik
#1. Visualisasi PCA
pca0 <- prcomp(x = df_std %>% select(-country))
#menambahkan nama objek (baris)
rownames(pca0$x) <- df_std$country
fviz_pca_ind(pca0,
             geom.ind = c("point","text"),
             repel = TRUE,
             labelsize=3
)

pca0$rotation %>% 
  as.data.frame()

##                PC1        PC2         PC3         PC4         PC5          PC6
## lOOm     0.3683561  0.4900597 -0.28601157  0.31938631  0.23116950  0.619825234
## X200m    0.3653642  0.5365800 -0.22981913 -0.08330196  0.04145457 -0.710764580
## X400m    0.3816103  0.2465377  0.51536655 -0.34737748 -0.57217791  0.190945970
## X800m    0.3845592 -0.1554023  0.58452608 -0.04207636  0.62032379 -0.019089032
## X1500m   0.3891040 -0.3604093  0.01291198  0.42953873  0.03026144 -0.231248381
## X3000m   0.3888661 -0.3475394 -0.15272772  0.36311995 -0.46335476  0.009277159
## Marathon 0.3670038 -0.3692076 -0.48437037 -0.67249685  0.13053590  0.142280558
##                  PC7
## lOOm      0.05217655
## X200m    -0.10922503
## X400m     0.20849691
## X800m    -0.31520972
## X1500m    0.69256151
## X3000m   -0.59835943
## Marathon  0.06959828

#2. UMAP
umap0 <- umap(d = df_std %>% select(-country))
data_umap <- data.frame(x=umap0$layout[,1],y=umap0$layout[,2]) 

ggscatter(data_umap, x = "x", y = "y",
          label = df_std$country,repel = TRUE,
          font.label = c(9, "plain", "grey50"))

#3. Dendogram
metode_agg <- c("single","complete",
                "average","ward.D","median","centroid",
                "mcquitty")
# memberikan label negara setiap amatan
df_std2 <- df_std %>% select(-country)
rownames(df_std2) <- df$country
map(metode_agg,function(i){
  
  
  hc_res0 <- hcut(x=df_std2,hc_method = i,hc_func = "hclust")
  fviz_dend(x = hc_res0,
            type = "rectangle",
            k_colors ="black",
            main = str_c("Dendrogram of HC with ",i," linkage"),
            # besar kecil label default 0.8
            cex = 0.5)
}
)

## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

#4. Matrik WSS (Within Sum of Squared)
set.seed(123)
map(metode_agg, function(i)
  
  fviz_nbclust(
    x = df_std2,
    FUNcluster = hcut,
    method = "wss",
    hc_method = i,
    hc_fun = "hclust",
    k.max = 25
  )+
    ggtitle(str_c("Optimal number of clusters based on HC with ",i," linkage"))
  
)

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

#5. Metric Koefisien Silhouette
set.seed(123)
sil_plot <- map(metode_agg, function(i)
  
  fviz_nbclust(
    x = df_std2,
    FUNcluster = hcut,
    method = "silhouette",
    hc_method = i,
    hc_fun = "hclust",
    k.max = 25
  )+
    ggtitle(str_c("Optimal number of clusters based on HC with ",i," linkage"))
  
)
sil_plot

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

av_sil_max <- map_dbl(sil_plot,function(plot){
  max(plot$data$y)
})

data.frame(metode=metode_agg,av_sil_max=av_sil_max)

##     metode av_sil_max
## 1   single  0.6353285
## 2 complete  0.6056807
## 3  average  0.6056807
## 4   ward.D  0.4281800
## 5   median  0.5272862
## 6 centroid  0.6353285
## 7 mcquitty  0.5272862

#===Penerapan Metode Terbaik
set.seed(123)
hc_res <- hcut(x=df_std2,
               k = 2,hc_method = "single",
               hc_func = "hclust")
table(hc_res$cluster)

## 
##  1  2 
## 54  1

#===Validasi Hasil Clustering
sil <- cluster::silhouette(x = hc_res$cluster,
                           dist = dist(x =df_std2,
                                       method = "euclidean"))
fviz_silhouette(sil,print.summary = FALSE)+
  scale_color_manual(values = get_palette("jco",k=7))+
  # get_palette berasal dari package ggpubr
  scale_fill_manual(values = get_palette("jco",k=7))+
  theme_minimal()+
  theme(legend.position = "top")

sil %>%  
  as.data.frame() %>% 
  mutate(obs=1:nrow(sil)) %>%  
  relocate(obs) %>%  
  filter(sil_width<0) %>%  
  arrange(sil_width)

##   obs cluster neighbor   sil_width
## 1  12       1        2 -0.29836541
## 2  36       1        2 -0.04906492

corrected_cluster <- sil %>%  
  as.data.frame() %>%  
  mutate(cluster=if_else(sil_width<0,neighbor,cluster))
corrected_cluster %>%  
  filter(sil_width<0) %>%  
  arrange(sil_width)

##   cluster neighbor   sil_width
## 1       2        2 -0.29836541
## 2       2        2 -0.04906492

#===Interpretasi hasil clustering
#1. Menggunakan center
df |>  
  select(-country) %>% 
  mutate(cluster=corrected_cluster$cluster) %>%  
  group_by(cluster) %>%  
  summarise(across(everything(),mean))

## # A tibble: 2 × 8
##   cluster  lOOm X200m X400m X800m X1500m X3000m Marathon
##     <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>    <dbl>
## 1       1  11.6  23.5  53.1  2.06   4.28   9.32     168.
## 2       2  12.5  26.0  59.1  2.3    5.15  11.7      267.

#2. Menggunakan visualisasi 
#2.1 UMAP
data_umap_labeled <- data_umap %>% 
  mutate(cluster=as.factor(corrected_cluster$cluster))

ggscatter(data_umap_labeled, x = "x",
          y = "y",
          color = "cluster",
          palette = "jco",
          label = df_std$country,repel = TRUE,
          title = "Cluster Plot")

#2.2 PCA
fviz_cluster(object = list(data=df_std2,
                           cluster=corrected_cluster$cluster),
             geom.ind = c("point","text"),
             repel = TRUE,
             show.clust.cent = FALSE,
             ellipse.alpha = 0.05,
             labelsize=9)+
  theme_minimal()

Hierarchical Clustering

Felasofa Rahmatanti - M0501241033

2025-10-09