#Praktikum 2 - Hierarchical Clustering
#===Packages
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("factoextra")
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("DataExplorer")
library("easystats")
## # Attaching packages: easystats 0.7.5 (red = needs update)
## ✔ bayestestR 0.17.0 ✔ correlation 0.8.8
## ✔ datawizard 1.2.0 ✔ effectsize 1.0.1
## ✔ insight 1.4.2 ✔ modelbased 0.13.0
## ✖ performance 0.15.1 ✖ parameters 0.28.1
## ✔ report 0.6.1 ✖ see 0.11.0
##
## Restart the R-Session and update packages with `easystats::easystats_update()`.
library("umap")
library("ggpubr")
##
## Attaching package: 'ggpubr'
##
## The following objects are masked from 'package:datawizard':
##
## mean_sd, median_mad
#===IMport Data
df <- read.csv("D:/Kuliah/IPB 2025 Semester 3/Pemodelan Klasifikasi/Praktikum/Praktikum 2/women_track_records.csv", stringsAsFactors = T)
glimpse(df)
## Rows: 55
## Columns: 8
## $ lOOm <dbl> 11.61, 11.20, 11.43, 11.41, 11.46, 11.31, 12.14, 11.00, 12.00…
## $ X200m <dbl> 22.94, 22.35, 23.09, 23.04, 23.05, 23.17, 24.47, 22.25, 24.52…
## $ X400m <dbl> 54.50, 51.08, 50.62, 52.00, 53.30, 52.80, 55.00, 50.06, 54.90…
## $ X800m <dbl> 2.15, 1.98, 1.99, 2.00, 2.16, 2.10, 2.18, 2.00, 2.05, 2.08, 2…
## $ X1500m <dbl> 4.43, 4.13, 4.22, 4.14, 4.58, 4.49, 4.45, 4.06, 4.23, 4.33, 4…
## $ X3000m <dbl> 9.79, 9.08, 9.34, 8.88, 9.81, 9.77, 9.51, 8.81, 9.37, 9.31, 9…
## $ Marathon <dbl> 178.52, 152.37, 159.37, 157.85, 169.98, 168.75, 191.02, 149.4…
## $ country <fct> argentina, australia, austria, belgium, bermuda, brazil, burm…
#===Eksplorasi Data
#1. PLot
plot_intro(data = df,ggtheme = theme_minimal())

#2. HIstogram
plot_histogram(data = df,
ncol = 2,nrow = 2,
geom_histogram_args = list(fill="steelblue",col="black"),
ggtheme = theme_minimal())
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#3. Korelasi
correlation(data = df,method = "spearman",
include_factors = FALSE) %>%
as.matrix() %>%
plot(text=list(size=2))+theme(text = element_text(size = 6),
axis.text.x = element_text(angle = 45,
hjust = 1))

#4. Standarisasi Peubah
df_std <- standardize(df)
plot_histogram(data = df_std,
ncol = 2,nrow = 2,
geom_histogram_args = list(fill="steelblue",col="black"),
ggtheme = theme_minimal())
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#=== Menentukan Banykanya Cluster Terbaik
#1. Visualisasi PCA
pca0 <- prcomp(x = df_std %>% select(-country))
#menambahkan nama objek (baris)
rownames(pca0$x) <- df_std$country
fviz_pca_ind(pca0,
geom.ind = c("point","text"),
repel = TRUE,
labelsize=3
)

pca0$rotation %>%
as.data.frame()
## PC1 PC2 PC3 PC4 PC5 PC6
## lOOm 0.3683561 0.4900597 -0.28601157 0.31938631 0.23116950 0.619825234
## X200m 0.3653642 0.5365800 -0.22981913 -0.08330196 0.04145457 -0.710764580
## X400m 0.3816103 0.2465377 0.51536655 -0.34737748 -0.57217791 0.190945970
## X800m 0.3845592 -0.1554023 0.58452608 -0.04207636 0.62032379 -0.019089032
## X1500m 0.3891040 -0.3604093 0.01291198 0.42953873 0.03026144 -0.231248381
## X3000m 0.3888661 -0.3475394 -0.15272772 0.36311995 -0.46335476 0.009277159
## Marathon 0.3670038 -0.3692076 -0.48437037 -0.67249685 0.13053590 0.142280558
## PC7
## lOOm 0.05217655
## X200m -0.10922503
## X400m 0.20849691
## X800m -0.31520972
## X1500m 0.69256151
## X3000m -0.59835943
## Marathon 0.06959828
#2. UMAP
umap0 <- umap(d = df_std %>% select(-country))
data_umap <- data.frame(x=umap0$layout[,1],y=umap0$layout[,2])
ggscatter(data_umap, x = "x", y = "y",
label = df_std$country,repel = TRUE,
font.label = c(9, "plain", "grey50"))

#3. Dendogram
metode_agg <- c("single","complete",
"average","ward.D","median","centroid",
"mcquitty")
# memberikan label negara setiap amatan
df_std2 <- df_std %>% select(-country)
rownames(df_std2) <- df$country
map(metode_agg,function(i){
hc_res0 <- hcut(x=df_std2,hc_method = i,hc_func = "hclust")
fviz_dend(x = hc_res0,
type = "rectangle",
k_colors ="black",
main = str_c("Dendrogram of HC with ",i," linkage"),
# besar kecil label default 0.8
cex = 0.5)
}
)
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

#4. Matrik WSS (Within Sum of Squared)
set.seed(123)
map(metode_agg, function(i)
fviz_nbclust(
x = df_std2,
FUNcluster = hcut,
method = "wss",
hc_method = i,
hc_fun = "hclust",
k.max = 25
)+
ggtitle(str_c("Optimal number of clusters based on HC with ",i," linkage"))
)
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

#5. Metric Koefisien Silhouette
set.seed(123)
sil_plot <- map(metode_agg, function(i)
fviz_nbclust(
x = df_std2,
FUNcluster = hcut,
method = "silhouette",
hc_method = i,
hc_fun = "hclust",
k.max = 25
)+
ggtitle(str_c("Optimal number of clusters based on HC with ",i," linkage"))
)
sil_plot
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

av_sil_max <- map_dbl(sil_plot,function(plot){
max(plot$data$y)
})
data.frame(metode=metode_agg,av_sil_max=av_sil_max)
## metode av_sil_max
## 1 single 0.6353285
## 2 complete 0.6056807
## 3 average 0.6056807
## 4 ward.D 0.4281800
## 5 median 0.5272862
## 6 centroid 0.6353285
## 7 mcquitty 0.5272862
#===Penerapan Metode Terbaik
set.seed(123)
hc_res <- hcut(x=df_std2,
k = 2,hc_method = "single",
hc_func = "hclust")
table(hc_res$cluster)
##
## 1 2
## 54 1
#===Validasi Hasil Clustering
sil <- cluster::silhouette(x = hc_res$cluster,
dist = dist(x =df_std2,
method = "euclidean"))
fviz_silhouette(sil,print.summary = FALSE)+
scale_color_manual(values = get_palette("jco",k=7))+
# get_palette berasal dari package ggpubr
scale_fill_manual(values = get_palette("jco",k=7))+
theme_minimal()+
theme(legend.position = "top")

sil %>%
as.data.frame() %>%
mutate(obs=1:nrow(sil)) %>%
relocate(obs) %>%
filter(sil_width<0) %>%
arrange(sil_width)
## obs cluster neighbor sil_width
## 1 12 1 2 -0.29836541
## 2 36 1 2 -0.04906492
corrected_cluster <- sil %>%
as.data.frame() %>%
mutate(cluster=if_else(sil_width<0,neighbor,cluster))
corrected_cluster %>%
filter(sil_width<0) %>%
arrange(sil_width)
## cluster neighbor sil_width
## 1 2 2 -0.29836541
## 2 2 2 -0.04906492
#===Interpretasi hasil clustering
#1. Menggunakan center
df |>
select(-country) %>%
mutate(cluster=corrected_cluster$cluster) %>%
group_by(cluster) %>%
summarise(across(everything(),mean))
## # A tibble: 2 × 8
## cluster lOOm X200m X400m X800m X1500m X3000m Marathon
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 11.6 23.5 53.1 2.06 4.28 9.32 168.
## 2 2 12.5 26.0 59.1 2.3 5.15 11.7 267.
#2. Menggunakan visualisasi
#2.1 UMAP
data_umap_labeled <- data_umap %>%
mutate(cluster=as.factor(corrected_cluster$cluster))
ggscatter(data_umap_labeled, x = "x",
y = "y",
color = "cluster",
palette = "jco",
label = df_std$country,repel = TRUE,
title = "Cluster Plot")

#2.2 PCA
fviz_cluster(object = list(data=df_std2,
cluster=corrected_cluster$cluster),
geom.ind = c("point","text"),
repel = TRUE,
show.clust.cent = FALSE,
ellipse.alpha = 0.05,
labelsize=9)+
theme_minimal()
