library(tidyverse)
library(factoextra)
library(DataExplorer)
library(easystats)
library(umap)
library(ggpubr)
library(readxl)
library(purrr)
library(ggplot2)
library(tidyr)
library(corrplot)
library(gridExtra)
library(dplyr)
library(forcats)
Clustering Analysis: K-Means dan Hierarchical Clustering
Persiapan dan Import data
# Data gregat
<- read_excel("Data_clean.xlsx")
df glimpse(df)
Rows: 34
Columns: 5
$ Provinsi <chr> "ACEH", "BALI", "BANTEN", "BENGKULU", "DI YOGYAK…
$ `Mutu Lulusan` <dbl> 85.04377, 93.39481, 91.29524, 93.38393, 91.76484…
$ `Proses Pembelajaran` <dbl> 84.45532, 91.85758, 90.38333, 88.51042, 91.96667…
$ `Mutu Guru` <dbl> 82.08983, 91.89646, 88.26620, 87.03125, 87.13248…
$ `Manajemen S/M` <dbl> 85.40780, 95.51010, 92.14583, 91.19097, 93.72222…
# Data asli
<- read_excel("DATA TUGAS-1_Final.xlsx", sheet="SMA")
df_asli glimpse(df_asli)
Rows: 1,144
Columns: 11
$ No <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
$ TAHUN <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2…
$ NPSN <dbl> 20501870, 10303885, 69974471, 50105492, 10600905,…
$ `Status Sekolah` <chr> "Swasta", "Swasta", "Swasta", "Negeri", "Negeri",…
$ Provinsi <chr> "JAWA TIMUR", "SUMATERA BARAT", "JAWA BARAT", "BA…
$ `Kab/Kota` <chr> "KABUPATEN SIDOARJO", "KOTA PAYAKUMBUH", "KABUPAT…
$ Mutu_Lulusan <dbl> 88.57143, 82.85714, 74.28571, 94.28571, 97.14286,…
$ Proses_Pembelajaran <dbl> 90.00000, 86.66667, 83.33333, 96.66667, 96.66667,…
$ Mutu_Guru <dbl> 88.88889, 83.33333, 77.77778, 100.00000, 94.44444…
$ `Manajemen_S/M` <dbl> 94.44444, 88.88889, 88.88889, 100.00000, 100.0000…
$ Peringkat_Akreditasi <chr> "A", "B", "B", "A", "A", "B", "C", "A", "B", "A",…
Eksplorasi Data
Korelasi data agregat
# Hitung korelasi numeric saja
<- cor(df %>% select(where(is.numeric)), method = "pearson")
cor_mat
# Plot korelasi sebagai heatmap
corrplot(cor_mat, method = "color",
type = "upper", # tampilkan segitiga atas
addCoef.col = "black", # tulis nilai korelasi
tl.col = "black", tl.srt = 45) # label miring
Histogram data agregat
plot_intro(data = df,ggtheme = theme_minimal())
plot_histogram(data = df,
ncol = 2,nrow = 2,
geom_histogram_args = list(fill="steelblue",col="black"),
ggtheme = theme_minimal())
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Boxplot data agregat
# Ubah data ke format long
<- df %>%
df_long select(-Provinsi) %>% # jika kolom Provinsi hanya label
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Value")
# Boxplot eksplorasi tiap variabel
ggplot(df_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "steelblue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribusi Variabel (Boxplot Eksplorasi)",
x = "Variabel",
y = "Nilai") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Boxplot data asli
# Ubah ke long format
<- df_asli %>%
df_box select(Provinsi, Mutu_Lulusan, Mutu_Guru, Proses_Pembelajaran, `Manajemen_S/M`) %>%
pivot_longer(cols = -Provinsi,
names_to = "Variable",
values_to = "Value")
# Hitung rata-rata per provinsi per variabel
<- df_box %>%
prov_means group_by(Variable, Provinsi) %>%
summarise(mean_value = mean(Value, na.rm = TRUE), .groups = "drop")
# Gabungkan mean untuk membuat faktor Provinsi terurut per variabel
<- df_box %>%
df_box left_join(prov_means, by = c("Variable", "Provinsi")) %>%
group_by(Variable) %>%
mutate(Provinsi = fct_reorder(Provinsi, mean_value, .desc = TRUE)) %>%
ungroup()
# Plot
ggplot(df_box, aes(x = Provinsi, y = Value, fill = Provinsi)) +
geom_boxplot(alpha = 0.7) +
facet_wrap(~ Variable, scales = "free_y") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
legend.position = "none") +
labs(title = "Distribusi Variabel per Provinsi",
x = "Provinsi",
y = "Rataan Skor")
Reduksi Dimensi untuk eksplorasi
Standarisasi
<- df %>%
df_std select(-Provinsi) %>%
standardize()
rownames(df_std) <- df$Provinsi # simpan nama provinsi
Warning: Setting row names on a tibble is deprecated.
PCA
<- prcomp(df_std)
pca0 $rotation %>%
pca0as.data.frame()
PC1 PC2 PC3 PC4
Mutu Lulusan 0.5016089 -0.1508693 -0.7655252 0.3736284
Proses Pembelajaran 0.5033937 0.1186185 -0.1079213 -0.8490450
Mutu Guru 0.4974482 0.7098739 0.3562985 0.3488205
Manajemen S/M 0.4975224 -0.6776778 0.5247624 0.1335990
# Visualisasi
fviz_pca_ind(pca0,
geom.ind = c("point","text"),
repel = TRUE,
labelsize = 3)
UMAP
<- umap(df_std)
umap0 <- data.frame(x = umap0$layout[,1],
data_umap y = umap0$layout[,2],
Provinsi = rownames(df_std))
ggscatter(data_umap, x = "x", y = "y",
label = data_umap$Provinsi,
repel = TRUE,
font.label = c(8, "plain", "grey50"))
Menentukan Jumlah Cluster Optimal
K-Means: Elbow & Silhouette
set.seed(123)
fviz_nbclust(df_std, kmeans, method = "wss", k.max = 25) # Elbow
set.seed(123)
fviz_nbclust(df_std, kmeans, method = "silhouette", k.max = 25) # Silhouette
Hierarchical Clustering: Dendogram dan Silhouette
<- c("single","complete","average","ward.D","ward.D2","median","centroid","mcquitty")
metode_agg
# Dendrogram berbagai metode linkage
walk(metode_agg, ~ {
<- hcut(x = df_std, hc_method = .x, hc_func = "hclust")
hc_res0 print(fviz_dend(hc_res0,
type = "rectangle",
k_colors ="black",
main = str_c("Dendrogram of HC with ", .x, " linkage"),
cex = 0.5))
})
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the factoextra package.
Please report the issue at <https://github.com/kassambara/factoextra/issues>.
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
# Silhouette untuk jumlah cluster optimal
walk(metode_agg, ~ {
print(
fviz_nbclust(df_std, FUNcluster = hcut,
method = "silhouette",
hc_method = .x) +
ggtitle(paste("Silhouette Optimal Cluster -", .x, "Linkage"))
) })
# Menyimpan k optimal
library(factoextra)
library(tidyverse)
library(purrr)
<- function(X, linkage) {
get_kopt_from_fviz <- fviz_nbclust(X, FUNcluster = hcut, method = "silhouette", hc_method = linkage)
p <- layer_data(p, 1) %>% distinct(x, y) # x = k, y = silhouette
dd <- dd$x[which.max(dd$y)]
k_opt <- max(dd$y, na.rm = TRUE)
silmax tibble(linkage = linkage, k_opt = k_opt, sil_max = silmax)
}
<- map_dfr(metode_agg, ~ get_kopt_from_fviz(df_std, .x)) %>%
kopt_tbl_fviz arrange(desc(sil_max))
kopt_tbl_fviz
# A tibble: 8 × 3
linkage k_opt sil_max
<chr> <mppd_dsc> <dbl>
1 average 2 0.565
2 ward.D 2 0.565
3 ward.D2 2 0.565
4 centroid 2 0.565
5 mcquitty 2 0.565
6 single 2 0.541
7 complete 2 0.455
8 median 4 0.277
#list hasil HC untuk semua metode linkage
<- pmap(
hc_all list(kopt_tbl_fviz$linkage, kopt_tbl_fviz$k_opt),
function(meth, k) {
<- hclust(dist(df_std), method = meth)
hc <- cutree(hc, k = k)
cl list(linkage = meth, k_opt = k, hc = hc, clusters = cl)
}
)names(hc_all) <- kopt_tbl_fviz$linkage
Clustering
Hierarchical Clustering
# daftar metode linkage
<- c("single","complete","average","ward.D","ward.D2",
metode_agg "median","centroid","mcquitty")
# loop untuk setiap metode
for (meth in metode_agg) {
# buat hclust
<- hclust(dist(df_std), method = meth)
hc_res
# ambil k optimum untuk metode ini
<- kopt_tbl_fviz$k_opt[kopt_tbl_fviz$linkage == meth]
k_opt
# potong cluster sesuai k_opt
<- cutree(hc_res, k = k_opt)
clusters_hc
# tampilkan dendrogram
<- fviz_dend(hc_res, k = k_opt, rect = TRUE,
p show_labels = TRUE,
cex = 0.5) +
ggtitle(paste0("HC - ", meth, " (k = ", k_opt, ")"))
print(p) # tampilkan satu per satu
}
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
Warning in data.frame(xmin = unlist(xleft), ymin = unlist(ybottom), xmax =
unlist(xright), : row names were found from a short variable and have been
discarded
Warning in get_col(col, k): Length of color vector was shorter than the number
of clusters - color vector was recycled
K-Means dengan jumlah cluster optimal
<- 2 # bisa ganti sesuai hasil elbow/silhouette
k_opt set.seed(123)
<- kmeans(df_std, centers = k_opt, iter.max = 100)
kmean_res $centers kmean_res
Mutu Lulusan Proses Pembelajaran Mutu Guru Manajemen S/M
1 -1.1927301 -1.2067627 -1.2214300 -1.2190815
2 0.4969709 0.5028178 0.5089292 0.5079506
Visualisasi Hasil klaster
# Plot K-Means
<- fviz_cluster(list(data = df_std,
plot_kmeans cluster = kmean_res$cluster),
geom.ind = c("point","text"),
repel = TRUE,
show.clust.cent = TRUE,
ellipse.alpha = 0.1,
labelsize = 6) +
theme_minimal() +
ggtitle("K-Means Clustering")
print(plot_kmeans)
# cluster hc dgn k_opt
<- kopt_tbl_fviz$linkage
metode_agg
walk(metode_agg, ~{
<- .x
meth <- kopt_tbl_fviz$k_opt[kopt_tbl_fviz$linkage == meth]
k_opt
<- hclust(dist(df_std), method = meth)
hc_res <- cutree(hc_res, k = k_opt)
clusters_hc
<- fviz_cluster(list(data = df_std,
plot_hc cluster = clusters_hc),
geom.ind = c("point","text"),
repel = TRUE,
show.clust.cent = TRUE,
ellipse.alpha = 0.1,
labelsize = 6) +
theme_minimal() +
ggtitle(paste0("Hierarchical Clustering – ", meth, " (k = ", k_opt, ")"))
print(plot_hc)
})
Simpan Hasil Klaster ke Data Asli
$cluster_kmeans <- kmean_res$cluster
df
# Tambahkan cluster HC
<- kopt_tbl_fviz$linkage
metode_agg
for (meth in metode_agg) {
<- kopt_tbl_fviz$k_opt[kopt_tbl_fviz$linkage == meth] # jumlah cluster optimal
k_opt <- hclust(dist(df_std), method = meth) # dendrogram HC
hc paste0("cluster_hc_", meth)]] <- cutree(hc, k = k_opt) # tempel ke df
df[[
}
# Tampilkan ringkasan ukuran cluster
cat("\n== Ukuran cluster K-Means ==\n")
== Ukuran cluster K-Means ==
print(table(df$cluster_kmeans))
1 2
10 24
walk(metode_agg, ~{
<- paste0("cluster_hc_", .x)
nm cat("\n== Ukuran cluster HC -", .x, "==\n")
print(table(df[[nm]]))
})
== Ukuran cluster HC - average ==
1 2
7 27
== Ukuran cluster HC - ward.D ==
1 2
7 27
== Ukuran cluster HC - ward.D2 ==
1 2
7 27
== Ukuran cluster HC - centroid ==
1 2
7 27
== Ukuran cluster HC - mcquitty ==
1 2
7 27
== Ukuran cluster HC - single ==
1 2
31 3
== Ukuran cluster HC - complete ==
1 2
14 20
== Ukuran cluster HC - median ==
1 2 3 4
7 1 23 3
# Crosstab perbandingan KMeans vs tiap HC
walk(metode_agg, ~{
<- paste0("cluster_hc_", .x)
nm cat("\n== Crosstab KMeans vs HC -", .x, "==\n")
print(table(KMeans = df$cluster_kmeans, HC = df[[nm]]))
})
== Crosstab KMeans vs HC - average ==
HC
KMeans 1 2
1 7 3
2 0 24
== Crosstab KMeans vs HC - ward.D ==
HC
KMeans 1 2
1 7 3
2 0 24
== Crosstab KMeans vs HC - ward.D2 ==
HC
KMeans 1 2
1 7 3
2 0 24
== Crosstab KMeans vs HC - centroid ==
HC
KMeans 1 2
1 7 3
2 0 24
== Crosstab KMeans vs HC - mcquitty ==
HC
KMeans 1 2
1 7 3
2 0 24
== Crosstab KMeans vs HC - single ==
HC
KMeans 1 2
1 7 3
2 24 0
== Crosstab KMeans vs HC - complete ==
HC
KMeans 1 2
1 10 0
2 4 20
== Crosstab KMeans vs HC - median ==
HC
KMeans 1 2 3 4
1 7 0 0 3
2 0 1 23 0
# Tampilkan sebagian hasil df dengan kolom cluster
head(df)
# A tibble: 6 × 14
Provinsi `Mutu Lulusan` `Proses Pembelajaran` `Mutu Guru` `Manajemen S/M`
<chr> <dbl> <dbl> <dbl> <dbl>
1 ACEH 85.0 84.5 82.1 85.4
2 BALI 93.4 91.9 91.9 95.5
3 BANTEN 91.3 90.4 88.3 92.1
4 BENGKULU 93.4 88.5 87.0 91.2
5 DI YOGYAKARTA 91.8 92.0 87.1 93.7
6 DKI JAKARTA 94.5 91.1 88.8 92.7
# ℹ 9 more variables: cluster_kmeans <int>, cluster_hc_average <int>,
# cluster_hc_ward.D <int>, cluster_hc_ward.D2 <int>,
# cluster_hc_centroid <int>, cluster_hc_mcquitty <int>,
# cluster_hc_single <int>, cluster_hc_complete <int>, cluster_hc_median <int>
Interpretasi
# --- K-Means ---
%>%
df group_by(cluster_kmeans) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 2 × 13
cluster_kmeans `mean_Mutu Lulusan` `mean_Proses Pembelajaran` `mean_Mutu Guru`
<int> <dbl> <dbl> <dbl>
1 1 85.1 83.9 81.0
2 2 90.5 89.0 87.3
# ℹ 9 more variables: `mean_Manajemen S/M` <dbl>,
# mean_cluster_hc_average <dbl>, mean_cluster_hc_ward.D <dbl>,
# mean_cluster_hc_ward.D2 <dbl>, mean_cluster_hc_centroid <dbl>,
# mean_cluster_hc_mcquitty <dbl>, mean_cluster_hc_single <dbl>,
# mean_cluster_hc_complete <dbl>, mean_cluster_hc_median <dbl>
# --- HC: Single linkage ---
%>%
df group_by(cluster_hc_single) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 2 × 13
cluster_hc_single `mean_Mutu Lulusan` mean_Proses Pembelaja…¹ `mean_Mutu Guru`
<int> <dbl> <dbl> <dbl>
1 1 89.6 88.1 86.2
2 2 82.5 81.3 77.5
# ℹ abbreviated name: ¹`mean_Proses Pembelajaran`
# ℹ 9 more variables: `mean_Manajemen S/M` <dbl>, mean_cluster_kmeans <dbl>,
# mean_cluster_hc_average <dbl>, mean_cluster_hc_ward.D <dbl>,
# mean_cluster_hc_ward.D2 <dbl>, mean_cluster_hc_centroid <dbl>,
# mean_cluster_hc_mcquitty <dbl>, mean_cluster_hc_complete <dbl>,
# mean_cluster_hc_median <dbl>
# --- HC: Complete linkage ---
%>%
df group_by(cluster_hc_complete) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 2 × 13
cluster_hc_complete `mean_Mutu Lulusan` `mean_Proses Pembelajaran`
<int> <dbl> <dbl>
1 1 85.9 84.9
2 2 91.1 89.3
# ℹ 10 more variables: `mean_Mutu Guru` <dbl>, `mean_Manajemen S/M` <dbl>,
# mean_cluster_kmeans <dbl>, mean_cluster_hc_average <dbl>,
# mean_cluster_hc_ward.D <dbl>, mean_cluster_hc_ward.D2 <dbl>,
# mean_cluster_hc_centroid <dbl>, mean_cluster_hc_mcquitty <dbl>,
# mean_cluster_hc_single <dbl>, mean_cluster_hc_median <dbl>
# --- HC: Average linkage ---
%>%
df group_by(cluster_hc_average) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 2 × 13
cluster_hc_average `mean_Mutu Lulusan` mean_Proses Pembelaj…¹ `mean_Mutu Guru`
<int> <dbl> <dbl> <dbl>
1 1 84.4 82.9 79.9
2 2 90.1 88.7 86.9
# ℹ abbreviated name: ¹`mean_Proses Pembelajaran`
# ℹ 9 more variables: `mean_Manajemen S/M` <dbl>, mean_cluster_kmeans <dbl>,
# mean_cluster_hc_ward.D <dbl>, mean_cluster_hc_ward.D2 <dbl>,
# mean_cluster_hc_centroid <dbl>, mean_cluster_hc_mcquitty <dbl>,
# mean_cluster_hc_single <dbl>, mean_cluster_hc_complete <dbl>,
# mean_cluster_hc_median <dbl>
# --- HC: Ward.D ---
%>%
df group_by(cluster_hc_ward.D) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 2 × 13
cluster_hc_ward.D `mean_Mutu Lulusan` mean_Proses Pembelaja…¹ `mean_Mutu Guru`
<int> <dbl> <dbl> <dbl>
1 1 84.4 82.9 79.9
2 2 90.1 88.7 86.9
# ℹ abbreviated name: ¹`mean_Proses Pembelajaran`
# ℹ 9 more variables: `mean_Manajemen S/M` <dbl>, mean_cluster_kmeans <dbl>,
# mean_cluster_hc_average <dbl>, mean_cluster_hc_ward.D2 <dbl>,
# mean_cluster_hc_centroid <dbl>, mean_cluster_hc_mcquitty <dbl>,
# mean_cluster_hc_single <dbl>, mean_cluster_hc_complete <dbl>,
# mean_cluster_hc_median <dbl>
# --- HC: Ward.D2 ---
%>%
df group_by(cluster_hc_ward.D2) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 2 × 13
cluster_hc_ward.D2 `mean_Mutu Lulusan` mean_Proses Pembelaj…¹ `mean_Mutu Guru`
<int> <dbl> <dbl> <dbl>
1 1 84.4 82.9 79.9
2 2 90.1 88.7 86.9
# ℹ abbreviated name: ¹`mean_Proses Pembelajaran`
# ℹ 9 more variables: `mean_Manajemen S/M` <dbl>, mean_cluster_kmeans <dbl>,
# mean_cluster_hc_average <dbl>, mean_cluster_hc_ward.D <dbl>,
# mean_cluster_hc_centroid <dbl>, mean_cluster_hc_mcquitty <dbl>,
# mean_cluster_hc_single <dbl>, mean_cluster_hc_complete <dbl>,
# mean_cluster_hc_median <dbl>
# --- HC: Median linkage ---
%>%
df group_by(cluster_hc_median) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 4 × 13
cluster_hc_median `mean_Mutu Lulusan` mean_Proses Pembelaja…¹ `mean_Mutu Guru`
<int> <dbl> <dbl> <dbl>
1 1 86.2 85.0 82.6
2 2 93.4 91.9 91.9
3 3 90.4 88.8 87.1
4 4 82.5 81.3 77.5
# ℹ abbreviated name: ¹`mean_Proses Pembelajaran`
# ℹ 9 more variables: `mean_Manajemen S/M` <dbl>, mean_cluster_kmeans <dbl>,
# mean_cluster_hc_average <dbl>, mean_cluster_hc_ward.D <dbl>,
# mean_cluster_hc_ward.D2 <dbl>, mean_cluster_hc_centroid <dbl>,
# mean_cluster_hc_mcquitty <dbl>, mean_cluster_hc_single <dbl>,
# mean_cluster_hc_complete <dbl>
# --- HC: Centroid linkage ---
%>%
df group_by(cluster_hc_centroid) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 2 × 13
cluster_hc_centroid `mean_Mutu Lulusan` `mean_Proses Pembelajaran`
<int> <dbl> <dbl>
1 1 84.4 82.9
2 2 90.1 88.7
# ℹ 10 more variables: `mean_Mutu Guru` <dbl>, `mean_Manajemen S/M` <dbl>,
# mean_cluster_kmeans <dbl>, mean_cluster_hc_average <dbl>,
# mean_cluster_hc_ward.D <dbl>, mean_cluster_hc_ward.D2 <dbl>,
# mean_cluster_hc_mcquitty <dbl>, mean_cluster_hc_single <dbl>,
# mean_cluster_hc_complete <dbl>, mean_cluster_hc_median <dbl>
# --- HC: Mcquitty linkage ---
%>%
df group_by(cluster_hc_mcquitty) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
# A tibble: 2 × 13
cluster_hc_mcquitty `mean_Mutu Lulusan` `mean_Proses Pembelajaran`
<int> <dbl> <dbl>
1 1 84.4 82.9
2 2 90.1 88.7
# ℹ 10 more variables: `mean_Mutu Guru` <dbl>, `mean_Manajemen S/M` <dbl>,
# mean_cluster_kmeans <dbl>, mean_cluster_hc_average <dbl>,
# mean_cluster_hc_ward.D <dbl>, mean_cluster_hc_ward.D2 <dbl>,
# mean_cluster_hc_centroid <dbl>, mean_cluster_hc_single <dbl>,
# mean_cluster_hc_complete <dbl>, mean_cluster_hc_median <dbl>