# HIERARCHICAL CLUSTERING (DATA PKL BANTUL)
# 1. Load library
library(readxl)
library(dplyr)
library(factoextra)
# 2. Import data
df <- read_excel("C:/Users/farah/OneDrive/Documents/FARAH/PKL/data/datapkl9var.xlsx")
# 3. Ambil variabel numerik (hapus nama wilayah)
df_num <- df[, -1]
# 4. Handle missing value
df_num <- na.omit(df_num)
# 5. Standardisasi data
dfs <- scale(df_num)
# 6. Hitung jarak (Euclidean distance)
d <- dist(dfs, method = "euclidean")
d
## 1 2 3 4 5 6 7
## 2 1.1840080
## 3 0.8833494 1.1932492
## 4 2.3221266 1.9042079 2.3399343
## 5 1.1374664 0.6845471 0.8147995 1.9828179
## 6 2.5083146 1.8353860 2.8114723 1.4194803 2.2302217
## 7 3.4680931 3.0353498 4.0127990 2.8504816 3.4455074 1.5663505
## 8 3.5053586 2.7660652 3.5771470 4.0241858 2.9089817 3.1696771 3.3818493
## 9 3.4367460 3.0133905 3.9304845 3.7502083 3.3130012 2.5447326 1.8610716
## 10 4.9831022 4.3994924 5.4443830 3.8181694 4.8734730 2.6884242 1.7569095
## 11 3.7161179 3.9002661 4.2065124 2.7759355 4.1788939 2.9966113 3.3540775
## 12 5.2398854 4.2803911 5.1532234 5.3737744 4.4892345 4.4983347 4.6973080
## 13 4.5702474 3.9162167 5.0365849 3.9021793 4.4049515 2.5253174 1.4081934
## 14 1.9173604 1.9249009 2.2805169 3.2934295 1.8860090 2.7974271 3.1508797
## 15 3.4737901 2.4133545 3.2475831 3.3574612 2.5253783 2.8298000 3.6301815
## 16 3.8984778 3.1687407 4.0456922 4.4379164 3.4042991 3.4381991 3.4369582
## 17 1.7696936 1.2201076 1.5200955 2.5314805 0.8605896 2.4962754 3.5101376
## 8 9 10 11 12 13 14
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 2.0073790
## 10 4.6369728 3.2968437
## 11 5.9934754 4.8765076 3.7831739
## 12 2.0238780 3.5756505 5.3916748 7.4318307
## 13 3.5414188 2.2161442 1.3238112 4.4111764 4.2882701
## 14 2.0695264 2.1360877 4.8408720 4.8937609 4.0609028 4.0172264
## 15 1.5655099 3.0038030 4.6191055 5.7432703 2.1684620 3.8132553 2.8050651
## 16 0.6417895 1.9610959 4.5769183 6.2186182 1.8176402 3.3828722 2.4490581
## 17 2.2329529 2.9826407 4.9868506 4.8346598 3.8677452 4.3438303 1.5149150
## 15 16
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16 1.9763606
## 17 1.9955484 2.7967466
# 7. CLUSTERING (BANDINGKAN METODE)
hc_single <- hclust(d, method = "single")
hc_complete <- hclust(d, method = "complete")
hc_average <- hclust(d, method = "average")
# 8. COPHENETIC CORRELATION (PILIH METODE TERBAIK)
cor_single <- cor(d, cophenetic(hc_single))
cor_complete <- cor(d, cophenetic(hc_complete))
cor_average <- cor(d, cophenetic(hc_average))
data.frame(single = cor_single,
complete = cor_complete,
average = cor_average)
## single complete average
## 1 0.6655329 0.656295 0.7666164
# 9. METODE TERBAIK (AVERAGE)
hc <- hc_average
# 10. DENDROGRAM FINAL
plot(hc, cex = 0.7, hang = -1)
# Coba beberapa jumlah cluster
rect.hclust(hc, k = 2, border = "red")
rect.hclust(hc, k = 3, border = "blue")
rect.hclust(hc, k = 4, border = "green")

# Sillhoutte Score
fviz_nbclust(dfs, FUN = hcut, method = "silhouette")

# 11. Menentukan Cluster
grp <- cutree(hc, k = 3)
df_cl <- mutate(df, cluster = grp)
print(df_cl, width = Inf)
## # A tibble: 17 × 8
## kapanewon pend_rendah pend_menengah pend_tinggi agraris formal informal
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SRANDAKAN 28.1 59.9 12.0 35.2 23.7 41.0
## 2 SANDEN 29.0 56.9 14.1 32.6 26.2 41.2
## 3 KRETEK 27.4 59.3 13.3 38.7 27.2 34.1
## 4 PUNDONG 35.5 55.3 9.21 38.3 26.0 35.8
## 5 BAMBANGLIPURO 28.7 57.9 13.4 33.2 29.7 37.1
## 6 PANDAK 35.0 54.4 10.7 27.7 24.5 47.8
## 7 PAJANGAN 36.5 54.5 9.05 16.3 22.5 61.3
## 8 BANTUL 27.1 55.4 17.5 8.37 36.7 54.9
## 9 JETIS 31.4 56.3 12.3 5.84 29.1 65.1
## 10 IMOGIRI 40.9 50.5 8.67 16.2 18.3 65.5
## 11 DLINGO 39.4 56.6 4.00 44.5 9.53 46.0
## 12 BANGUNTAPAN 26.1 51.0 22.9 3.31 40.5 56.2
## 13 PLERET 36.9 51.5 11.6 9.70 21.9 68.4
## 14 PIYUNGAN 27.1 59.8 13.1 17.4 31.3 51.3
## 15 SEWON 28.8 53.0 18.2 19.0 37.5 43.4
## 16 KASIHAN 26.7 54.8 18.5 5.06 34.9 60.0
## 17 SEDAYU 28.0 57.8 14.3 26.2 34.4 39.4
## cluster
## <int>
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 2
## 8 1
## 9 2
## 10 2
## 11 3
## 12 1
## 13 2
## 14 1
## 15 1
## 16 1
## 17 1
# 12. JUMLAH ANGGOTA CLUSTER
count(df_cl, cluster)
## # A tibble: 3 × 2
## cluster n
## <int> <int>
## 1 1 12
## 2 2 4
## 3 3 1
# 13. INTERPRETASI CLUSTER (RATA-RATA)
df_cl %>%
group_by(cluster) %>%
summarise(across(-kapanewon, mean))
## # A tibble: 3 × 7
## cluster pend_rendah pend_menengah pend_tinggi agraris formal informal
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 28.9 56.3 14.8 23.8 31.1 45.2
## 2 2 36.4 53.2 10.4 12.0 22.9 65.0
## 3 3 39.4 56.6 4.00 44.5 9.53 46.0
# 14. VISUALISASI CLUSTER
fviz_cluster(list(data = df_num, cluster = grp),
ellipse.type = "convex",
repel = TRUE,
ggtheme = theme_minimal())

library(tidyr)
df_long <- df_cl %>%
pivot_longer(cols = -c(kapanewon, cluster),
names_to = "variabel",
values_to = "nilai")
ggplot(df_long, aes(x = factor(cluster), y = nilai, fill = factor(cluster))) +
geom_boxplot() +
facet_wrap(~variabel, scales = "free") +
theme_minimal()
