# HIERARCHICAL CLUSTERING (DATA PKL BANTUL)
# 1. Load library
library(readxl)
library(dplyr)
library(factoextra)
library(cluster)
# 2. Import data
df <- read_excel("C:/Data/datapklpdpk.xlsx")
datapkl <- data.frame(df)
datapkl
## kapanewon pend_rendah pend_menengah pend_tinggi agraris formal
## 1 SRANDAKAN 28.12208 59.87621 12.001713 35.216882 23.745604
## 2 SANDEN 29.00309 56.93075 14.066163 32.637405 26.209655
## 3 KRETEK 27.36911 59.32716 13.303735 38.738993 27.150598
## 4 PUNDONG 35.45224 55.33728 9.210482 38.265051 25.969148
## 5 BAMBANGLIPURO 28.71355 57.90674 13.379719 33.177508 29.700370
## 6 PANDAK 34.97497 54.35637 10.668663 27.707747 24.523388
## 7 PAJANGAN 36.49559 54.45912 9.045290 16.266916 22.482501
## 8 BANTUL 27.09640 55.38568 17.517921 8.370620 36.745308
## 9 JETIS 31.38802 56.26517 12.346812 5.838568 29.076633
## 10 IMOGIRI 40.85698 50.47098 8.672038 16.234252 18.308182
## 11 DLINGO 39.40542 56.59581 3.998768 44.494771 9.532241
## 12 BANGUNTAPAN 26.08650 51.04045 22.873053 3.307092 40.503283
## 13 PLERET 36.86429 51.53485 11.600863 9.695519 21.917916
## 14 PIYUNGAN 27.06750 59.78891 13.143592 17.362055 31.292493
## 15 SEWON 28.79035 53.04883 18.160818 19.017699 37.549515
## 16 KASIHAN 26.68839 54.82107 18.490541 5.063062 34.936576
## 17 SEDAYU 27.96183 57.76364 14.274529 26.230606 34.386278
## informal
## 1 41.03751
## 2 41.15294
## 3 34.11041
## 4 35.76580
## 5 37.12212
## 6 47.76886
## 7 61.25058
## 8 54.88407
## 9 65.08480
## 10 65.45757
## 11 45.97299
## 12 56.18962
## 13 68.38657
## 14 51.34545
## 15 43.43279
## 16 60.00036
## 17 39.38312
# Statistika Deskriptif
statdes <- summary(datapkl)
statdes
## kapanewon pend_rendah pend_menengah pend_tinggi
## Length:17 Min. :26.09 Min. :50.47 Min. : 3.999
## Class :character 1st Qu.:27.37 1st Qu.:54.36 1st Qu.:10.669
## Mode :character Median :28.79 Median :55.39 Median :13.144
## Mean :31.31 Mean :55.58 Mean :13.103
## 3rd Qu.:35.45 3rd Qu.:57.76 3rd Qu.:14.275
## Max. :40.86 Max. :59.88 Max. :22.873
## agraris formal informal
## Min. : 3.307 Min. : 9.532 Min. :34.11
## 1st Qu.: 9.696 1st Qu.:23.746 1st Qu.:41.04
## Median :19.018 Median :27.151 Median :47.77
## Mean :22.213 Mean :27.884 Mean :49.90
## 3rd Qu.:33.178 3rd Qu.:34.386 3rd Qu.:60.00
## Max. :44.495 Max. :40.503 Max. :68.39
# 3. Ambil variabel numerik (hapus nama wilayah)
df_num <- df[, -1]
# 4. Standardisasi data
dfs <- scale(df_num)
dfs
## pend_rendah pend_menengah pend_tinggi agraris formal informal
## [1,] -0.65090876 1.46921426 -0.247828327 0.9752831 -0.53200172 -0.7829048
## [2,] -0.47124447 0.46125185 0.216653772 0.7818207 -0.21524901 -0.7727113
## [3,] -0.80446307 1.28132502 0.045114650 1.2394438 -0.09429122 -1.3946547
## [4,] 0.84393299 -0.08404802 -0.875829356 1.2038978 -0.24616617 -1.2484629
## [5,] -0.53029108 0.79524353 0.062210179 0.8223288 0.23348071 -1.1286828
## [6,] 0.74660235 -0.41972334 -0.547752119 0.4120931 -0.43201792 -0.1884426
## [7,] 1.05670271 -0.38456003 -0.912995987 -0.4459768 -0.69437298 1.0021615
## [8,] -0.86007590 -0.06748513 0.993267245 -1.0382041 1.13910430 0.4399191
## [9,] 0.01511579 0.23348354 -0.170184263 -1.2281097 0.15329969 1.3407706
## [10,] 1.94612380 -1.74933918 -0.996974246 -0.4484266 -1.23097978 1.3736906
## [11,] 1.65010662 0.34663301 -2.048416570 1.6711309 -2.35912305 -0.3470409
## [12,] -1.06602500 -1.55446284 2.198122138 -1.4179720 1.62219032 0.5552157
## [13,] 1.13189245 -1.38527375 -0.338015757 -0.9388358 -0.76695018 1.6323578
## [14,] -0.86596974 1.43934006 0.009083796 -0.3638406 0.43814741 0.1274147
## [15,] -0.51462824 -0.86717512 1.137913118 -0.2396662 1.24248480 -0.5713724
## [16,] -0.94328182 -0.26069901 1.212097558 -1.2862731 0.90659267 0.8917513
## [17,] -0.68358862 0.74627515 0.263534169 0.3013066 0.83585214 -0.9290089
## attr(,"scaled:center")
## pend_rendah pend_menengah pend_tinggi agraris formal
## 31.31390 55.58288 13.10322 22.21322 27.88410
## informal
## 49.90268
## attr(,"scaled:scale")
## pend_rendah pend_menengah pend_tinggi agraris formal
## 4.903636 2.922191 4.444628 13.333216 7.779102
## informal
## 11.323427
# 5. Hitung jarak (Euclidean distance)
d <- dist(dfs, method = "euclidean")
d
## 1 2 3 4 5 6 7
## 2 1.1840080
## 3 0.8833494 1.1932492
## 4 2.3221266 1.9042079 2.3399343
## 5 1.1374664 0.6845471 0.8147995 1.9828179
## 6 2.5083146 1.8353860 2.8114723 1.4194803 2.2302217
## 7 3.4680931 3.0353498 4.0127990 2.8504816 3.4455074 1.5663505
## 8 3.5053586 2.7660652 3.5771470 4.0241858 2.9089817 3.1696771 3.3818493
## 9 3.4367460 3.0133905 3.9304845 3.7502083 3.3130012 2.5447326 1.8610716
## 10 4.9831022 4.3994924 5.4443830 3.8181694 4.8734730 2.6884242 1.7569095
## 11 3.7161179 3.9002661 4.2065124 2.7759355 4.1788939 2.9966113 3.3540775
## 12 5.2398854 4.2803911 5.1532234 5.3737744 4.4892345 4.4983347 4.6973080
## 13 4.5702474 3.9162167 5.0365849 3.9021793 4.4049515 2.5253174 1.4081934
## 14 1.9173604 1.9249009 2.2805169 3.2934295 1.8860090 2.7974271 3.1508797
## 15 3.4737901 2.4133545 3.2475831 3.3574612 2.5253783 2.8298000 3.6301815
## 16 3.8984778 3.1687407 4.0456922 4.4379164 3.4042991 3.4381991 3.4369582
## 17 1.7696936 1.2201076 1.5200955 2.5314805 0.8605896 2.4962754 3.5101376
## 8 9 10 11 12 13 14
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 2.0073790
## 10 4.6369728 3.2968437
## 11 5.9934754 4.8765076 3.7831739
## 12 2.0238780 3.5756505 5.3916748 7.4318307
## 13 3.5414188 2.2161442 1.3238112 4.4111764 4.2882701
## 14 2.0695264 2.1360877 4.8408720 4.8937609 4.0609028 4.0172264
## 15 1.5655099 3.0038030 4.6191055 5.7432703 2.1684620 3.8132553 2.8050651
## 16 0.6417895 1.9610959 4.5769183 6.2186182 1.8176402 3.3828722 2.4490581
## 17 2.2329529 2.9826407 4.9868506 4.8346598 3.8677452 4.3438303 1.5149150
## 15 16
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16 1.9763606
## 17 1.9955484 2.7967466
# 6. Koefisien Korelasi Cophenetic
## Single Linkage
hc_single <- hclust(d, method = "single")
cor_single <- cor(d, cophenetic(hc_single))
## Complete Linkage
hc_complete <- hclust(d, method = "complete")
cor_complete <- cor(d, cophenetic(hc_complete))
## Average Linkage
hc_average <- hclust(d, method = "average")
cor_average <- cor(d, cophenetic(hc_average))
data.frame(single = cor_single,
complete = cor_complete,
average = cor_average)
## single complete average
## 1 0.6655329 0.656295 0.7666164
Koefisien cophenetic digunakan untuk menentukan metode linkage terbaik. Hasil menunjukkan bahwa average linkage memiliki nilai cophenetic terbesar, sehingga dipilih sebagai metode pengelompokan.
# 7. METODE TERBAIK (AVERAGE)
hc <- hc_average
print(hc)
##
## Call:
## hclust(d = d, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 17
# 8. DENDROGRAM
# Dendrogram
plot(hc,
labels = datapkl$kapanewon,
main="Cluster Dendrogram",
xlab="Kapanewon",
ylab="Jarak")
# Indeks Validitas (SilhouEtte)
sil_score <- data.frame(k = 2:16, silhouette = NA)
for(i in 2:16){
cl <- cutree(hc, k = i)
sil <- silhouette(cl, d)
sil_score[sil_score$k == i, "silhouette"] <- mean(sil[, "sil_width"])
}
sil_score
## k silhouette
## 1 2 0.28012263
## 2 3 0.27594955
## 3 4 0.38602403
## 4 5 0.29861083
## 5 6 0.31904135
## 6 7 0.29455345
## 7 8 0.27352386
## 8 9 0.29256122
## 9 10 0.23188402
## 10 11 0.20459151
## 11 12 0.18656599
## 12 13 0.15051152
## 13 14 0.13086875
## 14 15 0.10697955
## 15 16 0.07276206
fviz_nbclust(dfs, FUNcluster = hcut,
method = "silhouette",
hc_method = "average",
k.max = 16)
Berdasarkan grafik silhouette, nilai rata-rata silhouette tertinggi diperoleh pada k = 4, sehingga jumlah cluster optimal yang digunakan dalam penelitian ini adalah 4.
# 9. Menentukan Cluster
idclus <- cutree(hc, k=4)
hasil_cluster <- data.frame(kapanewon = datapkl$kapanewon,
cluster = idclus)
hasil_cluster
## kapanewon cluster
## 1 SRANDAKAN 1
## 2 SANDEN 1
## 3 KRETEK 1
## 4 PUNDONG 1
## 5 BAMBANGLIPURO 1
## 6 PANDAK 1
## 7 PAJANGAN 2
## 8 BANTUL 3
## 9 JETIS 2
## 10 IMOGIRI 2
## 11 DLINGO 4
## 12 BANGUNTAPAN 3
## 13 PLERET 2
## 14 PIYUNGAN 1
## 15 SEWON 3
## 16 KASIHAN 3
## 17 SEDAYU 1
split(datapkl$kapanewon, idclus)
## $`1`
## [1] "SRANDAKAN" "SANDEN" "KRETEK" "PUNDONG"
## [5] "BAMBANGLIPURO" "PANDAK" "PIYUNGAN" "SEDAYU"
##
## $`2`
## [1] "PAJANGAN" "JETIS" "IMOGIRI" "PLERET"
##
## $`3`
## [1] "BANTUL" "BANGUNTAPAN" "SEWON" "KASIHAN"
##
## $`4`
## [1] "DLINGO"
# Rata-rata tiap cluster
aggregate(datapkl[,-1], list(idclus), mean)
## Group.1 pend_rendah pend_menengah pend_tinggi agraris formal informal
## 1 1 29.83304 57.66088 12.506074 31.167031 27.872192 40.96078
## 2 2 36.40122 53.18253 10.416251 12.008814 22.946308 65.04488
## 3 3 27.16541 53.57401 19.260583 8.939618 37.433670 53.62671
## 4 4 39.40542 56.59581 3.998768 44.494771 9.532241 45.97299
# 10. VISUALISASI CLUSTER
fviz_cluster(list(data = dfs, cluster = idclus),
ellipse.type = "convex",
repel = TRUE,
ggtheme = theme_minimal())
library(tidyr)
library(ggplot2)
df_cl <- data.frame(datapkl, cluster = idclus)
df_long <- df_cl %>%
pivot_longer(cols = -c(kapanewon, cluster),
names_to = "variabel",
values_to = "nilai")
# Boxplot
ggplot(df_long, aes(x = factor(cluster), y = nilai, fill = factor(cluster))) +
geom_boxplot() +
facet_wrap(~variabel, scales = "free") +
labs(x = "Cluster", y = "Nilai", fill = "Cluster",
title = "Perbandingan Karakteristik Antar Cluster") +
theme_minimal()