# HIERARCHICAL CLUSTERING (DATA PKL BANTUL)
# 1. Load library
library(readxl)
library(dplyr)
library(factoextra)
library(cluster)

# 2. Import data
df <- read_excel("C:/Data/datapklpdpk.xlsx")
datapkl <- data.frame(df)
datapkl
##        kapanewon pend_rendah pend_menengah pend_tinggi   agraris    formal
## 1      SRANDAKAN    28.12208      59.87621   12.001713 35.216882 23.745604
## 2         SANDEN    29.00309      56.93075   14.066163 32.637405 26.209655
## 3         KRETEK    27.36911      59.32716   13.303735 38.738993 27.150598
## 4        PUNDONG    35.45224      55.33728    9.210482 38.265051 25.969148
## 5  BAMBANGLIPURO    28.71355      57.90674   13.379719 33.177508 29.700370
## 6         PANDAK    34.97497      54.35637   10.668663 27.707747 24.523388
## 7       PAJANGAN    36.49559      54.45912    9.045290 16.266916 22.482501
## 8         BANTUL    27.09640      55.38568   17.517921  8.370620 36.745308
## 9          JETIS    31.38802      56.26517   12.346812  5.838568 29.076633
## 10       IMOGIRI    40.85698      50.47098    8.672038 16.234252 18.308182
## 11        DLINGO    39.40542      56.59581    3.998768 44.494771  9.532241
## 12   BANGUNTAPAN    26.08650      51.04045   22.873053  3.307092 40.503283
## 13        PLERET    36.86429      51.53485   11.600863  9.695519 21.917916
## 14      PIYUNGAN    27.06750      59.78891   13.143592 17.362055 31.292493
## 15         SEWON    28.79035      53.04883   18.160818 19.017699 37.549515
## 16       KASIHAN    26.68839      54.82107   18.490541  5.063062 34.936576
## 17        SEDAYU    27.96183      57.76364   14.274529 26.230606 34.386278
##    informal
## 1  41.03751
## 2  41.15294
## 3  34.11041
## 4  35.76580
## 5  37.12212
## 6  47.76886
## 7  61.25058
## 8  54.88407
## 9  65.08480
## 10 65.45757
## 11 45.97299
## 12 56.18962
## 13 68.38657
## 14 51.34545
## 15 43.43279
## 16 60.00036
## 17 39.38312
# Statistika Deskriptif
statdes <- summary(datapkl)
statdes
##   kapanewon          pend_rendah    pend_menengah    pend_tinggi    
##  Length:17          Min.   :26.09   Min.   :50.47   Min.   : 3.999  
##  Class :character   1st Qu.:27.37   1st Qu.:54.36   1st Qu.:10.669  
##  Mode  :character   Median :28.79   Median :55.39   Median :13.144  
##                     Mean   :31.31   Mean   :55.58   Mean   :13.103  
##                     3rd Qu.:35.45   3rd Qu.:57.76   3rd Qu.:14.275  
##                     Max.   :40.86   Max.   :59.88   Max.   :22.873  
##     agraris           formal          informal    
##  Min.   : 3.307   Min.   : 9.532   Min.   :34.11  
##  1st Qu.: 9.696   1st Qu.:23.746   1st Qu.:41.04  
##  Median :19.018   Median :27.151   Median :47.77  
##  Mean   :22.213   Mean   :27.884   Mean   :49.90  
##  3rd Qu.:33.178   3rd Qu.:34.386   3rd Qu.:60.00  
##  Max.   :44.495   Max.   :40.503   Max.   :68.39
# 3. Ambil variabel numerik (hapus nama wilayah)
df_num <- df[, -1]

# 4. Standardisasi data
dfs <- scale(df_num)
dfs
##       pend_rendah pend_menengah  pend_tinggi    agraris      formal   informal
##  [1,] -0.65090876    1.46921426 -0.247828327  0.9752831 -0.53200172 -0.7829048
##  [2,] -0.47124447    0.46125185  0.216653772  0.7818207 -0.21524901 -0.7727113
##  [3,] -0.80446307    1.28132502  0.045114650  1.2394438 -0.09429122 -1.3946547
##  [4,]  0.84393299   -0.08404802 -0.875829356  1.2038978 -0.24616617 -1.2484629
##  [5,] -0.53029108    0.79524353  0.062210179  0.8223288  0.23348071 -1.1286828
##  [6,]  0.74660235   -0.41972334 -0.547752119  0.4120931 -0.43201792 -0.1884426
##  [7,]  1.05670271   -0.38456003 -0.912995987 -0.4459768 -0.69437298  1.0021615
##  [8,] -0.86007590   -0.06748513  0.993267245 -1.0382041  1.13910430  0.4399191
##  [9,]  0.01511579    0.23348354 -0.170184263 -1.2281097  0.15329969  1.3407706
## [10,]  1.94612380   -1.74933918 -0.996974246 -0.4484266 -1.23097978  1.3736906
## [11,]  1.65010662    0.34663301 -2.048416570  1.6711309 -2.35912305 -0.3470409
## [12,] -1.06602500   -1.55446284  2.198122138 -1.4179720  1.62219032  0.5552157
## [13,]  1.13189245   -1.38527375 -0.338015757 -0.9388358 -0.76695018  1.6323578
## [14,] -0.86596974    1.43934006  0.009083796 -0.3638406  0.43814741  0.1274147
## [15,] -0.51462824   -0.86717512  1.137913118 -0.2396662  1.24248480 -0.5713724
## [16,] -0.94328182   -0.26069901  1.212097558 -1.2862731  0.90659267  0.8917513
## [17,] -0.68358862    0.74627515  0.263534169  0.3013066  0.83585214 -0.9290089
## attr(,"scaled:center")
##   pend_rendah pend_menengah   pend_tinggi       agraris        formal 
##      31.31390      55.58288      13.10322      22.21322      27.88410 
##      informal 
##      49.90268 
## attr(,"scaled:scale")
##   pend_rendah pend_menengah   pend_tinggi       agraris        formal 
##      4.903636      2.922191      4.444628     13.333216      7.779102 
##      informal 
##     11.323427
# 5. Hitung jarak (Euclidean distance)
d <- dist(dfs, method = "euclidean")
d
##            1         2         3         4         5         6         7
## 2  1.1840080                                                            
## 3  0.8833494 1.1932492                                                  
## 4  2.3221266 1.9042079 2.3399343                                        
## 5  1.1374664 0.6845471 0.8147995 1.9828179                              
## 6  2.5083146 1.8353860 2.8114723 1.4194803 2.2302217                    
## 7  3.4680931 3.0353498 4.0127990 2.8504816 3.4455074 1.5663505          
## 8  3.5053586 2.7660652 3.5771470 4.0241858 2.9089817 3.1696771 3.3818493
## 9  3.4367460 3.0133905 3.9304845 3.7502083 3.3130012 2.5447326 1.8610716
## 10 4.9831022 4.3994924 5.4443830 3.8181694 4.8734730 2.6884242 1.7569095
## 11 3.7161179 3.9002661 4.2065124 2.7759355 4.1788939 2.9966113 3.3540775
## 12 5.2398854 4.2803911 5.1532234 5.3737744 4.4892345 4.4983347 4.6973080
## 13 4.5702474 3.9162167 5.0365849 3.9021793 4.4049515 2.5253174 1.4081934
## 14 1.9173604 1.9249009 2.2805169 3.2934295 1.8860090 2.7974271 3.1508797
## 15 3.4737901 2.4133545 3.2475831 3.3574612 2.5253783 2.8298000 3.6301815
## 16 3.8984778 3.1687407 4.0456922 4.4379164 3.4042991 3.4381991 3.4369582
## 17 1.7696936 1.2201076 1.5200955 2.5314805 0.8605896 2.4962754 3.5101376
##            8         9        10        11        12        13        14
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
## 6                                                                       
## 7                                                                       
## 8                                                                       
## 9  2.0073790                                                            
## 10 4.6369728 3.2968437                                                  
## 11 5.9934754 4.8765076 3.7831739                                        
## 12 2.0238780 3.5756505 5.3916748 7.4318307                              
## 13 3.5414188 2.2161442 1.3238112 4.4111764 4.2882701                    
## 14 2.0695264 2.1360877 4.8408720 4.8937609 4.0609028 4.0172264          
## 15 1.5655099 3.0038030 4.6191055 5.7432703 2.1684620 3.8132553 2.8050651
## 16 0.6417895 1.9610959 4.5769183 6.2186182 1.8176402 3.3828722 2.4490581
## 17 2.2329529 2.9826407 4.9868506 4.8346598 3.8677452 4.3438303 1.5149150
##           15        16
## 2                     
## 3                     
## 4                     
## 5                     
## 6                     
## 7                     
## 8                     
## 9                     
## 10                    
## 11                    
## 12                    
## 13                    
## 14                    
## 15                    
## 16 1.9763606          
## 17 1.9955484 2.7967466
# 6. Koefisien Korelasi Cophenetic
## Single Linkage
hc_single   <- hclust(d, method = "single")
cor_single   <- cor(d, cophenetic(hc_single))
## Complete Linkage
hc_complete <- hclust(d, method = "complete")
cor_complete <- cor(d, cophenetic(hc_complete))
## Average Linkage
hc_average  <- hclust(d, method = "average")
cor_average  <- cor(d, cophenetic(hc_average))

data.frame(single = cor_single,
           complete = cor_complete,
           average = cor_average)
##      single complete   average
## 1 0.6655329 0.656295 0.7666164

Koefisien cophenetic digunakan untuk menentukan metode linkage terbaik. Hasil menunjukkan bahwa average linkage memiliki nilai cophenetic terbesar, sehingga dipilih sebagai metode pengelompokan.

# 7. METODE TERBAIK (AVERAGE)
hc <- hc_average
print(hc)
## 
## Call:
## hclust(d = d, method = "average")
## 
## Cluster method   : average 
## Distance         : euclidean 
## Number of objects: 17
# 8. DENDROGRAM
# Dendrogram
plot(hc,
     labels = datapkl$kapanewon,
     main="Cluster Dendrogram", 
     xlab="Kapanewon", 
     ylab="Jarak")

# Indeks Validitas (SilhouEtte)
sil_score <- data.frame(k = 2:16, silhouette = NA)
for(i in 2:16){
  cl <- cutree(hc, k = i)
  sil <- silhouette(cl, d)
  sil_score[sil_score$k == i, "silhouette"] <- mean(sil[, "sil_width"])
}

sil_score
##     k silhouette
## 1   2 0.28012263
## 2   3 0.27594955
## 3   4 0.38602403
## 4   5 0.29861083
## 5   6 0.31904135
## 6   7 0.29455345
## 7   8 0.27352386
## 8   9 0.29256122
## 9  10 0.23188402
## 10 11 0.20459151
## 11 12 0.18656599
## 12 13 0.15051152
## 13 14 0.13086875
## 14 15 0.10697955
## 15 16 0.07276206
fviz_nbclust(dfs, FUNcluster = hcut, 
             method = "silhouette", 
             hc_method = "average", 
             k.max = 16)

Berdasarkan grafik silhouette, nilai rata-rata silhouette tertinggi diperoleh pada k = 4, sehingga jumlah cluster optimal yang digunakan dalam penelitian ini adalah 4.

# 9. Menentukan Cluster
idclus <- cutree(hc, k=4)
hasil_cluster <- data.frame(kapanewon = datapkl$kapanewon,
                            cluster = idclus)
hasil_cluster
##        kapanewon cluster
## 1      SRANDAKAN       1
## 2         SANDEN       1
## 3         KRETEK       1
## 4        PUNDONG       1
## 5  BAMBANGLIPURO       1
## 6         PANDAK       1
## 7       PAJANGAN       2
## 8         BANTUL       3
## 9          JETIS       2
## 10       IMOGIRI       2
## 11        DLINGO       4
## 12   BANGUNTAPAN       3
## 13        PLERET       2
## 14      PIYUNGAN       1
## 15         SEWON       3
## 16       KASIHAN       3
## 17        SEDAYU       1
split(datapkl$kapanewon, idclus)
## $`1`
## [1] "SRANDAKAN"     "SANDEN"        "KRETEK"        "PUNDONG"      
## [5] "BAMBANGLIPURO" "PANDAK"        "PIYUNGAN"      "SEDAYU"       
## 
## $`2`
## [1] "PAJANGAN" "JETIS"    "IMOGIRI"  "PLERET"  
## 
## $`3`
## [1] "BANTUL"      "BANGUNTAPAN" "SEWON"       "KASIHAN"    
## 
## $`4`
## [1] "DLINGO"
# Rata-rata tiap cluster
aggregate(datapkl[,-1], list(idclus), mean)
##   Group.1 pend_rendah pend_menengah pend_tinggi   agraris    formal informal
## 1       1    29.83304      57.66088   12.506074 31.167031 27.872192 40.96078
## 2       2    36.40122      53.18253   10.416251 12.008814 22.946308 65.04488
## 3       3    27.16541      53.57401   19.260583  8.939618 37.433670 53.62671
## 4       4    39.40542      56.59581    3.998768 44.494771  9.532241 45.97299
# 10. VISUALISASI CLUSTER
fviz_cluster(list(data = dfs, cluster = idclus),
             ellipse.type = "convex",
             repel = TRUE,
             ggtheme = theme_minimal())

library(tidyr)
library(ggplot2)

df_cl <- data.frame(datapkl, cluster = idclus)
df_long <- df_cl %>%
  pivot_longer(cols = -c(kapanewon, cluster),
               names_to = "variabel",
               values_to = "nilai")

# Boxplot
ggplot(df_long, aes(x = factor(cluster), y = nilai, fill = factor(cluster))) +
  geom_boxplot() +
  facet_wrap(~variabel, scales = "free") +
  labs(x = "Cluster", y = "Nilai", fill = "Cluster",
       title = "Perbandingan Karakteristik Antar Cluster") +
  theme_minimal()

Berdasarkan hasil analisis hierarchical clustering dengan metode average linkage, diperoleh empat cluster dengan karakteristik sebagai berikut.

Cluster 1 (Srandakan, Sanden, Kretek, Pundong, Bambanglipuro, Pandak, Piyungan, Sedayu) merupakan kapanewon semi-agraris dengan pendidikan menengah dominan dan sektor informal relatif rendah.
Cluster 2 (Pajangan, Jetis, Imogiri, Pleret) memiliki tingkat pendidikan rendah yang tinggi serta ketergantungan besar pada sektor informal, sehingga menjadi kelompok dengan tantangan sosial ekonomi terbesar.
Cluster 3 (Bantul, Banguntapan, Sewon, Kasihan) merupakan kapanewon berkarakter perkotaan dengan proporsi pendidikan tinggi dan sektor formal tertinggi serta sektor agraris yang sangat rendah.
Cluster 4 (Dlingo) berdiri sendiri sebagai kapanewon paling unik dengan dominasi agraris tertinggi, pendidikan tinggi terendah, dan sektor formal terendah di antara seluruh kapanewon.