# HIERARCHICAL CLUSTERING (DATA PKL BANTUL)
# 1. Load library
library(readxl)
library(dplyr)
library(factoextra)

# 2. Import data
df <- read_excel("C:/Users/farah/OneDrive/Documents/FARAH/PKL/data/datapkl9var.xlsx")

# 3. Ambil variabel numerik (hapus nama wilayah)
df_num <- df[, -1]

# 4. Handle missing value
df_num <- na.omit(df_num)

# 5. Standardisasi data
dfs <- scale(df_num)

# 6. Hitung jarak (Euclidean distance)
d <- dist(dfs, method = "euclidean")
d
##            1         2         3         4         5         6         7
## 2  1.1840080                                                            
## 3  0.8833494 1.1932492                                                  
## 4  2.3221266 1.9042079 2.3399343                                        
## 5  1.1374664 0.6845471 0.8147995 1.9828179                              
## 6  2.5083146 1.8353860 2.8114723 1.4194803 2.2302217                    
## 7  3.4680931 3.0353498 4.0127990 2.8504816 3.4455074 1.5663505          
## 8  3.5053586 2.7660652 3.5771470 4.0241858 2.9089817 3.1696771 3.3818493
## 9  3.4367460 3.0133905 3.9304845 3.7502083 3.3130012 2.5447326 1.8610716
## 10 4.9831022 4.3994924 5.4443830 3.8181694 4.8734730 2.6884242 1.7569095
## 11 3.7161179 3.9002661 4.2065124 2.7759355 4.1788939 2.9966113 3.3540775
## 12 5.2398854 4.2803911 5.1532234 5.3737744 4.4892345 4.4983347 4.6973080
## 13 4.5702474 3.9162167 5.0365849 3.9021793 4.4049515 2.5253174 1.4081934
## 14 1.9173604 1.9249009 2.2805169 3.2934295 1.8860090 2.7974271 3.1508797
## 15 3.4737901 2.4133545 3.2475831 3.3574612 2.5253783 2.8298000 3.6301815
## 16 3.8984778 3.1687407 4.0456922 4.4379164 3.4042991 3.4381991 3.4369582
## 17 1.7696936 1.2201076 1.5200955 2.5314805 0.8605896 2.4962754 3.5101376
##            8         9        10        11        12        13        14
## 2                                                                       
## 3                                                                       
## 4                                                                       
## 5                                                                       
## 6                                                                       
## 7                                                                       
## 8                                                                       
## 9  2.0073790                                                            
## 10 4.6369728 3.2968437                                                  
## 11 5.9934754 4.8765076 3.7831739                                        
## 12 2.0238780 3.5756505 5.3916748 7.4318307                              
## 13 3.5414188 2.2161442 1.3238112 4.4111764 4.2882701                    
## 14 2.0695264 2.1360877 4.8408720 4.8937609 4.0609028 4.0172264          
## 15 1.5655099 3.0038030 4.6191055 5.7432703 2.1684620 3.8132553 2.8050651
## 16 0.6417895 1.9610959 4.5769183 6.2186182 1.8176402 3.3828722 2.4490581
## 17 2.2329529 2.9826407 4.9868506 4.8346598 3.8677452 4.3438303 1.5149150
##           15        16
## 2                     
## 3                     
## 4                     
## 5                     
## 6                     
## 7                     
## 8                     
## 9                     
## 10                    
## 11                    
## 12                    
## 13                    
## 14                    
## 15                    
## 16 1.9763606          
## 17 1.9955484 2.7967466
# 7. CLUSTERING (BANDINGKAN METODE)
hc_single   <- hclust(d, method = "single")
hc_complete <- hclust(d, method = "complete")
hc_average  <- hclust(d, method = "average")
# 8. COPHENETIC CORRELATION (PILIH METODE TERBAIK)
cor_single   <- cor(d, cophenetic(hc_single))
cor_complete <- cor(d, cophenetic(hc_complete))
cor_average  <- cor(d, cophenetic(hc_average))

data.frame(single = cor_single,
           complete = cor_complete,
           average = cor_average)
##      single complete   average
## 1 0.6655329 0.656295 0.7666164
# 9. METODE TERBAIK (AVERAGE)
hc <- hc_average
# 10. DENDROGRAM FINAL
plot(hc, cex = 0.7, hang = -1)

# Coba beberapa jumlah cluster
rect.hclust(hc, k = 2, border = "red")
rect.hclust(hc, k = 3, border = "blue")
rect.hclust(hc, k = 4, border = "green")

# Sillhoutte Score
fviz_nbclust(dfs, FUN = hcut, method = "silhouette")

# 11. Menentukan Cluster
grp <- cutree(hc, k = 3)
df_cl <- mutate(df, cluster = grp)
print(df_cl, width = Inf)
## # A tibble: 17 × 8
##    kapanewon     pend_rendah pend_menengah pend_tinggi agraris formal informal
##    <chr>               <dbl>         <dbl>       <dbl>   <dbl>  <dbl>    <dbl>
##  1 SRANDAKAN            28.1          59.9       12.0    35.2   23.7      41.0
##  2 SANDEN               29.0          56.9       14.1    32.6   26.2      41.2
##  3 KRETEK               27.4          59.3       13.3    38.7   27.2      34.1
##  4 PUNDONG              35.5          55.3        9.21   38.3   26.0      35.8
##  5 BAMBANGLIPURO        28.7          57.9       13.4    33.2   29.7      37.1
##  6 PANDAK               35.0          54.4       10.7    27.7   24.5      47.8
##  7 PAJANGAN             36.5          54.5        9.05   16.3   22.5      61.3
##  8 BANTUL               27.1          55.4       17.5     8.37  36.7      54.9
##  9 JETIS                31.4          56.3       12.3     5.84  29.1      65.1
## 10 IMOGIRI              40.9          50.5        8.67   16.2   18.3      65.5
## 11 DLINGO               39.4          56.6        4.00   44.5    9.53     46.0
## 12 BANGUNTAPAN          26.1          51.0       22.9     3.31  40.5      56.2
## 13 PLERET               36.9          51.5       11.6     9.70  21.9      68.4
## 14 PIYUNGAN             27.1          59.8       13.1    17.4   31.3      51.3
## 15 SEWON                28.8          53.0       18.2    19.0   37.5      43.4
## 16 KASIHAN              26.7          54.8       18.5     5.06  34.9      60.0
## 17 SEDAYU               28.0          57.8       14.3    26.2   34.4      39.4
##    cluster
##      <int>
##  1       1
##  2       1
##  3       1
##  4       1
##  5       1
##  6       1
##  7       2
##  8       1
##  9       2
## 10       2
## 11       3
## 12       1
## 13       2
## 14       1
## 15       1
## 16       1
## 17       1
# 12. JUMLAH ANGGOTA CLUSTER
count(df_cl, cluster)
## # A tibble: 3 × 2
##   cluster     n
##     <int> <int>
## 1       1    12
## 2       2     4
## 3       3     1
# 13. INTERPRETASI CLUSTER (RATA-RATA)
df_cl %>%
  group_by(cluster) %>%
  summarise(across(-kapanewon, mean))
## # A tibble: 3 × 7
##   cluster pend_rendah pend_menengah pend_tinggi agraris formal informal
##     <int>       <dbl>         <dbl>       <dbl>   <dbl>  <dbl>    <dbl>
## 1       1        28.9          56.3       14.8     23.8  31.1      45.2
## 2       2        36.4          53.2       10.4     12.0  22.9      65.0
## 3       3        39.4          56.6        4.00    44.5   9.53     46.0
# 14. VISUALISASI CLUSTER
fviz_cluster(list(data = df_num, cluster = grp),
             ellipse.type = "convex",
             repel = TRUE,
             ggtheme = theme_minimal())

library(tidyr)

df_long <- df_cl %>%
  pivot_longer(cols = -c(kapanewon, cluster),
               names_to = "variabel",
               values_to = "nilai")

ggplot(df_long, aes(x = factor(cluster), y = nilai, fill = factor(cluster))) +
  geom_boxplot() +
  facet_wrap(~variabel, scales = "free") +
  theme_minimal()