1. Library

library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(clValid)
## Loading required package: cluster
library(sf)
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(ggplot2)

2. Import Data

data1 <- read_excel("~/KULIAH UGM/SEMESTER 3/STATISTIKA MULTIVARIAT TERAPAN/File project.xlsx", sheet = 2)

3. Eksplorasi Awal

data1_long <- gather(data1[-1], key = "Variabel", value = "Nilai")

ggplot(data1_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
  geom_boxplot() +
  theme_classic()

4. Persiapan Data Clustering

data1_s <- data1[-1]
data1_s_dist <- dist(x = data1_s, method = "euclidean")
fviz_dist(data1_s_dist, gradient = list(low="tomato", mid="white", high="green"))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

5. Clustering Hirarki (Average Linkage)

clust1 <- hclust(d = data1_s_dist, method = "average")

6. Label Klaster

data1_s_clust <- cbind(data1[1], data1_s)
data1_s_clust$Clust5 <- cutree(clust1, k = 5)
data1_s_clust$Clust4 <- cutree(clust1, k = 4)
data1_s_clust$Clust3 <- cutree(clust1, k = 3)
data1_s_clust$Clust2 <- cutree(clust1, k = 2)

7. Frekuensi Tiap Klaster

table(data1_s_clust$Clust5)
## 
##  1  2  3  4  5 
## 17  5  2  5  5
table(data1_s_clust$Clust4)
## 
##  1  2  3  4 
## 22  5  2  5
table(data1_s_clust$Clust3)
## 
##  1  2  3 
## 27  2  5
table(data1_s_clust$Clust2)
## 
##  1  2 
## 32  2

8. Visualisasi Dendrogram

fviz_dend(clust1, k = 5, k_colors = "jco", rect = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_cluster(list(data = data1_s, cluster = data1_s_clust$Clust5))

9. Penentuan Jumlah Klaster Optimal

data1_s_clValid <- as.data.frame(lapply(data1_s, as.numeric))
rownames(data1_s_clValid) <- seq_len(nrow(data1_s_clValid))

internal <- clValid(data1_s_clValid,
                    nClust = 2:5,
                    clMethods = "hierarchical",
                    validation = "internal",
                    metric = "euclidean",
                    method = "average")
summary(internal)
## 
## Clustering Methods:
##  hierarchical 
## 
## Cluster sizes:
##  2 3 4 5 
## 
## Validation Measures:
##                                  2       3       4       5
##                                                           
## hierarchical Connectivity   5.0857 10.9500 18.1865 24.9913
##              Dunn           0.3314  0.2494  0.2311  0.2452
##              Silhouette     0.2970  0.2729  0.2842  0.3315
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## Connectivity 5.0857 hierarchical 2       
## Dunn         0.3314 hierarchical 2       
## Silhouette   0.3315 hierarchical 5

10. Peta Klustering

data <- data1_s_clust

data2 <- st_read("D:/Download/indonesia-map-geojson (1).json")
## Reading layer `indonesia-map-geojson (1)' from data source 
##   `D:\Download\indonesia-map-geojson (1).json' using driver `GeoJSON'
## Simple feature collection with 34 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 95.19439 ymin: -10.92308 xmax: 141.0298 ymax: 5.65662
## Geodetic CRS:  WGS 84
data$Provinsi <- toupper(data$Provinsi)
# Merge
names(data2)[1] <- "Provinsi"

data3 <- merge(data, data2, by.x = "Provinsi", by.y = "Provinsi")

data3_sf <- st_as_sf(data3, crs = 4326)

data3_sf$Clust5 <- as.factor(data3_sf$Clust5)
ggplot(data3_sf) +
  geom_sf(aes(fill = Clust5), color = "white", size = 0.25) +
  scale_fill_manual(values = c(
    "1" = "darkred",
    "2" = "darkkhaki",
    "3" = "darkseagreen",
    "4" = "cadetblue",
    "5" = "burlywood"
  )) +
  labs(title = "Persebaran Kebahagiaan Indonesia", fill = "Cluster") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    legend.position = "bottom",
    legend.key.size = unit(0.8, "cm"),
    panel.grid = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )