1. Library

library(readxl)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(clValid)

## Loading required package: cluster

library(sf)

## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE

library(ggplot2)

2. Import Data

data1 <- read_excel("~/KULIAH UGM/SEMESTER 3/STATISTIKA MULTIVARIAT TERAPAN/File project.xlsx", sheet = 2)

3. Eksplorasi Awal

data1_long <- gather(data1[-1], key = "Variabel", value = "Nilai")

ggplot(data1_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
  geom_boxplot() +
  theme_classic()

4. Persiapan Data Clustering

data1_s <- data1[-1]
data1_s_dist <- dist(x = data1_s, method = "euclidean")

fviz_dist(data1_s_dist, gradient = list(low="tomato", mid="white", high="green"))

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

5. Clustering Hirarki (Average Linkage)

clust1 <- hclust(d = data1_s_dist, method = "average")

6. Label Klaster

data1_s_clust <- cbind(data1[1], data1_s)
data1_s_clust$Clust5 <- cutree(clust1, k = 5)
data1_s_clust$Clust4 <- cutree(clust1, k = 4)
data1_s_clust$Clust3 <- cutree(clust1, k = 3)
data1_s_clust$Clust2 <- cutree(clust1, k = 2)

7. Frekuensi Tiap Klaster

table(data1_s_clust$Clust5)

## 
##  1  2  3  4  5 
## 17  5  2  5  5

table(data1_s_clust$Clust4)

## 
##  1  2  3  4 
## 22  5  2  5

table(data1_s_clust$Clust3)

## 
##  1  2  3 
## 27  2  5

table(data1_s_clust$Clust2)

## 
##  1  2 
## 32  2

8. Visualisasi Dendrogram

fviz_dend(clust1, k = 5, k_colors = "jco", rect = TRUE)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_cluster(list(data = data1_s, cluster = data1_s_clust$Clust5))

9. Penentuan Jumlah Klaster Optimal

data1_s_clValid <- as.data.frame(lapply(data1_s, as.numeric))
rownames(data1_s_clValid) <- seq_len(nrow(data1_s_clValid))

internal <- clValid(data1_s_clValid,
                    nClust = 2:5,
                    clMethods = "hierarchical",
                    validation = "internal",
                    metric = "euclidean",
                    method = "average")
summary(internal)

## 
## Clustering Methods:
##  hierarchical 
## 
## Cluster sizes:
##  2 3 4 5 
## 
## Validation Measures:
##                                  2       3       4       5
##                                                           
## hierarchical Connectivity   5.0857 10.9500 18.1865 24.9913
##              Dunn           0.3314  0.2494  0.2311  0.2452
##              Silhouette     0.2970  0.2729  0.2842  0.3315
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## Connectivity 5.0857 hierarchical 2       
## Dunn         0.3314 hierarchical 2       
## Silhouette   0.3315 hierarchical 5

10. Peta Klustering

data <- data1_s_clust

data2 <- st_read("D:/Download/indonesia-map-geojson (1).json")

## Reading layer `indonesia-map-geojson (1)' from data source 
##   `D:\Download\indonesia-map-geojson (1).json' using driver `GeoJSON'
## Simple feature collection with 34 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 95.19439 ymin: -10.92308 xmax: 141.0298 ymax: 5.65662
## Geodetic CRS:  WGS 84

data$Provinsi <- toupper(data$Provinsi)
# Merge
names(data2)[1] <- "Provinsi"

data3 <- merge(data, data2, by.x = "Provinsi", by.y = "Provinsi")

data3_sf <- st_as_sf(data3, crs = 4326)

data3_sf$Clust5 <- as.factor(data3_sf$Clust5)

ggplot(data3_sf) +
  geom_sf(aes(fill = Clust5), color = "white", size = 0.25) +
  scale_fill_manual(values = c(
    "1" = "darkred",
    "2" = "darkkhaki",
    "3" = "darkseagreen",
    "4" = "cadetblue",
    "5" = "burlywood"
  )) +
  labs(title = "Persebaran Kebahagiaan Indonesia", fill = "Cluster") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    legend.position = "bottom",
    legend.key.size = unit(0.8, "cm"),
    panel.grid = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )

Output Project SMT

Kelompok 6

2025-12-05

1. Library

2. Import Data

3. Eksplorasi Awal

4. Persiapan Data Clustering

5. Clustering Hirarki (Average Linkage)

6. Label Klaster

7. Frekuensi Tiap Klaster

8. Visualisasi Dendrogram

9. Penentuan Jumlah Klaster Optimal

10. Peta Klustering