1. Library
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(clValid)
## Loading required package: cluster
library(sf)
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(ggplot2)
2. Import Data
data1 <- read_excel("~/KULIAH UGM/SEMESTER 3/STATISTIKA MULTIVARIAT TERAPAN/File project.xlsx", sheet = 2)
3. Eksplorasi Awal
data1_long <- gather(data1[-1], key = "Variabel", value = "Nilai")
ggplot(data1_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot() +
theme_classic()

4. Persiapan Data Clustering
data1_s <- data1[-1]
data1_s_dist <- dist(x = data1_s, method = "euclidean")
fviz_dist(data1_s_dist, gradient = list(low="tomato", mid="white", high="green"))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

5. Clustering Hirarki (Average Linkage)
clust1 <- hclust(d = data1_s_dist, method = "average")
6. Label Klaster
data1_s_clust <- cbind(data1[1], data1_s)
data1_s_clust$Clust5 <- cutree(clust1, k = 5)
data1_s_clust$Clust4 <- cutree(clust1, k = 4)
data1_s_clust$Clust3 <- cutree(clust1, k = 3)
data1_s_clust$Clust2 <- cutree(clust1, k = 2)
7. Frekuensi Tiap Klaster
table(data1_s_clust$Clust5)
##
## 1 2 3 4 5
## 17 5 2 5 5
table(data1_s_clust$Clust4)
##
## 1 2 3 4
## 22 5 2 5
table(data1_s_clust$Clust3)
##
## 1 2 3
## 27 2 5
table(data1_s_clust$Clust2)
##
## 1 2
## 32 2
8. Visualisasi Dendrogram
fviz_dend(clust1, k = 5, k_colors = "jco", rect = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_cluster(list(data = data1_s, cluster = data1_s_clust$Clust5))

9. Penentuan Jumlah Klaster Optimal
data1_s_clValid <- as.data.frame(lapply(data1_s, as.numeric))
rownames(data1_s_clValid) <- seq_len(nrow(data1_s_clValid))
internal <- clValid(data1_s_clValid,
nClust = 2:5,
clMethods = "hierarchical",
validation = "internal",
metric = "euclidean",
method = "average")
summary(internal)
##
## Clustering Methods:
## hierarchical
##
## Cluster sizes:
## 2 3 4 5
##
## Validation Measures:
## 2 3 4 5
##
## hierarchical Connectivity 5.0857 10.9500 18.1865 24.9913
## Dunn 0.3314 0.2494 0.2311 0.2452
## Silhouette 0.2970 0.2729 0.2842 0.3315
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 5.0857 hierarchical 2
## Dunn 0.3314 hierarchical 2
## Silhouette 0.3315 hierarchical 5
10. Peta Klustering
data <- data1_s_clust
data2 <- st_read("D:/Download/indonesia-map-geojson (1).json")
## Reading layer `indonesia-map-geojson (1)' from data source
## `D:\Download\indonesia-map-geojson (1).json' using driver `GeoJSON'
## Simple feature collection with 34 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 95.19439 ymin: -10.92308 xmax: 141.0298 ymax: 5.65662
## Geodetic CRS: WGS 84
data$Provinsi <- toupper(data$Provinsi)
# Merge
names(data2)[1] <- "Provinsi"
data3 <- merge(data, data2, by.x = "Provinsi", by.y = "Provinsi")
data3_sf <- st_as_sf(data3, crs = 4326)
data3_sf$Clust5 <- as.factor(data3_sf$Clust5)
ggplot(data3_sf) +
geom_sf(aes(fill = Clust5), color = "white", size = 0.25) +
scale_fill_manual(values = c(
"1" = "darkred",
"2" = "darkkhaki",
"3" = "darkseagreen",
"4" = "cadetblue",
"5" = "burlywood"
)) +
labs(title = "Persebaran Kebahagiaan Indonesia", fill = "Cluster") +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold"),
legend.position = "bottom",
legend.key.size = unit(0.8, "cm"),
panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()
)
