Projek kali ini merupakan penerapan metode Analisis Cluster K-Means dan K-Medoids dengan menggunakan jarak Euclidean pada data Mutu Sekolah di Indonesia berdasarkan Provinsi.
Analisis cluster merupakan salah satu metode analisis multivariat yang digunakan untuk mengelompokkan objek-objek yang memiliki karakteristik serupa ke dalam kelompok (cluster) tertentu. Tujuannya adalah agar objek-objek yang berada dalam satu cluster memiliki tingkat kemiripan yang tinggi, sedangkan antar cluster memiliki perbedaan yang jelas.
Dalam konteks ini, variabel mutu sekolah yang diukur berdasarkan beberapa indikator (seperti mutu lulusan, mutu guru, proses pembelajaran, dan manajemen sekolah) digunakan sebagai dasar pengelompokan. Dengan pendekatan ini, setiap provinsi di Indonesia akan dikelompokkan ke dalam cluster tertentu sesuai dengan tingkat mutu sekolahnya.
Metode yang digunakan adalah K-Means dan K-Medoids(PAM - Partitioning Around Medoids), dengan jarak Euclidean sebagai ukuran kedekatan antar provinsi. Perbandingan kedua metode ini penting karena:
K-Means sensitif terhadap outlier dan perbedaan skala data, tetapi lebih efisien untuk jumlah data besar.
K-Medoids lebih robust terhadap outlier karena menggunakan medoid sebagai pusat cluster, meskipun secara komputasi bisa lebih kompleks.
Hasil dari analisis ini diharapkan dapat memberikan gambaran mengenai pengelompokan mutu sekolah antar provinsi, sehingga dapat menjadi dasar dalam pengambilan kebijakan peningkatan kualitas pendidikan di Indonesia.
Berikut adalah package yang diperlukan dalam analisis
library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library("umap")
## Warning: package 'umap' was built under R version 4.4.3
library("ggpubr")
## Warning: package 'ggpubr' was built under R version 4.4.3
library("DataExplorer")
## Warning: package 'DataExplorer' was built under R version 4.4.3
library("easystats")
## Warning: package 'easystats' was built under R version 4.4.3
library("factoextra")
## Warning: package 'factoextra' was built under R version 4.4.3
library("tidyverse")
df <- read_excel("C:/Users/nabil/Downloads/Pemodelan Klasifikasi/DATA TUGAS-1.xlsx")
str(df)
## tibble [1,144 × 11] (S3: tbl_df/tbl/data.frame)
## $ No : num [1:1144] 1 2 3 4 5 6 7 8 9 10 ...
## $ TAHUN : num [1:1144] 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ NPSN : num [1:1144] 20501870 10303885 69974471 50105492 10600905 ...
## $ Status Sekolah : chr [1:1144] "Swasta" "Swasta" "Swasta" "Negeri" ...
## $ Provinsi : chr [1:1144] "JAWA TIMUR" "SUMATERA BARAT" "JAWA BARAT" "BALI" ...
## $ Kab/Kota : chr [1:1144] "KABUPATEN SIDOARJO" "KOTA PAYAKUMBUH" "KABUPATEN CIANJUR" "KABUPATEN BULELENG" ...
## $ Mutu Lulusan : num [1:1144] 88.6 82.9 74.3 94.3 97.1 ...
## $ Proses Pembelajaran : num [1:1144] 90 86.7 83.3 96.7 96.7 ...
## $ Mutu Guru : num [1:1144] 88.9 83.3 77.8 100 94.4 ...
## $ Manajemen S/M : num [1:1144] 94.4 88.9 88.9 100 100 ...
## $ Peringkat Akreditasi: chr [1:1144] "A" "B" "B" "A" ...
sum(is.na(df))
## [1] 0
library("tidyverse")
plot_intro(data = df,ggtheme = theme_minimal())
# ambil hanya kolom yang diperlukan
df_long <- df %>%
dplyr::select(Provinsi, `Mutu Lulusan`, `Proses Pembelajaran`, `Mutu Guru`, `Manajemen S/M`) %>%
pivot_longer(cols = -Provinsi,
names_to = "Variabel",
values_to = "Nilai")
# buat boxplot
ggplot(df_long, aes(x = Provinsi, y = Nilai)) +
geom_boxplot(outlier.size = 0.7, fill = "skyblue") +
facet_wrap(~Variabel, scales = "free_y") + # 4 plot terpisah untuk setiap variabel
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.9)) +
labs(title = "Boxplot Nilai Mutu per Provinsi",
x = "Provinsi", y = "Nilai")
library(ggplot2)
# Boxplot Mutu Lulusan
ggplot(df, aes(x = Provinsi, y = `Mutu Lulusan`)) +
geom_boxplot(fill = "skyblue") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
labs(title = "Boxplot Mutu Lulusan per Provinsi",
x = "Provinsi", y = "Mutu Lulusan")
# Boxplot Proses Pembelajaran
ggplot(df, aes(x = Provinsi, y = `Proses Pembelajaran`)) +
geom_boxplot(fill = "orange") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
labs(title = "Boxplot Proses Pembelajaran per Provinsi",
x = "Provinsi", y = "Proses Pembelajaran")
# Boxplot Mutu Guru
ggplot(df, aes(x = Provinsi, y = `Mutu Guru`)) +
geom_boxplot(fill = "lightgreen") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
labs(title = "Boxplot Mutu Guru per Provinsi",
x = "Provinsi", y = "Mutu Guru")
# Boxplot Manajemen S/M
ggplot(df, aes(x = Provinsi, y = `Manajemen S/M`)) +
geom_boxplot(fill = "purple") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
labs(title = "Boxplot Manajemen S/M per Provinsi",
x = "Provinsi", y = "Manajemen S/M")
# ubah ke long format
df_long <- df %>%
dplyr::select(`Mutu Lulusan`, `Proses Pembelajaran`, `Mutu Guru`, `Manajemen S/M`) %>%
tidyr::pivot_longer(cols = everything(),
names_to = "Variabel",
values_to = "Nilai")
# buat boxplot
ggplot(df_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot() +
theme_bw() +
labs(title = "Boxplot Masing-masing Peubah",
x = "Variabel", y = "Nilai") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
Karena data mentah masih menggunakan observasi sekolah, maka diperlukan agregasi berdasarkan provinsi dengan menggunakan rata-rata (mean) ## Agregasi Data
df_clean <- df %>%
dplyr::select(-No, -TAHUN, -NPSN, -`Status Sekolah`, -`Kab/Kota`, -`Peringkat Akreditasi`)
df_agg <- df_clean %>%
group_by(Provinsi) %>%
summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)))
jumlah_sekolah <- df %>%
count(Provinsi, name = "Jumlah_Sekolah") %>%
arrange(desc(Jumlah_Sekolah))
jumlah_sekolah
## # A tibble: 34 × 2
## Provinsi Jumlah_Sekolah
## <chr> <int>
## 1 SUMATERA UTARA 111
## 2 JAMBI 91
## 3 JAWA TIMUR 80
## 4 JAWA BARAT 67
## 5 LAMPUNG 60
## 6 SULAWESI SELATAN 50
## 7 SUMATERA SELATAN 50
## 8 PAPUA 48
## 9 ACEH 47
## 10 NUSA TENGGARA TIMUR 46
## # ℹ 24 more rows
# hitung rata-rata per provinsi (gabungan 4 variabel mutu)
df_mean <- df_clean %>%
group_by(Provinsi) %>%
summarise(across(where(is.numeric), \(x) median(x, na.rm = TRUE))) %>%
rowwise() %>%
mutate(Rata_Mutu = mean(c_across(`Mutu Lulusan`:`Manajemen S/M`), na.rm = TRUE)) %>%
ungroup()
ggplot(df_mean, aes(x = reorder(Provinsi, Rata_Mutu), y = Rata_Mutu)) +
geom_col(fill = "steelblue") +
coord_flip(ylim = c(75,95)) + # biar provinsi terbaca jelas (horizontal barplot)
theme_bw() +
labs(title = "Rata-rata Mutu SMA per Provinsi",
x = "Provinsi",
y = "Rata-rata Nilai Mutu")
plot_histogram(data = df_agg,
ncol = 2,nrow = 2,
geom_histogram_args = list(fill="steelblue",col="black"),
ggtheme = theme_minimal())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# ambil kolom numerik dari df_agg
df_num <- df_agg %>% dplyr::select(where(is.numeric))
# hitung korelasi
cor_mat <- cor(df_num, use = "pairwise.complete.obs")
# ubah ke long format
cor_long <- as.data.frame(as.table(cor_mat))
library(corrplot)
## corrplot 0.95 loaded
corrplot(cor_mat, method = "color", type = "upper",
addCoef.col = "white", number.cex = 0.7,
tl.col = "black", tl.srt = 45)
summary(df_agg)
## Provinsi Mutu Lulusan Proses Pembelajaran Mutu Guru
## Length:34 Min. :81.88 Min. :79.14 Min. :76.82
## Class :character 1st Qu.:86.98 1st Qu.:86.08 1st Qu.:82.68
## Mode :character Median :89.72 Median :87.80 Median :86.34
## Mean :88.94 Mean :87.47 Mean :85.48
## 3rd Qu.:91.50 3rd Qu.:89.41 3rd Qu.:88.27
## Max. :94.50 Max. :91.97 Max. :91.90
## Manajemen S/M
## Min. :81.17
## 1st Qu.:88.74
## Median :90.21
## Mean :89.75
## 3rd Qu.:92.14
## Max. :95.51
# ubah ke long format
df_long <- df_agg %>%
dplyr::select(`Mutu Lulusan`, `Proses Pembelajaran`, `Mutu Guru`, `Manajemen S/M`) %>%
tidyr::pivot_longer(cols = everything(),
names_to = "Variabel",
values_to = "Nilai")
# buat boxplot
ggplot(df_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot() +
theme_bw() +
labs(title = "Boxplot Masing-masing Peubah",
x = "Variabel", y = "Nilai") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
df_agg
## # A tibble: 34 × 5
## Provinsi `Mutu Lulusan` `Proses Pembelajaran` `Mutu Guru` `Manajemen S/M`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ACEH 85.0 84.5 82.1 85.4
## 2 BALI 93.4 91.9 91.9 95.5
## 3 BANTEN 91.3 90.4 88.3 92.1
## 4 BENGKULU 93.4 88.5 87.0 91.2
## 5 DI YOGYAKAR… 91.8 92.0 87.1 93.7
## 6 DKI JAKARTA 94.5 91.1 88.8 92.7
## 7 GORONTALO 87.4 85.8 83.9 89.2
## 8 JAMBI 91.9 89.2 88.8 93.1
## 9 JAWA BARAT 91.6 90.1 88.3 93.1
## 10 JAWA TENGAH 92.5 91.0 88.5 93.3
## # ℹ 24 more rows
Dalam menentukan banyak cluster yang optimal (K) ada beberapa cara seperti PCA, UMAP, Silhouete dan Elbow
pca0 <- prcomp(x = df_agg %>% dplyr::select(-Provinsi))
#menambahkan nama objek (baris)
rownames(pca0$x) <- df_agg$Provinsi
fviz_pca_ind(pca0,
geom.ind = c("point","text"),
repel = TRUE,
labelsize=3
)
Berdasarkan hasil PCA penentuan kelompok kurang jelas
pca0$rotation %>%
as.data.frame()
## PC1 PC2 PC3 PC4
## Mutu Lulusan 0.4830764 -0.12779493 -0.7635520 0.4090158
## Proses Pembelajaran 0.4443967 0.03485035 -0.1929188 -0.8741163
## Mutu Guru 0.5424766 0.73158057 0.3435127 0.2291464
## Manajemen S/M 0.5242785 -0.66876287 0.5116343 0.1269592
umap0 <- umap(d = df_agg %>% dplyr::select(-Provinsi))
data_umap <- data.frame(x=umap0$layout[,1],y=umap0$layout[,2])
ggscatter(data_umap, x = "x", y = "y",
label = df_agg$Provinsi,repel = TRUE,
font.label = c(9, "plain", "grey50"))
Dengan UMAP juga kurang jelas dalam memisahkan kelompok ## Elbow
set.seed(123)
fviz_nbclust(
x = df_agg %>% dplyr::select(-Provinsi),
FUNcluster = stats::kmeans,
method = "wss",
iter.max = 100,
k.max = 25
)
Berdsarkan grafik elbow di atas, dipilih k=4 karena penurunan yang cukup melandai
set.seed(123)
fviz_nbclust(
x = df_agg %>% dplyr::select(-Provinsi),
FUNcluster = stats::kmeans,
method = "silhouette",
iter.max = 100,
k.max = 25
)
Berdasarkan kefisien silhoutte k-optimal adalah 2, namun dalam pengelompokan mutu sekolah dengan 2 cluster sepertinya kurang interpretable / bervariasi, sehingga dipilih k = 4 berdasarkan grafik elbow sebelumnya
set.seed(123)
kmean_res <- kmeans(df_agg %>% dplyr::select(-Provinsi),
centers = 4,
iter.max = 100)
kmean_res$centers
## Mutu Lulusan Proses Pembelajaran Mutu Guru Manajemen S/M
## 1 84.36917 82.86291 79.89696 84.14302
## 2 90.50368 88.38849 87.50482 90.89853
## 3 87.58090 87.12855 84.54442 89.48174
## 4 92.52469 90.74011 88.86753 93.51396
Dari 4 cluster yang terbentuk nilainya searah sehingga dapat disimpulkan 4 cluster ini bisa kita buat menjadi kelas dari sangat tinggi hingga rendah berdasrkan nilai rataan mutu, Dengan keterangan sebagai berikut :
“4” = “Sangat Tinggi”,
“2” = “Tinggi”,
“3” = “Sedang”,
“1” = “Rendah”
table(kmean_res$cluster)
##
## 1 2 3 4
## 7 10 9 8
sil <- cluster::silhouette(x = kmean_res$cluster,
dist = dist(x =df_agg %>% dplyr::select(-Provinsi) ,
method = "euclidean"))
fviz_silhouette(sil,print.summary = FALSE)+
scale_color_manual(values = get_palette("jco",k=7))+
# get_palette berasal dari package ggpubr
scale_fill_manual(values = get_palette("jco",k=7))+
theme_minimal()+
theme(legend.position = "top")
Gambar di atas menunjukkan tidak ada di bawah 0 artinya semua kluster dapat terpisah dengan jelas
sil %>%
as.data.frame() %>%
mutate(obs=1:nrow(sil)) %>%
relocate(obs) %>%
filter(sil_width<0) %>%
arrange(sil_width)
## [1] obs cluster neighbor sil_width
## <0 rows> (or 0-length row.names)
corrected_cluster_kmean_euc <- sil %>%
as.data.frame() %>%
mutate(cluster=if_else(sil_width<0,neighbor,cluster))
corrected_cluster_kmean_euc %>%
filter(sil_width<0) %>%
arrange(sil_width)
## [1] cluster neighbor sil_width
## <0 rows> (or 0-length row.names)
df_agg |>
dplyr::select(-Provinsi) %>%
mutate(cluster=corrected_cluster_kmean_euc$cluster) %>%
group_by(cluster) %>%
summarise(across(everything(),mean))
## # A tibble: 4 × 5
## cluster `Mutu Lulusan` `Proses Pembelajaran` `Mutu Guru` `Manajemen S/M`
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 84.4 82.9 79.9 84.1
## 2 2 90.5 88.4 87.5 90.9
## 3 3 87.6 87.1 84.5 89.5
## 4 4 92.5 90.7 88.9 93.5
df_agg2 <- df_agg %>% dplyr::select(-Provinsi)
fviz_pca_ind(pca0,
geom.ind = c("point", "text"),
repel = TRUE,
labelsize = 3) +
theme_minimal()
library(factoextra)
library(ggplot2)
# Simpan hasil plot fviz_cluster
p_cluster <- fviz_cluster(
object = list(data = df_agg2,
cluster = corrected_cluster_kmean_euc$cluster),
geom = "point", # hanya titik dulu
repel = TRUE,
show.clust.cent = FALSE,
ellipse.alpha = 0.05
) + theme_minimal()
# Tambah label provinsi secara manual
p_cluster +
geom_text(aes(label = df_agg$Provinsi),
size = 3, vjust = -0.5, check_overlap = TRUE)
Untuk membentuk visualisasi spasial, dibutuhkan dataset longitude dan latitude asing-masing provinsi
library(sf) # baca geojson
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
# 1. Baca file JSON dari GADM
indo_sf <- st_read("C:/Users/nabil/Downloads/Pemodelan Klasifikasi/gadm41_IDN_1.json")
## Reading layer `gadm41_IDN_1' from data source
## `C:\Users\nabil\Downloads\Pemodelan Klasifikasi\gadm41_IDN_1.json'
## using driver `GeoJSON'
## Simple feature collection with 34 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 95.0097 ymin: -11.0075 xmax: 141.0194 ymax: 6.0761
## Geodetic CRS: WGS 84
# Cek kolom nama provinsi
names(indo_sf)
## [1] "GID_1" "GID_0" "COUNTRY" "NAME_1" "VARNAME_1" "NL_NAME_1"
## [7] "TYPE_1" "ENGTYPE_1" "CC_1" "HASC_1" "ISO_1" "geometry"
# Biasanya ada kolom NAME_1 untuk nama provinsi
unique(indo_sf$NAME_1)
## [1] "Aceh" "Bali" "BangkaBelitung"
## [4] "Banten" "Bengkulu" "Gorontalo"
## [7] "JakartaRaya" "Jambi" "JawaBarat"
## [10] "JawaTengah" "JawaTimur" "KalimantanBarat"
## [13] "KalimantanSelatan" "KalimantanTengah" "KalimantanTimur"
## [16] "KalimantanUtara" "KepulauanRiau" "Lampung"
## [19] "Maluku" "MalukuUtara" "NusaTenggaraBarat"
## [22] "NusaTenggaraTimur" "Papua" "PapuaBarat"
## [25] "Riau" "SulawesiBarat" "SulawesiSelatan"
## [28] "SulawesiTengah" "SulawesiTenggara" "SulawesiUtara"
## [31] "SumateraBarat" "SumateraSelatan" "SumateraUtara"
## [34] "Yogyakarta"
unique(df_agg$Provinsi)
## [1] "ACEH" "BALI"
## [3] "BANTEN" "BENGKULU"
## [5] "DI YOGYAKARTA" "DKI JAKARTA"
## [7] "GORONTALO" "JAMBI"
## [9] "JAWA BARAT" "JAWA TENGAH"
## [11] "JAWA TIMUR" "KALIMANTAN BARAT"
## [13] "KALIMANTAN SELATAN" "KALIMANTAN TENGAH"
## [15] "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## [17] "KEPULAUAN BANGKA BELITUNG" "KEPULAUAN RIAU"
## [19] "LAMPUNG" "MALUKU"
## [21] "MALUKU UTARA" "NUSA TENGGARA BARAT"
## [23] "NUSA TENGGARA TIMUR" "PAPUA"
## [25] "PAPUA BARAT" "RIAU"
## [27] "SULAWESI BARAT" "SULAWESI SELATAN"
## [29] "SULAWESI TENGAH" "SULAWESI TENGGARA"
## [31] "SULAWESI UTARA" "SUMATERA BARAT"
## [33] "SUMATERA SELATAN" "SUMATERA UTARA"
# Buat dictionary mapping nama shapefile -> nama di df_agg
prov_map <- c(
"Aceh" = "ACEH",
"Bali" = "BALI",
"BangkaBelitung" = "KEPULAUAN BANGKA BELITUNG",
"Banten" = "BANTEN",
"Bengkulu" = "BENGKULU",
"Gorontalo" = "GORONTALO",
"JakartaRaya" = "DKI JAKARTA",
"Jambi" = "JAMBI",
"JawaBarat" = "JAWA BARAT",
"JawaTengah" = "JAWA TENGAH",
"JawaTimur" = "JAWA TIMUR",
"KalimantanBarat" = "KALIMANTAN BARAT",
"KalimantanSelatan" = "KALIMANTAN SELATAN",
"KalimantanTengah" = "KALIMANTAN TENGAH",
"KalimantanTimur" = "KALIMANTAN TIMUR",
"KalimantanUtara" = "KALIMANTAN UTARA",
"KepulauanRiau" = "KEPULAUAN RIAU",
"Lampung" = "LAMPUNG",
"Maluku" = "MALUKU",
"MalukuUtara" = "MALUKU UTARA",
"NusaTenggaraBarat" = "NUSA TENGGARA BARAT",
"NusaTenggaraTimur" = "NUSA TENGGARA TIMUR",
"Papua" = "PAPUA",
"PapuaBarat" = "PAPUA BARAT",
"Riau" = "RIAU",
"SulawesiBarat" = "SULAWESI BARAT",
"SulawesiSelatan" = "SULAWESI SELATAN",
"SulawesiTengah" = "SULAWESI TENGAH",
"SulawesiTenggara" = "SULAWESI TENGGARA",
"SulawesiUtara" = "SULAWESI UTARA",
"SumateraBarat" = "SUMATERA BARAT",
"SumateraSelatan" = "SUMATERA SELATAN",
# Tambah kalau ada "SumateraUtara" → "SUMATERA UTARA"
"SumateraUtara" = "SUMATERA UTARA",
"Yogyakarta" = "DI YOGYAKARTA"
)
# Tambahkan kolom baru "Provinsi" di shapefile pakai mapping
indo_sf <- indo_sf %>%
mutate(Provinsi = prov_map[NAME_1])
# Lihat nama provinsi dari shapefile
unique(indo_sf$Provinsi)
## [1] "ACEH" "BALI"
## [3] "KEPULAUAN BANGKA BELITUNG" "BANTEN"
## [5] "BENGKULU" "GORONTALO"
## [7] "DKI JAKARTA" "JAMBI"
## [9] "JAWA BARAT" "JAWA TENGAH"
## [11] "JAWA TIMUR" "KALIMANTAN BARAT"
## [13] "KALIMANTAN SELATAN" "KALIMANTAN TENGAH"
## [15] "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## [17] "KEPULAUAN RIAU" "LAMPUNG"
## [19] "MALUKU" "MALUKU UTARA"
## [21] "NUSA TENGGARA BARAT" "NUSA TENGGARA TIMUR"
## [23] "PAPUA" "PAPUA BARAT"
## [25] "RIAU" "SULAWESI BARAT"
## [27] "SULAWESI SELATAN" "SULAWESI TENGAH"
## [29] "SULAWESI TENGGARA" "SULAWESI UTARA"
## [31] "SUMATERA BARAT" "SUMATERA SELATAN"
## [33] "SUMATERA UTARA" "DI YOGYAKARTA"
# Lihat nama provinsi dari data Anda
unique(df_agg$Provinsi)
## [1] "ACEH" "BALI"
## [3] "BANTEN" "BENGKULU"
## [5] "DI YOGYAKARTA" "DKI JAKARTA"
## [7] "GORONTALO" "JAMBI"
## [9] "JAWA BARAT" "JAWA TENGAH"
## [11] "JAWA TIMUR" "KALIMANTAN BARAT"
## [13] "KALIMANTAN SELATAN" "KALIMANTAN TENGAH"
## [15] "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## [17] "KEPULAUAN BANGKA BELITUNG" "KEPULAUAN RIAU"
## [19] "LAMPUNG" "MALUKU"
## [21] "MALUKU UTARA" "NUSA TENGGARA BARAT"
## [23] "NUSA TENGGARA TIMUR" "PAPUA"
## [25] "PAPUA BARAT" "RIAU"
## [27] "SULAWESI BARAT" "SULAWESI SELATAN"
## [29] "SULAWESI TENGAH" "SULAWESI TENGGARA"
## [31] "SULAWESI UTARA" "SUMATERA BARAT"
## [33] "SUMATERA SELATAN" "SUMATERA UTARA"
setdiff(df_agg$Provinsi, indo_sf$Provinsi)
## character(0)
# Join dengan data cluster
df_cluster <- df_agg %>%
mutate(cluster = corrected_cluster_kmean_euc$cluster)
indo_clustered <- indo_sf %>%
left_join(df_cluster, by = "Provinsi")
# Atur urutan cluster sesuai ranking
indo_clustered$cluster <- factor(indo_clustered$cluster, levels = c("4", "2", "3", "1"))
# Plot peta
ggplot(data = indo_clustered) +
geom_sf(aes(fill = cluster), color = "black", size = 0.2) +
scale_fill_brewer(palette = "YlGnBu", direction = -1, name = "Cluster") +
theme_minimal() +
labs(
title = "Peta Klasterisasi Provinsi di Indonesia",
subtitle = "Hasil clustering K-means (warna sesuai ranking)"
)
indo_clustered <- indo_clustered %>%
mutate(cluster_label = recode_factor(as.character(cluster),
"4" = "Sangat Tinggi",
"2" = "Tinggi",
"3" = "Sedang",
"1" = "Rendah",
.ordered = TRUE))
# Membagi provinsi berdasarkan cluster_label
prov_per_cluster <- split(indo_clustered$Provinsi, indo_clustered$cluster_label)
# Cek hasil
prov_per_cluster
## $`Sangat Tinggi`
## Bali Banten JakartaRaya Jambi
## "BALI" "BANTEN" "DKI JAKARTA" "JAMBI"
## JawaBarat JawaTengah SumateraUtara Yogyakarta
## "JAWA BARAT" "JAWA TENGAH" "SUMATERA UTARA" "DI YOGYAKARTA"
##
## $Tinggi
## BangkaBelitung Bengkulu
## "KEPULAUAN BANGKA BELITUNG" "BENGKULU"
## JawaTimur KalimantanTengah
## "JAWA TIMUR" "KALIMANTAN TENGAH"
## KalimantanTimur KalimantanUtara
## "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## MalukuUtara Papua
## "MALUKU UTARA" "PAPUA"
## Riau SumateraBarat
## "RIAU" "SUMATERA BARAT"
##
## $Sedang
## Gorontalo KalimantanSelatan KepulauanRiau
## "GORONTALO" "KALIMANTAN SELATAN" "KEPULAUAN RIAU"
## Lampung Maluku SulawesiSelatan
## "LAMPUNG" "MALUKU" "SULAWESI SELATAN"
## SulawesiTengah SulawesiTenggara SumateraSelatan
## "SULAWESI TENGAH" "SULAWESI TENGGARA" "SUMATERA SELATAN"
##
## $Rendah
## Aceh KalimantanBarat NusaTenggaraBarat
## "ACEH" "KALIMANTAN BARAT" "NUSA TENGGARA BARAT"
## NusaTenggaraTimur PapuaBarat SulawesiBarat
## "NUSA TENGGARA TIMUR" "PAPUA BARAT" "SULAWESI BARAT"
## SulawesiUtara
## "SULAWESI UTARA"
library(sf)
library(leaflet)
library(dplyr)
# Bikin palet warna
pal <- colorFactor(topo.colors(length(unique(indo_clustered$cluster))),
domain = indo_clustered$cluster)
# Peta interaktif
leaflet(indo_clustered) %>%
addTiles() %>%
addPolygons(
fillColor = ~pal(cluster),
color = "black", weight = 1, opacity = 0.7,
fillOpacity = 0.6,
highlightOptions = highlightOptions(color = "white", weight = 2, bringToFront = TRUE),
label = ~paste0(NAME_1, " (Cluster ", cluster, ")")
) %>%
addLegend("bottomright", pal = pal, values = ~cluster,
title = "Cluster")
Untuk memastikan hasil cluster dengan benar, akan dilakukan eksplorasi data sebelum cluster.
library(dplyr)
library(ggplot2)
library(tidyr)
# 1. Tambahkan info cluster ke data sekolah
df_clean_clustered <- df_clean %>%
left_join(
indo_clustered %>% st_drop_geometry() %>% select(Provinsi, cluster),
by = "Provinsi"
)
# 2. Reshape data jadi long format (biar gampang bikin plot semua peubah sekaligus)
df_long <- df_clean_clustered %>%
pivot_longer(cols = c(`Mutu Lulusan`, `Proses Pembelajaran`,
`Mutu Guru`, `Manajemen S/M`),
names_to = "Variabel",
values_to = "Nilai")
# 3. Buat boxplot eksplorasi
ggplot(df_long, aes(x = factor(cluster), y = Nilai, fill = factor(cluster))) +
geom_boxplot(outlier.alpha = 0.3) +
facet_wrap(~Variabel, scales = "free_y") +
theme_minimal() +
labs(title = "Distribusi 4 Peubah Mutu berdasarkan Cluster",
x = "Cluster", y = "Nilai Peubah") +
theme(legend.position = "none")
library(dplyr)
library(ggplot2)
# Tambah kolom rata_mutu kalau belum ada
df_clean <- df_clean %>%
mutate(rata_mutu = rowMeans(select(., `Mutu Lulusan`,
`Proses Pembelajaran`,
`Mutu Guru`,
`Manajemen S/M`), na.rm = TRUE))
ggplot(df_clean, aes(x = reorder(Provinsi, rata_mutu, FUN = median),
y = rata_mutu, fill = Provinsi)) +
geom_boxplot(outlier.alpha = 0.3) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 8), # label kecil & miring ke atas
legend.position = "none" # legend dihilangkan biar rapi
) +
labs(
title = "Boxplot Rata-rata Mutu Sekolah per Provinsi",
x = "Provinsi (urut median)",
y = "Rata-rata Mutu"
)
library(dplyr)
library(ggplot2)
# Tambah kolom rata_mutu per sekolah
df_clean <- df_clean %>%
mutate(rata_mutu = rowMeans(select(., `Mutu Lulusan`, `Proses Pembelajaran`,
`Mutu Guru`, `Manajemen S/M`), na.rm = TRUE))
# Gabungkan info cluster + cluster_label per provinsi
df_clean_clustered <- df_clean %>%
left_join(
indo_clustered %>%
st_drop_geometry() %>%
select(Provinsi, cluster, cluster_label),
by = "Provinsi"
)
# --- Plot per cluster ---
# --- Cluster Sangat Tinggi ---
df_clean_clustered %>%
filter(cluster_label == "Sangat Tinggi") %>%
ggplot(aes(x = Provinsi, y = rata_mutu, fill = Provinsi)) +
geom_boxplot(outlier.alpha = 0.3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot Rata-rata Mutu Sekolah - Cluster Sangat Tinggi",
x = "Provinsi", y = "Rata-rata Mutu")
# --- Cluster Tinggi ---
df_clean_clustered %>%
filter(cluster_label == "Tinggi") %>%
ggplot(aes(x = Provinsi, y = rata_mutu, fill = Provinsi)) +
geom_boxplot(outlier.alpha = 0.3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot Rata-rata Mutu Sekolah - Cluster Tinggi",
x = "Provinsi", y = "Rata-rata Mutu")
# --- Cluster Sedang ---
df_clean_clustered %>%
filter(cluster_label == "Sedang") %>%
ggplot(aes(x = Provinsi, y = rata_mutu, fill = Provinsi)) +
geom_boxplot(outlier.alpha = 0.3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot Rata-rata Mutu Sekolah - Cluster Sedang",
x = "Provinsi", y = "Rata-rata Mutu")
# --- Cluster Rendah ---
df_clean_clustered %>%
filter(cluster_label == "Rendah") %>%
ggplot(aes(x = Provinsi, y = rata_mutu, fill = Provinsi)) +
geom_boxplot(outlier.alpha = 0.3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot Rata-rata Mutu Sekolah - Cluster Rendah",
x = "Provinsi", y = "Rata-rata Mutu")
# Kalau mau median per sekolah, sebenarnya tidak perlu (karena rata_mutu sudah gabungan 4 indikator dalam 1 sekolah)
# Tapi kalau mau median antar sekolah di level provinsi:
df_median_prov <- df_clean %>%
group_by(Provinsi) %>%
summarise(median_mutu = median(rata_mutu, na.rm = TRUE))
df_clean_clustered %>%
filter(cluster_label == "Sangat Tinggi") %>%
ggplot(aes(x = Provinsi, y = rata_mutu, fill = Provinsi)) +
geom_boxplot(outlier.alpha = 0.3) +
stat_summary(fun = median, geom = "point", color = "red", size = 2) + # titik median
stat_summary(fun = median, geom = "text",
aes(label = round(after_stat(y), 1)), # gunakan after_stat()
vjust = -0.7, color = "black", size = 3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot Rata-rata Mutu Sekolah - Cluster Sangat Tinggi",
x = "Provinsi", y = "Rata-rata Mutu")
untuk melihat IQR seberapa lebar variasi antar provinsinya , mana provinsi paling heterogen
library(dplyr)
library(ggplot2)
# Hitung IQR per provinsi (khusus cluster "Sangat Tinggi")
iqr_per_prov <- df_clean_clustered %>%
filter(cluster_label == "Sangat Tinggi") %>%
group_by(Provinsi) %>%
summarise(
IQR_mutu = IQR(rata_mutu, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(IQR_mutu))
# Tampilkan tabel IQR
print(iqr_per_prov)
## # A tibble: 8 × 2
## Provinsi IQR_mutu
## <chr> <dbl>
## 1 BANTEN 6.76
## 2 DI YOGYAKARTA 6.03
## 3 JAWA BARAT 4.81
## 4 JAMBI 3.74
## 5 SUMATERA UTARA 3.73
## 6 BALI 3.50
## 7 DKI JAKARTA 2.99
## 8 JAWA TENGAH 2.90
# Visualisasi barplot IQR
ggplot(iqr_per_prov, aes(x = reorder(Provinsi, -IQR_mutu), y = IQR_mutu, fill = Provinsi)) +
geom_col() +
geom_text(aes(label = round(IQR_mutu, 1)), vjust = -0.5, size = 4) +
theme_minimal() +
labs(title = "Sebaran (IQR) Rata-rata Mutu Sekolah per Provinsi - Cluster Sangat Tinggi",
x = "Provinsi", y = "IQR (Interquartile Range)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
outlier_prop <- df_clean_clustered %>%
filter(cluster_label == "Sangat Tinggi") %>%
group_by(Provinsi) %>%
mutate(
Q1 = quantile(rata_mutu, 0.25, na.rm = TRUE),
Q3 = quantile(rata_mutu, 0.75, na.rm = TRUE),
IQR_val = Q3 - Q1,
lower_bound = Q1 - 1.5 * IQR_val,
upper_bound = Q3 + 1.5 * IQR_val,
outlier = rata_mutu < lower_bound | rata_mutu > upper_bound
) %>%
summarise(
total = n(),
outlier_count = sum(outlier, na.rm = TRUE),
prop_outlier = outlier_count / total * 100,
.groups = "drop"
)
print(outlier_prop)
## # A tibble: 8 × 4
## Provinsi total outlier_count prop_outlier
## <chr> <int> <int> <dbl>
## 1 BALI 22 1 4.55
## 2 BANTEN 24 1 4.17
## 3 DI YOGYAKARTA 13 1 7.69
## 4 DKI JAKARTA 14 1 7.14
## 5 JAMBI 91 7 7.69
## 6 JAWA BARAT 67 1 1.49
## 7 JAWA TENGAH 19 2 10.5
## 8 SUMATERA UTARA 111 12 10.8
df_clean_clustered %>%
filter(cluster_label == "Rendah") %>%
ggplot(aes(x = Provinsi, y = rata_mutu, fill = Provinsi)) +
geom_boxplot(outlier.alpha = 0.3) +
stat_summary(fun = median, geom = "point", color = "red", size = 2) + # titik median
stat_summary(fun = median, geom = "text",
aes(label = round(after_stat(y), 1)), # gunakan after_stat()
vjust = -0.7, color = "black", size = 3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot Rata-rata Mutu Sekolah - Cluster Rendah",
x = "Provinsi", y = "Rata-rata Mutu")
# Data tanpa kolom provinsi
df_agg2 <- df_agg %>% dplyr::select(-Provinsi)
library(cluster)
df_kmed <- df_agg2
# Jalankan K-Medoids dengan k =4
set.seed(123) # biar hasil konsisten
kmed_euclid <- pam(df_kmed, k = 4, metric = "euclidean")
# Lihat medoid (pusat cluster)
kmed_euclid$medoids
## Mutu Lulusan Proses Pembelajaran Mutu Guru Manajemen S/M
## [1,] 85.04377 84.45532 82.08983 85.40780
## [2,] 92.48571 90.99825 88.52924 93.29532
## [3,] 89.80607 87.08917 86.42500 90.54514
## [4,] 83.16273 81.79130 77.22947 83.94565
table(kmed_euclid$clustering)
##
## 1 2 3 4
## 6 10 15 3
sil <- cluster::silhouette(x = kmed_euclid$clustering,
dist = dist(x =df_agg %>% dplyr::select(-Provinsi) ,
method = "euclidean"))
fviz_silhouette(sil,print.summary = FALSE)+
scale_color_manual(values = get_palette("jco",k=7))+
# get_palette berasal dari package ggpubr
scale_fill_manual(values = get_palette("jco",k=7))+
theme_minimal()+
theme(legend.position = "top")
# Deteksi observasi dengan silhouette negatif
sil %>%
as.data.frame() %>%
mutate(obs = 1:nrow(sil)) %>%
relocate(obs) %>%
filter(sil_width < 0) %>%
arrange(sil_width)
## obs cluster neighbor sil_width
## 1 7 3 1 -0.04238903
# Koreksi cluster bila silhouette negatif
corrected_cluster_kmed_euc <- sil %>%
as.data.frame() %>%
mutate(cluster = if_else(sil_width < 0, neighbor, cluster))
# Rata-rata per cluster
df_agg2 %>%
mutate(cluster = corrected_cluster_kmed_euc$cluster) %>%
group_by(cluster) %>%
summarise(across(everything(), mean))
## # A tibble: 4 × 5
## cluster `Mutu Lulusan` `Proses Pembelajaran` `Mutu Guru` `Manajemen S/M`
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 86.2 85.0 82.6 86.8
## 2 2 92.6 90.4 88.7 92.9
## 3 3 89.1 87.9 86.4 90.5
## 4 4 82.5 81.3 77.5 82.4
# Visualisasi cluster
# --- Visualisasi cluster ---
p_cluster <- fviz_cluster(
object = list(data = df_agg2, cluster = corrected_cluster_kmed_euc$cluster),
geom = "point",
repel = TRUE,
show.clust.cent = FALSE,
ellipse.alpha = 0.05
) + theme_minimal()
p_cluster +
geom_text(aes(label = df_agg$Provinsi),
size = 3, vjust = -0.5, check_overlap = TRUE)
# Join dengan data cluster
df_cluster <- df_agg %>%
mutate(cluster = corrected_cluster_kmed_euc$cluster)
indo_clustered <- indo_sf %>%
left_join(df_cluster, by = "Provinsi")
library(ggplot2)
ggplot(data = indo_clustered) +
geom_sf(aes(fill = factor(cluster)), color = "grey40", size = 0.2) +
scale_fill_brewer(
palette = "Set3",
name = "Cluster"
) +
theme_minimal(base_size = 12) +
labs(
title = "Peta Klasterisasi Provinsi di Indonesia",
subtitle = "Metode K-Medoids dengan Jarak Euclidean",
caption = "Sumber data: df_agg & shapefile Indo"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12, color = "grey20"),
legend.position = "right",
panel.grid = element_blank()
)
library(sf) # baca geojson
# 1. Baca file JSON dari GADM
indo_sf <- st_read("C:/Users/nabil/Downloads/Pemodelan Klasifikasi/gadm41_IDN_1.json")
## Reading layer `gadm41_IDN_1' from data source
## `C:\Users\nabil\Downloads\Pemodelan Klasifikasi\gadm41_IDN_1.json'
## using driver `GeoJSON'
## Simple feature collection with 34 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 95.0097 ymin: -11.0075 xmax: 141.0194 ymax: 6.0761
## Geodetic CRS: WGS 84
# Cek kolom nama provinsi
names(indo_sf)
## [1] "GID_1" "GID_0" "COUNTRY" "NAME_1" "VARNAME_1" "NL_NAME_1"
## [7] "TYPE_1" "ENGTYPE_1" "CC_1" "HASC_1" "ISO_1" "geometry"
# Biasanya ada kolom NAME_1 untuk nama provinsi
unique(indo_sf$NAME_1)
## [1] "Aceh" "Bali" "BangkaBelitung"
## [4] "Banten" "Bengkulu" "Gorontalo"
## [7] "JakartaRaya" "Jambi" "JawaBarat"
## [10] "JawaTengah" "JawaTimur" "KalimantanBarat"
## [13] "KalimantanSelatan" "KalimantanTengah" "KalimantanTimur"
## [16] "KalimantanUtara" "KepulauanRiau" "Lampung"
## [19] "Maluku" "MalukuUtara" "NusaTenggaraBarat"
## [22] "NusaTenggaraTimur" "Papua" "PapuaBarat"
## [25] "Riau" "SulawesiBarat" "SulawesiSelatan"
## [28] "SulawesiTengah" "SulawesiTenggara" "SulawesiUtara"
## [31] "SumateraBarat" "SumateraSelatan" "SumateraUtara"
## [34] "Yogyakarta"
unique(df_agg$Provinsi)
## [1] "ACEH" "BALI"
## [3] "BANTEN" "BENGKULU"
## [5] "DI YOGYAKARTA" "DKI JAKARTA"
## [7] "GORONTALO" "JAMBI"
## [9] "JAWA BARAT" "JAWA TENGAH"
## [11] "JAWA TIMUR" "KALIMANTAN BARAT"
## [13] "KALIMANTAN SELATAN" "KALIMANTAN TENGAH"
## [15] "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## [17] "KEPULAUAN BANGKA BELITUNG" "KEPULAUAN RIAU"
## [19] "LAMPUNG" "MALUKU"
## [21] "MALUKU UTARA" "NUSA TENGGARA BARAT"
## [23] "NUSA TENGGARA TIMUR" "PAPUA"
## [25] "PAPUA BARAT" "RIAU"
## [27] "SULAWESI BARAT" "SULAWESI SELATAN"
## [29] "SULAWESI TENGAH" "SULAWESI TENGGARA"
## [31] "SULAWESI UTARA" "SUMATERA BARAT"
## [33] "SUMATERA SELATAN" "SUMATERA UTARA"
# Buat dictionary mapping nama shapefile -> nama di df_agg
prov_map <- c(
"Aceh" = "ACEH",
"Bali" = "BALI",
"BangkaBelitung" = "KEPULAUAN BANGKA BELITUNG",
"Banten" = "BANTEN",
"Bengkulu" = "BENGKULU",
"Gorontalo" = "GORONTALO",
"JakartaRaya" = "DKI JAKARTA",
"Jambi" = "JAMBI",
"JawaBarat" = "JAWA BARAT",
"JawaTengah" = "JAWA TENGAH",
"JawaTimur" = "JAWA TIMUR",
"KalimantanBarat" = "KALIMANTAN BARAT",
"KalimantanSelatan" = "KALIMANTAN SELATAN",
"KalimantanTengah" = "KALIMANTAN TENGAH",
"KalimantanTimur" = "KALIMANTAN TIMUR",
"KalimantanUtara" = "KALIMANTAN UTARA",
"KepulauanRiau" = "KEPULAUAN RIAU",
"Lampung" = "LAMPUNG",
"Maluku" = "MALUKU",
"MalukuUtara" = "MALUKU UTARA",
"NusaTenggaraBarat" = "NUSA TENGGARA BARAT",
"NusaTenggaraTimur" = "NUSA TENGGARA TIMUR",
"Papua" = "PAPUA",
"PapuaBarat" = "PAPUA BARAT",
"Riau" = "RIAU",
"SulawesiBarat" = "SULAWESI BARAT",
"SulawesiSelatan" = "SULAWESI SELATAN",
"SulawesiTengah" = "SULAWESI TENGAH",
"SulawesiTenggara" = "SULAWESI TENGGARA",
"SulawesiUtara" = "SULAWESI UTARA",
"SumateraBarat" = "SUMATERA BARAT",
"SumateraSelatan" = "SUMATERA SELATAN",
# Tambah kalau ada "SumateraUtara" → "SUMATERA UTARA"
"SumateraUtara" = "SUMATERA UTARA",
"Yogyakarta" = "DI YOGYAKARTA"
)
# Tambahkan kolom baru "Provinsi" di shapefile pakai mapping
indo_sf <- indo_sf %>%
mutate(Provinsi = prov_map[NAME_1])
# Lihat nama provinsi dari shapefile
unique(indo_sf$Provinsi)
## [1] "ACEH" "BALI"
## [3] "KEPULAUAN BANGKA BELITUNG" "BANTEN"
## [5] "BENGKULU" "GORONTALO"
## [7] "DKI JAKARTA" "JAMBI"
## [9] "JAWA BARAT" "JAWA TENGAH"
## [11] "JAWA TIMUR" "KALIMANTAN BARAT"
## [13] "KALIMANTAN SELATAN" "KALIMANTAN TENGAH"
## [15] "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## [17] "KEPULAUAN RIAU" "LAMPUNG"
## [19] "MALUKU" "MALUKU UTARA"
## [21] "NUSA TENGGARA BARAT" "NUSA TENGGARA TIMUR"
## [23] "PAPUA" "PAPUA BARAT"
## [25] "RIAU" "SULAWESI BARAT"
## [27] "SULAWESI SELATAN" "SULAWESI TENGAH"
## [29] "SULAWESI TENGGARA" "SULAWESI UTARA"
## [31] "SUMATERA BARAT" "SUMATERA SELATAN"
## [33] "SUMATERA UTARA" "DI YOGYAKARTA"
# Lihat nama provinsi dari data Anda
unique(df_agg$Provinsi)
## [1] "ACEH" "BALI"
## [3] "BANTEN" "BENGKULU"
## [5] "DI YOGYAKARTA" "DKI JAKARTA"
## [7] "GORONTALO" "JAMBI"
## [9] "JAWA BARAT" "JAWA TENGAH"
## [11] "JAWA TIMUR" "KALIMANTAN BARAT"
## [13] "KALIMANTAN SELATAN" "KALIMANTAN TENGAH"
## [15] "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## [17] "KEPULAUAN BANGKA BELITUNG" "KEPULAUAN RIAU"
## [19] "LAMPUNG" "MALUKU"
## [21] "MALUKU UTARA" "NUSA TENGGARA BARAT"
## [23] "NUSA TENGGARA TIMUR" "PAPUA"
## [25] "PAPUA BARAT" "RIAU"
## [27] "SULAWESI BARAT" "SULAWESI SELATAN"
## [29] "SULAWESI TENGAH" "SULAWESI TENGGARA"
## [31] "SULAWESI UTARA" "SUMATERA BARAT"
## [33] "SUMATERA SELATAN" "SUMATERA UTARA"
setdiff(df_agg$Provinsi, indo_sf$Provinsi)
## character(0)
# Join dengan data cluster
df_cluster <- df_agg %>%
mutate(cluster = corrected_cluster_kmed_euc$cluster)
indo_clustered <- indo_sf %>%
left_join(df_cluster, by = "Provinsi")
# Plot peta
ggplot(data = indo_clustered) +
geom_sf(aes(fill = factor(cluster)), color = "black", size = 0.2) +
scale_fill_brewer(palette = "Set3", name = "Cluster") +
theme_minimal() +
labs(title = "Peta Klasterisasi Provinsi di Indonesia",
subtitle = "Hasil clustering K-means Euclidean")
# Atur urutan cluster sesuai ranking
indo_clustered$cluster <- factor(indo_clustered$cluster, levels = c("2", "3", "1", "4"))
# Plot peta
ggplot(data = indo_clustered) +
geom_sf(aes(fill = cluster), color = "black", size = 0.2) +
scale_fill_brewer(palette = "YlGnBu", direction = -1, name = "Cluster") +
theme_minimal() +
labs(
title = "Peta Klasterisasi Provinsi di Indonesia",
subtitle = "Hasil clustering K-Medoid (warna sesuai ranking)"
)
# Membagi provinsi berdasarkan cluster
prov_per_cluster <- split(indo_clustered$Provinsi, indo_clustered$cluster)
# Lihat hasil
prov_per_cluster
## $`2`
## Bali Banten Bengkulu JakartaRaya
## "BALI" "BANTEN" "BENGKULU" "DKI JAKARTA"
## Jambi JawaBarat JawaTengah KalimantanTengah
## "JAMBI" "JAWA BARAT" "JAWA TENGAH" "KALIMANTAN TENGAH"
## SumateraUtara Yogyakarta
## "SUMATERA UTARA" "DI YOGYAKARTA"
##
## $`3`
## BangkaBelitung JawaTimur
## "KEPULAUAN BANGKA BELITUNG" "JAWA TIMUR"
## KalimantanTimur KalimantanUtara
## "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## KepulauanRiau Lampung
## "KEPULAUAN RIAU" "LAMPUNG"
## MalukuUtara Papua
## "MALUKU UTARA" "PAPUA"
## Riau SulawesiSelatan
## "RIAU" "SULAWESI SELATAN"
## SulawesiTengah SulawesiTenggara
## "SULAWESI TENGAH" "SULAWESI TENGGARA"
## SumateraBarat SumateraSelatan
## "SUMATERA BARAT" "SUMATERA SELATAN"
##
## $`1`
## Aceh Gorontalo KalimantanSelatan
## "ACEH" "GORONTALO" "KALIMANTAN SELATAN"
## Maluku NusaTenggaraBarat PapuaBarat
## "MALUKU" "NUSA TENGGARA BARAT" "PAPUA BARAT"
## SulawesiUtara
## "SULAWESI UTARA"
##
## $`4`
## KalimantanBarat NusaTenggaraTimur SulawesiBarat
## "KALIMANTAN BARAT" "NUSA TENGGARA TIMUR" "SULAWESI BARAT"
#install.packages("clusterSim")
library(clusterSim)
## Warning: package 'clusterSim' was built under R version 4.4.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# --- Data tanpa Provinsi ---
X <- dplyr::select(df_agg, -Provinsi)
# --- Evaluasi untuk KMeans euclidean---
# Silhouette
sil_kmeans_euc <- silhouette(kmean_res$cluster, dist(X))
avg_sil_kmeans_euc <- mean(sil_kmeans_euc[, 3])
# Davies-Bouldin Index
dbi_kmeans_euc <- index.DB(X, kmean_res$cluster, centrotypes = "centroids")$DB
avg_sil_kmeans_euc
## [1] 0.3572708
dbi_kmeans_euc
## [1] 0.9151393
# --- Evaluasi untuk KMedoids Euclidean---
# Silhouette
sil <- cluster::silhouette(x = kmed_euclid$clustering,
dist = dist(x =df_agg %>% dplyr::select(-Provinsi),
,
method = "euclidean"))
avg_sil_kmed_euc<- mean(sil[, 3])
# Davies-Bouldin Index (butuh dissimilarity)
dbi_kmed_euc <- index.DB(X, kmed_euclid$clustering,
centrotypes = "medoids",
d = dist(df_agg %>% dplyr::select(-Provinsi), method = "euclidean")
)$DB
# --- Buat tabel perbandingan ---
comparison <- data.frame(
Metode = c("KMeans (Euclidean, k=4)" ,"KMedoids (Euclidean, k=4)"),
Silhouette = c(avg_sil_kmeans_euc, avg_sil_kmed_euc),
DBI = c(dbi_kmeans_euc, dbi_kmed_euc)
)
print(comparison)
## Metode Silhouette DBI
## 1 KMeans (Euclidean, k=4) 0.3572708 0.9151393
## 2 KMedoids (Euclidean, k=4) 0.3761096 0.9503386