##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'cluster' was built under R version 4.3.3
## Warning: package 'factoextra' was built under R version 4.3.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Warning: package 'mclust' was built under R version 4.3.3
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
## Warning: package 'kernlab' was built under R version 4.3.3
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Warning: package 'ggcorrplot' was built under R version 4.3.3
## Warning: package 'ggrepel' was built under R version 4.3.3
## Warning: package 'psych' was built under R version 4.3.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:kernlab':
##
## alpha
## The following object is masked from 'package:mclust':
##
## sim
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Warning: package 'fpc' was built under R version 4.3.3
## Warning: package 'clusterSim' was built under R version 4.3.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Warning: package 'clusterCrit' was built under R version 4.3.3
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
data <- read_excel("C:/Users/Nabila/OneDrive/Dokumen/RStudio/PDM SMT 5/data_tbp2.xlsx")
cat("Total missing values:", sum(is.na(data)), "\n")## Total missing values: 0
## # A tibble: 6 × 13
## Provinsi Perawat Bidan Kefarmasian `Kesehatan Masyarakat`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Aceh 21435 21767 3165 3890
## 2 Sumatera Utara 25787 27122 3970 3167
## 3 Sumatera Barat 12236 9313 2548 1430
## 4 Riau 12185 10246 2634 1392
## 5 Jambi 8425 7512 1635 1059
## 6 Sumatera Selatan 17309 15942 2971 2217
## # ℹ 8 more variables: `Kesehatan Lingkungan` <dbl>, `Tenaga Gizi` <dbl>,
## # `Tenaga Medis (Dokter)` <dbl>, `Psikologi Klinis` <dbl>,
## # `Keterapian Fisik` <dbl>, `Keteknisan Medis` <dbl>,
## # `Teknik Biomedika` <dbl>, `Jumlah Penduduk` <dbl>
## Perawat Bidan Kefarmasian Kesehatan Masyarakat
## Perawat 1.00 0.89 0.98 0.60
## Bidan 0.89 1.00 0.82 0.82
## Kefarmasian 0.98 0.82 1.00 0.51
## Kesehatan Masyarakat 0.60 0.82 0.51 1.00
## Kesehatan Lingkungan 0.87 0.92 0.82 0.85
## Tenaga Gizi 0.96 0.92 0.92 0.72
## Tenaga Medis (Dokter) 0.94 0.76 0.92 0.42
## Psikologi Klinis 0.82 0.64 0.85 0.37
## Keterapian Fisik 0.95 0.77 0.96 0.46
## Keteknisan Medis 0.98 0.84 0.98 0.54
## Teknik Biomedika 0.99 0.83 0.98 0.51
## Jumlah Penduduk 0.96 0.87 0.96 0.53
## Kesehatan Lingkungan Tenaga Gizi Tenaga Medis (Dokter)
## Perawat 0.87 0.96 0.94
## Bidan 0.92 0.92 0.76
## Kefarmasian 0.82 0.92 0.92
## Kesehatan Masyarakat 0.85 0.72 0.42
## Kesehatan Lingkungan 1.00 0.94 0.71
## Tenaga Gizi 0.94 1.00 0.83
## Tenaga Medis (Dokter) 0.71 0.83 1.00
## Psikologi Klinis 0.67 0.74 0.84
## Keterapian Fisik 0.77 0.86 0.94
## Keteknisan Medis 0.85 0.94 0.92
## Teknik Biomedika 0.82 0.92 0.97
## Jumlah Penduduk 0.80 0.91 0.89
## Psikologi Klinis Keterapian Fisik Keteknisan Medis
## Perawat 0.82 0.95 0.98
## Bidan 0.64 0.77 0.84
## Kefarmasian 0.85 0.96 0.98
## Kesehatan Masyarakat 0.37 0.46 0.54
## Kesehatan Lingkungan 0.67 0.77 0.85
## Tenaga Gizi 0.74 0.86 0.94
## Tenaga Medis (Dokter) 0.84 0.94 0.92
## Psikologi Klinis 1.00 0.91 0.87
## Keterapian Fisik 0.91 1.00 0.97
## Keteknisan Medis 0.87 0.97 1.00
## Teknik Biomedika 0.86 0.97 0.98
## Jumlah Penduduk 0.72 0.88 0.94
## Teknik Biomedika Jumlah Penduduk
## Perawat 0.99 0.96
## Bidan 0.83 0.87
## Kefarmasian 0.98 0.96
## Kesehatan Masyarakat 0.51 0.53
## Kesehatan Lingkungan 0.82 0.80
## Tenaga Gizi 0.92 0.91
## Tenaga Medis (Dokter) 0.97 0.89
## Psikologi Klinis 0.86 0.72
## Keterapian Fisik 0.97 0.88
## Keteknisan Medis 0.98 0.94
## Teknik Biomedika 1.00 0.94
## Jumlah Penduduk 0.94 1.00
ggcorrplot(
cor(data_num, use = "pairwise.complete.obs"),
hc.order = TRUE,
type = "lower",
lab = TRUE
)# Deteksi outlier (IQR)
detect_outliers_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQRv <- IQR(x, na.rm = TRUE)
lower <- Q1 - 1.5 * IQRv
upper <- Q3 + 1.5 * IQRv
which(x < lower | x > upper)
}
outlier_list <- lapply(data_num, detect_outliers_iqr)
print(outlier_list)## $Perawat
## [1] 11 12 13 15
##
## $Bidan
## [1] 2 12 13 15
##
## $Kefarmasian
## [1] 11 12 13 15
##
## $`Kesehatan Masyarakat`
## integer(0)
##
## $`Kesehatan Lingkungan`
## [1] 12 13 15 27
##
## $`Tenaga Gizi`
## [1] 12 13 15
##
## $`Tenaga Medis (Dokter)`
## [1] 11 12 13 15
##
## $`Psikologi Klinis`
## [1] 11 12 13 14 15
##
## $`Keterapian Fisik`
## [1] 11 12 13 15
##
## $`Keteknisan Medis`
## [1] 11 12 13 15
##
## $`Teknik Biomedika`
## [1] 11 12 13 15
##
## $`Jumlah Penduduk`
## [1] 12 13 15
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
## Perawat Bidan Kefarmasian
## 357.81598 29.38698 217.04330
## `Kesehatan Masyarakat` `Kesehatan Lingkungan` `Tenaga Gizi`
## 10.82399 33.42960 59.10213
## `Tenaga Medis (Dokter)` `Psikologi Klinis` `Keterapian Fisik`
## 95.31899 10.17798 84.27099
## `Keteknisan Medis` `Teknik Biomedika` `Jumlah Penduduk`
## 106.70752 259.47365 76.62807
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_num)
## Overall MSA = 0.84
## MSA for each item =
## Perawat Bidan Kefarmasian
## 0.86 0.80 0.76
## Kesehatan Masyarakat Kesehatan Lingkungan Tenaga Gizi
## 0.75 0.86 0.93
## Tenaga Medis (Dokter) Psikologi Klinis Keterapian Fisik
## 0.78 0.85 0.84
## Keteknisan Medis Teknik Biomedika Jumlah Penduduk
## 0.90 0.91 0.78
pca_result <- prcomp(
data_num,
center = TRUE,
scale. = TRUE
)
eigenvalues <- pca_result$sdev^2
prop_var <- eigenvalues / sum(eigenvalues)
cum_var <- cumsum(prop_var)
data.frame(
PC = paste0("PC", seq_along(eigenvalues)),
Eigenvalue = eigenvalues,
Proporsi = prop_var,
Kumulatif = cum_var
)## PC Eigenvalue Proporsi Kumulatif
## 1 PC1 10.226075996 0.8521729997 0.8521730
## 2 PC2 1.120995008 0.0934162506 0.9455893
## 3 PC3 0.304114859 0.0253429049 0.9709322
## 4 PC4 0.125729504 0.0104774586 0.9814096
## 5 PC5 0.083302850 0.0069419041 0.9883515
## 6 PC6 0.058743480 0.0048952900 0.9932468
## 7 PC7 0.036036719 0.0030030599 0.9962499
## 8 PC8 0.023532041 0.0019610034 0.9982109
## 9 PC9 0.011613545 0.0009677954 0.9991787
## 10 PC10 0.005575676 0.0004646397 0.9996433
## 11 PC11 0.002509648 0.0002091374 0.9998524
## 12 PC12 0.001770676 0.0001475563 1.0000000
k_range <- 2:6
sil_kmed <- numeric(length(k_range))
for (i in seq_along(k_range)) {
k <- k_range[i]
pam_tmp <- pam(data_pca, k = k)
sil <- silhouette(pam_tmp$clustering,
dist(data_pca))
sil_kmed[i] <- mean(sil[, "sil_width"])
}
# Data Frame
df_kmed <- data.frame(
Metode = "K-Medoids",
K = k_range,
Silhouette = sil_kmed
)
df_kmed## Metode K Silhouette
## 1 K-Medoids 2 0.7804345
## 2 K-Medoids 3 0.5877555
## 3 K-Medoids 4 0.4160245
## 4 K-Medoids 5 0.4607601
## 5 K-Medoids 6 0.5012297
k_opt_kmed <- df_kmed$K[which.max(df_kmed$Silhouette)]
# Plot Silhouette
plot(
k_range, sil_kmed,
type = "b", pch = 19,
xlab = "Jumlah Klaster (k)",
ylab = "Silhouette Index",
main = "Tuning Jumlah Klaster K-Medoids"
)
abline(v = k_opt_kmed, lty = 2, col = "red")k_range <- 2:6
sil_gmm <- numeric(length(k_range))
bic_gmm <- numeric(length(k_range))
for (i in seq_along(k_range)) {
k <- k_range[i]
gmm_tmp <- Mclust(data_pca, G = k, verbose = FALSE)
sil <- silhouette(gmm_tmp$classification, dist(data_pca))
sil_gmm[i] <- mean(sil[, "sil_width"])
bic_gmm[i] <- max(gmm_tmp$bic)
}
# Data Frame
df_gmm <- data.frame(
Metode = "GMM",
K = k_range,
Silhouette = sil_gmm,
BIC = bic_gmm
)
df_gmm## Metode K Silhouette BIC
## 1 GMM 2 0.7560847 -241.3543
## 2 GMM 3 0.5572465 -239.7693
## 3 GMM 4 0.5557908 -235.6264
## 4 GMM 5 0.4025244 -234.0796
## 5 GMM 6 0.5117878 -232.5772
k_opt_gmm <- df_gmm$K[which.max(df_gmm$Silhouette)]
# Plot Silhouette
plot(
k_range, sil_gmm,
type = "b", pch = 19,
xlab = "Jumlah Klaster (k)",
ylab = "Silhouette Index",
main = "Tuning Jumlah Klaster GMM (Silhouette)"
)
abline(v = k_opt_gmm, lty = 2, col = "red")# Plot BIC
plot(
k_range, bic_gmm,
type = "b", pch = 19,
xlab = "Jumlah Klaster (k)",
ylab = "BIC",
main = "Tuning Jumlah Klaster GMM (BIC)"
)
abline(v = k_range[which.min(bic_gmm)], lty = 2, col = "blue")k dipilih berdasarkan Silhouette (tertinggi), dengan BIC sebagai validasi kompleksitas model (terendah)
k_range <- 2:6
sil_spec <- numeric(length(k_range))
for (i in seq_along(k_range)) {
k <- k_range[i]
spec_tmp <- specc(
data_pca,
centers = k,
kernel = "rbfdot"
)
cl <- as.integer(spec_tmp)
sil <- silhouette(cl, dist(data_pca))
sil_spec[i] <- mean(sil[, "sil_width"])
}
# Data Frame
df_spec <- data.frame(
Metode = "Spectral",
K = k_range,
Silhouette = sil_spec
)
df_spec## Metode K Silhouette
## 1 Spectral 2 0.7804345
## 2 Spectral 3 0.2441622
## 3 Spectral 4 0.4404547
## 4 Spectral 5 0.4649247
## 5 Spectral 6 0.3724160
k_opt_spec <- df_spec$K[which.max(df_spec$Silhouette)]
# Plot Silhouette
plot(
k_range, sil_spec,
type = "b", pch = 19,
xlab = "Jumlah Klaster (k)",
ylab = "Silhouette Index",
main = "Tuning Jumlah Klaster Spectral Clustering"
)
abline(v = k_opt_spec, lty = 2, col = "red")set.seed(123)
kmedoids_res <- pam(data_pca, k = k_opt_kmed)
kmedoids <- factor(kmedoids_res$clustering)
table(kmedoids)## kmedoids
## 1 2
## 31 3
gmm_final <- Mclust(data_pca, G = k_opt_gmm)
gmm <- factor(gmm_final$classification)
summary(gmm_final)## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EVV (ellipsoidal, equal volume) model with 2 components:
##
## log-likelihood n df BIC ICL
## -103.0453 34 10 -241.3543 -241.3543
##
## Clustering table:
## 1 2
## 4 30
## gmm
## 1 2
## 4 30
# Konversi label klaster ke numerik
cl_kmed <- as.numeric(kmedoids)
cl_spec <- as.numeric(spectral)
cl_gmm <- gmm_final$classificationdata.frame(
Metode = c("K-Medoids", "GMMM", "Spectral"),
Silhouette = c(
hitung_silhouette(data_pca, cl_kmed),
hitung_silhouette(data_pca, cl_gmm),
hitung_silhouette(data_pca, cl_spec)
),
Dunn = c(dunn_kmed, dunn_gmm, dunn_spec),
Davies_Bouldin = c(db_kmed, db_gmm, db_spec),
CH_Score = c(
hitung_ch(data_pca, cl_kmed),
hitung_ch(data_pca, cl_gmm),
hitung_ch(data_pca, cl_spec)
)
)## Metode Silhouette Dunn Davies_Bouldin CH_Score
## 1 K-Medoids 0.7804345 0.8462686 0.2128929 78.83529
## 2 GMMM 0.7560847 0.7000952 0.4774531 84.37198
## 3 Spectral 0.7804345 0.8462686 0.2128929 78.83529
Meskipun K-Medoids dan Spectral Clustering menghasilkan nilai indeks evaluasi internal yang identik, pemilihan metode terbaik tidak hanya didasarkan pada kualitas struktur klaster, tetapi juga mempertimbangkan konsistensi hasil dan kemudahan interpretasi. K-Medoids dipilih karena memberikan representasi klaster yang lebih mudah diinterpretasikan melalui medoid yang merupakan observasi nyata, serta menunjukkan tingkat konvergensi yang lebih konsisten pada seluruh iterasi pengujian dibandingkan Spectral Clustering yang sensitif terhadap parameter kernel.
data_with_cluster <- data %>%
mutate(cluster = factor(kmedoids_res$clustering))
(summary_by_cluster <- data_with_cluster %>%
group_by(cluster) %>%
summarise(
across(
where(is.numeric),
list(mean = ~ mean(.x, na.rm = TRUE)),
.names = "{col}_{fn}"
)
))## # A tibble: 2 × 13
## cluster Perawat_mean Bidan_mean Kefarmasian_mean `Kesehatan Masyarakat_mean`
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 11724. 7781. 2237. 1351.
## 2 2 69312 32705. 19899. 3214.
## # ℹ 8 more variables: `Kesehatan Lingkungan_mean` <dbl>,
## # `Tenaga Gizi_mean` <dbl>, `Tenaga Medis (Dokter)_mean` <dbl>,
## # `Psikologi Klinis_mean` <dbl>, `Keterapian Fisik_mean` <dbl>,
## # `Keteknisan Medis_mean` <dbl>, `Teknik Biomedika_mean` <dbl>,
## # `Jumlah Penduduk_mean` <dbl>
kategori_klaster <- summary_by_cluster %>%
mutate(
skor_rata = rowMeans(
dplyr::select(., ends_with("_mean")),
na.rm = TRUE
),
Kategori = case_when(
skor_rata == max(skor_rata) ~ "Ketersediaan Tenaga Kesehatan Tinggi",
skor_rata == min(skor_rata) ~ "Ketersediaan Tenaga Kesehatan Rendah"
)
) %>%
dplyr::select(cluster, Kategori)##
## 1 2
## 31 3
for (i in 1:k_optimal) {
cat("\n--- Klaster", i, "---\n")
print(data_with_cluster$Provinsi[data_with_cluster$cluster == i])
}##
## --- Klaster 1 ---
## [1] "Aceh" "Sumatera Utara"
## [3] "Sumatera Barat" "Riau"
## [5] "Jambi" "Sumatera Selatan"
## [7] "Bengkulu" "Lampung"
## [9] "Kepulauan Bangka Belitung" "Kepulauan Riau"
## [11] "DKI Jakarta" "Daerah Istimewa Yogyakarta"
## [13] "Banten" "Bali"
## [15] "Nusa Tenggara Barat" "Nusa Tenggara Timur"
## [17] "Kalimantan Barat" "Kalimantan Tengah"
## [19] "Kalimantan Selatan" "Kalimantan Timur"
## [21] "Kalimantan Utara" "Sulawesi Utara"
## [23] "Sulawesi Tengah" "Sulawesi Selatan"
## [25] "Sulawesi Tenggara" "Gorontalo"
## [27] "Sulawesi Barat" "Maluku"
## [29] "Maluku Utara" "Papua Barat"
## [31] "Papua"
##
## --- Klaster 2 ---
## [1] "Jawa Barat" "Jawa Tengah" "Jawa Timur"
## # A tibble: 6 × 15
## Provinsi Perawat Bidan Kefarmasian `Kesehatan Masyarakat`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Aceh 21435 21767 3165 3890
## 2 Sumatera Utara 25787 27122 3970 3167
## 3 Sumatera Barat 12236 9313 2548 1430
## 4 Riau 12185 10246 2634 1392
## 5 Jambi 8425 7512 1635 1059
## 6 Sumatera Selatan 17309 15942 2971 2217
## # ℹ 10 more variables: `Kesehatan Lingkungan` <dbl>, `Tenaga Gizi` <dbl>,
## # `Tenaga Medis (Dokter)` <dbl>, `Psikologi Klinis` <dbl>,
## # `Keterapian Fisik` <dbl>, `Keteknisan Medis` <dbl>,
## # `Teknik Biomedika` <dbl>, `Jumlah Penduduk` <dbl>, cluster <fct>,
## # Kategori <chr>
p <- fviz_cluster(
kmedoids_res,
data = data,
geom = "point",
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal()
)
df_plot <- p$data
df_plot$daerah <- data$Provinsi
p +
geom_text_repel(
data = df_plot,
aes(x = x, y = y, label = daerah, color = cluster),
size = 2.5,
max.overlaps = 15,
segment.color = NA
) +
guides(color = guide_legend(title = "Cluster")) +
labs(title = "Visualisasi Klaster K-Medoids") +
theme(
plot.title = element_text(face = "bold", size = 14)
)## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Shapefile provinsi Indonesia
shp_path <- "D:/STATISTIKA RECAP/SEMESTER 5/Sains Data Spasial/BATAS PROVINSI DESEMBER 2019 DUKCAPIL/BATAS PROVINSI DESEMBER 2019 DUKCAPIL/BATAS_PROVINSI_DESEMBER_2019_DUKCAPIL.shp"
ind_sf <- st_read(shp_path, quiet = TRUE)
unique(ind_sf$PROVINSI)## [1] "ACEH" "BALI"
## [3] "BANTEN" "BENGKULU"
## [5] "DAERAH ISTIMEWA YOGYAKARTA" "DKI JAKARTA"
## [7] "GORONTALO" "JAMBI"
## [9] "JAWA BARAT" "JAWA TENGAH"
## [11] "JAWA TIMUR" "KALIMANTAN BARAT"
## [13] "KALIMANTAN SELATAN" "KALIMANTAN TENGAH"
## [15] "KALIMANTAN TIMUR" "KALIMANTAN UTARA"
## [17] "KEPULAUAN BANGKA BELITUNG" "KEPULAUAN RIAU"
## [19] "LAMPUNG" "MALUKU"
## [21] "MALUKU UTARA" "NUSA TENGGARA BARAT"
## [23] "NUSA TENGGARA TIMUR" "PAPUA"
## [25] "PAPUA BARAT" "RIAU"
## [27] "SULAWESI BARAT" "SULAWESI SELATAN"
## [29] "SULAWESI TENGAH" "SULAWESI TENGGARA"
## [31] "SULAWESI UTARA" "SUMATERA BARAT"
## [33] "SUMATERA SELATAN" "SUMATERA UTARA"
# Normalisasi nama provinsi
ind_sf <- ind_sf %>%
mutate(PROVINSI_norm = toupper(trimws(PROVINSI)))
data_with_cluster <- data_with_cluster %>%
mutate(provinsi_norm = toupper(trimws(Provinsi)))
# Join data klaster dengan shapefile
ind_sf <- ind_sf %>%
left_join(
data_with_cluster %>%
dplyr::select(provinsi_norm, cluster),
by = c("PROVINSI_norm" = "provinsi_norm")
) %>%
rename(Cluster = cluster)
ind_sf <- st_make_valid(ind_sf) %>%
st_transform(3857)
ind_sf_centroid <- ind_sf %>%
filter(Cluster != "Data Tidak Lengkap") %>%
mutate(
centroid = st_centroid(geometry),
lon = st_coordinates(centroid)[, 1],
lat = st_coordinates(centroid)[, 2]
)
ggplot() +
geom_sf(
data = ind_sf,
aes(fill = Cluster),
color = "grey40",
size = 0.2
) +
geom_text_repel(
data = ind_sf_centroid,
aes(x = lon, y = lat, label = PROVINSI_norm),
size = 2.2,
segment.size = 0.2,
max.overlaps = 20
) +
scale_fill_manual(
values = c(
"1" = "#fc8d62",
"2" = "#66c2a5",
"Data Tidak Lengkap" = "grey85"
),
name = "Klaster"
) +
labs(
title = "Peta Klaster Ketersediaan Tenaga Kesehatan per Provinsi (2023)",
subtitle = "Hasil Klasterisasi KMedoids berdasarkan Komponen Utama (PCA)",
caption = "Sumber data: Badan Pusat Statistik Indonesia, 2023"
) +
theme_minimal() +
theme(
legend.position = "right",
plot.title = element_text(face = "bold")
)