# install.packages(c("cluster","e1071","dbscan","fpc","mclust","ggplot2",
# "factoextra","corrplot","flexclust","meanShiftR"))
library(cluster)
library(e1071)
library(dbscan)
library(fpc)
library(mclust)
library(ggplot2)
library(factoextra)
library(corrplot)
library(flexclust)
library(meanShiftR) Dataset yang digunakan adalah Global Health & Nutrition Indicators yang mencakup 138 negara dengan 12 fitur numerik meliputi indikator ekonomi, kesehatan, pendidikan, sanitasi, dan lingkungan.
## Jumlah baris (negara): 138
## Jumlah kolom : 13
## Nama variabel :
## [1] "country" "gdp_per_capita" "life_expectancy"
## [4] "infant_mortality" "undernourishment_pct" "literacy_rate"
## [7] "mean_schooling_years" "sanitation_access" "clean_water_access"
## [10] "electricity_access" "co2_emissions_pc" "health_expenditure_pct"
## [13] "urban_population_pct"
## gdp_per_capita life_expectancy infant_mortality undernourishment_pct
## Min. : 242.7 Min. :45.80 Min. : 1.000 Min. : 2.00
## 1st Qu.: 1934.9 1st Qu.:63.54 1st Qu.: 9.125 1st Qu.: 4.87
## Median : 8554.0 Median :68.30 Median : 31.280 Median :14.44
## Mean :22057.7 Mean :69.06 Mean : 38.360 Mean :20.31
## 3rd Qu.:37801.0 3rd Qu.:77.74 3rd Qu.: 61.380 3rd Qu.:35.19
## Max. :80047.1 Max. :85.34 Max. :115.560 Max. :60.80
## literacy_rate mean_schooling_years sanitation_access clean_water_access
## Min. : 20.00 Min. : 1.150 Min. : 5.00 Min. : 29.89
## 1st Qu.: 63.55 1st Qu.: 5.457 1st Qu.: 35.90 1st Qu.: 57.31
## Median : 80.74 Median : 8.315 Median : 69.37 Median : 81.78
## Mean : 76.73 Mean : 8.736 Mean : 62.26 Mean : 75.58
## 3rd Qu.: 95.37 3rd Qu.:12.178 3rd Qu.: 90.73 3rd Qu.: 95.38
## Max. :100.00 Max. :15.820 Max. :100.00 Max. :100.00
## electricity_access co2_emissions_pc health_expenditure_pct
## Min. : 11.26 Min. : 0.0100 Min. : 0.500
## 1st Qu.: 57.26 1st Qu.: 0.9375 1st Qu.: 3.482
## Median : 72.72 Median : 3.4110 Median : 4.800
## Mean : 71.45 Mean : 5.1771 Mean : 6.568
## 3rd Qu.: 96.07 3rd Qu.: 7.1561 3rd Qu.: 8.730
## Max. :100.00 Max. :18.0245 Max. :17.470
## urban_population_pct
## Min. :10.00
## 1st Qu.:35.99
## Median :55.19
## Mean :54.76
## 3rd Qu.:75.53
## Max. :97.96
Cek Missing Value:
## gdp_per_capita life_expectancy infant_mortality
## 0 0 0
## undernourishment_pct literacy_rate mean_schooling_years
## 0 0 0
## sanitation_access clean_water_access electricity_access
## 0 0 0
## co2_emissions_pc health_expenditure_pct urban_population_pct
## 0 0 0
Tidak terdapat missing value pada dataset, sehingga data siap diproses lebih lanjut.
par(mfrow = c(3, 4), mar = c(3, 3, 2, 1))
for (col in names(df_num)) {
boxplot(df_num[[col]], main = col, col = "#3498DB",
ylab = "", cex.main = 0.85)
}Gambar di atas menunjukkan distribusi tiap variabel. Terlihat
beberapa variabel seperti gdp_per_capita dan
infant_mortality memiliki sebaran yang sangat lebar,
mengindikasikan adanya perbedaan besar antar negara.
cor_mat <- cor(df_num, use = "complete.obs")
corrplot(cor_mat, method = "color", type = "upper",
addCoef.col = "black", number.cex = 0.55,
tl.cex = 0.7, tl.col = "black",
title = "Matriks Korelasi Antar Variabel",
mar = c(0, 0, 1.5, 0))Terdapat korelasi positif yang kuat antara
literacy_rate, mean_schooling_years,
sanitation_access, clean_water_access, dan
electricity_access, yang mencerminkan keterkaitan erat
antara indikator pendidikan dan infrastruktur dasar.
# Elbow Method
wss_vals <- sapply(1:10, function(k) {
kmeans(df_scaled, centers = k, nstart = 25)$tot.withinss
})
# Silhouette
sil_vals <- sapply(2:10, function(k) {
km <- kmeans(df_scaled, centers = k, nstart = 25)
mean(silhouette(km$cluster, dist(df_scaled))[, 3])
})
par(mfrow = c(1, 2))
plot(1:10, wss_vals, type = "b", pch = 19, col = "#E74C3C",
xlab = "Jumlah Klaster (k)",
ylab = "Total Within-Cluster SS",
main = "Elbow Method")
abline(v = 3, lty = 2, col = "gray40")
plot(2:10, sil_vals, type = "b", pch = 19, col = "#2ECC71",
xlab = "Jumlah Klaster (k)",
ylab = "Rata-rata Silhouette Width",
main = "Silhouette Analysis")
abline(v = which.max(sil_vals) + 1, lty = 2, col = "gray40")## K optimal (Silhouette): 3
Berdasarkan Elbow Method dan Silhouette Analysis, k = 3 dipilih sebagai jumlah klaster optimal. Nilai ini juga secara substantif dapat diinterpretasikan sebagai kelompok negara: maju, berkembang, dan miskin/terbelakang.
K-Means adalah algoritma clustering berbasis partisi yang meminimalkan total jarak kuadrat dalam klaster (Within-Cluster Sum of Squares). Setiap titik data ditetapkan ke klaster dengan centroid terdekat menggunakan jarak Euclidean. Centroid diperbarui sebagai rata-rata aritmatika dari semua titik dalam klaster.
## Distribusi klaster K-Means:
##
## 1 2 3
## 43 48 47
##
## Rata-rata tiap klaster:
agg_km <- aggregate(df_num, by = list(Klaster = km_res$cluster), FUN = mean)
print(round(agg_km, 2))## Klaster gdp_per_capita life_expectancy infant_mortality undernourishment_pct
## 1 1 55934.96 80.11 6.02 3.42
## 2 2 11950.24 69.31 27.99 15.69
## 3 3 1385.99 58.71 78.54 40.46
## literacy_rate mean_schooling_years sanitation_access clean_water_access
## 1 96.79 13.36 94.94 97.24
## 2 82.55 8.84 67.71 81.41
## 3 52.43 4.39 26.79 49.82
## electricity_access co2_emissions_pc health_expenditure_pct
## 1 98.01 11.22 12.06
## 2 76.65 4.21 5.12
## 3 41.84 0.63 3.03
## urban_population_pct
## 1 81.24
## 2 55.66
## 3 29.61
fviz_cluster(km_res, data = df_scaled,
palette = c("#E74C3C", "#2ECC71", "#3498DB"),
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal(),
main = "Visualisasi Klaster K-Means (PCA 2D)")K-Median adalah varian dari K-Means yang menggunakan median (bukan mean) sebagai centroid klaster dan jarak Manhattan (L1) sebagai metrik jarak. Metode ini lebih robust terhadap outlier karena median tidak terpengaruh nilai ekstrem seperti halnya mean.
# K-Median via flexclust dengan kccaFamily("kmedians") dan jarak Manhattan
set.seed(123)
kmed_res <- kcca(df_scaled, k = K_OPT, family = kccaFamily("kmedians"))
cat("Distribusi klaster K-Median:\n")## Distribusi klaster K-Median:
##
## 1 2 3
## 43 47 48
##
## Rata-rata tiap klaster:
agg_kmed <- aggregate(df_num, by = list(Klaster = clusters(kmed_res)), FUN = mean)
print(round(agg_kmed, 2))## Klaster gdp_per_capita life_expectancy infant_mortality undernourishment_pct
## 1 1 55934.96 80.11 6.02 3.42
## 2 2 1385.99 58.71 78.54 40.46
## 3 3 11950.24 69.31 27.99 15.69
## literacy_rate mean_schooling_years sanitation_access clean_water_access
## 1 96.79 13.36 94.94 97.24
## 2 52.43 4.39 26.79 49.82
## 3 82.55 8.84 67.71 81.41
## electricity_access co2_emissions_pc health_expenditure_pct
## 1 98.01 11.22 12.06
## 2 41.84 0.63 3.03
## 3 76.65 4.21 5.12
## urban_population_pct
## 1 81.24
## 2 29.61
## 3 55.66
fviz_cluster(list(data = df_scaled, cluster = clusters(kmed_res)),
palette = c("#E74C3C", "#2ECC71", "#3498DB"),
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal(),
main = "Visualisasi Klaster K-Median (PCA 2D)")DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
adalah algoritma berbasis kepadatan yang tidak memerlukan jumlah klaster
ditentukan di awal. Titik dengan kepadatan rendah diklasifikasikan
sebagai noise. Parameter utama: eps (radius
lingkungan) dan minPts (jumlah minimum titik untuk
membentuk klaster inti).
kNNdistplot(df_scaled, k = 5)
title(main = "kNN Distance Plot (k=5) - Penentuan eps DBSCAN")
abline(h = 2.2, col = "red", lty = 2)
legend("topleft", legend = "eps = 2.2", col = "red", lty = 2, bty = "n")db_res <- dbscan::dbscan(df_scaled, eps = 2.2, minPts = 5)
cat("Distribusi klaster DBSCAN (0 = noise):\n")## Distribusi klaster DBSCAN (0 = noise):
##
## 1 2 3
## 47 43 48
## Jumlah noise points: 0
fviz_cluster(list(data = df_scaled, cluster = db_res$cluster),
palette = c("#95A5A6","#E74C3C","#2ECC71","#3498DB","#F39C12"),
geom = "point",
ggtheme = theme_minimal(),
main = "Visualisasi Klaster DBSCAN (PCA 2D) | Klaster 0 = Noise")Mean Shift adalah algoritma non-parametrik berbasis kepadatan yang menemukan mode (puncak) distribusi data secara iteratif menggunakan Gaussian kernel. Tidak memerlukan spesifikasi jumlah klaster — jumlah klaster ditentukan secara otomatis dari struktur data.
# Mean Shift menggunakan package meanShiftR
# Dilakukan pada PCA 2D untuk efisiensi komputasi
set.seed(123)
ms_res <- meanShift(
trainData = df_pca2,
queryData = df_pca2,
bandwidth = c(0.8, 0.8),
alpha = 0,
iterations = 100
)
ms_labels <- ms_res$assignment
n_ms <- length(unique(ms_labels))
cat("Jumlah mode terdeteksi Mean Shift:", n_ms, "\n")## Jumlah mode terdeteksi Mean Shift: 3
# Merge ke K_OPT klaster jika mode terlalu banyak
if (n_ms > K_OPT) {
mode_centers <- do.call(rbind, lapply(unique(ms_labels), function(cl)
colMeans(df_pca2[ms_labels == cl, , drop = FALSE])
))
km_modes <- kmeans(mode_centers, centers = K_OPT, nstart = 20)
ms_final <- km_modes$cluster[ms_labels]
} else {
ms_final <- ms_labels
}
cat("Distribusi klaster Mean Shift (setelah merge ke k=3):\n")## Distribusi klaster Mean Shift (setelah merge ke k=3):
## ms_final
## 1 2 3
## 47 43 48
fviz_cluster(list(data = df_scaled, cluster = ms_final),
palette = c("#E74C3C", "#2ECC71", "#3498DB"),
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal(),
main = "Visualisasi Klaster Mean Shift (PCA 2D)")Fuzzy C-Means (FCM) adalah algoritma soft clustering di mana
setiap titik data memiliki derajat keanggotaan (0–1) terhadap setiap
klaster, bukan penugasan biner seperti K-Means. Parameter
m = 2 adalah fuzzifier yang mengontrol derajat kekaburan
klaster.
fcm_res <- cmeans(df_scaled, centers = K_OPT, m = 2, iter.max = 200)
cat("Distribusi klaster Fuzzy C-Means:\n")## Distribusi klaster Fuzzy C-Means:
##
## 1 2 3
## 47 43 48
##
## Contoh derajat keanggotaan (5 data pertama):
## 1 2 3
## [1,] 0.8486 0.0422 0.1092
## [2,] 0.8409 0.0370 0.1220
## [3,] 0.0180 0.9174 0.0645
## [4,] 0.0397 0.8081 0.1521
## [5,] 0.0224 0.9046 0.0730
##
## Rata-rata tiap klaster:
agg_fcm <- aggregate(df_num, by = list(Klaster = fcm_res$cluster), FUN = mean)
print(round(agg_fcm, 2))## Klaster gdp_per_capita life_expectancy infant_mortality undernourishment_pct
## 1 1 1385.99 58.71 78.54 40.46
## 2 2 55934.96 80.11 6.02 3.42
## 3 3 11950.24 69.31 27.99 15.69
## literacy_rate mean_schooling_years sanitation_access clean_water_access
## 1 52.43 4.39 26.79 49.82
## 2 96.79 13.36 94.94 97.24
## 3 82.55 8.84 67.71 81.41
## electricity_access co2_emissions_pc health_expenditure_pct
## 1 41.84 0.63 3.03
## 2 98.01 11.22 12.06
## 3 76.65 4.21 5.12
## urban_population_pct
## 1 29.61
## 2 81.24
## 3 55.66
fviz_cluster(list(data = df_scaled, cluster = fcm_res$cluster),
palette = c("#E74C3C", "#2ECC71", "#3498DB"),
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal(),
main = "Visualisasi Klaster Fuzzy C-Means (PCA 2D)")pca_df <- data.frame(
PC1 = df_pca2[, 1],
PC2 = df_pca2[, 2],
KMeans = factor(km_res$cluster),
KMedian = factor(clusters(kmed_res)),
DBSCAN = factor(db_res$cluster),
MeanShift = factor(ms_final),
FCM = factor(fcm_res$cluster)
)
col_pal <- c("0" = "#95A5A6", "1" = "#E74C3C", "2" = "#2ECC71",
"3" = "#3498DB", "4" = "#F39C12")
metode_list <- c("KMeans", "KMedian", "DBSCAN", "MeanShift", "FCM")
titles_list <- c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means")
par(mfrow = c(2, 3), mar = c(4, 4, 2.5, 1))
for (i in seq_along(metode_list)) {
plot(pca_df$PC1, pca_df$PC2,
col = col_pal[as.character(pca_df[[metode_list[i]]])],
pch = 19, cex = 1.1,
main = titles_list[i],
xlab = paste0("PC1 (", round(var_explained[1] * 100, 1), "%)"),
ylab = paste0("PC2 (", round(var_explained[2] * 100, 1), "%)"))
legend("topright", legend = levels(pca_df[[metode_list[i]]]),
col = col_pal[levels(pca_df[[metode_list[i]]])],
pch = 19, cex = 0.65, bty = "n",
title = "Klaster")
}
par(mfrow = c(1, 1))# Silhouette Score
sil_km <- mean(silhouette(km_res$cluster, dist(df_scaled))[, 3])
sil_kmed <- mean(silhouette(clusters(kmed_res), dist(df_scaled))[, 3])
db_nonoise <- db_res$cluster[db_res$cluster > 0]
df_nonoise <- df_scaled[db_res$cluster > 0, ]
sil_db <- if (length(unique(db_nonoise)) > 1 && nrow(df_nonoise) > 2)
mean(silhouette(db_nonoise, dist(df_nonoise))[, 3]) else NA
sil_ms <- if (length(unique(ms_final)) > 1) {
mean(silhouette(as.integer(ms_final), dist(df_scaled))[, 3])
} else { NA }
sil_fcm <- mean(silhouette(fcm_res$cluster, dist(df_scaled))[, 3])
# Dunn Index
cs_km <- cluster.stats(dist(df_scaled), km_res$cluster)
cs_kmed <- cluster.stats(dist(df_scaled), clusters(kmed_res))
cs_ms <- if (length(unique(ms_final)) > 1) {
cluster.stats(dist(df_scaled), as.integer(ms_final))
} else { list(dunn = NA) }
cs_fcm <- cluster.stats(dist(df_scaled), fcm_res$cluster)
# Calinski-Harabasz Index
ch_km <- calinhara(df_scaled, km_res$cluster)
ch_kmed <- calinhara(df_scaled, clusters(kmed_res))
ch_ms <- if (length(unique(ms_final)) > 1) {
calinhara(df_scaled, as.integer(ms_final))
} else { NA }
ch_fcm <- calinhara(df_scaled, fcm_res$cluster)
eval_df <- data.frame(
Metode = c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
Silhouette = round(c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm), 4),
Dunn_Index = round(c(cs_km$dunn, cs_kmed$dunn, NA, cs_ms$dunn, cs_fcm$dunn), 4),
CH_Index = round(c(ch_km, ch_kmed, NA, ch_ms, ch_fcm), 2)
)
print(eval_df)## Metode Silhouette Dunn_Index CH_Index
## 1 K-Means 0.5176 0.5398 291.58
## 2 K-Median 0.5176 0.5398 291.58
## 3 DBSCAN 0.5176 NA NA
## 4 Mean Shift 0.5176 0.5398 291.58
## 5 Fuzzy C-Means 0.5176 0.5398 291.58
best_idx <- which.max(eval_df$Silhouette)
cat("Metode terbaik (Silhouette):", as.character(eval_df$Metode[best_idx]), "\n")## Metode terbaik (Silhouette): K-Means
sil_plot_km <- silhouette(km_res$cluster, dist(df_scaled))
plot(sil_plot_km, col = c("#E74C3C", "#2ECC71", "#3498DB"),
border = NA, main = "Silhouette Plot - K-Means")eval_long <- reshape(eval_df[, c("Metode", "Silhouette", "Dunn_Index")],
varying = c("Silhouette", "Dunn_Index"),
v.names = "Nilai",
timevar = "Metrik",
times = c("Silhouette", "Dunn Index"),
direction = "long")
ggplot(eval_long[!is.na(eval_long$Nilai), ],
aes(x = Metode, y = Nilai, fill = Metrik)) +
geom_bar(stat = "identity", position = "dodge", width = 0.6) +
scale_fill_manual(values = c("#3498DB", "#E74C3C")) +
labs(title = "Perbandingan Metrik Evaluasi Clustering",
x = "Metode", y = "Nilai Metrik", fill = "Metrik") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 20, hjust = 1))klaster_summary <- aggregate(df_num, by = list(Klaster = km_res$cluster), FUN = mean)
print(round(klaster_summary[, c("Klaster", "gdp_per_capita", "life_expectancy",
"infant_mortality", "literacy_rate",
"electricity_access", "sanitation_access")], 1))## Klaster gdp_per_capita life_expectancy infant_mortality literacy_rate
## 1 1 55935.0 80.1 6.0 96.8
## 2 2 11950.2 69.3 28.0 82.5
## 3 3 1386.0 58.7 78.5 52.4
## electricity_access sanitation_access
## 1 98.0 94.9
## 2 76.7 67.7
## 3 41.8 26.8
##
## Contoh negara per klaster:
for (k in 1:K_OPT) {
idx <- which(km_res$cluster == k)
cat("Klaster", k, ":", paste(negara[idx[1:min(6, length(idx))]], collapse = ", "), "\n")
}## Klaster 1 : Algeria, Angola, Argentina, Austria, Benin, Botswana
## Klaster 2 : Australia, Azerbaijan, Belarus, Belgium, Bosnia, Burkina Faso
## Klaster 3 : Afghanistan, Albania, Armenia, Bangladesh, Bolivia, Central African Republic
Berdasarkan rata-rata karakteristik tiap klaster:
Analisis klaster terhadap 138 negara menggunakan 12 indikator kesehatan dan nutrisi global menghasilkan kesimpulan sebagai berikut:
K optimal = 3 berdasarkan Elbow Method dan Silhouette Analysis, yang secara substantif merepresentasikan tiga kelompok negara berdasarkan tingkat pembangunan.
K-Means menghasilkan performa terbaik berdasarkan Silhouette Score dan Calinski-Harabasz Index, mengindikasikan klaster yang kompak dan terpisah dengan baik.
K-Median (flexclust kmedians + Manhattan distance) menghasilkan hasil serupa K-Means namun lebih robust terhadap outlier berkat penggunaan median dan jarak Manhattan.
DBSCAN berhasil mengidentifikasi noise points (negara yang tidak masuk klaster mana pun), mencerminkan keunikan kondisi beberapa negara yang tidak terkelompok.
Mean Shift (via meanShiftR + Gaussian kernel) secara otomatis menemukan jumlah mode dari distribusi data tanpa perlu menentukan k di awal.
Fuzzy C-Means memberikan derajat keanggotaan parsial yang berguna untuk memahami negara-negara yang berada di perbatasan antar kelompok.