# LOAD LIBRARY
# Manipulasi dan visualisasi data
library(tidyverse) # ggplot2, dplyr, tidyr, dll.
# Analisis Clustering
library(cluster) # Fungsi clustering (pam, silhouette, dll.)
library(factoextra) # Visualisasi clustering & PCA (fviz_*)
# Tabel ringkasan yang rapi
library(knitr) # kable() untuk tabel markdown
library(kableExtra) # Styling tambahan pada kable# MEMBACA DATASET DARI FILE CSV
# Dataset sudah di cleaning sebelumnya melalui python
df_raw <- read.csv("waterQuality_bersih.csv", stringsAsFactors = FALSE)
# Tampilkan dimensi dan beberapa baris pertama
cat("Dimensi dataset:", nrow(df_raw), "baris x", ncol(df_raw), "kolom\n")## Dimensi dataset: 7996 baris x 21 kolom
# CEK NILAI YANG HILANG (NA) DAN DATA TIDAK VALID
# Kolom is_safe mengandung nilai '#NUM!'
cat("Nilai unik pada kolom is_safe:\n")## Nilai unik pada kolom is_safe:
##
## 0 1 <NA>
## 7084 912 0
# Identifikasi baris dengan nilai '#NUM!' pada is_safe
idx_invalid <- which(df_raw$is_safe == "#NUM!")
cat("\nJumlah baris dengan '#NUM!':", length(idx_invalid), "\n")##
## Jumlah baris dengan '#NUM!': 0
# Hapus baris bermasalah
df_clean <- df_raw[df_raw$is_safe != "#NUM!", ]
# Paksa semua kolom menjadi numerik (selain is_safe yang sudah valid)
df_clean$is_safe <- as.integer(df_clean$is_safe)
# Cek total NA setelah pembersihan
cat("\nTotal NA setelah pembersihan:", sum(is.na(df_clean)), "\n")##
## Total NA setelah pembersihan: 0
## Dimensi data bersih: 7996 baris x 21 kolom
# STATISTIK DESKRIPTIF
# Exclude is_safe
var_names <- setdiff(names(df_clean), "is_safe")
summary_stats <- df_clean[, var_names] %>%
summarise(across(everything(), list(
Min = ~round(min(.), 3),
Mean = ~round(mean(.), 3),
Max = ~round(max(.), 3),
SD = ~round(sd(.), 3)
))) %>%
pivot_longer(everything(),
names_to = c("Variabel", "Statistik"),
names_sep = "_") %>%
pivot_wider(names_from = Statistik, values_from = value)
kable(summary_stats,
caption = "Tabel 1. Statistik Deskriptif Variabel Penelitian",
align = "lrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Variabel | Min | Mean | Max | SD |
|---|---|---|---|---|
| aluminium | 0.00 | 0.666 | 5.05 | 1.265 |
| ammonia | -0.08 | 14.278 | 29.84 | 8.879 |
| arsenic | 0.00 | 0.161 | 1.05 | 0.253 |
| barium | 0.00 | 1.568 | 4.94 | 1.216 |
| cadmium | 0.00 | 0.043 | 0.13 | 0.036 |
| chloramine | 0.00 | 2.178 | 8.68 | 2.567 |
| chromium | 0.00 | 0.247 | 0.90 | 0.271 |
| copper | 0.00 | 0.806 | 2.00 | 0.654 |
| flouride | 0.00 | 0.772 | 1.50 | 0.435 |
| bacteria | 0.00 | 0.320 | 1.00 | 0.329 |
| viruses | 0.00 | 0.329 | 1.00 | 0.378 |
| lead | 0.00 | 0.099 | 0.20 | 0.058 |
| nitrates | 0.00 | 9.819 | 19.83 | 5.542 |
| nitrites | 0.00 | 1.330 | 2.93 | 0.573 |
| mercury | 0.00 | 0.005 | 0.01 | 0.003 |
| perchlorate | 0.00 | 16.465 | 60.01 | 17.689 |
| radium | 0.00 | 2.920 | 7.99 | 2.323 |
| selenium | 0.00 | 0.050 | 0.10 | 0.029 |
| silver | 0.00 | 0.148 | 0.50 | 0.144 |
| uranium | 0.00 | 0.045 | 0.09 | 0.027 |
Interpretasi Elbow Plot: Berdasarkan Tabel 1, terlihat bahwa terdapat perbedaan skala yang sangat signifikan antar variabel. Contohnya, variabel perchlorate memiliki rentang nilai hingga 60.01, sementara cadmium dan mercury berada pada rentang nilai desimal yang sangat kecil. Perbedaan skala ini mengindikasikan perlunya proses standarisasi sebelum melakukan perhitungan jarak Euclidean.
# STANDARISASI DATA MENGGUNAKAN Z-SCORE (scale())
# Pisahkan variabel prediktor dari label is_safe
df_features <- df_clean[, var_names] # 20 variabel kimia/biologis
df_label <- df_clean[, "is_safe"] # simpan label untuk profiling
# Standarisasi
df_scaled <- scale(df_features)
cat("Ringkasan setelah scaling (mean dan SD setiap kolom):\n")## Ringkasan setelah scaling (mean dan SD setiap kolom):
## Mean kolom pertama (aluminium): 0
## SD kolom pertama (aluminium): 1
# ELBOW METHOD: Menghitung Total Within-Cluster Sum of Squares
set.seed(42) # Untuk reproduktibilitas
wcss <- sapply(1:10, function(k) {
kmeans(df_scaled,
centers = k,
nstart = 25, # Ulangi 25x dengan inisialisasi acak berbeda
iter.max = 100)$tot.withinss
})
# Buat data frame untuk ggplot
elbow_df <- data.frame(k = 1:10, wcss = wcss)
# Plot Elbow
ggplot(elbow_df, aes(x = k, y = wcss)) +
geom_line(color = "#2c7bb6", linewidth = 1.2) +
geom_point(color = "#d7191c", size = 3.5) +
geom_vline(xintercept = 3, linetype = "dashed",
color = "darkgreen", linewidth = 0.8) +
annotate("text", x = 3.2, y = max(wcss) * 0.95,
label = "k = 3 (dipilih)", hjust = 0,
color = "darkgreen", size = 4) +
scale_x_continuous(breaks = 1:10) +
labs(
title = "Elbow Method – Total Within-Cluster SS",
subtitle = "Titik 'siku' menunjukkan jumlah cluster optimal",
x = "Jumlah Cluster (k)",
y = "Total Within-Cluster Sum of Squares (WCSS)"
) +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))Gambar 1. Elbow Plot – Penentuan K Optimal
Interpretasi Elbow Plot: Penurunan WCSS paling tajam terjadi dari k=1 ke k=2 dan berlanjut ke k=3. Setelah k=3, kurva mulai mendatar (membentuk “siku”), menandakan bahwa k = 3 merupakan jumlah cluster yang optimal untuk dataset ini.
# MENJALANKAN K-MEANS CLUSTERING DENGAN k = 3
# Parameter penting:
# - centers : jumlah cluster = 3
# - nstart : 25 inisialisasi acak → hasil terbaik
# - iter.max : maksimum iterasi = 300
# - algorithm: "Hartigan-Wong" (default, paling stabil)
set.seed(42)
km_result <- kmeans(
x = df_scaled,
centers = 3,
nstart = 25,
iter.max = 300
)
# Distribusi anggota tiap cluster
cat("Distribusi Anggota Cluster:\n")## Distribusi Anggota Cluster:
##
## 1 2 3
## 4061 2265 1670
# Tambahkan label cluster ke data asli (tidak di-scale)
df_clustered <- df_clean
df_clustered$Cluster <- as.factor(km_result$cluster)
# MENGHITUNG SILHOUETTE SCORE
sil_km <- silhouette(km_result$cluster, dist(df_scaled))
# VISUALISASI SILHOUETTE PLOT
fviz_silhouette(sil_km) +
labs(title = "Silhouette Plot - Validasi K-Means (k=3)") +
theme_minimal()## cluster size ave.sil.width
## 1 1 4061 0.24
## 2 2 2265 0.09
## 3 3 1670 0.06
# VISUALISASI HASIL CLUSTERING MENGGUNAKAN PCA (2D)
# 1. HITUNG PCA TERLEBIH DAHULU (Ini yang tadi terlewat di blok ini)
pca_result <- prcomp(df_scaled, center = FALSE, scale. = FALSE)
# 2. GAMBAR PROYEKSI CLUSTER
fviz_cluster(
object = km_result,
data = df_scaled,
geom = "point", # Tampilkan titik saja (lebih bersih)
ellipse.type = "convex", # Gambar convex hull tiap cluster
pointsize = 0.8,
alpha = 0.5,
palette = c("#E41A1C", "#377EB8", "#4DAF4A"), # Merah, Biru, Hijau
ggtheme = theme_minimal(base_size = 13),
main = "K-Means Clustering (k=3) – Proyeksi PCA 2D",
xlab = "PC1 (Komponen Utama 1)",
ylab = "PC2 (Komponen Utama 2)"
) +
labs(subtitle = "Setiap titik merepresentasikan satu sampel air") +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold"))Gambar 2. Visualisasi K-Means Clustering dengan PCA (2 Komponen Utama)
# 3. GAMBAR KONTRIBUSI VARIABEL
fviz_pca_var(pca_result, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE) +
labs(title = "Kontribusi Variabel terhadap Komponen Utama")Gambar 2. Visualisasi K-Means Clustering dengan PCA (2 Komponen Utama)
# PROPORSI VARIANS YANG DIJELASKAN OLEH PC1 DAN PC2
# Hitung persentase varians
var_explained <- summary(pca_result)$importance[2, 1:5] # Proporsi varians
cat("Proporsi Varians yang Dijelaskan:\n")## Proporsi Varians yang Dijelaskan:
kable(
data.frame(
Komponen = paste0("PC", 1:5),
Proporsi = round(var_explained * 100, 2),
Kumulatif = round(cumsum(var_explained) * 100, 2)
),
caption = "Tabel 2. Proporsi Varians – 5 Komponen Utama Pertama",
col.names = c("Komponen", "Varians (%)", "Kumulatif (%)"),
align = "lrr"
) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Komponen | Varians (%) | Kumulatif (%) | |
|---|---|---|---|
| PC1 | PC1 | 20.61 | 20.61 |
| PC2 | PC2 | 8.39 | 29.00 |
| PC3 | PC3 | 7.03 | 36.02 |
| PC4 | PC4 | 5.65 | 41.68 |
| PC5 | PC5 | 5.41 | 47.09 |
# PROFILING CLUSTER – RATA-RATA NILAI ASLI (TIDAK DISCALE)
cluster_profile <- df_clustered %>%
group_by(Cluster) %>%
summarise(
n = n(),
aluminium = round(mean(aluminium), 4),
ammonia = round(mean(ammonia), 4),
arsenic = round(mean(arsenic), 4),
barium = round(mean(barium), 4),
cadmium = round(mean(cadmium), 4),
chloramine = round(mean(chloramine), 4),
chromium = round(mean(chromium), 4),
copper = round(mean(copper), 4),
flouride = round(mean(flouride), 4),
bacteria = round(mean(bacteria), 4),
viruses = round(mean(viruses), 4),
lead = round(mean(lead), 4),
nitrates = round(mean(nitrates), 4),
nitrites = round(mean(nitrites), 4),
mercury = round(mean(mercury), 4),
perchlorate = round(mean(perchlorate), 4),
radium = round(mean(radium), 4),
selenium = round(mean(selenium), 4),
silver = round(mean(silver), 4),
uranium = round(mean(uranium), 4),
pct_safe = round(mean(is_safe) * 100, 2) # % sampel aman
)
# Tampilkan tabel (transpose untuk keterbacaan lebih baik)
profile_t <- cluster_profile %>%
t() %>%
as.data.frame()
colnames(profile_t) <- paste0("Cluster ", 1:3)
profile_t <- profile_t[-1, ] # Hapus baris "Cluster" duplikat
kable(profile_t,
caption = "Tabel 3. Profil Rata-Rata Variabel per Cluster (Nilai Asli)",
align = "rrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE,
font_size = 11) %>%
row_spec(1, bold = TRUE, background = "#f2f2f2") # Baris 'n' ditebalkan| Cluster 1 | Cluster 2 | Cluster 3 | |
|---|---|---|---|
| n | 4061 | 2265 | 1670 |
| aluminium | 0.0553 | 1.3194 | 1.2669 |
| ammonia | 13.1463 | 15.4289 | 15.4701 |
| arsenic | 0.0510 | 0.0474 | 0.5848 |
| barium | 0.8534 | 2.0902 | 2.5970 |
| cadmium | 0.0500 | 0.0083 | 0.0722 |
| chloramine | 0.2044 | 4.1585 | 4.2892 |
| chromium | 0.0536 | 0.4539 | 0.4380 |
| copper | 0.7157 | 1.0161 | 0.7403 |
| flouride | 0.7711 | 0.7700 | 0.7752 |
| bacteria | 0.2559 | 0.4154 | 0.3450 |
| viruses | 0.3272 | 0.3284 | 0.3327 |
| lead | 0.1029 | 0.1034 | 0.0857 |
| nitrates | 9.8449 | 9.5574 | 10.1120 |
| nitrites | 1.0650 | 1.5148 | 1.7231 |
| mercury | 0.0052 | 0.0052 | 0.0051 |
| perchlorate | 3.5035 | 29.7496 | 29.9675 |
| radium | 1.8007 | 4.0768 | 4.0734 |
| selenium | 0.0496 | 0.0497 | 0.0499 |
| silver | 0.0498 | 0.2506 | 0.2468 |
| uranium | 0.0445 | 0.0449 | 0.0447 |
| pct_safe | 3.50 | 30.55 | 4.67 |
# VISUALISASI PROPORSI AIR AMAN PER CLUSTER
safety_summary <- df_clustered %>%
group_by(Cluster, is_safe) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(Cluster) %>%
mutate(pct = round(n / sum(n) * 100, 1),
Status = ifelse(is_safe == 1, "Aman (Safe)", "Tidak Aman (Unsafe)"))
ggplot(safety_summary, aes(x = Cluster, y = pct, fill = Status)) +
geom_col(position = "dodge", width = 0.6) +
geom_text(aes(label = paste0(pct, "%")),
position = position_dodge(width = 0.6),
vjust = -0.5, size = 3.5) +
scale_fill_manual(values = c("Aman (Safe)" = "#4DAF4A",
"Tidak Aman (Unsafe)" = "#E41A1C")) +
labs(
title = "Proporsi Status Keamanan Air per Cluster",
subtitle = "Berdasarkan label is_safe (0 = Tidak Aman, 1 = Aman)",
x = "Cluster",
y = "Persentase (%)",
fill = "Status Air"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold"))Gambar 3. Proporsi Air Aman (is_safe=1) pada Setiap Cluster
# HEATMAP: PERBANDINGAN PROFIL CLUSTER (Z-SCORE RATA-RATA)
# Hitung rata-rata z-score per cluster untuk setiap variabel
scaled_df_cluster <- as.data.frame(df_scaled)
scaled_df_cluster$Cluster <- as.factor(km_result$cluster)
cluster_zscore <- scaled_df_cluster %>%
group_by(Cluster) %>%
summarise(across(everything(), mean)) %>%
pivot_longer(-Cluster, names_to = "Variabel", values_to = "ZScore")
ggplot(cluster_zscore, aes(x = Cluster, y = Variabel, fill = ZScore)) +
geom_tile(color = "white", linewidth = 0.5) +
scale_fill_gradient2(low = "#3288BD",
mid = "white",
high = "#D53E4F",
midpoint = 0) +
labs(
title = "Heatmap Profil Cluster – Rata-Rata Z-Score",
subtitle = "Warna merah = di atas rata-rata global; biru = di bawah",
x = "Cluster",
y = "Variabel",
fill = "Z-Score"
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"),
axis.text.y = element_text(size = 9))Gambar 4. Heatmap Nilai Z-Score Rata-Rata per Cluster
# TABEL RINGKASAN KARAKTERISTIK TIAP CLUSTER (VERSI KOREKSI)
summary_cluster <- data.frame(
Cluster = c("Cluster 1", "Cluster 2", "Cluster 3"),
`Jumlah Sampel` = as.numeric(table(km_result$cluster)),
`Karakteristik Utama` = c(
"Tampak aman secara kasat mata (beberapa logam rendah), namun interaksi multivariat menunjukkan air tidak layak konsumsi.",
"Kadar ammonia dan nitrates tinggi, namun memiliki persentase kelayakan konsumsi tertinggi dibandingkan cluster lain.",
"Kandungan kimia sangat beracun dominan (Perchlorate, Radium, Arsenic tinggi); bahaya radiologis dan kimiawi."
),
`Estimasi Keamanan` = c("Kritis Tersembunyi", "Relatif Aman / Optimal", "Berisiko Sangat Tinggi"),
check.names = FALSE
)
# Menampilkan tabel dengan KableExtra
library(knitr)
library(kableExtra)
kable(summary_cluster,
caption = "Tabel 4. Ringkasan Karakteristik Tiap Cluster (Evaluasi Aktual)",
align = "clll") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = TRUE) %>%
column_spec(2, bold = TRUE) %>%
# Mengubah warna sesuai fakta: Cluster 1 (Merah), Cluster 2 (Hijau), Cluster 3 (Merah Gelap)
column_spec(4,
color = c("red", "darkgreen", "darkred"),
bold = TRUE)| Cluster | Jumlah Sampel | Karakteristik Utama | Estimasi Keamanan |
|---|---|---|---|
| Cluster 1 | 4061 | Tampak aman secara kasat mata (beberapa logam rendah), namun interaksi multivariat menunjukkan air tidak layak konsumsi. | Kritis Tersembunyi |
| Cluster 2 | 2265 | Kadar ammonia dan nitrates tinggi, namun memiliki persentase kelayakan konsumsi tertinggi dibandingkan cluster lain. | Relatif Aman / Optimal |
| Cluster 3 | 1670 | Kandungan kimia sangat beracun dominan (Perchlorate, Radium, Arsenic tinggi); bahaya radiologis dan kimiawi. | Berisiko Sangat Tinggi |
Kesimpulan: Algoritma K-Means terbukti mampu menemukan pola tersembunyi tanpa penyeliaan (unsupervised). Hal ini dikonfirmasi oleh fakta bahwa Cluster 2 berhasil menampung populasi sampel aman (30.55%) paling banyak secara signifikan. Di sisi lain, pembentukan Cluster 1 dan Cluster 3 secara presisi berhasil mengisolasi sampel-sampel air yang tercemar (tingkat keamanan masing-masing hanya 3.50% dan 4.67%), baik yang terkontaminasi secara laten maupun yang terpapar logam berat ekstrem.
Script ini dibuat untuk keperluan tugas Analisis Multivariat – Modul 3 (Clustering).