1 Load Package

library(tidyverse); library(flexclust); library(dbscan); library(e1071)
library(cluster); library(fpc); library(meanShiftR); library(factoextra); library(gridExtra)
palet <- c("#378ADD","#1D9E75","#D85A30","#7F77DD","#D4537E","#BA7517","#EF9F27")
noise_col <- "gray70"

2 Load & Eksplorasi Data

df_raw <- read.csv("CC GENERAL.csv", stringsAsFactors=FALSE)
cat("Dimensi:", nrow(df_raw), "baris x", ncol(df_raw), "kolom\n")
## Dimensi: 8950 baris x 18 kolom
head(df_raw, 5)
mv <- data.frame(Kolom=names(df_raw), Missing=colSums(is.na(df_raw)), Persen=round(colMeans(is.na(df_raw))*100,2))
mv[mv$Missing > 0, ]

3 Preprocessing

cat("Proporsi TENURE=12:", round(mean(df_raw$TENURE==12)*100,1), "%\n")
## Proporsi TENURE=12: 84.7 %
df <- df_raw %>% select(-CUST_ID, -TENURE)
for(col in names(df)){
  n_miss <- sum(is.na(df[[col]]))
  if(n_miss > 0){ df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm=TRUE); cat("Imputasi",col,":",n_miss,"nilai\n") }
}
## Imputasi CREDIT_LIMIT : 1 nilai
## Imputasi MINIMUM_PAYMENTS : 313 nilai
cat("Missing setelah imputasi:", sum(is.na(df)), "| Variabel aktif:", ncol(df), "\n")
## Missing setelah imputasi: 0 | Variabel aktif: 16
df %>% pivot_longer(everything(), names_to="variabel", values_to="nilai") %>%
  ggplot(aes(x=nilai)) + geom_histogram(bins=30, fill="#378ADD", color="white", linewidth=0.2) +
  facet_wrap(~variabel, scales="free", ncol=4) +
  labs(title="Distribusi Variabel (Sebelum Scaling)", x=NULL, y="Frekuensi") + theme_minimal(base_size=9)

df_scaled <- as.data.frame(scale(df))

4 PCA

set.seed(42)
pca_result <- prcomp(df_scaled, center=FALSE, scale.=FALSE)
cum_var <- cumsum(summary(pca_result)$importance[2,])
n_pc <- which(cum_var >= 0.80)[1]
cat("PC >= 80% variance:", n_pc, "\n"); print(round(cum_var[1:(n_pc+2)], 4))
## PC >= 80% variance: 7
##    PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8    PC9 
## 0.2885 0.5044 0.5979 0.6733 0.7394 0.7933 0.8392 0.8804 0.9132
fviz_eig(pca_result, addlabels=TRUE, barfill="#378ADD", barcolor="#185FA5", linecolor="#D85A30", main="Scree Plot") + theme_minimal()

fviz_pca_var(pca_result, col.var="contrib", gradient.cols=c("#9FE1CB","#1D9E75","#085041"), repel=TRUE, title="Biplot — Kontribusi Variabel") + theme_minimal()

df_pca <- as.data.frame(pca_result$x[, 1:n_pc])
cat("Dimensi setelah PCA:", nrow(df_pca), "x", ncol(df_pca), "\n")
## Dimensi setelah PCA: 8950 x 7

5 Penentuan K Optimal

set.seed(42)
df_sample <- df_pca[sample(nrow(df_pca), 2000), ]
wss_vals <- sapply(1:10, function(k) kmeans(df_sample, centers=k, nstart=10, iter.max=100)$tot.withinss)
k_elbow <- which.max(diff(diff(wss_vals))) + 1
sil_vals <- sapply(2:10, function(k){ km <- kmeans(df_sample, centers=k, nstart=10, iter.max=100); mean(silhouette(km$cluster, dist(df_sample))[,3]) })
k_sil <- which.max(sil_vals) + 1
par(mfrow=c(1,2), mar=c(4,4,3,1))
plot(1:10, wss_vals, type="b", pch=19, col="#378ADD", lwd=2, xlab="K", ylab="WSS", main="Elbow")
abline(v=k_elbow, lty=2, col="#D85A30"); text(k_elbow+0.2, max(wss_vals)*0.95, paste0("K=",k_elbow), col="#D85A30", cex=0.85, adj=0)
plot(2:10, sil_vals, type="b", pch=19, col="#1D9E75", lwd=2, xlab="K", ylab="Silhouette", main="Silhouette")
abline(v=k_sil, lty=2, col="#D85A30"); text(k_sil+0.2, min(sil_vals)*1.02, paste0("K=",k_sil), col="#D85A30", cex=0.85, adj=0)

cat("Elbow K=", k_elbow, "| Silhouette K=", k_sil, "\n")
## Elbow K= 2 | Silhouette K= 4
K_OPTIMAL <- 4

Keputusan: K=4 dipilih karena konsisten dengan Elbow dan selisih Silhouette vs K=7 sangat kecil (~0.012).

6 Clustering — 5 Metode

# ── K-MEANS ──
set.seed(42)
km_result <- kmeans(df_pca, centers=K_OPTIMAL, nstart=10, iter.max=100)
cat("K-Means — Anggota:\n"); print(table(km_result$cluster))
## K-Means — Anggota:
## 
##    1    2    3    4 
## 1234  328 3363 4025
cat("Between/Total SS:", round(km_result$betweenss/km_result$totss*100,2), "%\n")
## Between/Total SS: 43.73 %
fviz_cluster(km_result, data=df_pca, geom="point", ellipse.type="convex",
             palette=palet[1:K_OPTIMAL], ggtheme=theme_minimal(), main="K-Means",
             pointsize=0.5, alpha=0.5)

# ── K-MEDIANS ──
kmed_result   <- kcca(df_pca, k=K_OPTIMAL, family=kccaFamily("kmedians"), control=list(iter.max=50, verbose=0))
kmed_clusters <- clusters(kmed_result)
cat("K-Medians — Anggota:\n"); print(table(kmed_clusters))
## K-Medians — Anggota:
## kmed_clusters
##    1    2    3    4 
## 3420 2525 1568 1437
fviz_cluster(list(data=df_pca, cluster=kmed_clusters), geom="point", ellipse.type="convex",
             palette=palet[1:K_OPTIMAL], ggtheme=theme_minimal(), main="K-Medians",
             pointsize=0.5, alpha=0.5)

# ── DBSCAN ──
MINPTS_VAL <- 2*n_pc
k_sorted   <- sort(kNNdist(df_pca, k=MINPTS_VAL))
n_pts      <- length(k_sorted)
x_norm     <- (seq_along(k_sorted)-1)/(n_pts-1)
y_norm     <- (k_sorted-min(k_sorted))/(max(k_sorted)-min(k_sorted))
jarak      <- abs(y_norm-x_norm)/sqrt(2)
bawah      <- floor(0.10*n_pts); atas <- floor(0.90*n_pts)
knee       <- which.max(jarak[bawah:atas])+bawah-1
EPS_VALUE  <- round(k_sorted[knee],2)
cat("DBSCAN eps:", EPS_VALUE, "| MinPts:", MINPTS_VAL, "\n")
## DBSCAN eps: 1.61 | MinPts: 14
plot(k_sorted, type="l", col="#378ADD", lwd=1.5,
     xlab="Data (terurut)", ylab=paste0("Jarak ke-",MINPTS_VAL," tetangga"),
     main=paste0("K-Distance Plot — eps=",EPS_VALUE))
abline(h=EPS_VALUE, lty=2, col="#D85A30"); points(knee, k_sorted[knee], pch=19, col="#D85A30", cex=1.2)

db_result <- dbscan::dbscan(df_pca, eps=EPS_VALUE, minPts=MINPTS_VAL)
cat("DBSCAN Cluster:", length(unique(db_result$cluster[db_result$cluster>0])), "| Noise:", sum(db_result$cluster==0), "\n")
## DBSCAN Cluster: 1 | Noise: 490
print(table(db_result$cluster))
## 
##    0    1 
##  490 8460
lvl_db <- as.character(sort(unique(db_result$cluster)))
col_db <- ifelse(lvl_db=="0", noise_col, palet[as.integer(lvl_db) %% length(palet) + 1])
ggplot(data.frame(PC1=df_pca$PC1, PC2=df_pca$PC2, cluster=as.factor(db_result$cluster)),
       aes(PC1,PC2,color=cluster)) +
  geom_point(size=0.5, alpha=0.45) +
  scale_color_manual(values=col_db, labels=ifelse(lvl_db=="0","Noise",paste0("Cluster ",lvl_db))) +
  labs(title=paste0("DBSCAN (eps=",EPS_VALUE,", minPts=",MINPTS_VAL,")"), color="Cluster") +
  theme_minimal()

# ── MEAN SHIFT ──
cat("Mean Shift — subsample 800...\n")
## Mean Shift — subsample 800...
set.seed(42)
idx_ms     <- sample(nrow(df_pca), 800)
bw_vec     <- rep(2, ncol(df_pca))
ms_result  <- meanShift(as.matrix(df_pca[idx_ms,]), bandwidth=bw_vec, epsilon=0.01)
ms_centers <- unique(ms_result$value)
ms_raw     <- apply(as.matrix(df_pca), 1, function(x) which.min(colSums((t(ms_centers)-x)^2)))
# Paksa max K_OPTIMAL cluster agar warna cukup
ms_clusters <- ifelse(ms_raw > K_OPTIMAL, K_OPTIMAL, ms_raw)
cat("Mean Shift Cluster (setelah dipadatkan):", length(unique(ms_clusters)), "\n")
## Mean Shift Cluster (setelah dipadatkan): 2
print(table(ms_clusters))
## ms_clusters
##    1    4 
## 2669 6281
ggplot(data.frame(PC1=df_pca$PC1, PC2=df_pca$PC2, cluster=as.factor(ms_clusters)),
       aes(PC1,PC2,color=cluster)) +
  geom_point(size=0.5, alpha=0.45) +
  scale_color_manual(values=palet[1:length(unique(ms_clusters))]) +
  labs(title=paste0("Mean Shift (",n_pc," PC, bw=2)"), color="Cluster") +
  theme_minimal()

# ── FUZZY C-MEANS ──
fcm_result   <- cmeans(as.matrix(df_pca), centers=K_OPTIMAL, m=2, iter.max=50, verbose=FALSE, dist="euclidean")
fcm_clusters <- fcm_result$cluster
cat("FCM — Anggota:\n"); print(table(fcm_clusters))
## FCM — Anggota:
## fcm_clusters
##    1    2    3    4 
## 2330 1650 1639 3331
cat("Objective function:", round(fcm_result$withinerror,4), "\n")
## Objective function: 3.2516
print(round(head(fcm_result$membership,5),4))
##           1      2      3      4
## [1,] 0.1267 0.0780 0.0561 0.7392
## [2,] 0.0829 0.6930 0.0736 0.1505
## [3,] 0.2442 0.1699 0.3723 0.2136
## [4,] 0.1463 0.1818 0.1095 0.5625
## [5,] 0.0880 0.0697 0.0443 0.7980
fviz_cluster(list(data=df_pca, cluster=fcm_clusters), geom="point", ellipse.type="convex",
             palette=palet[1:K_OPTIMAL], ggtheme=theme_minimal(),
             main=paste0("Fuzzy C-Means (m=2, K=",K_OPTIMAL,")"), pointsize=0.5, alpha=0.5)

7 Evaluasi Metrik

hitung_sil <- function(label, data){
  label <- as.integer(label); idx <- label>0
  if(length(unique(label[idx]))<2) return(NA)
  round(mean(silhouette(label[idx], dist(data[idx,]))[,3]),4)
}
hitung_dunn <- function(label, data){
  label <- as.integer(label); idx <- label>0
  if(length(unique(label[idx]))<2) return(NA)
  tryCatch(round(fpc::cluster.stats(dist(data[idx,]),label[idx])$dunn,4), error=function(e) NA)
}
set.seed(42)
idx_eval <- sample(nrow(df_pca), min(800,nrow(df_pca)))
df_eval  <- df_pca[idx_eval,]
labels_list <- list(
  "K-Means"       = km_result$cluster[idx_eval],
  "K-Medians"     = kmed_clusters[idx_eval],
  "DBSCAN"        = db_result$cluster[idx_eval],
  "Mean Shift"    = ms_clusters[idx_eval],
  "Fuzzy C-Means" = fcm_clusters[idx_eval]
)
hasil_eval <- purrr::imap_dfr(labels_list, function(lbl,nm)
  data.frame(Metode=nm, N_Cluster=length(unique(lbl[lbl>0])), N_Noise=sum(lbl==0),
             Silhouette=hitung_sil(lbl,df_eval), Dunn_Index=hitung_dunn(lbl,df_eval)))
knitr::kable(hasil_eval, digits=4, caption="Perbandingan Metrik Evaluasi")
Perbandingan Metrik Evaluasi
Metode N_Cluster N_Noise Silhouette Dunn_Index
K-Means 4 0 0.2593 0.0082
K-Medians 4 0 0.2237 0.0084
DBSCAN 1 45 NA NA
Mean Shift 2 0 0.1472 0.0084
Fuzzy C-Means 4 0 0.2113 0.0102
hasil_eval %>% filter(!is.na(Silhouette)) %>% mutate(Metode=factor(Metode,levels=Metode)) %>%
  ggplot(aes(x=Metode, y=Silhouette, fill=Metode)) +
  geom_col(width=0.6, show.legend=FALSE) +
  geom_text(aes(label=sprintf("%.4f",Silhouette)), vjust=-0.5, size=3.5) +
  scale_fill_manual(values=palet[1:5]) +
  scale_y_continuous(limits=c(0, max(hasil_eval$Silhouette,na.rm=TRUE)*1.15)) +
  labs(title="Perbandingan Silhouette Width", x=NULL, y="Avg Silhouette") +
  theme_minimal() + theme(axis.text.x=element_text(angle=15,hjust=1))

8 Perbandingan Visual 5 Metode

df_cmp <- data.frame(
  PC1=df_pca$PC1, PC2=df_pca$PC2,
  KMeans      = as.factor(km_result$cluster),
  KMedians    = as.factor(kmed_clusters),
  DBSCAN      = as.factor(db_result$cluster),
  MeanShift   = as.factor(ms_clusters),
  FuzzyCMeans = as.factor(fcm_clusters)
)
mk_plot <- function(col_cl, judul, warna=palet[1:length(unique(df_cmp[[col_cl]]))]) {
  ggplot(df_cmp, aes(x=PC1, y=PC2, color=.data[[col_cl]])) +
    geom_point(size=0.35, alpha=0.4) +
    scale_color_manual(values=warna) +
    labs(title=judul, color="Cl") +
    theme_minimal(base_size=9) +
    theme(legend.position="bottom", legend.key.size=unit(0.3,"cm"))
}
n_db <- length(unique(df_cmp$DBSCAN))
grid.arrange(
  mk_plot("KMeans",      "K-Means"),
  mk_plot("KMedians",    "K-Medians"),
  mk_plot("DBSCAN",      "DBSCAN (0=Noise)", c(noise_col, palet[1:(n_db-1)])),
  mk_plot("MeanShift",   "Mean Shift"),
  mk_plot("FuzzyCMeans", "Fuzzy C-Means"),
  ncol=3, top="Perbandingan 5 Metode Clustering (PC1 vs PC2)"
)

9 Profil Cluster

df_profil_base <- df
df_profil_base$Cluster_KMeans   <- km_result$cluster
df_profil_base$Cluster_KMedians <- kmed_clusters
df_profil_base$Cluster_DBSCAN   <- db_result$cluster
df_profil_base$Cluster_MS       <- ms_clusters
df_profil_base$Cluster_FCM      <- fcm_clusters
buat_profil <- function(data, col_cl)
  data %>% rename(Cluster=!!sym(col_cl)) %>% filter(Cluster>0) %>%
  group_by(Cluster) %>% summarise(N=n(), across(where(is.numeric),mean), .groups="drop") %>%
  mutate(across(where(is.numeric),~round(.x,2)))
knitr::kable(buat_profil(df_profil_base,"Cluster_KMeans"),   caption="Profil K-Means")
Profil K-Means
Cluster N BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT Cluster_KMedians Cluster_DBSCAN Cluster_MS Cluster_FCM
1 1234 4542.66 0.97 487.67 309.96 177.79 4465.53 0.28 0.14 0.18 0.48 14.06 7.47 7462.04 3452.38 2001.37 0.04 3.94 0.85 4.00 2.03
2 328 3850.36 0.99 8582.29 5793.94 2790.18 804.95 0.94 0.75 0.78 0.08 2.53 93.29 10127.44 8254.06 2229.73 0.28 3.13 0.30 3.01 2.99
3 3363 933.75 0.94 1322.00 643.50 678.69 209.60 0.89 0.31 0.72 0.04 0.79 23.65 4322.98 1392.30 650.70 0.28 2.35 0.98 1.86 1.77
4 4025 992.12 0.79 277.27 212.50 65.10 566.84 0.18 0.09 0.09 0.11 2.05 3.06 3268.50 959.44 539.77 0.08 1.28 0.99 3.88 3.58
knitr::kable(buat_profil(df_profil_base,"Cluster_KMedians"), caption="Profil K-Medians")
Profil K-Medians
Cluster N BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT Cluster_KMeans Cluster_DBSCAN Cluster_MS Cluster_FCM
1 3420 986.95 0.81 244.30 196.07 48.37 546.60 0.15 0.08 0.07 0.11 2.04 2.63 3045.69 851.42 521.42 0.04 3.99 1.00 3.86 3.79
2 2525 598.13 0.86 827.27 164.49 663.59 207.19 0.83 0.09 0.77 0.04 0.75 17.30 3386.44 1012.65 580.17 0.31 3.12 0.99 2.21 1.33
3 1568 1847.04 0.98 3046.15 2159.36 886.82 295.90 0.89 0.72 0.54 0.05 0.97 41.16 6805.77 2850.29 842.58 0.24 2.85 0.86 2.09 2.93
4 1437 4328.64 0.96 889.32 577.96 311.44 4108.83 0.26 0.13 0.17 0.45 13.01 10.05 7366.29 3878.62 2082.50 0.04 1.46 0.84 4.00 2.06
knitr::kable(buat_profil(df_profil_base,"Cluster_DBSCAN"),   caption="Profil DBSCAN")
Profil DBSCAN
Cluster N BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT Cluster_KMeans Cluster_KMedians Cluster_MS Cluster_FCM
1 8460 1382.36 0.87 744.52 419.42 325.35 835.68 0.48 0.19 0.35 0.13 2.81 12.16 4176.26 1346.41 634.49 0.15 3.21 2.04 3.07 2.67
knitr::kable(buat_profil(df_profil_base,"Cluster_MS"),       caption="Profil Mean Shift")
Profil Mean Shift
Cluster N BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT Cluster_KMeans Cluster_KMedians Cluster_DBSCAN Cluster_FCM
1 2669 824.75 0.98 1559.61 793.38 766.39 132.32 0.90 0.33 0.75 0.03 0.50 27.33 4340.13 1541.92 578.94 0.24 3.02 2.32 0.98 1.91
4 6281 1878.81 0.83 766.77 507.05 260.08 1338.60 0.32 0.15 0.20 0.18 4.42 9.35 4559.79 1814.40 957.93 0.12 3.19 2.03 0.93 2.99
knitr::kable(buat_profil(df_profil_base,"Cluster_FCM"),      caption="Profil Fuzzy C-Means")
Profil Fuzzy C-Means
Cluster N BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT Cluster_KMeans Cluster_KMedians Cluster_DBSCAN Cluster_MS
1 2330 542.18 0.89 717.44 142.39 575.67 146.50 0.84 0.09 0.77 0.03 0.60 15.66 3058.13 847.65 599.47 0.29 3.11 2.00 0.99 2.05
2 1650 3902.05 0.96 384.11 257.28 126.90 3803.06 0.23 0.11 0.14 0.44 12.13 5.37 6630.68 3056.75 1729.00 0.04 1.82 3.51 0.89 4.00
3 1639 2059.87 0.98 3531.75 2327.35 1204.79 419.48 0.92 0.67 0.63 0.06 1.38 46.86 7336.49 3475.55 1017.43 0.26 2.76 2.93 0.82 2.08
4 3331 877.89 0.78 265.60 219.61 46.15 437.40 0.17 0.10 0.07 0.09 1.62 2.86 3042.11 839.55 493.76 0.06 3.99 1.10 1.00 3.90

10 Simpan & Ringkasan

df_hasil <- df_raw
df_hasil$Cluster_KMeans     <- km_result$cluster
df_hasil$Cluster_KMedians   <- kmed_clusters
df_hasil$Cluster_DBSCAN     <- db_result$cluster
df_hasil$Cluster_MeanShift  <- ms_clusters
df_hasil$Cluster_FCM        <- fcm_clusters
write.csv(df_hasil,   "hasil_clustering_creditcard.csv", row.names=FALSE)
write.csv(hasil_eval, "evaluasi_clustering.csv",         row.names=FALSE)
cat("Dataset:", nrow(df_raw), "nasabah |", ncol(df_raw), "variabel awal\n")
## Dataset: 8950 nasabah | 18 variabel awal
cat("Variabel aktif:", ncol(df), "| PC:", n_pc, "| K Optimal:", K_OPTIMAL, "\n")
## Variabel aktif: 16 | PC: 7 | K Optimal: 4
knitr::kable(hasil_eval, digits=4, caption="Evaluasi Akhir")
Evaluasi Akhir
Metode N_Cluster N_Noise Silhouette Dunn_Index
K-Means 4 0 0.2593 0.0082
K-Medians 4 0 0.2237 0.0084
DBSCAN 1 45 NA NA
Mean Shift 2 0 0.1472 0.0084
Fuzzy C-Means 4 0 0.2113 0.0102

Analisis selesai.