Preprocessing
cat("Proporsi TENURE=12:", round(mean(df_raw$TENURE==12)*100,1), "%\n")
## Proporsi TENURE=12: 84.7 %
df <- df_raw %>% select(-CUST_ID, -TENURE)
for(col in names(df)){
n_miss <- sum(is.na(df[[col]]))
if(n_miss > 0){ df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm=TRUE); cat("Imputasi",col,":",n_miss,"nilai\n") }
}
## Imputasi CREDIT_LIMIT : 1 nilai
## Imputasi MINIMUM_PAYMENTS : 313 nilai
cat("Missing setelah imputasi:", sum(is.na(df)), "| Variabel aktif:", ncol(df), "\n")
## Missing setelah imputasi: 0 | Variabel aktif: 16
df %>% pivot_longer(everything(), names_to="variabel", values_to="nilai") %>%
ggplot(aes(x=nilai)) + geom_histogram(bins=30, fill="#378ADD", color="white", linewidth=0.2) +
facet_wrap(~variabel, scales="free", ncol=4) +
labs(title="Distribusi Variabel (Sebelum Scaling)", x=NULL, y="Frekuensi") + theme_minimal(base_size=9)

df_scaled <- as.data.frame(scale(df))
PCA
set.seed(42)
pca_result <- prcomp(df_scaled, center=FALSE, scale.=FALSE)
cum_var <- cumsum(summary(pca_result)$importance[2,])
n_pc <- which(cum_var >= 0.80)[1]
cat("PC >= 80% variance:", n_pc, "\n"); print(round(cum_var[1:(n_pc+2)], 4))
## PC >= 80% variance: 7
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
## 0.2885 0.5044 0.5979 0.6733 0.7394 0.7933 0.8392 0.8804 0.9132
fviz_eig(pca_result, addlabels=TRUE, barfill="#378ADD", barcolor="#185FA5", linecolor="#D85A30", main="Scree Plot") + theme_minimal()

fviz_pca_var(pca_result, col.var="contrib", gradient.cols=c("#9FE1CB","#1D9E75","#085041"), repel=TRUE, title="Biplot — Kontribusi Variabel") + theme_minimal()

df_pca <- as.data.frame(pca_result$x[, 1:n_pc])
cat("Dimensi setelah PCA:", nrow(df_pca), "x", ncol(df_pca), "\n")
## Dimensi setelah PCA: 8950 x 7
Penentuan K
Optimal
set.seed(42)
df_sample <- df_pca[sample(nrow(df_pca), 2000), ]
wss_vals <- sapply(1:10, function(k) kmeans(df_sample, centers=k, nstart=10, iter.max=100)$tot.withinss)
k_elbow <- which.max(diff(diff(wss_vals))) + 1
sil_vals <- sapply(2:10, function(k){ km <- kmeans(df_sample, centers=k, nstart=10, iter.max=100); mean(silhouette(km$cluster, dist(df_sample))[,3]) })
k_sil <- which.max(sil_vals) + 1
par(mfrow=c(1,2), mar=c(4,4,3,1))
plot(1:10, wss_vals, type="b", pch=19, col="#378ADD", lwd=2, xlab="K", ylab="WSS", main="Elbow")
abline(v=k_elbow, lty=2, col="#D85A30"); text(k_elbow+0.2, max(wss_vals)*0.95, paste0("K=",k_elbow), col="#D85A30", cex=0.85, adj=0)
plot(2:10, sil_vals, type="b", pch=19, col="#1D9E75", lwd=2, xlab="K", ylab="Silhouette", main="Silhouette")
abline(v=k_sil, lty=2, col="#D85A30"); text(k_sil+0.2, min(sil_vals)*1.02, paste0("K=",k_sil), col="#D85A30", cex=0.85, adj=0)

cat("Elbow K=", k_elbow, "| Silhouette K=", k_sil, "\n")
## Elbow K= 2 | Silhouette K= 4
Keputusan: K=4 dipilih karena konsisten dengan Elbow
dan selisih Silhouette vs K=7 sangat kecil (~0.012).
Clustering — 5
Metode
# ── K-MEANS ──
set.seed(42)
km_result <- kmeans(df_pca, centers=K_OPTIMAL, nstart=10, iter.max=100)
cat("K-Means — Anggota:\n"); print(table(km_result$cluster))
## K-Means — Anggota:
##
## 1 2 3 4
## 1234 328 3363 4025
cat("Between/Total SS:", round(km_result$betweenss/km_result$totss*100,2), "%\n")
## Between/Total SS: 43.73 %
fviz_cluster(km_result, data=df_pca, geom="point", ellipse.type="convex",
palette=palet[1:K_OPTIMAL], ggtheme=theme_minimal(), main="K-Means",
pointsize=0.5, alpha=0.5)

# ── K-MEDIANS ──
kmed_result <- kcca(df_pca, k=K_OPTIMAL, family=kccaFamily("kmedians"), control=list(iter.max=50, verbose=0))
kmed_clusters <- clusters(kmed_result)
cat("K-Medians — Anggota:\n"); print(table(kmed_clusters))
## K-Medians — Anggota:
## kmed_clusters
## 1 2 3 4
## 3420 2525 1568 1437
fviz_cluster(list(data=df_pca, cluster=kmed_clusters), geom="point", ellipse.type="convex",
palette=palet[1:K_OPTIMAL], ggtheme=theme_minimal(), main="K-Medians",
pointsize=0.5, alpha=0.5)

# ── DBSCAN ──
MINPTS_VAL <- 2*n_pc
k_sorted <- sort(kNNdist(df_pca, k=MINPTS_VAL))
n_pts <- length(k_sorted)
x_norm <- (seq_along(k_sorted)-1)/(n_pts-1)
y_norm <- (k_sorted-min(k_sorted))/(max(k_sorted)-min(k_sorted))
jarak <- abs(y_norm-x_norm)/sqrt(2)
bawah <- floor(0.10*n_pts); atas <- floor(0.90*n_pts)
knee <- which.max(jarak[bawah:atas])+bawah-1
EPS_VALUE <- round(k_sorted[knee],2)
cat("DBSCAN eps:", EPS_VALUE, "| MinPts:", MINPTS_VAL, "\n")
## DBSCAN eps: 1.61 | MinPts: 14
plot(k_sorted, type="l", col="#378ADD", lwd=1.5,
xlab="Data (terurut)", ylab=paste0("Jarak ke-",MINPTS_VAL," tetangga"),
main=paste0("K-Distance Plot — eps=",EPS_VALUE))
abline(h=EPS_VALUE, lty=2, col="#D85A30"); points(knee, k_sorted[knee], pch=19, col="#D85A30", cex=1.2)

db_result <- dbscan::dbscan(df_pca, eps=EPS_VALUE, minPts=MINPTS_VAL)
cat("DBSCAN Cluster:", length(unique(db_result$cluster[db_result$cluster>0])), "| Noise:", sum(db_result$cluster==0), "\n")
## DBSCAN Cluster: 1 | Noise: 490
print(table(db_result$cluster))
##
## 0 1
## 490 8460
lvl_db <- as.character(sort(unique(db_result$cluster)))
col_db <- ifelse(lvl_db=="0", noise_col, palet[as.integer(lvl_db) %% length(palet) + 1])
ggplot(data.frame(PC1=df_pca$PC1, PC2=df_pca$PC2, cluster=as.factor(db_result$cluster)),
aes(PC1,PC2,color=cluster)) +
geom_point(size=0.5, alpha=0.45) +
scale_color_manual(values=col_db, labels=ifelse(lvl_db=="0","Noise",paste0("Cluster ",lvl_db))) +
labs(title=paste0("DBSCAN (eps=",EPS_VALUE,", minPts=",MINPTS_VAL,")"), color="Cluster") +
theme_minimal()

# ── MEAN SHIFT ──
cat("Mean Shift — subsample 800...\n")
## Mean Shift — subsample 800...
set.seed(42)
idx_ms <- sample(nrow(df_pca), 800)
bw_vec <- rep(2, ncol(df_pca))
ms_result <- meanShift(as.matrix(df_pca[idx_ms,]), bandwidth=bw_vec, epsilon=0.01)
ms_centers <- unique(ms_result$value)
ms_raw <- apply(as.matrix(df_pca), 1, function(x) which.min(colSums((t(ms_centers)-x)^2)))
# Paksa max K_OPTIMAL cluster agar warna cukup
ms_clusters <- ifelse(ms_raw > K_OPTIMAL, K_OPTIMAL, ms_raw)
cat("Mean Shift Cluster (setelah dipadatkan):", length(unique(ms_clusters)), "\n")
## Mean Shift Cluster (setelah dipadatkan): 2
print(table(ms_clusters))
## ms_clusters
## 1 4
## 2669 6281
ggplot(data.frame(PC1=df_pca$PC1, PC2=df_pca$PC2, cluster=as.factor(ms_clusters)),
aes(PC1,PC2,color=cluster)) +
geom_point(size=0.5, alpha=0.45) +
scale_color_manual(values=palet[1:length(unique(ms_clusters))]) +
labs(title=paste0("Mean Shift (",n_pc," PC, bw=2)"), color="Cluster") +
theme_minimal()

# ── FUZZY C-MEANS ──
fcm_result <- cmeans(as.matrix(df_pca), centers=K_OPTIMAL, m=2, iter.max=50, verbose=FALSE, dist="euclidean")
fcm_clusters <- fcm_result$cluster
cat("FCM — Anggota:\n"); print(table(fcm_clusters))
## FCM — Anggota:
## fcm_clusters
## 1 2 3 4
## 2330 1650 1639 3331
cat("Objective function:", round(fcm_result$withinerror,4), "\n")
## Objective function: 3.2516
print(round(head(fcm_result$membership,5),4))
## 1 2 3 4
## [1,] 0.1267 0.0780 0.0561 0.7392
## [2,] 0.0829 0.6930 0.0736 0.1505
## [3,] 0.2442 0.1699 0.3723 0.2136
## [4,] 0.1463 0.1818 0.1095 0.5625
## [5,] 0.0880 0.0697 0.0443 0.7980
fviz_cluster(list(data=df_pca, cluster=fcm_clusters), geom="point", ellipse.type="convex",
palette=palet[1:K_OPTIMAL], ggtheme=theme_minimal(),
main=paste0("Fuzzy C-Means (m=2, K=",K_OPTIMAL,")"), pointsize=0.5, alpha=0.5)

Evaluasi Metrik
hitung_sil <- function(label, data){
label <- as.integer(label); idx <- label>0
if(length(unique(label[idx]))<2) return(NA)
round(mean(silhouette(label[idx], dist(data[idx,]))[,3]),4)
}
hitung_dunn <- function(label, data){
label <- as.integer(label); idx <- label>0
if(length(unique(label[idx]))<2) return(NA)
tryCatch(round(fpc::cluster.stats(dist(data[idx,]),label[idx])$dunn,4), error=function(e) NA)
}
set.seed(42)
idx_eval <- sample(nrow(df_pca), min(800,nrow(df_pca)))
df_eval <- df_pca[idx_eval,]
labels_list <- list(
"K-Means" = km_result$cluster[idx_eval],
"K-Medians" = kmed_clusters[idx_eval],
"DBSCAN" = db_result$cluster[idx_eval],
"Mean Shift" = ms_clusters[idx_eval],
"Fuzzy C-Means" = fcm_clusters[idx_eval]
)
hasil_eval <- purrr::imap_dfr(labels_list, function(lbl,nm)
data.frame(Metode=nm, N_Cluster=length(unique(lbl[lbl>0])), N_Noise=sum(lbl==0),
Silhouette=hitung_sil(lbl,df_eval), Dunn_Index=hitung_dunn(lbl,df_eval)))
knitr::kable(hasil_eval, digits=4, caption="Perbandingan Metrik Evaluasi")
Perbandingan Metrik Evaluasi
| K-Means |
4 |
0 |
0.2593 |
0.0082 |
| K-Medians |
4 |
0 |
0.2237 |
0.0084 |
| DBSCAN |
1 |
45 |
NA |
NA |
| Mean Shift |
2 |
0 |
0.1472 |
0.0084 |
| Fuzzy C-Means |
4 |
0 |
0.2113 |
0.0102 |
hasil_eval %>% filter(!is.na(Silhouette)) %>% mutate(Metode=factor(Metode,levels=Metode)) %>%
ggplot(aes(x=Metode, y=Silhouette, fill=Metode)) +
geom_col(width=0.6, show.legend=FALSE) +
geom_text(aes(label=sprintf("%.4f",Silhouette)), vjust=-0.5, size=3.5) +
scale_fill_manual(values=palet[1:5]) +
scale_y_continuous(limits=c(0, max(hasil_eval$Silhouette,na.rm=TRUE)*1.15)) +
labs(title="Perbandingan Silhouette Width", x=NULL, y="Avg Silhouette") +
theme_minimal() + theme(axis.text.x=element_text(angle=15,hjust=1))
