Dataset –> Credit Card Data Intermediate Link Dataset –> https://www.kaggle.com/datasets/samira1992/credit-card-data-intermediate-dataset

Variabel

X1 = BALANCE X2 = BALANCE_FREQUENCY X3 = PURCHASES X4 = ONEOFF_PURCHASES X5 = INSTALLMENTS_PURCHASES X6 = CASH_ADVANCE X7 = PURCHASES_FREQUENCY X8 = ONEOFF_PURCHASES_FREQUENCY X9 = PURCHASES_INSTALLMENTS_FREQUENCY X10 = CASH_ADVANCE_FREQUENCY X11 = CASH_ADVANCE_TRX X12 = PURCHASES_TRX X13 = CREDIT_LIMIT X14 = PAYMENTS X15 = MINIMUM_PAYMENTS X16 = PRC_FULL_PAYMENT X17 = TENURE

Import Library

library("tidyverse")
library("flexclust")
library("dbscan")
library("meanShiftR")
library("e1071")
library("cluster")
library("fpc")
library("mclust")

Import Data

data <- read.csv("D:/BIODATA KAFKA/Sem 4/Customer_Data.csv", header = TRUE, sep =",")
summary(data)
##    CUST_ID             BALANCE        BALANCE_FREQUENCY   PURCHASES       
##  Length:8950        Min.   :    0.0   Min.   :0.0000    Min.   :    0.00  
##  Class :character   1st Qu.:  128.3   1st Qu.:0.8889    1st Qu.:   39.63  
##  Mode  :character   Median :  873.4   Median :1.0000    Median :  361.28  
##                     Mean   : 1564.5   Mean   :0.8773    Mean   : 1003.21  
##                     3rd Qu.: 2054.1   3rd Qu.:1.0000    3rd Qu.: 1110.13  
##                     Max.   :19043.1   Max.   :1.0000    Max.   :49039.57  
##                                                                           
##  ONEOFF_PURCHASES  INSTALLMENTS_PURCHASES  CASH_ADVANCE     PURCHASES_FREQUENCY
##  Min.   :    0.0   Min.   :    0.0        Min.   :    0.0   Min.   :0.00000    
##  1st Qu.:    0.0   1st Qu.:    0.0        1st Qu.:    0.0   1st Qu.:0.08333    
##  Median :   38.0   Median :   89.0        Median :    0.0   Median :0.50000    
##  Mean   :  592.4   Mean   :  411.1        Mean   :  978.9   Mean   :0.49035    
##  3rd Qu.:  577.4   3rd Qu.:  468.6        3rd Qu.: 1113.8   3rd Qu.:0.91667    
##  Max.   :40761.2   Max.   :22500.0        Max.   :47137.2   Max.   :1.00000    
##                                                                                
##  ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
##  Min.   :0.00000            Min.   :0.0000                  
##  1st Qu.:0.00000            1st Qu.:0.0000                  
##  Median :0.08333            Median :0.1667                  
##  Mean   :0.20246            Mean   :0.3644                  
##  3rd Qu.:0.30000            3rd Qu.:0.7500                  
##  Max.   :1.00000            Max.   :1.0000                  
##                                                             
##  CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX  PURCHASES_TRX     CREDIT_LIMIT  
##  Min.   :0.0000         Min.   :  0.000   Min.   :  0.00   Min.   :   50  
##  1st Qu.:0.0000         1st Qu.:  0.000   1st Qu.:  1.00   1st Qu.: 1600  
##  Median :0.0000         Median :  0.000   Median :  7.00   Median : 3000  
##  Mean   :0.1351         Mean   :  3.249   Mean   : 14.71   Mean   : 4494  
##  3rd Qu.:0.2222         3rd Qu.:  4.000   3rd Qu.: 17.00   3rd Qu.: 6500  
##  Max.   :1.5000         Max.   :123.000   Max.   :358.00   Max.   :30000  
##                                                            NA's   :1      
##     PAYMENTS       MINIMUM_PAYMENTS    PRC_FULL_PAYMENT     TENURE     
##  Min.   :    0.0   Min.   :    0.019   Min.   :0.0000   Min.   : 6.00  
##  1st Qu.:  383.3   1st Qu.:  169.124   1st Qu.:0.0000   1st Qu.:12.00  
##  Median :  856.9   Median :  312.344   Median :0.0000   Median :12.00  
##  Mean   : 1733.1   Mean   :  864.207   Mean   :0.1537   Mean   :11.52  
##  3rd Qu.: 1901.1   3rd Qu.:  825.485   3rd Qu.:0.1429   3rd Qu.:12.00  
##  Max.   :50721.5   Max.   :76406.208   Max.   :1.0000   Max.   :12.00  
##                    NA's   :313

Pre Processing

data_num <- data[, -c(1)]
data_clean <- na.omit(data_num)
data_scaled <- scale(data_clean)

Menentukan K terbaik Silhoutte

avg_sil <- function(k) {
  km <- kmeans(data_scaled, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(data_scaled))
  mean(ss[, 3])
}

k_values <- 2:10
sil_values <- sapply(k_values, avg_sil)

plot(k_values, sil_values,
     type = "b",
     pch = 19,
     xlab = "Jumlah Cluster (k)",
     ylab = "Nilai Silhouette",
     main = "Silhouette Method")

best_k <- k_values[which.max(sil_values)]
best_k
## [1] 3

K-Means

km_res <- kmeans(data_scaled, centers = 3)

K-Medians

kmed_res <- kcca(data_scaled, k = 3, family = kccaFamily("kmedians"))

DBScan

db_res <- dbscan(data_scaled, eps = 0.7, MinPts = 5)

Mean Shift

ms_res <- meanShift(data_scaled)

Fuzzy C Means

fcm_res <- cmeans(data_scaled, centers = 3, m = 2) 

Visualisasi

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
plot(data_scaled, col = km_res$cluster, main = "K-means")
plot(data_scaled, col = clusters(kmed_res), main = "K-medians")
plot(data_scaled, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")
plot(data_scaled, col = ms_res$assignment, main = "Mean Shift")
plot(data_scaled, col = fcm_res$cluster, main = "Fuzzy C-means")
plot(data_scaled, col = as.numeric(iris$Species), main = "Original Species")

Eksplorasi

Matriks

mean(silhouette(km_res$cluster, dist(data_scaled))[,3])
## [1] 0.1798837

Dunn-Index

stats <- cluster.stats(dist(data_scaled), km_res$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.00311811597437616"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 108672.584766356"