INSTALL PACKAGES

## LOAD LIBRARY
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:flexclust':
## 
##     bclust
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## The following object is masked from 'package:purrr':
## 
##     map
## 
## The following object is masked from 'package:htmltools':
## 
##     em

LOAD DATA

data = read.csv("CC GENERAL.csv")

head(data)
##   CUST_ID    BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES
## 1  C10001   40.90075          0.818182     95.40             0.00
## 2  C10002 3202.46742          0.909091      0.00             0.00
## 3  C10003 2495.14886          1.000000    773.17           773.17
## 4  C10004 1666.67054          0.636364   1499.00          1499.00
## 5  C10005  817.71434          1.000000     16.00            16.00
## 6  C10006 1809.82875          1.000000   1333.28             0.00
##   INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY
## 1                  95.40        0.000            0.166667
## 2                   0.00     6442.945            0.000000
## 3                   0.00        0.000            1.000000
## 4                   0.00      205.788            0.083333
## 5                   0.00        0.000            0.083333
## 6                1333.28        0.000            0.666667
##   ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
## 1                   0.000000                         0.083333
## 2                   0.000000                         0.000000
## 3                   1.000000                         0.000000
## 4                   0.083333                         0.000000
## 5                   0.083333                         0.000000
## 6                   0.000000                         0.583333
##   CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT  PAYMENTS
## 1               0.000000                0             2         1000  201.8021
## 2               0.250000                4             0         7000 4103.0326
## 3               0.000000                0            12         7500  622.0667
## 4               0.083333                1             1         7500    0.0000
## 5               0.000000                0             1         1200  678.3348
## 6               0.000000                0             8         1800 1400.0578
##   MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
## 1         139.5098         0.000000     12
## 2        1072.3402         0.222222     12
## 3         627.2848         0.000000     12
## 4               NA         0.000000     12
## 5         244.7912         0.000000     12
## 6        2407.2460         0.000000     12
str(data)
## 'data.frame':    8950 obs. of  18 variables:
##  $ CUST_ID                         : chr  "C10001" "C10002" "C10003" "C10004" ...
##  $ BALANCE                         : num  40.9 3202.5 2495.1 1666.7 817.7 ...
##  $ BALANCE_FREQUENCY               : num  0.818 0.909 1 0.636 1 ...
##  $ PURCHASES                       : num  95.4 0 773.2 1499 16 ...
##  $ ONEOFF_PURCHASES                : num  0 0 773 1499 16 ...
##  $ INSTALLMENTS_PURCHASES          : num  95.4 0 0 0 0 ...
##  $ CASH_ADVANCE                    : num  0 6443 0 206 0 ...
##  $ PURCHASES_FREQUENCY             : num  0.1667 0 1 0.0833 0.0833 ...
##  $ ONEOFF_PURCHASES_FREQUENCY      : num  0 0 1 0.0833 0.0833 ...
##  $ PURCHASES_INSTALLMENTS_FREQUENCY: num  0.0833 0 0 0 0 ...
##  $ CASH_ADVANCE_FREQUENCY          : num  0 0.25 0 0.0833 0 ...
##  $ CASH_ADVANCE_TRX                : int  0 4 0 1 0 0 0 0 0 0 ...
##  $ PURCHASES_TRX                   : int  2 0 12 1 1 8 64 12 5 3 ...
##  $ CREDIT_LIMIT                    : num  1000 7000 7500 7500 1200 1800 13500 2300 7000 11000 ...
##  $ PAYMENTS                        : num  202 4103 622 0 678 ...
##  $ MINIMUM_PAYMENTS                : num  140 1072 627 NA 245 ...
##  $ PRC_FULL_PAYMENT                : num  0 0.222 0 0 0 ...
##  $ TENURE                          : int  12 12 12 12 12 12 12 12 12 12 ...

KARAKTERISTIK DATA

summary(data)
##    CUST_ID             BALANCE        BALANCE_FREQUENCY   PURCHASES       
##  Length:8950        Min.   :    0.0   Min.   :0.0000    Min.   :    0.00  
##  Class :character   1st Qu.:  128.3   1st Qu.:0.8889    1st Qu.:   39.63  
##  Mode  :character   Median :  873.4   Median :1.0000    Median :  361.28  
##                     Mean   : 1564.5   Mean   :0.8773    Mean   : 1003.21  
##                     3rd Qu.: 2054.1   3rd Qu.:1.0000    3rd Qu.: 1110.13  
##                     Max.   :19043.1   Max.   :1.0000    Max.   :49039.57  
##                                                                           
##  ONEOFF_PURCHASES  INSTALLMENTS_PURCHASES  CASH_ADVANCE     PURCHASES_FREQUENCY
##  Min.   :    0.0   Min.   :    0.0        Min.   :    0.0   Min.   :0.00000    
##  1st Qu.:    0.0   1st Qu.:    0.0        1st Qu.:    0.0   1st Qu.:0.08333    
##  Median :   38.0   Median :   89.0        Median :    0.0   Median :0.50000    
##  Mean   :  592.4   Mean   :  411.1        Mean   :  978.9   Mean   :0.49035    
##  3rd Qu.:  577.4   3rd Qu.:  468.6        3rd Qu.: 1113.8   3rd Qu.:0.91667    
##  Max.   :40761.2   Max.   :22500.0        Max.   :47137.2   Max.   :1.00000    
##                                                                                
##  ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
##  Min.   :0.00000            Min.   :0.0000                  
##  1st Qu.:0.00000            1st Qu.:0.0000                  
##  Median :0.08333            Median :0.1667                  
##  Mean   :0.20246            Mean   :0.3644                  
##  3rd Qu.:0.30000            3rd Qu.:0.7500                  
##  Max.   :1.00000            Max.   :1.0000                  
##                                                             
##  CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX  PURCHASES_TRX     CREDIT_LIMIT  
##  Min.   :0.0000         Min.   :  0.000   Min.   :  0.00   Min.   :   50  
##  1st Qu.:0.0000         1st Qu.:  0.000   1st Qu.:  1.00   1st Qu.: 1600  
##  Median :0.0000         Median :  0.000   Median :  7.00   Median : 3000  
##  Mean   :0.1351         Mean   :  3.249   Mean   : 14.71   Mean   : 4494  
##  3rd Qu.:0.2222         3rd Qu.:  4.000   3rd Qu.: 17.00   3rd Qu.: 6500  
##  Max.   :1.5000         Max.   :123.000   Max.   :358.00   Max.   :30000  
##                                                            NA's   :1      
##     PAYMENTS       MINIMUM_PAYMENTS    PRC_FULL_PAYMENT     TENURE     
##  Min.   :    0.0   Min.   :    0.019   Min.   :0.0000   Min.   : 6.00  
##  1st Qu.:  383.3   1st Qu.:  169.124   1st Qu.:0.0000   1st Qu.:12.00  
##  Median :  856.9   Median :  312.344   Median :0.0000   Median :12.00  
##  Mean   : 1733.1   Mean   :  864.207   Mean   :0.1537   Mean   :11.52  
##  3rd Qu.: 1901.1   3rd Qu.:  825.485   3rd Qu.:0.1429   3rd Qu.:12.00  
##  Max.   :50721.5   Max.   :76406.208   Max.   :1.0000   Max.   :12.00  
##                    NA's   :313
df_long = data[, -1] %>%
  scale() %>%
  as.data.frame() %>%
  pivot_longer(cols = everything(), names_to = "Variabel", values_to = "Nilai")

ggplot(df_long, aes(x = Variabel, y = Nilai)) +
  geom_boxplot(fill = "lightgray") +
  coord_cartesian(ylim = c(-3, 5)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, size = 8))
## Warning: Removed 314 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## PREPROCESSING

# Menghapus kolom ID
data = data[, -1]

#Imputasi missing values dengan median
for(i in 1:ncol(data)){
  data[is.na(data[,i]), i] = median(data[,i], na.rm = TRUE)
}

# Stanndarrisasi
df = scale(data)
set.seed(123)

MENENTUKAN JUMLAH CLUSTER

# Elbow Method
wss = sapply(1:10, function(k){
  kmeans(df, centers = k, nstart = 20)$tot.withinss
})

plot(1:10, wss, type = "b", pch = 19,
     xlab = "Jumlah Cluster",
     ylab = "Total Within SS",
     main = "Elbow Method")

# Silhouette Method
df_sample = df[1:1000, ]

avg_sil = function(k) {
  km_res = kmeans(df_sample, centers = k, nstart = 25)
  ss = silhouette(km_res$cluster, dist(df_sample))
  mean(ss[, 3])
}

k_values = 2:10
avg_sil_values = sapply(k_values, avg_sil)

plot(k_values, avg_sil_values, type = "b", pch = 19,
     xlab = "Jumlah Cluster",
     ylab = "Silhouette",
     main = "Silhouette Method")

## CLUSTERING

# K-Means (k = 3)
km_res = kmeans(df, centers = 3)

#K-Median (k = 3)
kmed_res = flexclust::kcca(df, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
# DBSCAN
db_res = dbscan(df, eps = 0.5, MinPts = 5)

# DBSCAN
df_small = df[1:500,]
ms_res = meanShift(df_small)

# Mean Shift
fcm_res = cmeans(df, centers = 3, m = 2)

VISUALISASI HASIL CLUSTERING

par(mfrow = c(2,3))

plot(df[,1:2], col = km_res$cluster, main = "K-Means")
plot(df[,1:2], col = clusters(kmed_res), main = "K-Median")
plot(df[,1:2], col = db_res$cluster + 1, main = "DBSCAN")
plot(df_small[,1:2], col = ms_res$assignment, main = "Mean Shift")
plot(df[,1:2], col = fcm_res$cluster, main = "Fuzzy C-Means")

## EVALUASI CLUSTERING

# Silhouette Score
sil_km = mean(silhouette(km_res$cluster[1:1000], dist(df_sample))[,3])

sil_kmed = mean(silhouette(clusters(kmed_res)[1:1000], dist(df_sample))[,3])

# Silhouette DBSCAN (exclude noise)
db_cluster = db_res$cluster
df_db = df[db_cluster != 0,]
db_cluster = db_cluster[db_cluster != 0]

if(length(db_cluster) > 10){
  df_db_sample = df_db[1:min(1000, nrow(df_db)),]
  db_cluster_sample = db_cluster[1:min(1000, length(db_cluster))]
  sil_db = mean(silhouette(db_cluster_sample, dist(df_db_sample))[,3])
} else {
  sil_db = NA
}

sil_fcm = mean(silhouette(fcm_res$cluster[1:1000], dist(df_sample))[,3])

sil_ms = mean(silhouette(ms_res$assignment, dist(df_small))[,3])

# Dunn Index dan Calinski-Harabasz (K-Means)
stats = cluster.stats(dist(df_sample), km_res$cluster[1:1000])
dunn = stats$dunn

ch = calinhara(df_sample, km_res$cluster[1:1000])

# Tabel Hasil Evaluasi
hasil_eval = data.frame(
  Metode = c("KMeans","KMedian","DBSCAN","Fuzzy","MeanShift"),
  Silhouette = c(sil_km, sil_kmed, sil_db, sil_fcm, sil_ms)
)

hasil_eval
##      Metode  Silhouette
## 1    KMeans  0.17108926
## 2   KMedian  0.08144866
## 3    DBSCAN -0.36431719
## 4     Fuzzy  0.07962635
## 5 MeanShift -0.11062394
dunn
## [1] 0.01300959
ch
## [1] 163.6733

Analisis Data Eksploratif

data_clustered = cbind(data, cluster = km_res$cluster)

table(km_res$cluster)
## 
##    1    2    3 
## 1242 3149 4559
aggregate(. ~ cluster, data = data_clustered, mean)
##   cluster   BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES
## 1       1 4544.7008         0.9685803  578.2584         363.5475
## 2       2 1289.0834         0.9684482 2185.6680        1239.9339
## 3       3  942.7962         0.7894172  302.2193         207.5536
##   INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY
## 1              214.79273    4510.4354           0.2858484
## 2              946.12293     235.2755           0.9266214
## 3               94.96419     530.3911           0.2447209
##   ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
## 1                 0.13669569                        0.1869415
## 2                 0.38927895                        0.7475960
## 3                 0.09133161                        0.1481362
##   CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT  PAYMENTS
## 1             0.48283117       14.2584541      8.374396     7465.865 3584.6183
## 2             0.04309574        0.8409019     32.734519     5165.467 2168.6085
## 3             0.10400434        1.9127002      3.985742     3221.138  927.9652
##   MINIMUM_PAYMENTS PRC_FULL_PAYMENT   TENURE
## 1        2093.1295       0.03646342 11.38889
## 2         794.7562       0.28766458 11.74341
## 3         539.4958       0.09313507 11.39614