## LOAD LIBRARY
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
##
## The following object is masked from 'package:dplyr':
##
## count
##
## The following object is masked from 'package:purrr':
##
## map
##
## The following object is masked from 'package:htmltools':
##
## em
data = read.csv("CC GENERAL.csv")
head(data)
## CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES
## 1 C10001 40.90075 0.818182 95.40 0.00
## 2 C10002 3202.46742 0.909091 0.00 0.00
## 3 C10003 2495.14886 1.000000 773.17 773.17
## 4 C10004 1666.67054 0.636364 1499.00 1499.00
## 5 C10005 817.71434 1.000000 16.00 16.00
## 6 C10006 1809.82875 1.000000 1333.28 0.00
## INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY
## 1 95.40 0.000 0.166667
## 2 0.00 6442.945 0.000000
## 3 0.00 0.000 1.000000
## 4 0.00 205.788 0.083333
## 5 0.00 0.000 0.083333
## 6 1333.28 0.000 0.666667
## ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
## 1 0.000000 0.083333
## 2 0.000000 0.000000
## 3 1.000000 0.000000
## 4 0.083333 0.000000
## 5 0.083333 0.000000
## 6 0.000000 0.583333
## CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS
## 1 0.000000 0 2 1000 201.8021
## 2 0.250000 4 0 7000 4103.0326
## 3 0.000000 0 12 7500 622.0667
## 4 0.083333 1 1 7500 0.0000
## 5 0.000000 0 1 1200 678.3348
## 6 0.000000 0 8 1800 1400.0578
## MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
## 1 139.5098 0.000000 12
## 2 1072.3402 0.222222 12
## 3 627.2848 0.000000 12
## 4 NA 0.000000 12
## 5 244.7912 0.000000 12
## 6 2407.2460 0.000000 12
str(data)
## 'data.frame': 8950 obs. of 18 variables:
## $ CUST_ID : chr "C10001" "C10002" "C10003" "C10004" ...
## $ BALANCE : num 40.9 3202.5 2495.1 1666.7 817.7 ...
## $ BALANCE_FREQUENCY : num 0.818 0.909 1 0.636 1 ...
## $ PURCHASES : num 95.4 0 773.2 1499 16 ...
## $ ONEOFF_PURCHASES : num 0 0 773 1499 16 ...
## $ INSTALLMENTS_PURCHASES : num 95.4 0 0 0 0 ...
## $ CASH_ADVANCE : num 0 6443 0 206 0 ...
## $ PURCHASES_FREQUENCY : num 0.1667 0 1 0.0833 0.0833 ...
## $ ONEOFF_PURCHASES_FREQUENCY : num 0 0 1 0.0833 0.0833 ...
## $ PURCHASES_INSTALLMENTS_FREQUENCY: num 0.0833 0 0 0 0 ...
## $ CASH_ADVANCE_FREQUENCY : num 0 0.25 0 0.0833 0 ...
## $ CASH_ADVANCE_TRX : int 0 4 0 1 0 0 0 0 0 0 ...
## $ PURCHASES_TRX : int 2 0 12 1 1 8 64 12 5 3 ...
## $ CREDIT_LIMIT : num 1000 7000 7500 7500 1200 1800 13500 2300 7000 11000 ...
## $ PAYMENTS : num 202 4103 622 0 678 ...
## $ MINIMUM_PAYMENTS : num 140 1072 627 NA 245 ...
## $ PRC_FULL_PAYMENT : num 0 0.222 0 0 0 ...
## $ TENURE : int 12 12 12 12 12 12 12 12 12 12 ...
summary(data)
## CUST_ID BALANCE BALANCE_FREQUENCY PURCHASES
## Length:8950 Min. : 0.0 Min. :0.0000 Min. : 0.00
## Class :character 1st Qu.: 128.3 1st Qu.:0.8889 1st Qu.: 39.63
## Mode :character Median : 873.4 Median :1.0000 Median : 361.28
## Mean : 1564.5 Mean :0.8773 Mean : 1003.21
## 3rd Qu.: 2054.1 3rd Qu.:1.0000 3rd Qu.: 1110.13
## Max. :19043.1 Max. :1.0000 Max. :49039.57
##
## ONEOFF_PURCHASES INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. :0.00000
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.:0.08333
## Median : 38.0 Median : 89.0 Median : 0.0 Median :0.50000
## Mean : 592.4 Mean : 411.1 Mean : 978.9 Mean :0.49035
## 3rd Qu.: 577.4 3rd Qu.: 468.6 3rd Qu.: 1113.8 3rd Qu.:0.91667
## Max. :40761.2 Max. :22500.0 Max. :47137.2 Max. :1.00000
##
## ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.08333 Median :0.1667
## Mean :0.20246 Mean :0.3644
## 3rd Qu.:0.30000 3rd Qu.:0.7500
## Max. :1.00000 Max. :1.0000
##
## CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT
## Min. :0.0000 Min. : 0.000 Min. : 0.00 Min. : 50
## 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.: 1.00 1st Qu.: 1600
## Median :0.0000 Median : 0.000 Median : 7.00 Median : 3000
## Mean :0.1351 Mean : 3.249 Mean : 14.71 Mean : 4494
## 3rd Qu.:0.2222 3rd Qu.: 4.000 3rd Qu.: 17.00 3rd Qu.: 6500
## Max. :1.5000 Max. :123.000 Max. :358.00 Max. :30000
## NA's :1
## PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
## Min. : 0.0 Min. : 0.019 Min. :0.0000 Min. : 6.00
## 1st Qu.: 383.3 1st Qu.: 169.124 1st Qu.:0.0000 1st Qu.:12.00
## Median : 856.9 Median : 312.344 Median :0.0000 Median :12.00
## Mean : 1733.1 Mean : 864.207 Mean :0.1537 Mean :11.52
## 3rd Qu.: 1901.1 3rd Qu.: 825.485 3rd Qu.:0.1429 3rd Qu.:12.00
## Max. :50721.5 Max. :76406.208 Max. :1.0000 Max. :12.00
## NA's :313
df_long = data[, -1] %>%
scale() %>%
as.data.frame() %>%
pivot_longer(cols = everything(), names_to = "Variabel", values_to = "Nilai")
ggplot(df_long, aes(x = Variabel, y = Nilai)) +
geom_boxplot(fill = "lightgray") +
coord_cartesian(ylim = c(-3, 5)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8))
## Warning: Removed 314 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## PREPROCESSING
# Menghapus kolom ID
data = data[, -1]
#Imputasi missing values dengan median
for(i in 1:ncol(data)){
data[is.na(data[,i]), i] = median(data[,i], na.rm = TRUE)
}
# Stanndarrisasi
df = scale(data)
set.seed(123)
# Elbow Method
wss = sapply(1:10, function(k){
kmeans(df, centers = k, nstart = 20)$tot.withinss
})
plot(1:10, wss, type = "b", pch = 19,
xlab = "Jumlah Cluster",
ylab = "Total Within SS",
main = "Elbow Method")
# Silhouette Method
df_sample = df[1:1000, ]
avg_sil = function(k) {
km_res = kmeans(df_sample, centers = k, nstart = 25)
ss = silhouette(km_res$cluster, dist(df_sample))
mean(ss[, 3])
}
k_values = 2:10
avg_sil_values = sapply(k_values, avg_sil)
plot(k_values, avg_sil_values, type = "b", pch = 19,
xlab = "Jumlah Cluster",
ylab = "Silhouette",
main = "Silhouette Method")
## CLUSTERING
# K-Means (k = 3)
km_res = kmeans(df, centers = 3)
#K-Median (k = 3)
kmed_res = flexclust::kcca(df, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
# DBSCAN
db_res = dbscan(df, eps = 0.5, MinPts = 5)
# DBSCAN
df_small = df[1:500,]
ms_res = meanShift(df_small)
# Mean Shift
fcm_res = cmeans(df, centers = 3, m = 2)
par(mfrow = c(2,3))
plot(df[,1:2], col = km_res$cluster, main = "K-Means")
plot(df[,1:2], col = clusters(kmed_res), main = "K-Median")
plot(df[,1:2], col = db_res$cluster + 1, main = "DBSCAN")
plot(df_small[,1:2], col = ms_res$assignment, main = "Mean Shift")
plot(df[,1:2], col = fcm_res$cluster, main = "Fuzzy C-Means")
## EVALUASI CLUSTERING
# Silhouette Score
sil_km = mean(silhouette(km_res$cluster[1:1000], dist(df_sample))[,3])
sil_kmed = mean(silhouette(clusters(kmed_res)[1:1000], dist(df_sample))[,3])
# Silhouette DBSCAN (exclude noise)
db_cluster = db_res$cluster
df_db = df[db_cluster != 0,]
db_cluster = db_cluster[db_cluster != 0]
if(length(db_cluster) > 10){
df_db_sample = df_db[1:min(1000, nrow(df_db)),]
db_cluster_sample = db_cluster[1:min(1000, length(db_cluster))]
sil_db = mean(silhouette(db_cluster_sample, dist(df_db_sample))[,3])
} else {
sil_db = NA
}
sil_fcm = mean(silhouette(fcm_res$cluster[1:1000], dist(df_sample))[,3])
sil_ms = mean(silhouette(ms_res$assignment, dist(df_small))[,3])
# Dunn Index dan Calinski-Harabasz (K-Means)
stats = cluster.stats(dist(df_sample), km_res$cluster[1:1000])
dunn = stats$dunn
ch = calinhara(df_sample, km_res$cluster[1:1000])
# Tabel Hasil Evaluasi
hasil_eval = data.frame(
Metode = c("KMeans","KMedian","DBSCAN","Fuzzy","MeanShift"),
Silhouette = c(sil_km, sil_kmed, sil_db, sil_fcm, sil_ms)
)
hasil_eval
## Metode Silhouette
## 1 KMeans 0.17108926
## 2 KMedian 0.08144866
## 3 DBSCAN -0.36431719
## 4 Fuzzy 0.07962635
## 5 MeanShift -0.11062394
dunn
## [1] 0.01300959
ch
## [1] 163.6733
data_clustered = cbind(data, cluster = km_res$cluster)
table(km_res$cluster)
##
## 1 2 3
## 1242 3149 4559
aggregate(. ~ cluster, data = data_clustered, mean)
## cluster BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES
## 1 1 4544.7008 0.9685803 578.2584 363.5475
## 2 2 1289.0834 0.9684482 2185.6680 1239.9339
## 3 3 942.7962 0.7894172 302.2193 207.5536
## INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY
## 1 214.79273 4510.4354 0.2858484
## 2 946.12293 235.2755 0.9266214
## 3 94.96419 530.3911 0.2447209
## ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY
## 1 0.13669569 0.1869415
## 2 0.38927895 0.7475960
## 3 0.09133161 0.1481362
## CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT PAYMENTS
## 1 0.48283117 14.2584541 8.374396 7465.865 3584.6183
## 2 0.04309574 0.8409019 32.734519 5165.467 2168.6085
## 3 0.10400434 1.9127002 3.985742 3221.138 927.9652
## MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE
## 1 2093.1295 0.03646342 11.38889
## 2 794.7562 0.28766458 11.74341
## 3 539.4958 0.09313507 11.39614