Import Library

Pada tahap awal, dilakukan import beberapa library yang digunakan untuk proses analisis data, mulai dari preprocessing hingga visualisasi.

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(flexclust)     
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)        
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)   
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:flexclust':
## 
##     bclust
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan

Import Dataset

Dataset yang digunakan adalah Customer Personality Analysis yang diperoleh dari Kaggle.

data <- read.csv("marketing_campaign.csv", sep = "\t")
head(data)
##     ID Year_Birth  Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524       1957 Graduation         Single  58138       0        0  04-09-2012
## 2 2174       1954 Graduation         Single  46344       1        1  08-03-2014
## 3 4141       1965 Graduation       Together  71613       0        0  21-08-2013
## 4 6182       1984 Graduation       Together  26646       1        0  10-02-2014
## 5 5324       1981        PhD        Married  58293       1        0  19-01-2014
## 6 7446       1967     Master       Together  62513       0        1  09-09-2013
##   Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1      58      635        88             546             172               88
## 2      38       11         1               6               2                1
## 3      26      426        49             127             111               21
## 4      26       11         4              20              10                3
## 5      94      173        43             118              46               27
## 6      16      520        42              98               0               42
##   MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1           88                 3               8                  10
## 2            6                 2               1                   1
## 3           42                 1               8                   2
## 4            5                 2               2                   0
## 5           15                 5               5                   3
## 6           14                 2               6                   4
##   NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1                 4                 7            0            0            0
## 2                 2                 5            0            0            0
## 3                10                 4            0            0            0
## 4                 4                 6            0            0            0
## 5                 6                 5            0            0            0
## 6                10                 6            0            0            0
##   AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1            0            0        0             3        11        1
## 2            0            0        0             3        11        0
## 3            0            0        0             3        11        0
## 4            0            0        0             3        11        0
## 5            0            0        0             3        11        0
## 6            0            0        0             3        11        0

Preprocessing Data

Tahap preprocessing dilakukan untuk memastikan data siap digunakan dalam proses clustering.

# Menghapus kolom yang tidak diperlukan
data_clean <- data %>%
  select(-ID, -Z_CostContact, -Z_Revenue)

# Mengubah format tanggal
data_clean$Dt_Customer <- as.Date(data_clean$Dt_Customer, format = "%d-%m-%Y")

# Mengubah categorical ke numeric
data_clean$Education <- as.numeric(as.factor(data_clean$Education))
data_clean$Marital_Status <- as.numeric(as.factor(data_clean$Marital_Status))

# Mengatasi missing value (pada Income)
data_clean$Income[is.na(data_clean$Income)] <- median(data_clean$Income, na.rm = TRUE)

# Cek struktur data
str(data_clean)
## 'data.frame':    2240 obs. of  26 variables:
##  $ Year_Birth         : int  1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
##  $ Education          : num  3 3 3 3 5 4 3 5 5 5 ...
##  $ Marital_Status     : num  5 5 6 6 4 6 3 4 6 6 ...
##  $ Income             : num  58138 46344 71613 26646 58293 ...
##  $ Kidhome            : int  0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : int  0 1 0 0 0 1 1 0 0 1 ...
##  $ Dt_Customer        : Date, format: "2012-09-04" "2014-03-08" ...
##  $ Recency            : int  58 38 26 26 94 16 34 32 19 68 ...
##  $ MntWines           : int  635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : int  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : int  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : int  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : int  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : int  88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : int  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : int  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: int  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : int  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : int  7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Complain           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Response           : int  1 0 0 0 0 0 0 0 1 0 ...

Karakteristik Data

Statistika Deskriptif

Statistika deskriptif digunakan untuk melihat gambaran umum dari data yang digunakan.

summary(data_clean)
##    Year_Birth     Education     Marital_Status     Income      
##  Min.   :1893   Min.   :1.000   Min.   :1.00   Min.   :  1730  
##  1st Qu.:1959   1st Qu.:3.000   1st Qu.:4.00   1st Qu.: 35539  
##  Median :1970   Median :3.000   Median :5.00   Median : 51382  
##  Mean   :1969   Mean   :3.394   Mean   :4.73   Mean   : 52238  
##  3rd Qu.:1977   3rd Qu.:4.000   3rd Qu.:6.00   3rd Qu.: 68290  
##  Max.   :1996   Max.   :5.000   Max.   :8.00   Max.   :666666  
##     Kidhome          Teenhome       Dt_Customer            Recency     
##  Min.   :0.0000   Min.   :0.0000   Min.   :2012-07-30   Min.   : 0.00  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:2013-01-16   1st Qu.:24.00  
##  Median :0.0000   Median :0.0000   Median :2013-07-08   Median :49.00  
##  Mean   :0.4442   Mean   :0.5062   Mean   :2013-07-10   Mean   :49.11  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:2013-12-30   3rd Qu.:74.00  
##  Max.   :2.0000   Max.   :2.0000   Max.   :2014-06-29   Max.   :99.00  
##     MntWines         MntFruits     MntMeatProducts  MntFishProducts 
##  Min.   :   0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.00  
##  1st Qu.:  23.75   1st Qu.:  1.0   1st Qu.:  16.0   1st Qu.:  3.00  
##  Median : 173.50   Median :  8.0   Median :  67.0   Median : 12.00  
##  Mean   : 303.94   Mean   : 26.3   Mean   : 166.9   Mean   : 37.53  
##  3rd Qu.: 504.25   3rd Qu.: 33.0   3rd Qu.: 232.0   3rd Qu.: 50.00  
##  Max.   :1493.00   Max.   :199.0   Max.   :1725.0   Max.   :259.00  
##  MntSweetProducts  MntGoldProds    NumDealsPurchases NumWebPurchases 
##  Min.   :  0.00   Min.   :  0.00   Min.   : 0.000    Min.   : 0.000  
##  1st Qu.:  1.00   1st Qu.:  9.00   1st Qu.: 1.000    1st Qu.: 2.000  
##  Median :  8.00   Median : 24.00   Median : 2.000    Median : 4.000  
##  Mean   : 27.06   Mean   : 44.02   Mean   : 2.325    Mean   : 4.085  
##  3rd Qu.: 33.00   3rd Qu.: 56.00   3rd Qu.: 3.000    3rd Qu.: 6.000  
##  Max.   :263.00   Max.   :362.00   Max.   :15.000    Max.   :27.000  
##  NumCatalogPurchases NumStorePurchases NumWebVisitsMonth  AcceptedCmp3    
##  Min.   : 0.000      Min.   : 0.00     Min.   : 0.000    Min.   :0.00000  
##  1st Qu.: 0.000      1st Qu.: 3.00     1st Qu.: 3.000    1st Qu.:0.00000  
##  Median : 2.000      Median : 5.00     Median : 6.000    Median :0.00000  
##  Mean   : 2.662      Mean   : 5.79     Mean   : 5.317    Mean   :0.07277  
##  3rd Qu.: 4.000      3rd Qu.: 8.00     3rd Qu.: 7.000    3rd Qu.:0.00000  
##  Max.   :28.000      Max.   :13.00     Max.   :20.000    Max.   :1.00000  
##   AcceptedCmp4      AcceptedCmp5      AcceptedCmp1      AcceptedCmp2    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.07455   Mean   :0.07277   Mean   :0.06429   Mean   :0.01339  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     Complain           Response     
##  Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :0.000000   Median :0.0000  
##  Mean   :0.009375   Mean   :0.1491  
##  3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :1.000000   Max.   :1.0000
data_clean %>%
  summarise(
    mean_income = mean(Income),
    median_income = median(Income),
    sd_income = sd(Income)
  )
##   mean_income median_income sd_income
## 1    52237.98       51381.5  25037.96

Visualisasi Plot

Visualisasi dilakukan untuk memahami pola distribusi data serta melihat sebaran nilai dan potensi pencilan (outlier) pada setiap variabel sebelum dilakukan proses clustering.

Distribusi Tiap Variabel

Pada tahap ini dilakukan pemeriksaan bentuk distribusi dari masing-masing variabel numerik menggunakan histogram.

data_num <- data_clean %>% select(where(is.numeric))

data_num %>%
  pivot_longer(cols = everything(), names_to = "Variabel", values_to = "Nilai") %>%
  ggplot(aes(x = Nilai)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  facet_wrap(~Variabel, scales = "free") +
  labs(title = "Distribusi Tiap Variabel", x = "Nilai", y = "Frekuensi")

Berdasarkan visualisasi histogram, terlihat bahwa beberapa variabel memiliki distribusi yang tidak simetris (skewed), terutama pada variabel yang berkaitan dengan pendapatan dan pengeluaran. Hal ini menunjukkan adanya perbedaan karakteristik antar pelanggan yang cukup beragam.

Boxplot Karakteristik Data

Boxplot digunakan untuk melihat sebaran data berdasarkan kuartil serta mendeteksi adanya pencilan (outlier) pada setiap variabel.

data_num %>%
  pivot_longer(cols = everything(), names_to = "Variabel", values_to = "Nilai") %>%
  ggplot(aes(x = Variabel, y = Nilai)) +
  geom_boxplot(fill = "orange") +
  coord_flip() +
  labs(title = "Boxplot Karakteristik Data",
       x = "Variabel", y = "Nilai")

Dari boxplot yang ditampilkan, terlihat bahwa terdapat beberapa pencilan (outlier) pada beberapa variabel. Selain itu, perbedaan rentang nilai antar variabel juga cukup signifikan, sehingga diperlukan proses normalisasi atau scaling sebelum dilakukan clustering agar hasil yang diperoleh lebih optimal.

Data Preparation untuk Clustering

Sebelum dilakukan proses clustering, data perlu dinormalisasi agar perbedaan skala antar variabel tidak mempengaruhi hasil pengelompokan.

data_num <- data_clean %>% select(where(is.numeric))

data_scaled <- scale(data_num)

head(data_scaled)
##      Year_Birth  Education Marital_Status     Income    Kidhome   Teenhome
## [1,] -0.9851248 -0.3500631      0.2509477  0.2356432 -0.8250334 -0.9296868
## [2,] -1.2354571 -0.3500631      0.2509477 -0.2354016  1.0323283  0.9067316
## [3,] -0.3175719 -0.3500631      1.1800764  0.7738261 -0.8250334 -0.9296868
## [4,]  1.2678662 -0.3500631      1.1800764 -1.0221272  1.0323283 -0.9296868
## [5,]  1.0175339  1.4280353     -0.6781810  0.2418338  1.0323283 -0.9296868
## [6,] -0.1506837  0.5389861      1.1800764  0.4103779 -0.8250334  0.9067316
##         Recency   MntWines  MntFruits MntMeatProducts MntFishProducts
## [1,]  0.3069707  0.9835616  1.5512306       1.6793274       2.4615974
## [2,] -0.3835785 -0.8702852 -0.6361591      -0.7130662      -0.6503040
## [3,] -0.7979081  0.3626418  0.5706766      -0.1769928       1.3449739
## [4,] -0.7979081 -0.8702852 -0.5607319      -0.6510412      -0.5038616
## [5,]  1.5499594 -0.3889980  0.4198221      -0.2168660       0.1551293
## [6,] -1.1431827  0.6419072  0.3946797      -0.3054732      -0.6869147
##      MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases
## [1,]      1.476170487   0.84301867         0.3493359       1.4089893
## [2,]     -0.631362202  -0.72884304        -0.1681988      -1.1101615
## [3,]     -0.146871929  -0.03875741        -0.6857335       1.4089893
## [4,]     -0.582913175  -0.74801209        -0.1681988      -0.7502828
## [5,]     -0.001524847  -0.55632164         1.3844054       0.3293533
## [6,]      0.361842858  -0.57549068        -0.1681988       0.6892320
##      NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3
## [1,]           2.5103297       -0.55066183         0.6937488   -0.2800777
## [2,]          -0.5685927       -1.16586508        -0.1304343   -0.2800777
## [3,]          -0.2264902        1.29494790        -0.5425259   -0.2800777
## [4,]          -0.9106951       -0.55066183         0.2816572   -0.2800777
## [5,]           0.1156123        0.06454141        -0.1304343   -0.2800777
## [6,]           0.4577148        1.29494790         0.2816572   -0.2800777
##      AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2    Complain   Response
## [1,]   -0.2837668   -0.2800777   -0.2620527   -0.1164843 -0.09725995  2.3883131
## [2,]   -0.2837668   -0.2800777   -0.2620527   -0.1164843 -0.09725995 -0.4185187
## [3,]   -0.2837668   -0.2800777   -0.2620527   -0.1164843 -0.09725995 -0.4185187
## [4,]   -0.2837668   -0.2800777   -0.2620527   -0.1164843 -0.09725995 -0.4185187
## [5,]   -0.2837668   -0.2800777   -0.2620527   -0.1164843 -0.09725995 -0.4185187
## [6,]   -0.2837668   -0.2800777   -0.2620527   -0.1164843 -0.09725995 -0.4185187

Menentukan Jumlah Cluster (K)

Digunakan metode Elbow dan Silhouette untuk menentukan jumlah cluster optimal.

Elbow Method

fviz_nbclust(data_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method")

Silhouette Method

fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
  labs(title = "Silhouette Method")

Berdasarkan hasil pengujian menggunakan metode Elbow dan Silhouette, diperoleh bahwa jumlah cluster optimal berada pada nilai k = 2. Pada metode Elbow, terlihat adanya penurunan yang cukup signifikan hingga k = 2, kemudian cenderung melandai pada nilai k berikutnya. Sementara itu, metode Silhouette menunjukkan nilai tertinggi pada k = 2, yang menandakan bahwa pemisahan antar cluster pada jumlah tersebut paling baik dibandingkan dengan jumlah cluster lainnya. Oleh karena itu, pada penelitian ini digunakan jumlah cluster sebanyak dua untuk proses clustering selanjutnya.

Clustering 5 Metode

1. K-Means

set.seed(123)
km_res <- kmeans(data_scaled, centers = 2, nstart = 25)
km_cluster <- km_res$cluster

2. K-Median

kmed_res <- flexclust::kcca(data_scaled, k = 2, family = flexclust::kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
kmed_cluster <- flexclust::clusters(kmed_res)

3. DBSCAN

db_res <- dbscan::dbscan(data_scaled, eps = 1.5, minPts = 5)
db_cluster <- db_res$cluster

db_cluster <- ifelse(db_cluster == 0, NA, db_cluster)

4. Mean Shift

ms_res <- meanShift(data_scaled)
ms_cluster <- ms_res$assignment

5. Fuzzy C-Means

fcm_res <- cmeans(data_scaled, centers = 2, m = 2 )
fcm_cluster <- fcm_res$cluster

Evaluasi Clustering

Silhouette Score

sil_score <- function(cluster, data){
  idx <- which(!is.na(cluster))
  mean(silhouette(cluster[idx], dist(data[idx, ]))[,3])
}

sil_km   <- sil_score(km_cluster, data_scaled)
sil_kmed <- sil_score(kmed_cluster, data_scaled)
sil_db   <- sil_score(db_cluster, data_scaled)
sil_ms   <- sil_score(ms_cluster, data_scaled)
sil_fcm  <- sil_score(fcm_cluster, data_scaled)

sil_km
## [1] 0.2420156
sil_kmed
## [1] 0.2239927
sil_db
## [1] 0.09492424
sil_ms
## [1] 0.03848401
sil_fcm
## [1] 0.2252527

Calinski-Harabasz

library(clusterSim)
## Warning: package 'clusterSim' was built under R version 4.5.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
ch_index <- function(cluster, data){
  idx <- which(!is.na(cluster))
  index.G1(data[idx, ], cluster[idx], centrotypes = "centroids")
}

ch_km   <- ch_index(km_cluster, data_scaled)
ch_kmed <- ch_index(kmed_cluster, data_scaled)
ch_db   <- ch_index(db_cluster, data_scaled)
ch_ms   <- ch_index(ms_cluster, data_scaled)
ch_fcm  <- ch_index(fcm_cluster, data_scaled)

ch_km
## [1] 570.1716
ch_kmed
## [1] 542.9873
ch_db
## [1] 34.83134
ch_ms
## [1] 9.169468
ch_fcm
## [1] 563.7734

Perbandingan hasil

hasil <- data.frame(
  Metode = c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
  Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm),
  Calinski_Harabasz = c(ch_km, ch_kmed, ch_db, ch_ms, ch_fcm)
)

hasil
##          Metode Silhouette Calinski_Harabasz
## 1       K-Means 0.24201557        570.171574
## 2      K-Median 0.22399271        542.987270
## 3        DBSCAN 0.09492424         34.831337
## 4    Mean Shift 0.03848401          9.169468
## 5 Fuzzy C-Means 0.22525271        563.773427
hasil$Score <- scale(hasil$Silhouette) + scale(hasil$Calinski_Harabasz)

metode_terbaik <- hasil[which.max(hasil$Score), ]

metode_terbaik
##    Metode Silhouette Calinski_Harabasz   Score
## 1 K-Means  0.2420156          570.1716 1.60412

Berdasarkan hasil evaluasi menggunakan Silhouette Score dan Calinski-Harabasz Index, diperoleh bahwa metode clustering terbaik adalah metode K-Means yang memiliki nilai evaluasi tertinggi dibandingkan metode lainnya. Pemilihan metode terbaik dilakukan dengan mempertimbangkan nilai Silhouette sebagai indikator utama kualitas cluster, serta didukung oleh nilai Calinski-Harabasz sebagai indikator pemisahan antar cluster.

EDA K-Means

Menambahkan Label Cluster ke Data

data_cluster <- data_clean
data_cluster$Cluster <- factor(km_cluster)

head(data_cluster)
##   Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1       1957         3              5  58138       0        0  2012-09-04
## 2       1954         3              5  46344       1        1  2014-03-08
## 3       1965         3              6  71613       0        0  2013-08-21
## 4       1984         3              6  26646       1        0  2014-02-10
## 5       1981         5              4  58293       1        0  2014-01-19
## 6       1967         4              6  62513       0        1  2013-09-09
##   Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1      58      635        88             546             172               88
## 2      38       11         1               6               2                1
## 3      26      426        49             127             111               21
## 4      26       11         4              20              10                3
## 5      94      173        43             118              46               27
## 6      16      520        42              98               0               42
##   MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1           88                 3               8                  10
## 2            6                 2               1                   1
## 3           42                 1               8                   2
## 4            5                 2               2                   0
## 5           15                 5               5                   3
## 6           14                 2               6                   4
##   NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1                 4                 7            0            0            0
## 2                 2                 5            0            0            0
## 3                10                 4            0            0            0
## 4                 4                 6            0            0            0
## 5                 6                 5            0            0            0
## 6                10                 6            0            0            0
##   AcceptedCmp1 AcceptedCmp2 Complain Response Cluster
## 1            0            0        0        1       2
## 2            0            0        0        0       1
## 3            0            0        0        0       2
## 4            0            0        0        0       1
## 5            0            0        0        0       1
## 6            0            0        0        0       2

Karakteristik Tiap Cluster (Rata-rata)

library(dplyr)

cluster_summary <- data_cluster %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `Cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
cluster_summary
## # A tibble: 2 × 26
##   Cluster Year_Birth Education Marital_Status Income Kidhome Teenhome Recency
##   <fct>        <dbl>     <dbl>          <dbl>  <dbl>   <dbl>    <dbl>   <dbl>
## 1 1            1970.      3.36           4.72 39308.  0.688     0.560    48.9
## 2 2            1967.      3.45           4.74 72258.  0.0660    0.423    49.5
## # ℹ 18 more variables: MntWines <dbl>, MntFruits <dbl>, MntMeatProducts <dbl>,
## #   MntFishProducts <dbl>, MntSweetProducts <dbl>, MntGoldProds <dbl>,
## #   NumDealsPurchases <dbl>, NumWebPurchases <dbl>, NumCatalogPurchases <dbl>,
## #   NumStorePurchases <dbl>, NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>,
## #   AcceptedCmp4 <dbl>, AcceptedCmp5 <dbl>, AcceptedCmp1 <dbl>,
## #   AcceptedCmp2 <dbl>, Complain <dbl>, Response <dbl>

Jumlah Data Tiap Cluster

table(data_cluster$Cluster)
## 
##    1    2 
## 1361  879

Visualisasi Cluster (PCA 2D)

library(factoextra)

fviz_cluster(km_res, data = data_scaled,
             geom = "point",
             ellipse.type = "convex",
             palette = "jco",
             ggtheme = theme_minimal())

Boxplot per Cluster (Karakteristik)

library(tidyr)
library(ggplot2)

data_cluster %>%
  pivot_longer(cols = where(is.numeric),
               names_to = "Variabel",
               values_to = "Nilai") %>%
  ggplot(aes(x = Cluster, y = Nilai, fill = Cluster)) +
  geom_boxplot() +
  facet_wrap(~Variabel, scales = "free") +
  theme_minimal() +
  labs(title = "Karakteristik Variabel pada Tiap Cluster")

Visualisasi Spesifik

ggplot(data_cluster, aes(x = Income, y = MntWines, color = Cluster)) +
  geom_point(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Cluster berdasarkan Income dan Spending")