Pada tahap awal, dilakukan import beberapa library yang digunakan untuk proses analisis data, mulai dari preprocessing hingga visualisasi.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
Dataset yang digunakan adalah Customer Personality Analysis yang diperoleh dari Kaggle.
data <- read.csv("marketing_campaign.csv", sep = "\t")
head(data)
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524 1957 Graduation Single 58138 0 0 04-09-2012
## 2 2174 1954 Graduation Single 46344 1 1 08-03-2014
## 3 4141 1965 Graduation Together 71613 0 0 21-08-2013
## 4 6182 1984 Graduation Together 26646 1 0 10-02-2014
## 5 5324 1981 PhD Married 58293 1 0 19-01-2014
## 6 7446 1967 Master Together 62513 0 1 09-09-2013
## Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1 58 635 88 546 172 88
## 2 38 11 1 6 2 1
## 3 26 426 49 127 111 21
## 4 26 11 4 20 10 3
## 5 94 173 43 118 46 27
## 6 16 520 42 98 0 42
## MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1 88 3 8 10
## 2 6 2 1 1
## 3 42 1 8 2
## 4 5 2 2 0
## 5 15 5 5 3
## 6 14 2 6 4
## NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1 4 7 0 0 0
## 2 2 5 0 0 0
## 3 10 4 0 0 0
## 4 4 6 0 0 0
## 5 6 5 0 0 0
## 6 10 6 0 0 0
## AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1 0 0 0 3 11 1
## 2 0 0 0 3 11 0
## 3 0 0 0 3 11 0
## 4 0 0 0 3 11 0
## 5 0 0 0 3 11 0
## 6 0 0 0 3 11 0
Tahap preprocessing dilakukan untuk memastikan data siap digunakan dalam proses clustering.
# Menghapus kolom yang tidak diperlukan
data_clean <- data %>%
select(-ID, -Z_CostContact, -Z_Revenue)
# Mengubah format tanggal
data_clean$Dt_Customer <- as.Date(data_clean$Dt_Customer, format = "%d-%m-%Y")
# Mengubah categorical ke numeric
data_clean$Education <- as.numeric(as.factor(data_clean$Education))
data_clean$Marital_Status <- as.numeric(as.factor(data_clean$Marital_Status))
# Mengatasi missing value (pada Income)
data_clean$Income[is.na(data_clean$Income)] <- median(data_clean$Income, na.rm = TRUE)
# Cek struktur data
str(data_clean)
## 'data.frame': 2240 obs. of 26 variables:
## $ Year_Birth : int 1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
## $ Education : num 3 3 3 3 5 4 3 5 5 5 ...
## $ Marital_Status : num 5 5 6 6 4 6 3 4 6 6 ...
## $ Income : num 58138 46344 71613 26646 58293 ...
## $ Kidhome : int 0 1 0 1 1 0 0 1 1 1 ...
## $ Teenhome : int 0 1 0 0 0 1 1 0 0 1 ...
## $ Dt_Customer : Date, format: "2012-09-04" "2014-03-08" ...
## $ Recency : int 58 38 26 26 94 16 34 32 19 68 ...
## $ MntWines : int 635 11 426 11 173 520 235 76 14 28 ...
## $ MntFruits : int 88 1 49 4 43 42 65 10 0 0 ...
## $ MntMeatProducts : int 546 6 127 20 118 98 164 56 24 6 ...
## $ MntFishProducts : int 172 2 111 10 46 0 50 3 3 1 ...
## $ MntSweetProducts : int 88 1 21 3 27 42 49 1 3 1 ...
## $ MntGoldProds : int 88 6 42 5 15 14 27 23 2 13 ...
## $ NumDealsPurchases : int 3 2 1 2 5 2 4 2 1 1 ...
## $ NumWebPurchases : int 8 1 8 2 5 6 7 4 3 1 ...
## $ NumCatalogPurchases: int 10 1 2 0 3 4 3 0 0 0 ...
## $ NumStorePurchases : int 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebVisitsMonth : int 7 5 4 6 5 6 6 8 9 20 ...
## $ AcceptedCmp3 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Complain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Response : int 1 0 0 0 0 0 0 0 1 0 ...
Statistika deskriptif digunakan untuk melihat gambaran umum dari data yang digunakan.
summary(data_clean)
## Year_Birth Education Marital_Status Income
## Min. :1893 Min. :1.000 Min. :1.00 Min. : 1730
## 1st Qu.:1959 1st Qu.:3.000 1st Qu.:4.00 1st Qu.: 35539
## Median :1970 Median :3.000 Median :5.00 Median : 51382
## Mean :1969 Mean :3.394 Mean :4.73 Mean : 52238
## 3rd Qu.:1977 3rd Qu.:4.000 3rd Qu.:6.00 3rd Qu.: 68290
## Max. :1996 Max. :5.000 Max. :8.00 Max. :666666
## Kidhome Teenhome Dt_Customer Recency
## Min. :0.0000 Min. :0.0000 Min. :2012-07-30 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:2013-01-16 1st Qu.:24.00
## Median :0.0000 Median :0.0000 Median :2013-07-08 Median :49.00
## Mean :0.4442 Mean :0.5062 Mean :2013-07-10 Mean :49.11
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:2013-12-30 3rd Qu.:74.00
## Max. :2.0000 Max. :2.0000 Max. :2014-06-29 Max. :99.00
## MntWines MntFruits MntMeatProducts MntFishProducts
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 23.75 1st Qu.: 1.0 1st Qu.: 16.0 1st Qu.: 3.00
## Median : 173.50 Median : 8.0 Median : 67.0 Median : 12.00
## Mean : 303.94 Mean : 26.3 Mean : 166.9 Mean : 37.53
## 3rd Qu.: 504.25 3rd Qu.: 33.0 3rd Qu.: 232.0 3rd Qu.: 50.00
## Max. :1493.00 Max. :199.0 Max. :1725.0 Max. :259.00
## MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.00 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 2.000
## Median : 8.00 Median : 24.00 Median : 2.000 Median : 4.000
## Mean : 27.06 Mean : 44.02 Mean : 2.325 Mean : 4.085
## 3rd Qu.: 33.00 3rd Qu.: 56.00 3rd Qu.: 3.000 3rd Qu.: 6.000
## Max. :263.00 Max. :362.00 Max. :15.000 Max. :27.000
## NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.: 3.00 1st Qu.: 3.000 1st Qu.:0.00000
## Median : 2.000 Median : 5.00 Median : 6.000 Median :0.00000
## Mean : 2.662 Mean : 5.79 Mean : 5.317 Mean :0.07277
## 3rd Qu.: 4.000 3rd Qu.: 8.00 3rd Qu.: 7.000 3rd Qu.:0.00000
## Max. :28.000 Max. :13.00 Max. :20.000 Max. :1.00000
## AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.07455 Mean :0.07277 Mean :0.06429 Mean :0.01339
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Complain Response
## Min. :0.000000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.0000
## Median :0.000000 Median :0.0000
## Mean :0.009375 Mean :0.1491
## 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1.000000 Max. :1.0000
data_clean %>%
summarise(
mean_income = mean(Income),
median_income = median(Income),
sd_income = sd(Income)
)
## mean_income median_income sd_income
## 1 52237.98 51381.5 25037.96
Visualisasi dilakukan untuk memahami pola distribusi data serta melihat sebaran nilai dan potensi pencilan (outlier) pada setiap variabel sebelum dilakukan proses clustering.
Pada tahap ini dilakukan pemeriksaan bentuk distribusi dari masing-masing variabel numerik menggunakan histogram.
data_num <- data_clean %>% select(where(is.numeric))
data_num %>%
pivot_longer(cols = everything(), names_to = "Variabel", values_to = "Nilai") %>%
ggplot(aes(x = Nilai)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
facet_wrap(~Variabel, scales = "free") +
labs(title = "Distribusi Tiap Variabel", x = "Nilai", y = "Frekuensi")
Berdasarkan visualisasi histogram, terlihat bahwa beberapa variabel memiliki distribusi yang tidak simetris (skewed), terutama pada variabel yang berkaitan dengan pendapatan dan pengeluaran. Hal ini menunjukkan adanya perbedaan karakteristik antar pelanggan yang cukup beragam.
Boxplot digunakan untuk melihat sebaran data berdasarkan kuartil serta mendeteksi adanya pencilan (outlier) pada setiap variabel.
data_num %>%
pivot_longer(cols = everything(), names_to = "Variabel", values_to = "Nilai") %>%
ggplot(aes(x = Variabel, y = Nilai)) +
geom_boxplot(fill = "orange") +
coord_flip() +
labs(title = "Boxplot Karakteristik Data",
x = "Variabel", y = "Nilai")
Dari boxplot yang ditampilkan, terlihat bahwa terdapat beberapa pencilan (outlier) pada beberapa variabel. Selain itu, perbedaan rentang nilai antar variabel juga cukup signifikan, sehingga diperlukan proses normalisasi atau scaling sebelum dilakukan clustering agar hasil yang diperoleh lebih optimal.
Sebelum dilakukan proses clustering, data perlu dinormalisasi agar perbedaan skala antar variabel tidak mempengaruhi hasil pengelompokan.
data_num <- data_clean %>% select(where(is.numeric))
data_scaled <- scale(data_num)
head(data_scaled)
## Year_Birth Education Marital_Status Income Kidhome Teenhome
## [1,] -0.9851248 -0.3500631 0.2509477 0.2356432 -0.8250334 -0.9296868
## [2,] -1.2354571 -0.3500631 0.2509477 -0.2354016 1.0323283 0.9067316
## [3,] -0.3175719 -0.3500631 1.1800764 0.7738261 -0.8250334 -0.9296868
## [4,] 1.2678662 -0.3500631 1.1800764 -1.0221272 1.0323283 -0.9296868
## [5,] 1.0175339 1.4280353 -0.6781810 0.2418338 1.0323283 -0.9296868
## [6,] -0.1506837 0.5389861 1.1800764 0.4103779 -0.8250334 0.9067316
## Recency MntWines MntFruits MntMeatProducts MntFishProducts
## [1,] 0.3069707 0.9835616 1.5512306 1.6793274 2.4615974
## [2,] -0.3835785 -0.8702852 -0.6361591 -0.7130662 -0.6503040
## [3,] -0.7979081 0.3626418 0.5706766 -0.1769928 1.3449739
## [4,] -0.7979081 -0.8702852 -0.5607319 -0.6510412 -0.5038616
## [5,] 1.5499594 -0.3889980 0.4198221 -0.2168660 0.1551293
## [6,] -1.1431827 0.6419072 0.3946797 -0.3054732 -0.6869147
## MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases
## [1,] 1.476170487 0.84301867 0.3493359 1.4089893
## [2,] -0.631362202 -0.72884304 -0.1681988 -1.1101615
## [3,] -0.146871929 -0.03875741 -0.6857335 1.4089893
## [4,] -0.582913175 -0.74801209 -0.1681988 -0.7502828
## [5,] -0.001524847 -0.55632164 1.3844054 0.3293533
## [6,] 0.361842858 -0.57549068 -0.1681988 0.6892320
## NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3
## [1,] 2.5103297 -0.55066183 0.6937488 -0.2800777
## [2,] -0.5685927 -1.16586508 -0.1304343 -0.2800777
## [3,] -0.2264902 1.29494790 -0.5425259 -0.2800777
## [4,] -0.9106951 -0.55066183 0.2816572 -0.2800777
## [5,] 0.1156123 0.06454141 -0.1304343 -0.2800777
## [6,] 0.4577148 1.29494790 0.2816572 -0.2800777
## AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response
## [1,] -0.2837668 -0.2800777 -0.2620527 -0.1164843 -0.09725995 2.3883131
## [2,] -0.2837668 -0.2800777 -0.2620527 -0.1164843 -0.09725995 -0.4185187
## [3,] -0.2837668 -0.2800777 -0.2620527 -0.1164843 -0.09725995 -0.4185187
## [4,] -0.2837668 -0.2800777 -0.2620527 -0.1164843 -0.09725995 -0.4185187
## [5,] -0.2837668 -0.2800777 -0.2620527 -0.1164843 -0.09725995 -0.4185187
## [6,] -0.2837668 -0.2800777 -0.2620527 -0.1164843 -0.09725995 -0.4185187
Digunakan metode Elbow dan Silhouette untuk menentukan jumlah cluster optimal.
fviz_nbclust(data_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method")
fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
labs(title = "Silhouette Method")
Berdasarkan hasil pengujian menggunakan metode Elbow dan Silhouette, diperoleh bahwa jumlah cluster optimal berada pada nilai k = 2. Pada metode Elbow, terlihat adanya penurunan yang cukup signifikan hingga k = 2, kemudian cenderung melandai pada nilai k berikutnya. Sementara itu, metode Silhouette menunjukkan nilai tertinggi pada k = 2, yang menandakan bahwa pemisahan antar cluster pada jumlah tersebut paling baik dibandingkan dengan jumlah cluster lainnya. Oleh karena itu, pada penelitian ini digunakan jumlah cluster sebanyak dua untuk proses clustering selanjutnya.
set.seed(123)
km_res <- kmeans(data_scaled, centers = 2, nstart = 25)
km_cluster <- km_res$cluster
kmed_res <- flexclust::kcca(data_scaled, k = 2, family = flexclust::kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
kmed_cluster <- flexclust::clusters(kmed_res)
db_res <- dbscan::dbscan(data_scaled, eps = 1.5, minPts = 5)
db_cluster <- db_res$cluster
db_cluster <- ifelse(db_cluster == 0, NA, db_cluster)
ms_res <- meanShift(data_scaled)
ms_cluster <- ms_res$assignment
fcm_res <- cmeans(data_scaled, centers = 2, m = 2 )
fcm_cluster <- fcm_res$cluster
sil_score <- function(cluster, data){
idx <- which(!is.na(cluster))
mean(silhouette(cluster[idx], dist(data[idx, ]))[,3])
}
sil_km <- sil_score(km_cluster, data_scaled)
sil_kmed <- sil_score(kmed_cluster, data_scaled)
sil_db <- sil_score(db_cluster, data_scaled)
sil_ms <- sil_score(ms_cluster, data_scaled)
sil_fcm <- sil_score(fcm_cluster, data_scaled)
sil_km
## [1] 0.2420156
sil_kmed
## [1] 0.2239927
sil_db
## [1] 0.09492424
sil_ms
## [1] 0.03848401
sil_fcm
## [1] 0.2252527
library(clusterSim)
## Warning: package 'clusterSim' was built under R version 4.5.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
ch_index <- function(cluster, data){
idx <- which(!is.na(cluster))
index.G1(data[idx, ], cluster[idx], centrotypes = "centroids")
}
ch_km <- ch_index(km_cluster, data_scaled)
ch_kmed <- ch_index(kmed_cluster, data_scaled)
ch_db <- ch_index(db_cluster, data_scaled)
ch_ms <- ch_index(ms_cluster, data_scaled)
ch_fcm <- ch_index(fcm_cluster, data_scaled)
ch_km
## [1] 570.1716
ch_kmed
## [1] 542.9873
ch_db
## [1] 34.83134
ch_ms
## [1] 9.169468
ch_fcm
## [1] 563.7734
hasil <- data.frame(
Metode = c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm),
Calinski_Harabasz = c(ch_km, ch_kmed, ch_db, ch_ms, ch_fcm)
)
hasil
## Metode Silhouette Calinski_Harabasz
## 1 K-Means 0.24201557 570.171574
## 2 K-Median 0.22399271 542.987270
## 3 DBSCAN 0.09492424 34.831337
## 4 Mean Shift 0.03848401 9.169468
## 5 Fuzzy C-Means 0.22525271 563.773427
hasil$Score <- scale(hasil$Silhouette) + scale(hasil$Calinski_Harabasz)
metode_terbaik <- hasil[which.max(hasil$Score), ]
metode_terbaik
## Metode Silhouette Calinski_Harabasz Score
## 1 K-Means 0.2420156 570.1716 1.60412
Berdasarkan hasil evaluasi menggunakan Silhouette Score dan Calinski-Harabasz Index, diperoleh bahwa metode clustering terbaik adalah metode K-Means yang memiliki nilai evaluasi tertinggi dibandingkan metode lainnya. Pemilihan metode terbaik dilakukan dengan mempertimbangkan nilai Silhouette sebagai indikator utama kualitas cluster, serta didukung oleh nilai Calinski-Harabasz sebagai indikator pemisahan antar cluster.
data_cluster <- data_clean
data_cluster$Cluster <- factor(km_cluster)
head(data_cluster)
## Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 1957 3 5 58138 0 0 2012-09-04
## 2 1954 3 5 46344 1 1 2014-03-08
## 3 1965 3 6 71613 0 0 2013-08-21
## 4 1984 3 6 26646 1 0 2014-02-10
## 5 1981 5 4 58293 1 0 2014-01-19
## 6 1967 4 6 62513 0 1 2013-09-09
## Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1 58 635 88 546 172 88
## 2 38 11 1 6 2 1
## 3 26 426 49 127 111 21
## 4 26 11 4 20 10 3
## 5 94 173 43 118 46 27
## 6 16 520 42 98 0 42
## MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1 88 3 8 10
## 2 6 2 1 1
## 3 42 1 8 2
## 4 5 2 2 0
## 5 15 5 5 3
## 6 14 2 6 4
## NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1 4 7 0 0 0
## 2 2 5 0 0 0
## 3 10 4 0 0 0
## 4 4 6 0 0 0
## 5 6 5 0 0 0
## 6 10 6 0 0 0
## AcceptedCmp1 AcceptedCmp2 Complain Response Cluster
## 1 0 0 0 1 2
## 2 0 0 0 0 1
## 3 0 0 0 0 2
## 4 0 0 0 0 1
## 5 0 0 0 0 1
## 6 0 0 0 0 2
library(dplyr)
cluster_summary <- data_cluster %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `Cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
cluster_summary
## # A tibble: 2 × 26
## Cluster Year_Birth Education Marital_Status Income Kidhome Teenhome Recency
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1970. 3.36 4.72 39308. 0.688 0.560 48.9
## 2 2 1967. 3.45 4.74 72258. 0.0660 0.423 49.5
## # ℹ 18 more variables: MntWines <dbl>, MntFruits <dbl>, MntMeatProducts <dbl>,
## # MntFishProducts <dbl>, MntSweetProducts <dbl>, MntGoldProds <dbl>,
## # NumDealsPurchases <dbl>, NumWebPurchases <dbl>, NumCatalogPurchases <dbl>,
## # NumStorePurchases <dbl>, NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>,
## # AcceptedCmp4 <dbl>, AcceptedCmp5 <dbl>, AcceptedCmp1 <dbl>,
## # AcceptedCmp2 <dbl>, Complain <dbl>, Response <dbl>
table(data_cluster$Cluster)
##
## 1 2
## 1361 879
library(factoextra)
fviz_cluster(km_res, data = data_scaled,
geom = "point",
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal())
library(tidyr)
library(ggplot2)
data_cluster %>%
pivot_longer(cols = where(is.numeric),
names_to = "Variabel",
values_to = "Nilai") %>%
ggplot(aes(x = Cluster, y = Nilai, fill = Cluster)) +
geom_boxplot() +
facet_wrap(~Variabel, scales = "free") +
theme_minimal() +
labs(title = "Karakteristik Variabel pada Tiap Cluster")
ggplot(data_cluster, aes(x = Income, y = MntWines, color = Cluster)) +
geom_point(alpha = 0.7) +
theme_minimal() +
labs(title = "Cluster berdasarkan Income dan Spending")