#1. Import Library
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
#2. Load Dataset
data <- read.csv(file.choose())
head(data)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 4 1 3 13265 1196 4221 6404 507 1788
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
str(data)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(data)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
#3. Preprocessing
#Ambil fitur numerik utama
df <- data %>%
select(Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen)
#Cek missing value
colSums(is.na(df))
## Fresh Milk Grocery Frozen
## 0 0 0 0
## Detergents_Paper Delicassen
## 0 0
#Scaling
df_scaled <- scale(df)
#4. EDA
#Histogram
df %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(value)) +
geom_histogram(bins = 30, fill = "skyblue") +
facet_wrap(~name, scales = "free") +
theme_minimal()
#Boxplot
df %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x = name, y = value)) +
geom_boxplot(fill = "orange") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#Correlation Heatmap
cor_matrix <- cor(df)
corrplot(cor_matrix, method = "color", type = "upper")
#5. Menentukan Jumlah Cluster
fviz_nbclust(df_scaled, kmeans, method = "wss") +
ggtitle("Elbow Method")
fviz_nbclust(df_scaled, kmeans, method = "silhouette") +
ggtitle("Silhouette Method")
#6. K-Means Clustering
set.seed(123)
kmeans_model <- kmeans(df_scaled, centers = 3, nstart = 25)
data$Cluster_KMeans <- kmeans_model$cluster
#Visualisasi K-Means
fviz_cluster(kmeans_model, data = df_scaled,
palette = "jco",
ggtheme = theme_minimal(),
main = "K-Means Clustering")
#Evaluasi K-Means
sil_kmeans <- silhouette(kmeans_model$cluster, dist(df_scaled))
fviz_silhouette(sil_kmeans)
## cluster size ave.sil.width
## 1 1 44 0.20
## 2 2 3 -0.08
## 3 3 393 0.59
mean_kmeans <- mean(sil_kmeans[,3])
mean_kmeans
## [1] 0.5424012
#7. Hierarchical Clustering
hc <- hclust(dist(df_scaled), method = "ward.D2")
plot(hc, main = "Dendrogram")
#Potong cluster
hc_cluster <- cutree(hc, k = 3)
data$Cluster_HC <- hc_cluster
#Visualisasi Hierarchical
fviz_cluster(list(data = df_scaled, cluster = hc_cluster),
palette = "jco",
ggtheme = theme_minimal(),
main = "Hierarchical Clustering")
#Evaluasi Hierarchical
sil_hc <- silhouette(hc_cluster, dist(df_scaled))
fviz_silhouette(sil_hc)
## cluster size ave.sil.width
## 1 1 153 0.21
## 2 2 281 0.30
## 3 3 6 0.04
mean_hc <- mean(sil_hc[,3])
mean_hc
## [1] 0.2646091
#8. Perbandingan Metode
comparison <- data.frame(
Method = c("K-Means", "Hierarchical"),
Silhouette_Score = c(mean_kmeans, mean_hc)
)
comparison
## Method Silhouette_Score
## 1 K-Means 0.5424012
## 2 Hierarchical 0.2646091
#9. Visualisasi Perbandingan
ggplot(comparison, aes(x = Method, y = Silhouette_Score, fill = Method)) +
geom_bar(stat = "identity") +
theme_minimal() +
ggtitle("Perbandingan Metode Clustering")
#10. Interpretasi Cluster (K-Means)
cluster_mean <- data %>%
group_by(Cluster_KMeans) %>%
summarise(across(Fresh:Delicassen, mean))
cluster_mean
## # A tibble: 3 × 7
## Cluster_KMeans Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 8129. 19154. 28895. 1859. 13518. 2234.
## 2 2 60572. 30120. 17315. 38049. 2153 20701.
## 3 3 12063. 4115. 5535. 2941. 1696. 1299.
#11. Visualisasi Rata-rata per Cluster
cluster_mean %>%
pivot_longer(cols = -Cluster_KMeans) %>%
ggplot(aes(x = name, y = value, fill = as.factor(Cluster_KMeans))) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
ggtitle("Rata-rata Pembelian per Cluster (K-Means)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#12. Boxplot per Cluster
data %>%
pivot_longer(cols = Fresh:Delicassen) %>%
ggplot(aes(x = as.factor(Cluster_KMeans), y = value, fill = as.factor(Cluster_KMeans))) +
geom_boxplot() +
facet_wrap(~name, scales = "free") +
theme_minimal() +
ggtitle("Distribusi Fitur per Cluster")
#13. Analisis Tambahan
table(data$Cluster_KMeans, data$Channel)
##
## 1 2
## 1 0 44
## 2 3 0
## 3 295 98
table(data$Cluster_KMeans, data$Region)
##
## 1 2 3
## 1 7 8 29
## 2 0 1 2
## 3 70 38 285