Modul 3_Clustering

#1. Import Library

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'readr' was built under R version 4.5.3

## Warning: package 'forcats' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(cluster)

## Warning: package 'cluster' was built under R version 4.5.3

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.5.3

## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.5.3

## corrplot 0.95 loaded

#2. Load Dataset

data <- read.csv(file.choose())

head(data)

##   Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1       2      3 12669 9656    7561    214             2674       1338
## 2       2      3  7057 9810    9568   1762             3293       1776
## 3       2      3  6353 8808    7684   2405             3516       7844
## 4       1      3 13265 1196    4221   6404              507       1788
## 5       2      3 22615 5410    7198   3915             1777       5185
## 6       2      3  9413 8259    5126    666             1795       1451

str(data)

## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

summary(data)

##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

#3. Preprocessing

#Ambil fitur numerik utama
df <- data %>%
  select(Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen)

#Cek missing value
colSums(is.na(df))

##            Fresh             Milk          Grocery           Frozen 
##                0                0                0                0 
## Detergents_Paper       Delicassen 
##                0                0

#Scaling
df_scaled <- scale(df)

#4. EDA

#Histogram
df %>%
  pivot_longer(cols = everything()) %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 30, fill = "skyblue") +
  facet_wrap(~name, scales = "free") +
  theme_minimal()

#Boxplot
df %>%
  pivot_longer(cols = everything()) %>%
  ggplot(aes(x = name, y = value)) +
  geom_boxplot(fill = "orange") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Correlation Heatmap
cor_matrix <- cor(df)
corrplot(cor_matrix, method = "color", type = "upper")

#5. Menentukan Jumlah Cluster

fviz_nbclust(df_scaled, kmeans, method = "wss") +
  ggtitle("Elbow Method")

fviz_nbclust(df_scaled, kmeans, method = "silhouette") +
  ggtitle("Silhouette Method")

#6. K-Means Clustering

set.seed(123)

kmeans_model <- kmeans(df_scaled, centers = 3, nstart = 25)

data$Cluster_KMeans <- kmeans_model$cluster

#Visualisasi K-Means
fviz_cluster(kmeans_model, data = df_scaled,
             palette = "jco",
             ggtheme = theme_minimal(),
             main = "K-Means Clustering")

#Evaluasi K-Means
sil_kmeans <- silhouette(kmeans_model$cluster, dist(df_scaled))
fviz_silhouette(sil_kmeans)

##   cluster size ave.sil.width
## 1       1   44          0.20
## 2       2    3         -0.08
## 3       3  393          0.59

mean_kmeans <- mean(sil_kmeans[,3])
mean_kmeans

## [1] 0.5424012

#7. Hierarchical Clustering

hc <- hclust(dist(df_scaled), method = "ward.D2")

plot(hc, main = "Dendrogram")

#Potong cluster
hc_cluster <- cutree(hc, k = 3)

data$Cluster_HC <- hc_cluster

#Visualisasi Hierarchical
fviz_cluster(list(data = df_scaled, cluster = hc_cluster),
             palette = "jco",
             ggtheme = theme_minimal(),
             main = "Hierarchical Clustering")

#Evaluasi Hierarchical
sil_hc <- silhouette(hc_cluster, dist(df_scaled))
fviz_silhouette(sil_hc)

##   cluster size ave.sil.width
## 1       1  153          0.21
## 2       2  281          0.30
## 3       3    6          0.04

mean_hc <- mean(sil_hc[,3])
mean_hc

## [1] 0.2646091

#8. Perbandingan Metode

comparison <- data.frame(
  Method = c("K-Means", "Hierarchical"),
  Silhouette_Score = c(mean_kmeans, mean_hc)
)

comparison

##         Method Silhouette_Score
## 1      K-Means        0.5424012
## 2 Hierarchical        0.2646091

#9. Visualisasi Perbandingan

ggplot(comparison, aes(x = Method, y = Silhouette_Score, fill = Method)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  ggtitle("Perbandingan Metode Clustering")

#10. Interpretasi Cluster (K-Means)

cluster_mean <- data %>%
  group_by(Cluster_KMeans) %>%
  summarise(across(Fresh:Delicassen, mean))

cluster_mean

## # A tibble: 3 × 7
##   Cluster_KMeans  Fresh   Milk Grocery Frozen Detergents_Paper Delicassen
##            <int>  <dbl>  <dbl>   <dbl>  <dbl>            <dbl>      <dbl>
## 1              1  8129. 19154.  28895.  1859.           13518.      2234.
## 2              2 60572. 30120.  17315. 38049.            2153      20701.
## 3              3 12063.  4115.   5535.  2941.            1696.      1299.

#11. Visualisasi Rata-rata per Cluster

cluster_mean %>%
  pivot_longer(cols = -Cluster_KMeans) %>%
  ggplot(aes(x = name, y = value, fill = as.factor(Cluster_KMeans))) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  ggtitle("Rata-rata Pembelian per Cluster (K-Means)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#12. Boxplot per Cluster

data %>%
  pivot_longer(cols = Fresh:Delicassen) %>%
  ggplot(aes(x = as.factor(Cluster_KMeans), y = value, fill = as.factor(Cluster_KMeans))) +
  geom_boxplot() +
  facet_wrap(~name, scales = "free") +
  theme_minimal() +
  ggtitle("Distribusi Fitur per Cluster")

#13. Analisis Tambahan

table(data$Cluster_KMeans, data$Channel)

##    
##       1   2
##   1   0  44
##   2   3   0
##   3 295  98

table(data$Cluster_KMeans, data$Region)

##    
##       1   2   3
##   1   7   8  29
##   2   0   1   2
##   3  70  38 285

Modul 3_Clustering

Sabrina Putri Amalia (055)_Variesa Sabita Mumtaza Habib (083)_24E

2026-04-13