library(psych)
## Warning: package 'psych' was built under R version 4.4.3
library(GPArotation)
## Warning: package 'GPArotation' was built under R version 4.4.3
## 
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
## 
##     equamax, varimin
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(cluster)
library(clusterCrit)
## Warning: package 'clusterCrit' was built under R version 4.4.3
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
library(scales)
## Warning: package 'scales' was built under R version 4.4.2
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
library(RColorBrewer)

Input Data

data <- read.csv("C:/Users/Ainul Hayati/Documents/kuliah/Departemen/Semester 5/Teknik Peubah Ganda/wine-clustering.csv")
head(data)
##   Alcohol Malic_Acid  Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## 1   14.23       1.71 2.43         15.6       127          2.80       3.06
## 2   13.20       1.78 2.14         11.2       100          2.65       2.76
## 3   13.16       2.36 2.67         18.6       101          2.80       3.24
## 4   14.37       1.95 2.50         16.8       113          3.85       3.49
## 5   13.24       2.59 2.87         21.0       118          2.80       2.69
## 6   14.20       1.76 2.45         15.2       112          3.27       3.39
##   Nonflavanoid_Phenols Proanthocyanins Color_Intensity  Hue OD280 Proline
## 1                 0.28            2.29            5.64 1.04  3.92    1065
## 2                 0.26            1.28            4.38 1.05  3.40    1050
## 3                 0.30            2.81            5.68 1.03  3.17    1185
## 4                 0.24            2.18            7.80 0.86  3.45    1480
## 5                 0.39            1.82            4.32 1.04  2.93     735
## 6                 0.34            1.97            6.75 1.05  2.85    1450
data <- data %>% select(where(is.numeric)) %>% na.omit()
data_scaled <- scale(data)

K-Means Clustering

fviz_nbclust(data_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method")

fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
labs(title = "Silhouette Method")

set.seed(070)
kmeans_result <- kmeans(data_scaled, centers = 3, nstart = 25)
kmeans_result
## K-means clustering with 3 clusters of sizes 65, 51, 62
## 
## Cluster means:
##      Alcohol Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1 -0.9234669 -0.3929331 -0.4931257    0.1701220 -0.49032869   -0.07576891
## 2  0.1644436  0.8690954  0.1863726    0.5228924 -0.07526047   -0.97657548
## 3  0.8328826 -0.3029551  0.3636801   -0.6084749  0.57596208    0.88274724
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1  0.02075402          -0.03343924      0.05810161      -0.8993770  0.4605046
## 2 -1.21182921           0.72402116     -0.77751312       0.9388902 -1.1615122
## 3  0.97506900          -0.56050853      0.57865427       0.1705823  0.4726504
##        OD280    Proline
## 1  0.2700025 -0.7517257
## 2 -1.2887761 -0.4059428
## 3  0.7770551  1.1220202
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   2   1   1   1   1   1   1   1   1   1   1   1   3   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   1   1   2   1   1   1   1   1   1   1   1   1   1   1   3   1   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   3   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 
## Within cluster sum of squares by cluster:
## [1] 558.6971 326.3537 385.6983
##  (between_SS / total_SS =  44.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
table(kmeans_result$cluster)
## 
##  1  2  3 
## 65 51 62
head(data.frame(Observasi = 1:length(kmeans_result$cluster),
           Cluster = kmeans_result$cluster),10)
##    Observasi Cluster
## 1          1       3
## 2          2       3
## 3          3       3
## 4          4       3
## 5          5       3
## 6          6       3
## 7          7       3
## 8          8       3
## 9          9       3
## 10        10       3
names(kmeans_result)
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
kmeans_result$cluster
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   2   1   1   1   1   1   1   1   1   1   1   1   3   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   1   1   2   1   1   1   1   1   1   1   1   1   1   1   3   1   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   3   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2
kmeans_result$totss
## [1] 2301
kmeans_result$withinss
## [1] 558.6971 326.3537 385.6983
kmeans_result$tot.withinss
## [1] 1270.749
kmeans_result$betweenss
## [1] 1030.251
kmeans_result$iter
## [1] 2

Evaluasi

cluster_km <- kmeans_result$cluster
dist_matrix <- dist(data_scaled, method = "euclidean")

sil <- mean(silhouette(cluster_km, dist_matrix)[, 3])
ch <- intCriteria(traj = as.matrix(data_scaled),
part = cluster_km,
crit = "Calinski_Harabasz")$calinski_harabasz
dunn <- intCriteria(traj = as.matrix(data_scaled),
part = cluster_km,
crit = "Dunn")$dunn

data.frame(Silhouette = sil,
Calinski_Harabasz = ch,
Dunn = dunn)
##   Silhouette Calinski_Harabasz      Dunn
## 1  0.2848589          70.94001 0.2322567

Nilai Silhouette = 0.2849 rendah, artinya sebagian observasi masih berdekatan dengan cluster lain. Nilai Calinski-Harabasz (CH) = 70.94 menunjukkan variasi antar cluster lebih tinggi dibanding variasi di dalam cluster. Nilai Dunn Index = 0.232 rendah, menunjukkan bahwa jarak antar cluster tidak terlalu besar dibanding diameter cluster.

data_clustered <- data.frame(data, cluster = factor(cluster_km))

cluster_summary <- data_clustered %>%
group_by(cluster) %>%
summarise(across(where(is.numeric), mean, .names = "{.col}"))

cluster_summary
## # A tibble: 3 × 14
##   cluster Alcohol Malic_Acid   Ash Ash_Alcanity Magnesium Total_Phenols
##   <fct>     <dbl>      <dbl> <dbl>        <dbl>     <dbl>         <dbl>
## 1 1          12.3       1.90  2.23         20.1      92.7          2.25
## 2 2          13.1       3.31  2.42         21.2      98.7          1.68
## 3 3          13.7       2.00  2.47         17.5     108.           2.85
## # ℹ 7 more variables: Flavanoids <dbl>, Nonflavanoid_Phenols <dbl>,
## #   Proanthocyanins <dbl>, Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>,
## #   Proline <dbl>

Cluster 1: Alcohol rendah (12.25), Total_Phenols & Flavanoids sedang (2.24 & 2.05), Color_Intensity rendah (2.97), Proline rendah (510). Wine dengan kadar alkohol dan warna paling ringan, fenolik sedang (), cenderung lebih ringan dan lembut.

Cluster 2: Malic_Acid tertinggi (3.30), Color_Intensity tertinggi (7.23), Flavanoids sangat rendah (0.82), Hue rendah (0.69), Proline sedang (619). Wine berwarna kuat tapi tidak terlalu lembut, memiliki keasaman tinggi dan sedikit flavonoid. Bertipe lebih asam, lebih pekat dan kurang manis.

Cluster 3: Alcohol dan Proline tertinggi (13.68 & 1100), Total_Phenols dan Flavanoids tinggi (2.85 & 3.00), Color_Intensity tinggi (5.45). Wine dengan kadar alkohol tinggi, warna kuat, dan kaya fenol/flavonoid. Cenderung lebih kompleks, berkualitas tinggi, dan kaya rasa.

Visualisasi

fviz_cluster(kmeans_result, data = data_scaled,
palette = "jco", geom = "point",
main = "Visualisasi Hasil K-Means Clustering (3 Cluster)")

Nilai Dim1 (36.2%) dan Dim2 (19.2%) menunjukkan proporsi keragaman data yang dijelaskan oleh dua komponen utama pertama hasil reduksi dimensi (PCA). Kedua dimensi ini secara bersama menjelaskan sekitar 55.4% variasi total dari keseluruhan data.

heat_data_scaled <- cluster_summary %>%
  pivot_longer(-cluster, names_to = "Variabel", values_to = "Rata2") %>%
  group_by(Variabel) %>%
  mutate(Rata2_Scaled = scales::rescale(Rata2, to = c(0, 1))) %>%
  ungroup()
heat_data_scaled
## # A tibble: 39 × 4
##    cluster Variabel              Rata2 Rata2_Scaled
##    <fct>   <chr>                 <dbl>        <dbl>
##  1 1       Alcohol              12.3          0    
##  2 1       Malic_Acid            1.90         0    
##  3 1       Ash                   2.23         0    
##  4 1       Ash_Alcanity         20.1          0.688
##  5 1       Magnesium            92.7          0    
##  6 1       Total_Phenols         2.25         0.484
##  7 1       Flavanoids            2.05         0.564
##  8 1       Nonflavanoid_Phenols  0.358        0.410
##  9 1       Proanthocyanins       1.62         0.616
## 10 1       Color_Intensity       2.97         0    
## # ℹ 29 more rows
ggplot(heat_data_scaled, aes(x = Variabel, y = cluster, fill = Rata2_Scaled)) +
  geom_tile(color = "white", linewidth = 0.8) +
  scale_fill_gradientn(
    colors = c("#56B1F7", "white", "#FF6B6B"),
    name = "Skor Relatif"
  ) +
  labs(
    title = "Heatmap Karakteristik Tiap Cluster (Terstandarisasi)",
    x = "Variabel",
    y = "Cluster"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank()
  )

ggplot(heat_data_scaled, aes(x = Variabel, y = Rata2_Scaled, fill = cluster)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Bar Plot Karakteristik Cluster",
x = "Variabel", y = "Rata-rata Nilai Terstandarisasi") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Berdasarkan hasil visualisasi heatmap dan bar plot, ketiga klaster memiliki karakteristik yang berbeda-beda

Cluster 1 menunjukkan nilai rata-rata terendah hampir di seluruh variabel, termasuk Alcohol, Flavanoids, dan Proline. Hal ini mengindikasikan jenis wine yang lebih ringan, kadar alkohol rendah, serta rasa yang tidak terlalu kompleks.

Cluster 2 memiliki nilai tinggi pada Malic_Acid dan Color_Intensity namun rendah pada Flavanoids dan Hue. Artinya, wine dalam klaster ini cenderung lebih asam dan berwarna pekat, dengan rasa yang kuat namun kurang lembut.

Cluster 3 memiliki nilai tertinggi pada Alcohol, Flavanoids, Total Phenols, dan Proline. Klaster ini merepresentasikan wine dengan kandungan alkohol dan fenol tinggi, rasa lebih kaya, kompleks, dan seimbang, serta kemungkinan berasal dari anggur dengan kualitas lebih baik.