library(psych)
## Warning: package 'psych' was built under R version 4.4.3
library(GPArotation)
## Warning: package 'GPArotation' was built under R version 4.4.3
##
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
##
## equamax, varimin
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cluster)
library(clusterCrit)
## Warning: package 'clusterCrit' was built under R version 4.4.3
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
library(scales)
## Warning: package 'scales' was built under R version 4.4.2
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
library(RColorBrewer)
data <- read.csv("C:/Users/Ainul Hayati/Documents/kuliah/Departemen/Semester 5/Teknik Peubah Ganda/wine-clustering.csv")
head(data)
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## 1 14.23 1.71 2.43 15.6 127 2.80 3.06
## 2 13.20 1.78 2.14 11.2 100 2.65 2.76
## 3 13.16 2.36 2.67 18.6 101 2.80 3.24
## 4 14.37 1.95 2.50 16.8 113 3.85 3.49
## 5 13.24 2.59 2.87 21.0 118 2.80 2.69
## 6 14.20 1.76 2.45 15.2 112 3.27 3.39
## Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline
## 1 0.28 2.29 5.64 1.04 3.92 1065
## 2 0.26 1.28 4.38 1.05 3.40 1050
## 3 0.30 2.81 5.68 1.03 3.17 1185
## 4 0.24 2.18 7.80 0.86 3.45 1480
## 5 0.39 1.82 4.32 1.04 2.93 735
## 6 0.34 1.97 6.75 1.05 2.85 1450
data <- data %>% select(where(is.numeric)) %>% na.omit()
data_scaled <- scale(data)
fviz_nbclust(data_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method")
fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
labs(title = "Silhouette Method")
set.seed(070)
kmeans_result <- kmeans(data_scaled, centers = 3, nstart = 25)
kmeans_result
## K-means clustering with 3 clusters of sizes 65, 51, 62
##
## Cluster means:
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## 2 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## 3 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## 2 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## 3 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## OD280 Proline
## 1 0.2700025 -0.7517257
## 2 -1.2887761 -0.4059428
## 3 0.7770551 1.1220202
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 3 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 558.6971 326.3537 385.6983
## (between_SS / total_SS = 44.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
table(kmeans_result$cluster)
##
## 1 2 3
## 65 51 62
head(data.frame(Observasi = 1:length(kmeans_result$cluster),
Cluster = kmeans_result$cluster),10)
## Observasi Cluster
## 1 1 3
## 2 2 3
## 3 3 3
## 4 4 3
## 5 5 3
## 6 6 3
## 7 7 3
## 8 8 3
## 9 9 3
## 10 10 3
names(kmeans_result)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeans_result$cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 3 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
kmeans_result$totss
## [1] 2301
kmeans_result$withinss
## [1] 558.6971 326.3537 385.6983
kmeans_result$tot.withinss
## [1] 1270.749
kmeans_result$betweenss
## [1] 1030.251
kmeans_result$iter
## [1] 2
cluster_km <- kmeans_result$cluster
dist_matrix <- dist(data_scaled, method = "euclidean")
sil <- mean(silhouette(cluster_km, dist_matrix)[, 3])
ch <- intCriteria(traj = as.matrix(data_scaled),
part = cluster_km,
crit = "Calinski_Harabasz")$calinski_harabasz
dunn <- intCriteria(traj = as.matrix(data_scaled),
part = cluster_km,
crit = "Dunn")$dunn
data.frame(Silhouette = sil,
Calinski_Harabasz = ch,
Dunn = dunn)
## Silhouette Calinski_Harabasz Dunn
## 1 0.2848589 70.94001 0.2322567
Nilai Silhouette = 0.2849 rendah, artinya sebagian observasi masih berdekatan dengan cluster lain. Nilai Calinski-Harabasz (CH) = 70.94 menunjukkan variasi antar cluster lebih tinggi dibanding variasi di dalam cluster. Nilai Dunn Index = 0.232 rendah, menunjukkan bahwa jarak antar cluster tidak terlalu besar dibanding diameter cluster.
data_clustered <- data.frame(data, cluster = factor(cluster_km))
cluster_summary <- data_clustered %>%
group_by(cluster) %>%
summarise(across(where(is.numeric), mean, .names = "{.col}"))
cluster_summary
## # A tibble: 3 × 14
## cluster Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 12.3 1.90 2.23 20.1 92.7 2.25
## 2 2 13.1 3.31 2.42 21.2 98.7 1.68
## 3 3 13.7 2.00 2.47 17.5 108. 2.85
## # ℹ 7 more variables: Flavanoids <dbl>, Nonflavanoid_Phenols <dbl>,
## # Proanthocyanins <dbl>, Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>,
## # Proline <dbl>
Cluster 1: Alcohol rendah (12.25),
Total_Phenols & Flavanoids sedang (2.24
& 2.05), Color_Intensity rendah (2.97),
Proline rendah (510). Wine dengan kadar alkohol dan warna
paling ringan, fenolik sedang (), cenderung lebih ringan dan lembut.
Cluster 2: Malic_Acid tertinggi (3.30),
Color_Intensity tertinggi (7.23), Flavanoids
sangat rendah (0.82), Hue rendah (0.69),
Proline sedang (619). Wine berwarna kuat tapi tidak terlalu
lembut, memiliki keasaman tinggi dan sedikit flavonoid. Bertipe lebih
asam, lebih pekat dan kurang manis.
Cluster 3: Alcohol dan
Proline tertinggi (13.68 & 1100),
Total_Phenols dan Flavanoids tinggi (2.85
& 3.00), Color_Intensity tinggi (5.45). Wine dengan
kadar alkohol tinggi, warna kuat, dan kaya fenol/flavonoid. Cenderung
lebih kompleks, berkualitas tinggi, dan kaya rasa.
fviz_cluster(kmeans_result, data = data_scaled,
palette = "jco", geom = "point",
main = "Visualisasi Hasil K-Means Clustering (3 Cluster)")
Nilai Dim1 (36.2%) dan Dim2 (19.2%) menunjukkan proporsi keragaman data yang dijelaskan oleh dua komponen utama pertama hasil reduksi dimensi (PCA). Kedua dimensi ini secara bersama menjelaskan sekitar 55.4% variasi total dari keseluruhan data.
heat_data_scaled <- cluster_summary %>%
pivot_longer(-cluster, names_to = "Variabel", values_to = "Rata2") %>%
group_by(Variabel) %>%
mutate(Rata2_Scaled = scales::rescale(Rata2, to = c(0, 1))) %>%
ungroup()
heat_data_scaled
## # A tibble: 39 × 4
## cluster Variabel Rata2 Rata2_Scaled
## <fct> <chr> <dbl> <dbl>
## 1 1 Alcohol 12.3 0
## 2 1 Malic_Acid 1.90 0
## 3 1 Ash 2.23 0
## 4 1 Ash_Alcanity 20.1 0.688
## 5 1 Magnesium 92.7 0
## 6 1 Total_Phenols 2.25 0.484
## 7 1 Flavanoids 2.05 0.564
## 8 1 Nonflavanoid_Phenols 0.358 0.410
## 9 1 Proanthocyanins 1.62 0.616
## 10 1 Color_Intensity 2.97 0
## # ℹ 29 more rows
ggplot(heat_data_scaled, aes(x = Variabel, y = cluster, fill = Rata2_Scaled)) +
geom_tile(color = "white", linewidth = 0.8) +
scale_fill_gradientn(
colors = c("#56B1F7", "white", "#FF6B6B"),
name = "Skor Relatif"
) +
labs(
title = "Heatmap Karakteristik Tiap Cluster (Terstandarisasi)",
x = "Variabel",
y = "Cluster"
) +
theme_minimal(base_size = 13) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank()
)
ggplot(heat_data_scaled, aes(x = Variabel, y = Rata2_Scaled, fill = cluster)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Bar Plot Karakteristik Cluster",
x = "Variabel", y = "Rata-rata Nilai Terstandarisasi") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Berdasarkan hasil visualisasi heatmap dan bar plot, ketiga klaster memiliki karakteristik yang berbeda-beda
Cluster 1 menunjukkan nilai rata-rata terendah
hampir di seluruh variabel, termasuk Alcohol,
Flavanoids, dan Proline. Hal ini
mengindikasikan jenis wine yang lebih ringan, kadar alkohol rendah,
serta rasa yang tidak terlalu kompleks.
Cluster 2 memiliki nilai tinggi pada
Malic_Acid dan Color_Intensity namun rendah
pada Flavanoids dan Hue. Artinya, wine dalam
klaster ini cenderung lebih asam dan berwarna pekat, dengan rasa yang
kuat namun kurang lembut.
Cluster 3 memiliki nilai tertinggi pada
Alcohol, Flavanoids,
Total Phenols, dan Proline. Klaster ini
merepresentasikan wine dengan kandungan alkohol dan fenol tinggi, rasa
lebih kaya, kompleks, dan seimbang, serta kemungkinan berasal dari
anggur dengan kualitas lebih baik.