## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Rows: 178 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): Alcohol, Malic_Acid, Ash, Ash_Alcanity, Magnesium, Total_Phenols, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 6 × 13
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14.2 1.71 2.43 15.6 127 2.8 3.06
## 2 13.2 1.78 2.14 11.2 100 2.65 2.76
## 3 13.2 2.36 2.67 18.6 101 2.8 3.24
## 4 14.4 1.95 2.5 16.8 113 3.85 3.49
## 5 13.2 2.59 2.87 21 118 2.8 2.69
## 6 14.2 1.76 2.45 15.2 112 3.27 3.39
## # ℹ 6 more variables: Nonflavanoid_Phenols <dbl>, Proanthocyanins <dbl>,
## # Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>, Proline <dbl>
## corrplot 0.95 loaded
cor_mat <- cor(df, use = "pairwise.complete.obs", method = "pearson")
corrplot(cor_mat, method = "color", type = "upper", tl.col = "black", addCoef.col = "black", number.cex = 0.6)library(dplyr)
keep <- c(
"Alcohol","Malic_Acid","Ash_Alcanity","Magnesium",
"Flavanoids","Nonflavanoid_Phenols","Proanthocyanins",
"Color_Intensity","Proline"
)
df_clean <- df %>% select(all_of(intersect(keep, names(df))))
head(df_clean)## # A tibble: 6 × 9
## Alcohol Malic_Acid Ash_Alcanity Magnesium Flavanoids Nonflavanoid_Phenols
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14.2 1.71 15.6 127 3.06 0.28
## 2 13.2 1.78 11.2 100 2.76 0.26
## 3 13.2 2.36 18.6 101 3.24 0.3
## 4 14.4 1.95 16.8 113 3.49 0.24
## 5 13.2 2.59 21 118 2.69 0.39
## 6 14.2 1.76 15.2 112 3.39 0.34
## # ℹ 3 more variables: Proanthocyanins <dbl>, Color_Intensity <dbl>,
## # Proline <dbl>
## Alcohol Malic_Acid Ash_Alcanity Magnesium Flavanoids Nonflavanoid_Phenols
## 1 1.5143408 -0.56066822 -1.1663032 1.90852151 1.0319081 -0.6577078
## 2 0.2455968 -0.49800856 -2.4838405 0.01809398 0.7315653 -0.8184106
## 3 0.1963252 0.02117152 -0.2679823 0.08810981 1.2121137 -0.4970050
## 4 1.6867914 -0.34583508 -0.8069748 0.92829983 1.4623994 -0.9791134
## 5 0.2948684 0.22705328 0.4506745 1.27837900 0.6614853 0.2261576
## 6 1.4773871 -0.51591132 -1.2860793 0.85828399 1.3622851 -0.1755994
## Proanthocyanins Color_Intensity Proline
## 1 1.2214385 0.2510088 1.01015939
## 2 -0.5431887 -0.2924962 0.96252635
## 3 2.1299594 0.2682629 1.39122370
## 4 1.0292513 1.1827317 2.32800680
## 5 0.4002753 -0.3183774 -0.03776747
## 6 0.6623487 0.7298108 2.23274072
## [1] 3.232780 2.688085 2.023915 2.852088 1.870516 2.451591
##
## Call:
## hclust(d = dist_matrix, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 178
k from Potential ks from Dendrogram## Warning: package 'cluster' was built under R version 4.4.3
## Warning: package 'clusterCrit' was built under R version 4.4.3
ks <- 2:5
sizes_list <- list()
results <- data.frame(
k = integer(),
Silhouette = numeric(),
Calinski_Harabasz = numeric(),
Dunn = numeric(),
stringsAsFactors = FALSE
)
for (k in ks) {
cl <- cutree(hc, k = k)
sizes_list[[as.character(k)]] <- table(cl)
sil <- silhouette(cl, dist_matrix)
sil_mean <- mean(sil[, "sil_width"])
crit <- intCriteria(as.matrix(df_scaled), as.integer(cl),
c("Calinski_Harabasz", "Dunn"))
results <- rbind(results, data.frame(
k = k,
Silhouette = sil_mean,
Calinski_Harabasz = crit$calinski_harabasz,
Dunn = crit$dunn
))
}
for (k in ks) {
cat("\n--- k =", k, "cluster sizes ---\n")
print(sizes_list[[as.character(k)]])
}##
## --- k = 2 cluster sizes ---
## cl
## 1 2
## 64 114
##
## --- k = 3 cluster sizes ---
## cl
## 1 2 3
## 64 67 47
##
## --- k = 4 cluster sizes ---
## cl
## 1 2 3 4
## 57 7 67 47
##
## --- k = 5 cluster sizes ---
## cl
## 1 2 3 4 5
## 57 7 48 19 47
## k Silhouette Calinski_Harabasz Dunn
## 1 2 0.2620687 65.06705 0.1608956
## 2 3 0.2805325 65.58287 0.1752991
## 3 4 0.2878328 52.42604 0.1864274
## 4 5 0.2334873 45.41841 0.1755590
koptimal_k <- 3
final_cl <- cutree(hc, k = optimal_k)
final_sil <- silhouette(final_cl, dist_matrix)
final_sil_mean <- mean(final_sil[, "sil_width"])
cat("Final Silhouette Score for k =", optimal_k, "is", final_sil_mean)## Final Silhouette Score for k = 3 is 0.2805325
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(list(data = df_scaled, cluster = final_cl),
geom = "point",
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal(),
main = "Cluster Visualization for Hierarchical Clustering (k=3)")## # A tibble: 178 × 10
## Alcohol Malic_Acid Ash_Alcanity Magnesium Flavanoids Nonflavanoid_Phenols
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14.2 1.71 15.6 127 3.06 0.28
## 2 13.2 1.78 11.2 100 2.76 0.26
## 3 13.2 2.36 18.6 101 3.24 0.3
## 4 14.4 1.95 16.8 113 3.49 0.24
## 5 13.2 2.59 21 118 2.69 0.39
## 6 14.2 1.76 15.2 112 3.39 0.34
## 7 14.4 1.87 14.6 96 2.52 0.3
## 8 14.1 2.15 17.6 121 2.51 0.31
## 9 14.8 1.64 14 97 2.98 0.29
## 10 13.9 1.35 16 98 3.15 0.22
## # ℹ 168 more rows
## # ℹ 4 more variables: Proanthocyanins <dbl>, Color_Intensity <dbl>,
## # Proline <dbl>, Cluster <fct>
cluster_profile <- df_final %>%
group_by(Cluster) %>%
summarise(across(everything(), list(mean = mean, sd = sd), .names = "{col}_{fn}"))
cluster_profile## # A tibble: 3 × 19
## Cluster Alcohol_mean Alcohol_sd Malic_Acid_mean Malic_Acid_sd
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.6 0.603 1.97 0.684
## 2 2 12.3 0.535 1.86 0.954
## 3 3 13.2 0.541 3.52 0.956
## # ℹ 14 more variables: Ash_Alcanity_mean <dbl>, Ash_Alcanity_sd <dbl>,
## # Magnesium_mean <dbl>, Magnesium_sd <dbl>, Flavanoids_mean <dbl>,
## # Flavanoids_sd <dbl>, Nonflavanoid_Phenols_mean <dbl>,
## # Nonflavanoid_Phenols_sd <dbl>, Proanthocyanins_mean <dbl>,
## # Proanthocyanins_sd <dbl>, Color_Intensity_mean <dbl>,
## # Color_Intensity_sd <dbl>, Proline_mean <dbl>, Proline_sd <dbl>
(Courtesy of GPT since I don’t understand wines)
k Using Elbow Method and Silhouette Scorewss <- numeric()
ks_kmeans <- 1:10
for (k in ks_kmeans) {
kmeans_model <- kmeans(df_scaled, centers = k, nstart = 25)
wss[k] <- kmeans_model$tot.withinss
}
plot(ks_kmeans, wss, type = "b", pch = 19,
xlab = "Number of clusters k",
ylab = "Total within-clusters sum of squares",
main = "Elbow Method for Determining Optimal k")sil_scores <- numeric()
for (k in ks_kmeans[-1]) {
kmeans_model <- kmeans(df_scaled, centers = k, nstart = 25)
sil <- silhouette(kmeans_model$cluster, dist_matrix)
sil_scores[k] <- mean(sil[, "sil_width"])
}
plot(ks_kmeans[-1], sil_scores[-1], type = "b", pch = 19,
xlab = "Number of clusters k",
ylab = "Average Silhouette Score",
main = "Silhouette Scores for Determining Optimal k")k and Evaluateoptimal_k_kmeans <- 3
kmeans_final <- kmeans(df_scaled, centers = optimal_k_kmeans, nstart = 25)
kmeans_final## K-means clustering with 3 clusters of sizes 69, 62, 47
##
## Cluster means:
## Alcohol Malic_Acid Ash_Alcanity Magnesium Flavanoids
## 1 -0.8905065 -0.3707433 0.2132921 -0.51869408 0.03410259
## 2 0.8314919 -0.3306755 -0.7142449 0.62565074 0.90030626
## 3 0.2104777 0.9804929 0.6290645 -0.06383944 -1.23770355
## Nonflavanoid_Phenols Proanthocyanins Color_Intensity Proline
## 1 0.05846771 -0.05145304 -0.8360012 -0.7568653
## 2 -0.61882809 0.62007876 0.1474144 1.1308810
## 3 0.73049084 -0.74243880 1.0328594 -0.3806578
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2
## [75] 1 1 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 384.2362 280.0128 224.5865
## (between_SS / total_SS = 44.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
final_sil_kmeans <- silhouette(kmeans_final$cluster, dist_matrix)
final_sil_mean_kmeans <- mean(final_sil_kmeans[, "sil_width"])
cat("Final Silhouette Score for K-Means k =", optimal_k_kmeans, "is", final_sil_mean_kmeans)## Final Silhouette Score for K-Means k = 3 is 0.292841
fviz_cluster(kmeans_final, data = df_scaled,
geom = "point",
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal(),
main = "Cluster Visualization for K-Means Clustering (k=3)")## # A tibble: 178 × 10
## Alcohol Malic_Acid Ash_Alcanity Magnesium Flavanoids Nonflavanoid_Phenols
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14.2 1.71 15.6 127 3.06 0.28
## 2 13.2 1.78 11.2 100 2.76 0.26
## 3 13.2 2.36 18.6 101 3.24 0.3
## 4 14.4 1.95 16.8 113 3.49 0.24
## 5 13.2 2.59 21 118 2.69 0.39
## 6 14.2 1.76 15.2 112 3.39 0.34
## 7 14.4 1.87 14.6 96 2.52 0.3
## 8 14.1 2.15 17.6 121 2.51 0.31
## 9 14.8 1.64 14 97 2.98 0.29
## 10 13.9 1.35 16 98 3.15 0.22
## # ℹ 168 more rows
## # ℹ 4 more variables: Proanthocyanins <dbl>, Color_Intensity <dbl>,
## # Proline <dbl>, Cluster <fct>
kmeans_cluster_profile <- df_kmeans_final %>%
group_by(Cluster) %>%
summarise(across(everything(), list(mean = mean, sd = sd), .names = "{col}_{fn}"))
kmeans_cluster_profile## # A tibble: 3 × 19
## Cluster Alcohol_mean Alcohol_sd Malic_Acid_mean Malic_Acid_sd
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 12.3 0.544 1.92 0.997
## 2 2 13.7 0.546 1.97 0.695
## 3 3 13.2 0.526 3.43 1.01
## # ℹ 14 more variables: Ash_Alcanity_mean <dbl>, Ash_Alcanity_sd <dbl>,
## # Magnesium_mean <dbl>, Magnesium_sd <dbl>, Flavanoids_mean <dbl>,
## # Flavanoids_sd <dbl>, Nonflavanoid_Phenols_mean <dbl>,
## # Nonflavanoid_Phenols_sd <dbl>, Proanthocyanins_mean <dbl>,
## # Proanthocyanins_sd <dbl>, Color_Intensity_mean <dbl>,
## # Color_Intensity_sd <dbl>, Proline_mean <dbl>, Proline_sd <dbl>
(Once again, courtesy of GPT since I don’t understand wines)