library(tidyverse)
library(cluster)
library(factoextra)
library(corrplot)
library(lares)
library(fpc)
library(caret)
# Read in data and reformat variable names
data <- read_csv("data/wine.data", col_names = c(
"true_clusters",
"Alcohol",
"Malic acid",
"Ash",
"Alcalinity of ash",
"Magnesium",
"Total phenols",
"Flavanoids",
"Nonflavanoid phenols",
"Proanthocyanins",
"Color intensity",
"Hue",
"OD280/OD315 of diluted wines",
"Proline")
) %>%
rename_all(., .funs = tolower) %>%
select_all(~gsub("\\s+|\\.", "_", .)) %>%
dplyr::select(-"true_clusters") # Removed true_clusters variable that ranges from 1-3
# Outputs descriptive statistics
summary_df <-
data.frame(mean = sapply(data, function(x) mean(x)),
variance = sapply(data, function(x) var(x)),
sd = sapply(data, function(x) sd(x))) %>%
round(3)
summary_df
## mean variance sd
## alcohol 13.001 0.659 0.812
## malic_acid 2.336 1.248 1.117
## ash 2.367 0.075 0.274
## alcalinity_of_ash 19.495 11.153 3.340
## magnesium 99.742 203.989 14.282
## total_phenols 2.295 0.392 0.626
## flavanoids 2.029 0.998 0.999
## nonflavanoid_phenols 0.362 0.015 0.124
## proanthocyanins 1.591 0.328 0.572
## color_intensity 5.058 5.374 2.318
## hue 0.957 0.052 0.229
## od280/od315_of_diluted_wines 2.612 0.504 0.710
## proline 746.893 99166.717 314.907
# Correlation matrix
corrplot(cor(data, method = "spearman"), method = "circle", type = "lower", diag = FALSE)
# Strongest top 10 statistically significant correlations
corr_cross(data, max_pvalue = 0.05, top = 10)
# Kernel density plots of all variables
data %>%
pivot_longer(colnames(data)) %>%
ggplot(., aes(x = value)) +
geom_histogram(aes(y = ..density..), color = "white", fill = "black") +
geom_density(color = "blue", alpha = 0.1, size = 1) +
facet_wrap(~ name, scales = "free") +
theme_light()
# Usefull for comparing the distribution of variables for k-means clustering prep.
data %>%
scale(., center = TRUE, scale = TRUE) %>%
as.data.frame() %>%
pivot_longer(colnames(data)) %>%
ggplot(., aes(x = name, y = value)) +
stat_summary(fun.y=mean, geom="point", shape=20, size=15, color="red", fill="red") +
geom_boxplot(fill = "white") +
theme_light() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
coord_flip() +
geom_hline(yintercept = 0,
color = "red", size=1)
# red dot is mean, line is median
set.seed(24398)
# Scaling variables to ensure proper comparisons for clustering
# mean = 0, sd = 1
data_scaled <- scale(data, center = TRUE, scale = TRUE)
The goal of the elbow method is the determine the ideal number of clusters by minimizing the SSE to a reasonable extent. Since as k tends to n, SSE will eventually equal zero, the inflection point of the graph is considered to be the ideal number of clusters.
fviz_nbclust(data_scaled, kmeans, method = "wss", k.max = 15)
# Significant bend noted at k=3
Similarly, the average silhouette method seeks to determine the ideal number of clusters, but does so by the calculation of the “silhouette.” Briefly, the silhouette is the similarity of observation i to its cluster, and its dissimilarity to other clusters, which is a score ranging from \(\pm1\). Positive values are indicative of good clustering (i.e. high internal similarity, and high external dissimilarity), while negative values are indicative of poor clustering (i.e. low internal similarity, and low external dissimilarity). The silhouette may be calculated based on various distance metrics, such as the Manhattan distance or the Euclidean distance. Here, the later is used.
fviz_nbclust(data_scaled, kmeans, method = "silhouette", k.max = 15)
# Average silhoutte is maximized at k=3
The gap statistic method is arguably more recent compared to the other two approaches, as it was proposed in 2000. Briefly, the gap statistic calculates all pairwise distances between the points in a cluster and takes the log of the average within-cluster distance. Thereafter, using bootstrap resampling, it generates subsamples from the original sample where no underlying clusters exist. Thereafter, the steps discussed initially are repeated with the subsamples. The value of k that maximizes the gap statistic is choosen to be ideal.
gap_stat <- clusGap(data_scaled, FUN = kmeans, nstart = 25,
K.max = 10, B = 100)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = data_scaled, FUNcluster = kmeans, K.max = 10, B = 100, nstart = 25)
## B=100 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 3
## logW E.logW gap SE.sim
## [1,] 5.377557 5.864029 0.4864727 0.01309903
## [2,] 5.203497 5.756745 0.5532483 0.01214727
## [3,] 5.066929 5.694581 0.6276522 0.01180668
## [4,] 5.023946 5.648222 0.6242763 0.01167275
## [5,] 4.989519 5.611009 0.6214900 0.01142308
## [6,] 4.961090 5.578753 0.6176629 0.01133593
## [7,] 4.935551 5.550707 0.6151559 0.01153793
## [8,] 4.902342 5.525309 0.6229665 0.01147364
## [9,] 4.876035 5.502051 0.6260160 0.01155692
## [10,] 4.850386 5.480666 0.6302799 0.01147605
fviz_gap_stat(gap_stat)
# Gap statistic is maximized at k=3
All three methods seem to indicate k=3, will proceed as such.
# nstart is the # of initial configurations, 25 is recommended.
k3 <- kmeans(data_scaled, centers = 3, nstart = 25)
# Visual of clusters
fviz_cluster(k3, data = data_scaled, ggtheme = theme_light())
# Examining the visual changes in clustering as k changes from 1->15
k_s <- lapply(1:15, function(x) kmeans(data_scaled, centers = x, nstart = 25))
plots_of_k_s <- lapply(k_s, function(x) fviz_cluster(x, geom = "point", data = data_scaled, ggtheme = theme_light()))
plots_of_k_s
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
# As k -> 15, we begin to notice overlap, for k<3, distance to center is large
Bootstrap resampling allows us to determine if the clusters we propose will hold in “other” datasets. This is often referred to as the stability of a cluster. The approach implemented below relies of the Jaccard coefficient of similarity, which is \(\frac{|A\cap B|}{|A\cup B|}\). As a rule of thumb, Jaccard coefficients that are:
\(J \leq 0.6\) are considered unstable (i.e. the proposed cluster does not hold up to plausible sampling variations.)
\(0.6 < J \leq 0.75\) are considered stable, yet the elements of the cluster or not clear.
\(0.75 < J\) are considered stable and likely real clusters.
bs_cluster <- clusterboot(data_scaled, B = 1e3, bootmethod = "boot", bscompare = TRUE, clustermethod = kmeansCBI, seed = 123, k = 3)
bs_cluster
## * Cluster stability assessment *
## Cluster method: kmeans
## Full clustering results are given as parameter result
## of the clusterboot object, which also provides further statistics
## of the resampling results.
## Number of resampling runs: 1000
##
## Number of clusters found in data: 3
##
## Clusterwise Jaccard bootstrap (omitting multiple points) mean:
## [1] 0.9751427 0.9671823 0.9505411
## dissolved:
## [1] 5 5 13
## recovered:
## [1] 990 988 986
Jaccard scores are indicative of stable clusters, which is supported by high recovery rate.
# Read in data and reformat variable names, didn't remove true clusters
data_for_conf_mat <- read_csv("data/wine.data", col_names = c(
"true_clusters",
"Alcohol",
"Malic acid",
"Ash",
"Alcalinity of ash",
"Magnesium",
"Total phenols",
"Flavanoids",
"Nonflavanoid phenols",
"Proanthocyanins",
"Color intensity",
"Hue",
"OD280/OD315 of diluted wines",
"Proline")
) %>%
rename_all(., .funs = tolower) %>%
select_all(~gsub("\\s+|\\.", "_", .)) %>%
mutate("clusters_modified" = case_when(
true_clusters == 1 ~ 2,
true_clusters == 2 ~ 3,
true_clusters == 3 ~ 1
))
# cluster numbering order was inconsistent between kmeans from R and the original dataset.
# Fixed based on best guess.
predicted_clusters <- as.factor(k3$cluster)
true_clusters <- as.factor(as.vector(unlist(data_for_conf_mat["clusters_modified"])))
confusionMatrix(data=predicted_clusters, reference = true_clusters)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 48 0 3
## 2 0 59 3
## 3 0 0 65
##
## Overall Statistics
##
## Accuracy : 0.9663
## 95% CI : (0.9281, 0.9875)
## No Information Rate : 0.3989
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9491
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 1.0000 0.9155
## Specificity 0.9769 0.9748 1.0000
## Pos Pred Value 0.9412 0.9516 1.0000
## Neg Pred Value 1.0000 1.0000 0.9469
## Prevalence 0.2697 0.3315 0.3989
## Detection Rate 0.2697 0.3315 0.3652
## Detection Prevalence 0.2865 0.3483 0.3652
## Balanced Accuracy 0.9885 0.9874 0.9577
data %>%
as.data.frame() %>%
mutate(cluster = k3$cluster) %>%
group_by(cluster) %>%
summarise_all(list(mean = mean))# Change for other values (min = min, max = max, var = var).
## # A tibble: 3 × 14
## cluster alcohol_mean malic_a…¹ ash_m…² alcal…³ magne…⁴ total…⁵ flava…⁶ nonfl…⁷
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.1 3.31 2.42 21.2 98.7 1.68 0.819 0.452
## 2 2 13.7 2.00 2.47 17.5 108. 2.85 3.00 0.292
## 3 3 12.3 1.90 2.23 20.1 92.7 2.25 2.05 0.358
## # … with 5 more variables: proanthocyanins_mean <dbl>,
## # color_intensity_mean <dbl>, hue_mean <dbl>,
## # `od280/od315_of_diluted_wines_mean` <dbl>, proline_mean <dbl>, and
## # abbreviated variable names ¹malic_acid_mean, ²ash_mean,
## # ³alcalinity_of_ash_mean, ⁴magnesium_mean, ⁵total_phenols_mean,
## # ⁶flavanoids_mean, ⁷nonflavanoid_phenols_mean