Dependencies
Calling in and cleaning data
EDA
Determining the number of clusters
Elbow method
Average silhouette method
Gap statistic method
Bootstrap evaluation of clusters
Examining the confusion matrix
Summary statistics per k=3 clusters

Dependencies

library(tidyverse)
library(cluster)
library(factoextra)
library(corrplot) 
library(lares)
library(fpc)
library(caret)

Calling in and cleaning data

# Read in data and reformat variable names 
data <- read_csv("data/wine.data", col_names = c(
  "true_clusters", 
  "Alcohol",
  "Malic acid",
  "Ash",
  "Alcalinity of ash",
  "Magnesium",
  "Total phenols",
  "Flavanoids",
  "Nonflavanoid phenols",
  "Proanthocyanins",
  "Color intensity",
  "Hue",
  "OD280/OD315 of diluted wines",
  "Proline")
) %>%
  rename_all(., .funs = tolower) %>%
  select_all(~gsub("\\s+|\\.", "_", .)) %>%
  dplyr::select(-"true_clusters") # Removed true_clusters variable that ranges from 1-3

EDA

 # Outputs descriptive statistics 
summary_df <- 
  data.frame(mean = sapply(data, function(x) mean(x)), 
             variance = sapply(data, function(x) var(x)), 
             sd = sapply(data, function(x) sd(x))) %>%
  round(3)

summary_df

##                                 mean  variance      sd
## alcohol                       13.001     0.659   0.812
## malic_acid                     2.336     1.248   1.117
## ash                            2.367     0.075   0.274
## alcalinity_of_ash             19.495    11.153   3.340
## magnesium                     99.742   203.989  14.282
## total_phenols                  2.295     0.392   0.626
## flavanoids                     2.029     0.998   0.999
## nonflavanoid_phenols           0.362     0.015   0.124
## proanthocyanins                1.591     0.328   0.572
## color_intensity                5.058     5.374   2.318
## hue                            0.957     0.052   0.229
## od280/od315_of_diluted_wines   2.612     0.504   0.710
## proline                      746.893 99166.717 314.907

# Correlation matrix 
corrplot(cor(data, method = "spearman"), method = "circle", type = "lower", diag = FALSE)

# Strongest top 10 statistically significant correlations 
corr_cross(data, max_pvalue = 0.05, top = 10)

# Kernel density plots of all variables 
data %>%
  pivot_longer(colnames(data)) %>%
  ggplot(., aes(x = value)) + 
  geom_histogram(aes(y = ..density..), color = "white", fill = "black") + 
  geom_density(color = "blue", alpha = 0.1, size = 1) + 
  facet_wrap(~ name, scales = "free") + 
  theme_light()

# Usefull for comparing the distribution of variables for k-means clustering prep. 
data %>%
  scale(., center = TRUE, scale = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(colnames(data)) %>%
  ggplot(., aes(x = name, y = value)) + 
  stat_summary(fun.y=mean, geom="point", shape=20, size=15, color="red", fill="red") + 
  geom_boxplot(fill = "white") + 
  theme_light() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  coord_flip() + 
  geom_hline(yintercept = 0, 
             color = "red", size=1)

# red dot is mean, line is median

Determining the number of clusters

set.seed(24398)
# Scaling variables to ensure proper comparisons for clustering 
# mean = 0, sd = 1

data_scaled <- scale(data, center = TRUE, scale = TRUE)

Elbow method

The goal of the elbow method is the determine the ideal number of clusters by minimizing the SSE to a reasonable extent. Since as k tends to n, SSE will eventually equal zero, the inflection point of the graph is considered to be the ideal number of clusters.

fviz_nbclust(data_scaled, kmeans, method = "wss", k.max = 15)

# Significant bend noted at k=3

Average silhouette method

Similarly, the average silhouette method seeks to determine the ideal number of clusters, but does so by the calculation of the “silhouette.” Briefly, the silhouette is the similarity of observation i to its cluster, and its dissimilarity to other clusters, which is a score ranging from \(\pm1\). Positive values are indicative of good clustering (i.e. high internal similarity, and high external dissimilarity), while negative values are indicative of poor clustering (i.e. low internal similarity, and low external dissimilarity). The silhouette may be calculated based on various distance metrics, such as the Manhattan distance or the Euclidean distance. Here, the later is used.

fviz_nbclust(data_scaled, kmeans, method = "silhouette",  k.max = 15)

# Average silhoutte is maximized at k=3

Gap statistic method

The gap statistic method is arguably more recent compared to the other two approaches, as it was proposed in 2000. Briefly, the gap statistic calculates all pairwise distances between the points in a cluster and takes the log of the average within-cluster distance. Thereafter, using bootstrap resampling, it generates subsamples from the original sample where no underlying clusters exist. Thereafter, the steps discussed initially are repeated with the subsamples. The value of k that maximizes the gap statistic is choosen to be ideal.

gap_stat <- clusGap(data_scaled, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 100)

print(gap_stat, method = "firstmax")

## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = data_scaled, FUNcluster = kmeans, K.max = 10, B = 100,     nstart = 25)
## B=100 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 3
##           logW   E.logW       gap     SE.sim
##  [1,] 5.377557 5.864029 0.4864727 0.01309903
##  [2,] 5.203497 5.756745 0.5532483 0.01214727
##  [3,] 5.066929 5.694581 0.6276522 0.01180668
##  [4,] 5.023946 5.648222 0.6242763 0.01167275
##  [5,] 4.989519 5.611009 0.6214900 0.01142308
##  [6,] 4.961090 5.578753 0.6176629 0.01133593
##  [7,] 4.935551 5.550707 0.6151559 0.01153793
##  [8,] 4.902342 5.525309 0.6229665 0.01147364
##  [9,] 4.876035 5.502051 0.6260160 0.01155692
## [10,] 4.850386 5.480666 0.6302799 0.01147605

fviz_gap_stat(gap_stat)

# Gap statistic is maximized at k=3

All three methods seem to indicate k=3, will proceed as such.

# nstart is the # of initial configurations, 25 is recommended. 
k3 <- kmeans(data_scaled, centers = 3, nstart = 25)

# Visual of clusters 
fviz_cluster(k3, data = data_scaled, ggtheme = theme_light())

# Examining the visual changes in clustering as k changes from 1->15
k_s <- lapply(1:15, function(x) kmeans(data_scaled, centers = x, nstart = 25))

plots_of_k_s <- lapply(k_s, function(x) fviz_cluster(x, geom = "point",  data = data_scaled, ggtheme = theme_light()))

plots_of_k_s

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

# As k -> 15, we begin to notice overlap, for k<3, distance to center is large

Bootstrap evaluation of clusters

Bootstrap resampling allows us to determine if the clusters we propose will hold in “other” datasets. This is often referred to as the stability of a cluster. The approach implemented below relies of the Jaccard coefficient of similarity, which is \(\frac{|A\cap B|}{|A\cup B|}\). As a rule of thumb, Jaccard coefficients that are:

\(J \leq 0.6\) are considered unstable (i.e. the proposed cluster does not hold up to plausible sampling variations.)
\(0.6 < J \leq 0.75\) are considered stable, yet the elements of the cluster or not clear.
\(0.75 < J\) are considered stable and likely real clusters.

bs_cluster <- clusterboot(data_scaled, B = 1e3, bootmethod = "boot", bscompare = TRUE, clustermethod = kmeansCBI, seed = 123, k = 3)

bs_cluster

## * Cluster stability assessment *
## Cluster method:  kmeans 
## Full clustering results are given as parameter result
## of the clusterboot object, which also provides further statistics
## of the resampling results.
## Number of resampling runs:  1000 
## 
## Number of clusters found in data:  3 
## 
##  Clusterwise Jaccard bootstrap (omitting multiple points) mean:
## [1] 0.9751427 0.9671823 0.9505411
## dissolved:
## [1]  5  5 13
## recovered:
## [1] 990 988 986

Jaccard scores are indicative of stable clusters, which is supported by high recovery rate.

Examining the confusion matrix

# Read in data and reformat variable names, didn't remove true clusters 
data_for_conf_mat <- read_csv("data/wine.data", col_names = c(
  "true_clusters", 
  "Alcohol",
  "Malic acid",
  "Ash",
  "Alcalinity of ash",
  "Magnesium",
  "Total phenols",
  "Flavanoids",
  "Nonflavanoid phenols",
  "Proanthocyanins",
  "Color intensity",
  "Hue",
  "OD280/OD315 of diluted wines",
  "Proline")
) %>%
  rename_all(., .funs = tolower) %>%
  select_all(~gsub("\\s+|\\.", "_", .)) %>%
  mutate("clusters_modified" = case_when(
    true_clusters == 1 ~ 2, 
    true_clusters == 2 ~ 3, 
    true_clusters == 3 ~ 1
  ))

# cluster numbering order was inconsistent between kmeans from R and the original dataset. 
# Fixed based on best guess. 

predicted_clusters <- as.factor(k3$cluster)
true_clusters <- as.factor(as.vector(unlist(data_for_conf_mat["clusters_modified"])))

confusionMatrix(data=predicted_clusters, reference = true_clusters)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 48  0  3
##          2  0 59  3
##          3  0  0 65
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9663          
##                  95% CI : (0.9281, 0.9875)
##     No Information Rate : 0.3989          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9491          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000   0.9155
## Specificity            0.9769   0.9748   1.0000
## Pos Pred Value         0.9412   0.9516   1.0000
## Neg Pred Value         1.0000   1.0000   0.9469
## Prevalence             0.2697   0.3315   0.3989
## Detection Rate         0.2697   0.3315   0.3652
## Detection Prevalence   0.2865   0.3483   0.3652
## Balanced Accuracy      0.9885   0.9874   0.9577

Summary statistics per k=3 clusters

data %>%
  as.data.frame() %>%
  mutate(cluster = k3$cluster) %>%
  group_by(cluster) %>%
  summarise_all(list(mean = mean))# Change for other values (min = min, max = max, var = var).

## # A tibble: 3 × 14
##   cluster alcohol_mean malic_a…¹ ash_m…² alcal…³ magne…⁴ total…⁵ flava…⁶ nonfl…⁷
##     <int>        <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1       1         13.1      3.31    2.42    21.2    98.7    1.68   0.819   0.452
## 2       2         13.7      2.00    2.47    17.5   108.     2.85   3.00    0.292
## 3       3         12.3      1.90    2.23    20.1    92.7    2.25   2.05    0.358
## # … with 5 more variables: proanthocyanins_mean <dbl>,
## #   color_intensity_mean <dbl>, hue_mean <dbl>,
## #   `od280/od315_of_diluted_wines_mean` <dbl>, proline_mean <dbl>, and
## #   abbreviated variable names ¹malic_acid_mean, ²ash_mean,
## #   ³alcalinity_of_ash_mean, ⁴magnesium_mean, ⁵total_phenols_mean,
## #   ⁶flavanoids_mean, ⁷nonflavanoid_phenols_mean

Wine Clustering Project

Ihsan E. Buker