This data is from a paper by Forina, Armanino, Lanteri, Tiscornia (1983) Classification of Olive Oils from their Fatty Acid Composition, in Martens and Russwurm (ed) Food Research and Data Anlysis. We thank Prof. Michele Forina, University of Genova, Italy for making this dataset available.

https://rdrr.io/cran/cepp/man/data-olive.html

  • region Three super-classes of Italy: North, South and the island of Sardinia
  • area Nine collection areas: three from North, four from South and 2 from Sardinia

1 Importing Packages

library(cepp)
library(plyr)
library(caret)
library(tidyselect)
library(forcats)
library(janitor)
library(gdata)
library(tools)
library(mclust)
library(ggplot2)
library(dplyr)
library(tidyr)
library(magrittr)
library(purrr)
library(tibble)
library(readr)
library(stringr)

2 Importing Data

data("olive")

write.csv(olive, 
          file = "data/Olive Oil Data - Forina, Armanino, Lanteri, Tiscornia (1983).csv", 
          na = "", row.names = FALSE)

3 Data Pre-processing

olive_df <- rename.vars(
  data = olive, 
  from = colnames(olive), 
  to = tools::toTitleCase(colnames(olive)), 
  info = FALSE)

# Create vector for targets and features
target <- colnames(olive_df)[1:2]
features <- setdiff(colnames(olive_df), target)

# Rename values of Regions and Areas
olive_df$Region <- plyr::revalue(x = as.character(olive_df$Region), 
                                 replace = c("1" = "South", 
                                             "2" = "Sardinia", 
                                             "3" = "North"))

olive_df$Area <- plyr::revalue(x = as.character(str_trim(olive_df$Area)), 
                               replace = c(
                                 "North-Apulia" = "Apulia (North)",
                                 "South-Apulia" = "Apulia (South)",
                                 "Inland-Sardinia" = "Sardinia (Inland)",
                                 "Coast-Sardinia" = "Sardinia (Coast)",
                                 "East-Liguria" = "Liguria (East)",
                                 "West-Liguria" = "Liguria (West)"
                                 )
                               )

# Convert features into %
olive_df[, features] <- olive_df[, features]/100

Some observations do not add up to 100%

hist(
  rowSums(olive_df[, features]), 
  main = "Histogram of Total of Fatty Acid Content", 
  xlab = "Total (%)")

Adjust fatty acid content to add to 100% for all samples

adjust_to_100 <- function(x) { (x/sum(x))*100 }

olive_df_adjusted <- olive_df

for (i in 1:nrow(olive_df_adjusted)) {
  
  olive_df_adjusted[i, features] <- adjust_to_100(olive_df[i, features])
  
}

4 Exploratory Data Analysis

means_area <- olive_df_adjusted %>% 
  dplyr::group_by(Area) %>% 
  gather(key = "fatty_acid", value = "percentage", -Region, -Area) %>% 
  dplyr::group_by(Region, Area, fatty_acid) %>% 
  dplyr::summarise(Mean = mean(percentage, na.rm = TRUE), .groups = "keep") %>% 
  spread(key = fatty_acid, value = Mean)

# Print table of mean values
means_area %>% knitr::kable(digits = 2, align = "c")
Region Area Arachidic Eicosenoic Linoleic Linolenic Oleic Palmitic Palmitoleic Stearic
North Liguria (East) 0.64 0.02 6.90 0.26 77.47 11.46 0.84 2.41
North Liguria (West) 0.07 0.02 8.97 0.05 76.72 10.53 1.08 2.57
North Umbria 0.43 0.02 5.99 0.34 79.78 10.89 0.60 1.95
Sardinia Sardinia (Coast) 0.72 0.02 13.37 0.24 70.83 11.38 1.01 2.44
Sardinia Sardinia (Inland) 0.74 0.02 11.25 0.29 73.61 10.98 0.95 2.17
South Apulia (North) 0.72 0.35 7.06 0.43 78.21 10.27 0.62 2.35
South Apulia (South) 0.60 0.24 11.68 0.35 69.20 13.98 1.84 2.11
South Calabria 0.64 0.28 8.23 0.46 73.44 13.09 1.22 2.64
South Sicily 0.76 0.39 8.39 0.43 73.90 12.34 1.05 2.75
means_fatty_acid <- apply(means_area[,features], MARGIN = 2, mean)
std_dev_fatty_acid <- apply(means_area[,features], MARGIN = 2, sd) 
  
means_area_scaled <- means_area

means_area_scaled[, features] <-
  scale(means_area_scaled[, features], 
        center = means_fatty_acid, 
        scale = std_dev_fatty_acid)

(g <- means_area_scaled %>% 
  gather(key = "fatty_acid", value = "percentage", -Area, -Region) %>% 
  dplyr::arrange(Region, Area) %>% 
  ggplot(aes(x=fatty_acid,  y=Area, fill=percentage)) +
    geom_tile() +
    labs(title = "Z-scores of mean fatty acid composition of olive samples by area",
         x = "Fatty Acid",
         y = "Area",
         fill = "z-score") +
    scale_fill_viridis_c() +
    theme_bw())

(g <- olive_df_adjusted %>% 
  dplyr::select(-Area) %>% 
  gather(key = "Fatty Acid", value = "Percentage", -Region) %>% 
  ggplot(mapping = aes(x = Percentage, fill = `Fatty Acid`)) +
    geom_histogram(bins = 30, alpha = 0.7) +
    facet_grid(Region~`Fatty Acid`, scales = "free_x") +
    scale_fill_brewer(palette = "Set1") +
    labs(title = "Distribution of fatty acids for each region",
         y = "Count",
         x = "%") +
    theme_bw() + theme(legend.position = "none"))


5 Clustering

5.1 K-Means

Perform iterative k-means clustering for various k values and compute the within-cluster sums of squares for each value of k.

Each k value is tested with 20 random starts. The Hartigan-Wong algorithm (Hartigan, J. A. and Wong, M. A. (1979)) will be used.

olive_df_scaled <- olive_df_adjusted
olive_df_scaled[, features] <- scale(olive_df_scaled[, features], center = TRUE, scale = TRUE)

wss <- NULL # vector for within sum of squares
bss <- NULL # vector for between sum of squares
set.seed(1) # set seed for reproducibility

for (i in 1:20) {
  
  kmeans_output <-
    kmeans(
      x = olive_df_scaled[, features],
      centers = i,
      nstart = 20,
      iter.max = 30,
      algorithm = "Hartigan-Wong"
    )
  wss[i] <- kmeans_output$tot.withinss
  bss[i] <- kmeans_output$betweenss
  
}

kmeans_results <- data.frame(k = 1:20,
                             WSS = wss,
                             BSS = bss)

kmeans_results %>% 
  gather(key = "Parameter", value = "Value", -k) %>% 
  ggplot(aes(x = k, y = Value, col = Parameter)) +
    geom_line() +
    geom_point() +
    scale_x_continuous(breaks = 1:20, minor_breaks = NULL) +
    scale_color_brewer(type = "qual", palette = "Set1") +
    labs(title = "Total Within Sum of Square and Total Between Sum of Square\nfor each k value tested") +
    theme_light()

5.1.1 Selecting k

We will select a k of 5 for further k-means clustering on this dataset.

set.seed(1)
kmeans_output <-
  kmeans(
    x = olive_df_scaled[, features],
    centers = 5,
    nstart = 20,
    iter.max = 30,
    algorithm = "Hartigan-Wong"
  )

olive_df_scaled$kmeans_clusters <- kmeans_output$cluster
olive_df_adjusted$kmeans_clusters <- kmeans_output$cluster

olive_df_scaled <- olive_df_scaled %>% 
  dplyr::mutate(kmeans_clusters = kmeans_output$cluster) %>% 
  dplyr::select(Region, Area, kmeans_clusters, everything())

5.1.2 Confusion Matrix

confusion_matrix_region <- olive_df_scaled %>% 
   dplyr::select(Region, Area, kmeans_clusters) %>% 
   dplyr::mutate(idx = 1) %>% 
   dplyr::group_by(Region, Area, kmeans_clusters) %>%
   dplyr::summarise(Frequency = sum(idx), .groups = "keep") %>% 
   spread(key = "kmeans_clusters", value = "Frequency", fill = 0) %>% 
   dplyr::arrange(Region, Area)
table(olive_df_scaled %>% dplyr::select(Region, kmeans_clusters)) %>% 
  data.frame() %>% 
  ggplot(aes(
    x = kmeans_clusters,
    y = fct_reorder(Region, desc(Region)),
    fill = Freq,
    label = Freq
  )) + 
  geom_tile() +
  geom_text() +
  scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
  labs(
    x = "K-means Clusters", 
    y = "Area", 
    title = "Confusion matrix of k-means clusters and region of olive oil samples") +
  theme_light()

(g <- confusion_matrix_region %>% 
  gather(key = "kmeans_clusters", value = "Frequency", -Area, -Region) %>% 
  dplyr::mutate(kmeans_clusters = as.numeric(kmeans_clusters)) %>% 
   ggplot(aes(
     x = kmeans_clusters,
     y = fct_reorder(Area, dplyr::desc(Region)),
     fill = Frequency,
     label = Frequency
   )) + 
  geom_tile() +
  geom_text() +
  scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
  labs(
    x = "K-means Clusters", 
    y = "Region", 
    title = "Confusion matrix of k-means clusters and area of olive oil samples") +
  theme_light())


5.2 Gaussian Mixture Models

Gaussian mixture models (GMM) are a form of model-based clustering that provide soft assignment to each observation by providing a probability of belonging to each cluster.

A benefit of Gaussian mixture models over k-means is that the former automatically identifies the optimal number of clusters. In addition, GMMs allow for more clustering a wider variety of cluster shapes and orientations.

It’s important to highlight that GMMs assume multivariate normality of data.

The mclust package will be used to apply GMM models to the data. The BIC is used to identify the optimize the covariance parameters and to identify the optimal number of clusters.

library(mclust)

gmm <- mclust::Mclust(data = olive_df_adjusted[, features], G = 2:10, verbose = FALSE)

summary(gmm)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVI (diagonal, varying volume and shape) model with 10 components: 
## 
##  log-likelihood   n  df       BIC       ICL
##       -323.9459 572 169 -1720.896 -1752.464
## 
## Clustering table:
##   1   2   3   4   5   6   7   8   9  10 
##  37  75 138  46  27  55  65  33  43  53

The final model has 10 clusters and a BIC of -1720.8963665.

olive_df_adjusted$gmm_clusters <- gmm$classification
olive_df_adjusted$gmm_uncertainty <- gmm$uncertainty

gmm_probability <- data.frame(gmm$z)
colnames(gmm_probability) <- paste0("GMM Cluster ", str_pad(1:gmm$G, width = 2, side = "left", pad = "0"))
olive_df_adjusted <- cbind(olive_df_adjusted, gmm_probability)

The optimal models uses the VVI covariance parameters (diagonal, varying volume and shape).

gmm_BIC <- matrix(gmm$BIC, nrow = dim(gmm$BIC)[1], ncol = dim(gmm$BIC)[2]) %>% 
  data.frame() %>% 
  add_column(K = 2:10, .before=1)

colnames(gmm_BIC) <- c("K", colnames(gmm$BIC))

gmm_BIC %>% 
  gather(key = "Model", value = "BIC", -K) %>% 
  filter(!is.na(BIC)) %>% 
  ggplot(aes(x = K, y = BIC, col = Model)) +
    geom_line(size=0.75) +
    geom_point() +
    scale_color_brewer(type = "qual", palette = "Set1", direction = -1) +
    scale_x_continuous(breaks = 1:10, minor_breaks = NULL) +
    labs(title = "Gaussian Mixture Models",
         subtitle = "BIC scores for components ranging from 1 to 10 and various covariance models",
         x = "Number of Components") +
    theme_light()

5.2.1 Probability and Uncertainty

olive_df_adjusted %>% 
  dplyr::select(Region, Area, contains("GMM Cluster")) %>% 
  mutate(ID = row_number()) %>% 
  gather(Cluster, probability, -ID, -Region, -Area) %>% 
  dplyr::arrange(Region, Area) %>% 
  ggplot(aes(x = probability, fill = Area)) +
    geom_histogram(bins=20) +
    facet_wrap(~ Cluster, nrow = 2) +
    labs(x = "Probability", y = "Count",
         title = "Distribution of probabilities for all observations aligning to each cluster") +
    scale_fill_brewer(type = "qual", palette = "Set1") + 
    theme_light()

The plot below shows that most samples with high uncertainty (>0.20) are from the South and are found mostly on cluster 3, 4 and 5.

olive_df_adjusted %>% 
  dplyr::select(Region, Area, Uncertainty = gmm_uncertainty, Cluster = gmm_clusters) %>% 
  mutate(ID = row_number()) %>% 
  filter(Uncertainty > 0.20) %>% 
  ggplot(aes(x = Uncertainty, reorder(ID, Uncertainty), col = factor(Cluster), shape = Region)) +
    geom_point(size = 3) +
    labs(title = "Observations with an uncertainty of membership higher than 0.20", y = "ID", col = "Cluster") +
    scale_color_brewer(type = "qual", palette = "Set1", direction = -1) +
    theme_bw() 

5.2.2 Confusion Matrix

confusion_matrix_region <- olive_df_adjusted %>% 
   dplyr::select(Region, Area, `GMM Clusters` =  gmm_clusters) %>% 
   dplyr::mutate(idx = 1) %>% 
   dplyr::group_by(Region, Area, `GMM Clusters`) %>%
   dplyr::summarise(Frequency = sum(idx), .groups = "keep") %>% 
   spread(key = "GMM Clusters", value = "Frequency", fill = 0) %>% 
   dplyr::arrange(Region, Area)
table(olive_df_adjusted %>% dplyr::select(Region, gmm_clusters)) %>% 
  data.frame() %>% 
  ggplot(aes(x = gmm_clusters, y = fct_reorder(Region, desc(Region)), fill = Freq, label = Freq)) + 
  geom_tile() +
  geom_text() +
  scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
  labs(
    x = "GMM Clusters", 
    y = "Region", 
    title = "Confusion matrix of Gaussian Mixture Model clusters\nand region of olive oil samples") +
  theme_light()

(g <- confusion_matrix_region %>% 
  gather(key = "GMM Clusters", value = "Frequency", -Area, -Region) %>% 
  dplyr::mutate(`GMM Clusters` = factor(`GMM Clusters`)) %>% 
  ggplot(aes(x = `GMM Clusters`, y = fct_reorder(Area, dplyr::desc(Region)), fill = Frequency, label = Frequency)) + 
  geom_tile() +
  geom_text() +
  scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
  labs(
    x = "GMM Clusters", 
    y = "Area", 
    title = "Confusion matrix of Gaussian Mixture Model clusters\nand area of olive oil samples") +
  theme_light())


5.3 HDBSCAN

library(dbscan)

(hdbscan <- dbscan::hdbscan(olive_df_scaled[, features], minPts = 10))
## HDBSCAN clustering for 572 objects.
## Parameters: minPts = 10
## The clustering contains 6 cluster(s) and 133 noise points.
## 
##   0   1   2   3   4   5   6 
## 133  29  42  14  97 245  12 
## 
## Available fields: cluster, minPts, coredist, cluster_scores,
##                   membership_prob, outlier_scores, hc
olive_df_adjusted$hdbscan_clusters <- hdbscan$cluster
olive_df_adjusted$hdbscan_membership_prob <- hdbscan$membership_prob
olive_df_adjusted$hdbscan_outlier_scores <- hdbscan$outlier_scores

hdbscan %>% plot(show_flat =TRUE)

5.3.1 Probability and Uncertainty

olive_df_adjusted %>% 
  dplyr::select(Region, Area, Cluster = hdbscan_clusters, Probability = hdbscan_membership_prob) %>% 
  mutate(ID = row_number(),
         Cluster = factor(Cluster, labels = c("Outlier", 1:max(hdbscan$cluster)))) %>% 
  # gather(Cluster, probability, -ID, -Region, -Area) %>% 
  dplyr::arrange(Region, Area) %>% 
  ggplot(aes(x = Probability, fill = Cluster)) +
    geom_histogram(bins=20) +
    facet_wrap(~ Cluster, nrow = 2, scales = "free_y") +
    labs(x = "Probability", y = "Count",
         title = "Distribution of probabilities for all observations aligning to each cluster") +
    scale_fill_brewer(type = "qual", palette = "Set1") +
    theme_light()

olive_df_adjusted %>% 
  dplyr::select(Region, Area, Outlier_Score = hdbscan_outlier_scores, Cluster = hdbscan_clusters) %>% 
  mutate(ID = row_number()) %>% 
  filter(Outlier_Score > 0.50,
         Cluster == 0) %>% 
  ggplot(aes(x = Outlier_Score, reorder(ID, Outlier_Score), col = factor(Region), shape = Region)) +
    geom_point(size = 1.5) +
    labs(title = "Observations with an outlier score higher than 0.50", y = "", col = "Region") +
    scale_y_discrete(labels = NULL, breaks = NULL) +
    scale_color_brewer(type = "qual", palette = "Set1", direction = -1) +
    theme_bw()

5.3.2 Confusion Matrix

confusion_matrix_region <- olive_df_adjusted %>% 
   dplyr::select(Region, Area, `HDBSCAN Clusters` =  hdbscan_clusters) %>% 
   dplyr::mutate(idx = 1) %>% 
   dplyr::group_by(Region, Area, `HDBSCAN Clusters`) %>%
   dplyr::summarise(Frequency = sum(idx), .groups = "keep") %>% 
   spread(key = "HDBSCAN Clusters", value = "Frequency", fill = 0) %>% 
   dplyr::arrange(Region, Area)
table(olive_df_adjusted %>% dplyr::select(Region, hdbscan_clusters)) %>% 
  data.frame() %>% 
  ggplot(aes(x = hdbscan_clusters, y = fct_reorder(Region, desc(Region)), fill = Freq, label = Freq)) + 
  geom_tile() +
  geom_text() +
  scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
  labs(
    x = "HDBSCAN Clusters", 
    y = "Region", 
    title = "Confusion matrix of HDBSCAN clusters\nand region of olive oil samples") +
  theme_light()

(g <- confusion_matrix_region %>% 
  gather(key = "HDBSCAN Clusters", value = "Frequency", -Area, -Region) %>% 
  dplyr::mutate(`HDBSCAN Clusters` = factor(`HDBSCAN Clusters`)) %>% 
  ggplot(aes(x = `HDBSCAN Clusters`, y = fct_reorder(Area, dplyr::desc(Region)), fill = Frequency, label = Frequency)) + 
  geom_tile() +
  geom_text() +
  scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
  labs(
    x = "HDBSCAN Clusters", 
    y = "Region", 
    title = "Confusion matrix of HDBSCAN clusters\nand area of olive oil samples") +
  theme_light())