This data is from a paper by Forina, Armanino, Lanteri, Tiscornia (1983) Classification of Olive Oils from their Fatty Acid Composition, in Martens and Russwurm (ed) Food Research and Data Anlysis. We thank Prof. Michele Forina, University of Genova, Italy for making this dataset available.
https://rdrr.io/cran/cepp/man/data-olive.html
region Three super-classes of Italy: North, South and
the island of Sardiniaarea Nine collection areas: three from North, four from
South and 2 from Sardinialibrary(cepp)
library(plyr)
library(caret)
library(tidyselect)
library(forcats)
library(janitor)
library(gdata)
library(tools)
library(mclust)
library(ggplot2)
library(dplyr)
library(tidyr)
library(magrittr)
library(purrr)
library(tibble)
library(readr)
library(stringr)data("olive")
write.csv(olive,
file = "data/Olive Oil Data - Forina, Armanino, Lanteri, Tiscornia (1983).csv",
na = "", row.names = FALSE)olive_df <- rename.vars(
data = olive,
from = colnames(olive),
to = tools::toTitleCase(colnames(olive)),
info = FALSE)
# Create vector for targets and features
target <- colnames(olive_df)[1:2]
features <- setdiff(colnames(olive_df), target)
# Rename values of Regions and Areas
olive_df$Region <- plyr::revalue(x = as.character(olive_df$Region),
replace = c("1" = "South",
"2" = "Sardinia",
"3" = "North"))
olive_df$Area <- plyr::revalue(x = as.character(str_trim(olive_df$Area)),
replace = c(
"North-Apulia" = "Apulia (North)",
"South-Apulia" = "Apulia (South)",
"Inland-Sardinia" = "Sardinia (Inland)",
"Coast-Sardinia" = "Sardinia (Coast)",
"East-Liguria" = "Liguria (East)",
"West-Liguria" = "Liguria (West)"
)
)
# Convert features into %
olive_df[, features] <- olive_df[, features]/100Some observations do not add up to 100%
hist(
rowSums(olive_df[, features]),
main = "Histogram of Total of Fatty Acid Content",
xlab = "Total (%)")Adjust fatty acid content to add to 100% for all samples
adjust_to_100 <- function(x) { (x/sum(x))*100 }
olive_df_adjusted <- olive_df
for (i in 1:nrow(olive_df_adjusted)) {
olive_df_adjusted[i, features] <- adjust_to_100(olive_df[i, features])
}means_area <- olive_df_adjusted %>%
dplyr::group_by(Area) %>%
gather(key = "fatty_acid", value = "percentage", -Region, -Area) %>%
dplyr::group_by(Region, Area, fatty_acid) %>%
dplyr::summarise(Mean = mean(percentage, na.rm = TRUE), .groups = "keep") %>%
spread(key = fatty_acid, value = Mean)
# Print table of mean values
means_area %>% knitr::kable(digits = 2, align = "c")| Region | Area | Arachidic | Eicosenoic | Linoleic | Linolenic | Oleic | Palmitic | Palmitoleic | Stearic |
|---|---|---|---|---|---|---|---|---|---|
| North | Liguria (East) | 0.64 | 0.02 | 6.90 | 0.26 | 77.47 | 11.46 | 0.84 | 2.41 |
| North | Liguria (West) | 0.07 | 0.02 | 8.97 | 0.05 | 76.72 | 10.53 | 1.08 | 2.57 |
| North | Umbria | 0.43 | 0.02 | 5.99 | 0.34 | 79.78 | 10.89 | 0.60 | 1.95 |
| Sardinia | Sardinia (Coast) | 0.72 | 0.02 | 13.37 | 0.24 | 70.83 | 11.38 | 1.01 | 2.44 |
| Sardinia | Sardinia (Inland) | 0.74 | 0.02 | 11.25 | 0.29 | 73.61 | 10.98 | 0.95 | 2.17 |
| South | Apulia (North) | 0.72 | 0.35 | 7.06 | 0.43 | 78.21 | 10.27 | 0.62 | 2.35 |
| South | Apulia (South) | 0.60 | 0.24 | 11.68 | 0.35 | 69.20 | 13.98 | 1.84 | 2.11 |
| South | Calabria | 0.64 | 0.28 | 8.23 | 0.46 | 73.44 | 13.09 | 1.22 | 2.64 |
| South | Sicily | 0.76 | 0.39 | 8.39 | 0.43 | 73.90 | 12.34 | 1.05 | 2.75 |
means_fatty_acid <- apply(means_area[,features], MARGIN = 2, mean)
std_dev_fatty_acid <- apply(means_area[,features], MARGIN = 2, sd)
means_area_scaled <- means_area
means_area_scaled[, features] <-
scale(means_area_scaled[, features],
center = means_fatty_acid,
scale = std_dev_fatty_acid)
(g <- means_area_scaled %>%
gather(key = "fatty_acid", value = "percentage", -Area, -Region) %>%
dplyr::arrange(Region, Area) %>%
ggplot(aes(x=fatty_acid, y=Area, fill=percentage)) +
geom_tile() +
labs(title = "Z-scores of mean fatty acid composition of olive samples by area",
x = "Fatty Acid",
y = "Area",
fill = "z-score") +
scale_fill_viridis_c() +
theme_bw())(g <- olive_df_adjusted %>%
dplyr::select(-Area) %>%
gather(key = "Fatty Acid", value = "Percentage", -Region) %>%
ggplot(mapping = aes(x = Percentage, fill = `Fatty Acid`)) +
geom_histogram(bins = 30, alpha = 0.7) +
facet_grid(Region~`Fatty Acid`, scales = "free_x") +
scale_fill_brewer(palette = "Set1") +
labs(title = "Distribution of fatty acids for each region",
y = "Count",
x = "%") +
theme_bw() + theme(legend.position = "none"))Perform iterative k-means clustering for various k values and compute the within-cluster sums of squares for each value of k.
Each k value is tested with 20 random starts. The Hartigan-Wong algorithm (Hartigan, J. A. and Wong, M. A. (1979)) will be used.
olive_df_scaled <- olive_df_adjusted
olive_df_scaled[, features] <- scale(olive_df_scaled[, features], center = TRUE, scale = TRUE)
wss <- NULL # vector for within sum of squares
bss <- NULL # vector for between sum of squares
set.seed(1) # set seed for reproducibility
for (i in 1:20) {
kmeans_output <-
kmeans(
x = olive_df_scaled[, features],
centers = i,
nstart = 20,
iter.max = 30,
algorithm = "Hartigan-Wong"
)
wss[i] <- kmeans_output$tot.withinss
bss[i] <- kmeans_output$betweenss
}
kmeans_results <- data.frame(k = 1:20,
WSS = wss,
BSS = bss)
kmeans_results %>%
gather(key = "Parameter", value = "Value", -k) %>%
ggplot(aes(x = k, y = Value, col = Parameter)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:20, minor_breaks = NULL) +
scale_color_brewer(type = "qual", palette = "Set1") +
labs(title = "Total Within Sum of Square and Total Between Sum of Square\nfor each k value tested") +
theme_light()We will select a k of 5 for further k-means clustering on this dataset.
set.seed(1)
kmeans_output <-
kmeans(
x = olive_df_scaled[, features],
centers = 5,
nstart = 20,
iter.max = 30,
algorithm = "Hartigan-Wong"
)
olive_df_scaled$kmeans_clusters <- kmeans_output$cluster
olive_df_adjusted$kmeans_clusters <- kmeans_output$cluster
olive_df_scaled <- olive_df_scaled %>%
dplyr::mutate(kmeans_clusters = kmeans_output$cluster) %>%
dplyr::select(Region, Area, kmeans_clusters, everything())confusion_matrix_region <- olive_df_scaled %>%
dplyr::select(Region, Area, kmeans_clusters) %>%
dplyr::mutate(idx = 1) %>%
dplyr::group_by(Region, Area, kmeans_clusters) %>%
dplyr::summarise(Frequency = sum(idx), .groups = "keep") %>%
spread(key = "kmeans_clusters", value = "Frequency", fill = 0) %>%
dplyr::arrange(Region, Area)table(olive_df_scaled %>% dplyr::select(Region, kmeans_clusters)) %>%
data.frame() %>%
ggplot(aes(
x = kmeans_clusters,
y = fct_reorder(Region, desc(Region)),
fill = Freq,
label = Freq
)) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
labs(
x = "K-means Clusters",
y = "Area",
title = "Confusion matrix of k-means clusters and region of olive oil samples") +
theme_light()(g <- confusion_matrix_region %>%
gather(key = "kmeans_clusters", value = "Frequency", -Area, -Region) %>%
dplyr::mutate(kmeans_clusters = as.numeric(kmeans_clusters)) %>%
ggplot(aes(
x = kmeans_clusters,
y = fct_reorder(Area, dplyr::desc(Region)),
fill = Frequency,
label = Frequency
)) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
labs(
x = "K-means Clusters",
y = "Region",
title = "Confusion matrix of k-means clusters and area of olive oil samples") +
theme_light())Gaussian mixture models (GMM) are a form of model-based clustering that provide soft assignment to each observation by providing a probability of belonging to each cluster.
A benefit of Gaussian mixture models over k-means is that the former automatically identifies the optimal number of clusters. In addition, GMMs allow for more clustering a wider variety of cluster shapes and orientations.
It’s important to highlight that GMMs assume multivariate normality of data.
The mclust package will be used to apply GMM models to
the data. The BIC is used to identify the optimize the covariance
parameters and to identify the optimal number of clusters.
library(mclust)
gmm <- mclust::Mclust(data = olive_df_adjusted[, features], G = 2:10, verbose = FALSE)
summary(gmm)## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVI (diagonal, varying volume and shape) model with 10 components:
##
## log-likelihood n df BIC ICL
## -323.9459 572 169 -1720.896 -1752.464
##
## Clustering table:
## 1 2 3 4 5 6 7 8 9 10
## 37 75 138 46 27 55 65 33 43 53
The final model has 10 clusters and a BIC of -1720.8963665.
olive_df_adjusted$gmm_clusters <- gmm$classification
olive_df_adjusted$gmm_uncertainty <- gmm$uncertainty
gmm_probability <- data.frame(gmm$z)
colnames(gmm_probability) <- paste0("GMM Cluster ", str_pad(1:gmm$G, width = 2, side = "left", pad = "0"))
olive_df_adjusted <- cbind(olive_df_adjusted, gmm_probability)The optimal models uses the VVI covariance parameters (diagonal, varying volume and shape).
gmm_BIC <- matrix(gmm$BIC, nrow = dim(gmm$BIC)[1], ncol = dim(gmm$BIC)[2]) %>%
data.frame() %>%
add_column(K = 2:10, .before=1)
colnames(gmm_BIC) <- c("K", colnames(gmm$BIC))
gmm_BIC %>%
gather(key = "Model", value = "BIC", -K) %>%
filter(!is.na(BIC)) %>%
ggplot(aes(x = K, y = BIC, col = Model)) +
geom_line(size=0.75) +
geom_point() +
scale_color_brewer(type = "qual", palette = "Set1", direction = -1) +
scale_x_continuous(breaks = 1:10, minor_breaks = NULL) +
labs(title = "Gaussian Mixture Models",
subtitle = "BIC scores for components ranging from 1 to 10 and various covariance models",
x = "Number of Components") +
theme_light()olive_df_adjusted %>%
dplyr::select(Region, Area, contains("GMM Cluster")) %>%
mutate(ID = row_number()) %>%
gather(Cluster, probability, -ID, -Region, -Area) %>%
dplyr::arrange(Region, Area) %>%
ggplot(aes(x = probability, fill = Area)) +
geom_histogram(bins=20) +
facet_wrap(~ Cluster, nrow = 2) +
labs(x = "Probability", y = "Count",
title = "Distribution of probabilities for all observations aligning to each cluster") +
scale_fill_brewer(type = "qual", palette = "Set1") +
theme_light()The plot below shows that most samples with high uncertainty (>0.20) are from the South and are found mostly on cluster 3, 4 and 5.
olive_df_adjusted %>%
dplyr::select(Region, Area, Uncertainty = gmm_uncertainty, Cluster = gmm_clusters) %>%
mutate(ID = row_number()) %>%
filter(Uncertainty > 0.20) %>%
ggplot(aes(x = Uncertainty, reorder(ID, Uncertainty), col = factor(Cluster), shape = Region)) +
geom_point(size = 3) +
labs(title = "Observations with an uncertainty of membership higher than 0.20", y = "ID", col = "Cluster") +
scale_color_brewer(type = "qual", palette = "Set1", direction = -1) +
theme_bw() confusion_matrix_region <- olive_df_adjusted %>%
dplyr::select(Region, Area, `GMM Clusters` = gmm_clusters) %>%
dplyr::mutate(idx = 1) %>%
dplyr::group_by(Region, Area, `GMM Clusters`) %>%
dplyr::summarise(Frequency = sum(idx), .groups = "keep") %>%
spread(key = "GMM Clusters", value = "Frequency", fill = 0) %>%
dplyr::arrange(Region, Area)table(olive_df_adjusted %>% dplyr::select(Region, gmm_clusters)) %>%
data.frame() %>%
ggplot(aes(x = gmm_clusters, y = fct_reorder(Region, desc(Region)), fill = Freq, label = Freq)) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
labs(
x = "GMM Clusters",
y = "Region",
title = "Confusion matrix of Gaussian Mixture Model clusters\nand region of olive oil samples") +
theme_light()(g <- confusion_matrix_region %>%
gather(key = "GMM Clusters", value = "Frequency", -Area, -Region) %>%
dplyr::mutate(`GMM Clusters` = factor(`GMM Clusters`)) %>%
ggplot(aes(x = `GMM Clusters`, y = fct_reorder(Area, dplyr::desc(Region)), fill = Frequency, label = Frequency)) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
labs(
x = "GMM Clusters",
y = "Area",
title = "Confusion matrix of Gaussian Mixture Model clusters\nand area of olive oil samples") +
theme_light())library(dbscan)
(hdbscan <- dbscan::hdbscan(olive_df_scaled[, features], minPts = 10))## HDBSCAN clustering for 572 objects.
## Parameters: minPts = 10
## The clustering contains 6 cluster(s) and 133 noise points.
##
## 0 1 2 3 4 5 6
## 133 29 42 14 97 245 12
##
## Available fields: cluster, minPts, coredist, cluster_scores,
## membership_prob, outlier_scores, hc
olive_df_adjusted$hdbscan_clusters <- hdbscan$cluster
olive_df_adjusted$hdbscan_membership_prob <- hdbscan$membership_prob
olive_df_adjusted$hdbscan_outlier_scores <- hdbscan$outlier_scores
hdbscan %>% plot(show_flat =TRUE)olive_df_adjusted %>%
dplyr::select(Region, Area, Cluster = hdbscan_clusters, Probability = hdbscan_membership_prob) %>%
mutate(ID = row_number(),
Cluster = factor(Cluster, labels = c("Outlier", 1:max(hdbscan$cluster)))) %>%
# gather(Cluster, probability, -ID, -Region, -Area) %>%
dplyr::arrange(Region, Area) %>%
ggplot(aes(x = Probability, fill = Cluster)) +
geom_histogram(bins=20) +
facet_wrap(~ Cluster, nrow = 2, scales = "free_y") +
labs(x = "Probability", y = "Count",
title = "Distribution of probabilities for all observations aligning to each cluster") +
scale_fill_brewer(type = "qual", palette = "Set1") +
theme_light()olive_df_adjusted %>%
dplyr::select(Region, Area, Outlier_Score = hdbscan_outlier_scores, Cluster = hdbscan_clusters) %>%
mutate(ID = row_number()) %>%
filter(Outlier_Score > 0.50,
Cluster == 0) %>%
ggplot(aes(x = Outlier_Score, reorder(ID, Outlier_Score), col = factor(Region), shape = Region)) +
geom_point(size = 1.5) +
labs(title = "Observations with an outlier score higher than 0.50", y = "", col = "Region") +
scale_y_discrete(labels = NULL, breaks = NULL) +
scale_color_brewer(type = "qual", palette = "Set1", direction = -1) +
theme_bw()confusion_matrix_region <- olive_df_adjusted %>%
dplyr::select(Region, Area, `HDBSCAN Clusters` = hdbscan_clusters) %>%
dplyr::mutate(idx = 1) %>%
dplyr::group_by(Region, Area, `HDBSCAN Clusters`) %>%
dplyr::summarise(Frequency = sum(idx), .groups = "keep") %>%
spread(key = "HDBSCAN Clusters", value = "Frequency", fill = 0) %>%
dplyr::arrange(Region, Area)table(olive_df_adjusted %>% dplyr::select(Region, hdbscan_clusters)) %>%
data.frame() %>%
ggplot(aes(x = hdbscan_clusters, y = fct_reorder(Region, desc(Region)), fill = Freq, label = Freq)) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
labs(
x = "HDBSCAN Clusters",
y = "Region",
title = "Confusion matrix of HDBSCAN clusters\nand region of olive oil samples") +
theme_light()(g <- confusion_matrix_region %>%
gather(key = "HDBSCAN Clusters", value = "Frequency", -Area, -Region) %>%
dplyr::mutate(`HDBSCAN Clusters` = factor(`HDBSCAN Clusters`)) %>%
ggplot(aes(x = `HDBSCAN Clusters`, y = fct_reorder(Area, dplyr::desc(Region)), fill = Frequency, label = Frequency)) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "#FFFFFF", high = "#132B43", guide="none") +
labs(
x = "HDBSCAN Clusters",
y = "Region",
title = "Confusion matrix of HDBSCAN clusters\nand area of olive oil samples") +
theme_light())