library(factoextra)
#library(dendextend)
#library(broom)
#library(kableExtra)
library(dbscan)
library(cluster)
library(viridis)
library(tidyverse)

Application to a random sample

set.seed(1234)
sample_df <- tibble(x = runif(100), y = runif(100))
ggplot(sample_df, aes(x,y)) +
  geom_point(size = 3) +
  theme_bw()

### Clustering by kmeans, hclust, dbscan

kmeans.out <- kmeans(sample_df, centers = 4, nstart = 25)
hclust.out <- hclust(dist(sample_df), method = "ward.D2")
dbscan.out <- dbscan(sample_df, eps = 0.13, minPts = 7)

sample_df <- sample_df %>%
  mutate(
    kmeans.cluster = kmeans.out$cluster,
    hclust.cluster = cutree(hclust.out, k = 4),
    dbscan.cluster = dbscan.out$cluster
  )

Calculation of Silhouette coefficients for each clustering method

sil.kmeans <- silhouette(sample_df$kmeans.cluster, dist(select(sample_df,x,y)))
summary(sil.kmeans)
## Silhouette of 100 units in 4 clusters from silhouette.default(x = sample_df$kmeans.cluster, dist = dist(select(sample_df,  from     x, y))) :
##  Cluster sizes and average silhouette widths:
##        29        24        17        30 
## 0.3071210 0.4961924 0.4148559 0.4725747 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.05837  0.31711  0.44914  0.42045  0.57635  0.68536
gg_kmeans <- ggplot(sample_df, aes(x,y, color = sil.kmeans[,3], shape = factor(kmeans.cluster))) +
  geom_point(size=3) +
  theme_bw() +
  scale_colour_viridis(name = "sil.width")+
  scale_shape_discrete(name = "cluster")
gg_kmeans

sil.hclust <- silhouette(sample_df$hclust.cluster, dist(select(sample_df,x,y)))
summary(sil.hclust)
## Silhouette of 100 units in 4 clusters from silhouette.default(x = sample_df$hclust.cluster, dist = dist(select(sample_df,  from     x, y))) :
##  Cluster sizes and average silhouette widths:
##        23        33        16        28 
## 0.4931617 0.2357250 0.4287645 0.4980124 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.3907  0.2914  0.4551  0.3993  0.5524  0.6947
gg_hclust <- ggplot(sample_df, aes(x,y, color = sil.hclust[,3], shape = factor(hclust.cluster))) +
  geom_point(size=3) +
  theme_bw() +
  scale_colour_viridis(name = "sil.width")+
  scale_shape_discrete(name = "cluster")
gg_hclust

sil.dbscan <- silhouette(sample_df$dbscan.cluster, dist(select(sample_df,x,y)))
summary(sil.dbscan)
## Silhouette of 100 units in 5 clusters from silhouette.default(x = sample_df$dbscan.cluster, dist = dist(select(sample_df,  from     x, y))) :
##  Cluster sizes, ids = (0, 1, 2, 3, 4), and average silhouette widths:
##         41         20         21         13          5 
## -0.4153901  0.3364206  0.6551446  0.4727659  0.7157080 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.7786 -0.3944  0.3842  0.1318  0.5877  0.7994
gg_dbscan <- ggplot(sample_df, aes(x,y, color = sil.dbscan[,3], shape = factor(dbscan.cluster))) +
  geom_point(size=3) +
  theme_bw() +
  scale_colour_viridis(name = "sil.width")+
  scale_shape_discrete(name = "cluster")
gg_dbscan

A more realistic example

set.seed(123)
iris_subset <- iris %>%
  select(Petal.Length, Petal.Width)
k <- 1:10
WSS_df <- map_dbl(k, function(x) kmeans(iris_subset, centers = x, nstart = 25)$tot.withinss) %>% 
  tibble(k = k, WSS = .)
ggplot(WSS_df, aes(x = k, y = WSS)) + 
  geom_point(size = 3) +
  geom_line() +
  labs(x = "k (number of clusters)", y = "Total Within SS") +
  theme_bw() +
  scale_x_continuous(breaks = WSS_df$k)

kmeans_iris <- kmeans(iris_subset, centers = 3, nstart = 25)

ggplot(iris_subset, aes(Petal.Length, Petal.Width, color = factor(kmeans_iris$cluster))) +
  geom_point(size = 3) +
  scale_color_discrete(name = "cluster") +
  theme_bw()

sil <- silhouette(kmeans_iris$cluster, dist(iris_subset))
summary(sil)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = kmeans_iris$cluster, dist = dist(iris_subset)) :
##  Cluster sizes and average silhouette widths:
##        52        50        48 
## 0.5740971 0.9187719 0.4850074 
## Individual silhouette widths:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.004493  0.544747  0.660400  0.660480  0.908898  0.943962
cluster <- sil[,1]
sil_width <- sil[,3]

ggplot(iris_subset, aes(Petal.Length, Petal.Width, color = sil_width, shape = factor(kmeans_iris$cluster))) +
  geom_point(size = 3) +
  scale_colour_viridis(name = "sil.width")+
  scale_shape_discrete(name = "cluster") +
  theme_bw()

avg_sil_width <- data.frame(cluster, sil_width) %>%
  group_by(cluster) %>%
  summarize(mean_sil_width = mean(sil_width)) %>%
  summarize(total_sil_score = mean(mean_sil_width)) %>%
  as.numeric()
avg_sil_width
## [1] 0.6592921
k <- 2:10
avg_sil_df <- map_dbl(k, function(x) {
  km <- kmeans(iris_subset, centers = x, nstart = 25)
  sil <- silhouette(km$cluster, dist(iris_subset))
  mean_sil <- data.frame(cluster = km$cluster, sil_width = sil[,3]) %>%
    group_by(cluster) %>%
    summarize(mean_sil_width = mean(sil_width)) %>%
    summarize(overall_sil_width = mean(mean_sil_width)) %>% as.numeric()
  return(mean_sil)
  }
  ) %>% 
  tibble(k = k, sil = .) %>%
  bind_rows(c(k = 1, sil = 0)) %>%
  arrange(k)
ggplot(avg_sil_df, aes(x = k, y = sil)) + 
  geom_point(size = 3) +
  geom_vline(xintercept = avg_sil_df %>% filter(sil == max(sil)) %>% .$k, linetype = "dashed") +
  geom_text(aes(x = k, y = sil, label = round(sil,2)), nudge_x = 0.25, nudge_y = 0.05) +
  geom_line() +
  labs(x = "k (number of clusters)", y = "Silhouette") +
  theme_bw() +
  scale_x_continuous(breaks = avg_sil_df$k)

kmeans_iris <- kmeans(iris_subset, centers = 2, nstart = 25)

ggplot(iris_subset, aes(Petal.Length, Petal.Width, color = factor(kmeans_iris$cluster))) +
  geom_point(size = 3) +
  scale_color_discrete(name = "cluster") +
  theme_bw()

sil <- silhouette(kmeans_iris$cluster, dist(iris_subset))
summary(sil)
## Silhouette of 150 units in 2 clusters from silhouette.default(x = kmeans_iris$cluster, dist = dist(iris_subset)) :
##  Cluster sizes and average silhouette widths:
##        99        51 
## 0.6902457 0.9112596 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.08317 0.71773 0.77152 0.76539 0.91795 0.94609
cluster <- sil[,1]
sil_width <- sil[,3]

ggplot(iris_subset, aes(Petal.Length, Petal.Width, color = sil_width, shape = factor(kmeans_iris$cluster))) +
  geom_point(size = 3) +
  scale_colour_viridis(name = "sil.width")+
  scale_shape_discrete(name = "cluster") +
  theme_bw()

avg_sil_width <- data.frame(cluster, sil_width) %>%
  group_by(cluster) %>%
  summarize(mean_sil_width = mean(sil_width)) %>%
  summarize(total_sil_score = mean(mean_sil_width)) %>%
  as.numeric()
avg_sil_width
## [1] 0.8007526