library(factoextra)
#library(dendextend)
#library(broom)
#library(kableExtra)
library(dbscan)
library(cluster)
library(viridis)
library(tidyverse)
set.seed(1234)
sample_df <- tibble(x = runif(100), y = runif(100))
ggplot(sample_df, aes(x,y)) +
geom_point(size = 3) +
theme_bw()
### Clustering by kmeans, hclust, dbscan
kmeans.out <- kmeans(sample_df, centers = 4, nstart = 25)
hclust.out <- hclust(dist(sample_df), method = "ward.D2")
dbscan.out <- dbscan(sample_df, eps = 0.13, minPts = 7)
sample_df <- sample_df %>%
mutate(
kmeans.cluster = kmeans.out$cluster,
hclust.cluster = cutree(hclust.out, k = 4),
dbscan.cluster = dbscan.out$cluster
)
sil.kmeans <- silhouette(sample_df$kmeans.cluster, dist(select(sample_df,x,y)))
summary(sil.kmeans)
## Silhouette of 100 units in 4 clusters from silhouette.default(x = sample_df$kmeans.cluster, dist = dist(select(sample_df, from x, y))) :
## Cluster sizes and average silhouette widths:
## 29 24 17 30
## 0.3071210 0.4961924 0.4148559 0.4725747
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.05837 0.31711 0.44914 0.42045 0.57635 0.68536
gg_kmeans <- ggplot(sample_df, aes(x,y, color = sil.kmeans[,3], shape = factor(kmeans.cluster))) +
geom_point(size=3) +
theme_bw() +
scale_colour_viridis(name = "sil.width")+
scale_shape_discrete(name = "cluster")
gg_kmeans
sil.hclust <- silhouette(sample_df$hclust.cluster, dist(select(sample_df,x,y)))
summary(sil.hclust)
## Silhouette of 100 units in 4 clusters from silhouette.default(x = sample_df$hclust.cluster, dist = dist(select(sample_df, from x, y))) :
## Cluster sizes and average silhouette widths:
## 23 33 16 28
## 0.4931617 0.2357250 0.4287645 0.4980124
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.3907 0.2914 0.4551 0.3993 0.5524 0.6947
gg_hclust <- ggplot(sample_df, aes(x,y, color = sil.hclust[,3], shape = factor(hclust.cluster))) +
geom_point(size=3) +
theme_bw() +
scale_colour_viridis(name = "sil.width")+
scale_shape_discrete(name = "cluster")
gg_hclust
sil.dbscan <- silhouette(sample_df$dbscan.cluster, dist(select(sample_df,x,y)))
summary(sil.dbscan)
## Silhouette of 100 units in 5 clusters from silhouette.default(x = sample_df$dbscan.cluster, dist = dist(select(sample_df, from x, y))) :
## Cluster sizes, ids = (0, 1, 2, 3, 4), and average silhouette widths:
## 41 20 21 13 5
## -0.4153901 0.3364206 0.6551446 0.4727659 0.7157080
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7786 -0.3944 0.3842 0.1318 0.5877 0.7994
gg_dbscan <- ggplot(sample_df, aes(x,y, color = sil.dbscan[,3], shape = factor(dbscan.cluster))) +
geom_point(size=3) +
theme_bw() +
scale_colour_viridis(name = "sil.width")+
scale_shape_discrete(name = "cluster")
gg_dbscan
set.seed(123)
iris_subset <- iris %>%
select(Petal.Length, Petal.Width)
k <- 1:10
WSS_df <- map_dbl(k, function(x) kmeans(iris_subset, centers = x, nstart = 25)$tot.withinss) %>%
tibble(k = k, WSS = .)
ggplot(WSS_df, aes(x = k, y = WSS)) +
geom_point(size = 3) +
geom_line() +
labs(x = "k (number of clusters)", y = "Total Within SS") +
theme_bw() +
scale_x_continuous(breaks = WSS_df$k)
kmeans_iris <- kmeans(iris_subset, centers = 3, nstart = 25)
ggplot(iris_subset, aes(Petal.Length, Petal.Width, color = factor(kmeans_iris$cluster))) +
geom_point(size = 3) +
scale_color_discrete(name = "cluster") +
theme_bw()
sil <- silhouette(kmeans_iris$cluster, dist(iris_subset))
summary(sil)
## Silhouette of 150 units in 3 clusters from silhouette.default(x = kmeans_iris$cluster, dist = dist(iris_subset)) :
## Cluster sizes and average silhouette widths:
## 52 50 48
## 0.5740971 0.9187719 0.4850074
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.004493 0.544747 0.660400 0.660480 0.908898 0.943962
cluster <- sil[,1]
sil_width <- sil[,3]
ggplot(iris_subset, aes(Petal.Length, Petal.Width, color = sil_width, shape = factor(kmeans_iris$cluster))) +
geom_point(size = 3) +
scale_colour_viridis(name = "sil.width")+
scale_shape_discrete(name = "cluster") +
theme_bw()
avg_sil_width <- data.frame(cluster, sil_width) %>%
group_by(cluster) %>%
summarize(mean_sil_width = mean(sil_width)) %>%
summarize(total_sil_score = mean(mean_sil_width)) %>%
as.numeric()
avg_sil_width
## [1] 0.6592921
k <- 2:10
avg_sil_df <- map_dbl(k, function(x) {
km <- kmeans(iris_subset, centers = x, nstart = 25)
sil <- silhouette(km$cluster, dist(iris_subset))
mean_sil <- data.frame(cluster = km$cluster, sil_width = sil[,3]) %>%
group_by(cluster) %>%
summarize(mean_sil_width = mean(sil_width)) %>%
summarize(overall_sil_width = mean(mean_sil_width)) %>% as.numeric()
return(mean_sil)
}
) %>%
tibble(k = k, sil = .) %>%
bind_rows(c(k = 1, sil = 0)) %>%
arrange(k)
ggplot(avg_sil_df, aes(x = k, y = sil)) +
geom_point(size = 3) +
geom_vline(xintercept = avg_sil_df %>% filter(sil == max(sil)) %>% .$k, linetype = "dashed") +
geom_text(aes(x = k, y = sil, label = round(sil,2)), nudge_x = 0.25, nudge_y = 0.05) +
geom_line() +
labs(x = "k (number of clusters)", y = "Silhouette") +
theme_bw() +
scale_x_continuous(breaks = avg_sil_df$k)
kmeans_iris <- kmeans(iris_subset, centers = 2, nstart = 25)
ggplot(iris_subset, aes(Petal.Length, Petal.Width, color = factor(kmeans_iris$cluster))) +
geom_point(size = 3) +
scale_color_discrete(name = "cluster") +
theme_bw()
sil <- silhouette(kmeans_iris$cluster, dist(iris_subset))
summary(sil)
## Silhouette of 150 units in 2 clusters from silhouette.default(x = kmeans_iris$cluster, dist = dist(iris_subset)) :
## Cluster sizes and average silhouette widths:
## 99 51
## 0.6902457 0.9112596
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08317 0.71773 0.77152 0.76539 0.91795 0.94609
cluster <- sil[,1]
sil_width <- sil[,3]
ggplot(iris_subset, aes(Petal.Length, Petal.Width, color = sil_width, shape = factor(kmeans_iris$cluster))) +
geom_point(size = 3) +
scale_colour_viridis(name = "sil.width")+
scale_shape_discrete(name = "cluster") +
theme_bw()
avg_sil_width <- data.frame(cluster, sil_width) %>%
group_by(cluster) %>%
summarize(mean_sil_width = mean(sil_width)) %>%
summarize(total_sil_score = mean(mean_sil_width)) %>%
as.numeric()
avg_sil_width
## [1] 0.8007526