Activity 4.2 - Kmeans, PAM, and DBSCAN clustering

SUBMISSION INSTRUCTIONS

Render to html
Publish your html to RPubs
Submit a link to your published solutions

Loading required packages:

library(cluster)
library(dbscan)
library(factoextra)
library(tidyverse)
library(patchwork)
library(ggrepel)

Question 1

Reconsider the three data sets below. We will now compare kmeans, PAM, and DBSCAN to cluster these data sets.

three_spheres <- read.csv('Data/cluster_data1.csv')
ring_moon_sphere <- read.csv('Data/cluster_data2.csv')
two_spirals_sphere <- read.csv('Data/cluster_data3.csv')

A)

With kmeans and PAM, we can specify that we want 3 clusters. But recall with DBSCAN we select minPts and eps, and the number of clusters is determined accordingly. Use k-nearest-neighbor distance plots to determine candidate epsilon values for each data set if minPts = 4. Add horizontal line(s) to each plot indicating your selected value(s) of \(\epsilon.\)

#three spheres
kNNdistplot(three_spheres, minPts = 4)
abline(h = 0.18)

#ring moon sphere
kNNdistplot(ring_moon_sphere, minPts = 4)
abline(h = 0.13)

#two spirals sphere
kNNdistplot(two_spirals_sphere, minPts = 4)
abline(h = 0.13)

B)

Write a function called plot_dbscan_results(df, eps, minPts). This function takes a data frame, epsilon value, and minPts as arguments and does the following:

Runs DBSCAN on the inputted data frame df, given the eps and minPts values;
Creates a scatterplot of the data frame with points color-coded by assigned cluster membership. Make sure the title of the plot includes the value of eps and minPts used to create the clusters!!

plot_dbscan_results <- function(df, eps, minPts) {
  db <- dbscan::dbscan(df, eps = eps, minPts = minPts)

  df$cluster <- as.factor(db$cluster)
  
  p <- ggplot2::ggplot(df, ggplot2::aes(x = df[[1]], y = df[[2]], color = cluster)) +
    ggplot2::geom_point(size = 3, alpha = 0.7) +
    ggplot2::labs(
      title = paste("DBSCAN Clustering (eps =", eps, ", minPts =", minPts, ")"),
      x = names(df)[1],
      y = names(df)[2],
      color = "Cluster"
    ) +
    ggplot2::theme_minimal() +
    ggplot2::theme(
    plot.title = ggplot2::element_text(size = 10) 
  )
  
  print(p)
}

#three sphere
p1 <- (plot_dbscan_results(three_spheres[, 1:2],0.185,4))

Warning: Use of `df[[1]]` is discouraged.
ℹ Use `.data[[1]]` instead.

Warning: Use of `df[[2]]` is discouraged.
ℹ Use `.data[[2]]` instead.

#ring moon sphere
p4 <- (plot_dbscan_results(ring_moon_sphere[, 1:2],0.28,4))

Warning: Use of `df[[1]]` is discouraged.
ℹ Use `.data[[1]]` instead.

Warning: Use of `df[[2]]` is discouraged.
ℹ Use `.data[[2]]` instead.

#two spirals sphere
p7 <- (plot_dbscan_results(two_spirals_sphere[, 1:2],0.11,4))

Warning: Use of `df[[1]]` is discouraged.
ℹ Use `.data[[1]]` instead.

Warning: Use of `df[[2]]` is discouraged.
ℹ Use `.data[[2]]` instead.

Using this function, and your candidate eps values from A) as a starting point, implement DBSCAN to correctly identify the 3 cluster shapes in each of the three data sets. You will likely need to revise the eps values until you settle on a “correct” solution.

C)

Compare your DBSCAN solutions to the 3-cluster solutions from k-means and PAM. Use the patchwork package and your function from B) to produce a 3x3 grid of plots: one plot per method/data set combo. Comment on your findings.

#adding clusters 
kmeans_clusters <- kmeans(three_spheres,
                         centers = 3,
                         iter.max = 20,
                         nstart = 10
                         )
pam_clusters <- pam(three_spheres,
                        k = 3,
                        nstart = 10
                    )
three_spheres$pam_clusters <- factor(pam_clusters$cluster)
three_spheres$kmeans_clusters <- factor(kmeans_clusters$cluster)

kmeans_clusters <- kmeans(ring_moon_sphere,
                         centers = 3,
                         iter.max = 20,
                         nstart = 10
                         )
pam_clusters <- pam(ring_moon_sphere,
                        k = 3,
                        nstart = 10
                    )
ring_moon_sphere$pam_clusters <- factor(pam_clusters$cluster)
ring_moon_sphere$kmeans_clusters <- factor(kmeans_clusters$cluster)

kmeans_clusters <- kmeans(two_spirals_sphere,
                         centers = 3,
                         iter.max = 20,
                         nstart = 10
                         )
pam_clusters <- pam(two_spirals_sphere,
                        k = 3,
                        nstart = 10
                    )
two_spirals_sphere$pam_clusters <- factor(pam_clusters$cluster)
two_spirals_sphere$kmeans_clusters <- factor(kmeans_clusters$cluster)


p2 <- ggplot(data = three_spheres, aes(x = x, y = y, color=kmeans_clusters)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 16) +
  labs(title = "K-Means") +
  theme(plot.title = element_text(size = 12))
p3 <- ggplot(data = three_spheres, aes(x = x, y = y, color=pam_clusters)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 16) +
    labs(title = "PAM") +
  theme(plot.title = element_text(size = 12))
p5 <- ggplot(data = ring_moon_sphere, aes(x = x, y = y, color=kmeans_clusters)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 16) +
  labs(title = "K-Means") +
  theme(plot.title = element_text(size = 12))
p6 <- ggplot(data = ring_moon_sphere, aes(x = x, y = y, color=pam_clusters)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 16) +
    labs(title = "PAM") +
  theme(plot.title = element_text(size = 12))
p8 <- ggplot(data = two_spirals_sphere, aes(x = x, y = y, color=kmeans_clusters)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 16) +
    labs(title = "K-Means") +
  theme(plot.title = element_text(size = 12))
p9 <- ggplot(data = two_spirals_sphere, aes(x = x, y = y, color=pam_clusters)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 16) +
    labs(title = "PAM") +
  theme(plot.title = element_text(size = 12))

print(
  (p1 + p2 + p3 + p4 + 
   p5 + p6 + p7 + p8 +
   p9) +
    plot_layout(nrow = 3, ncol = 3)
)

Warning: Use of `df[[1]]` is discouraged.
ℹ Use `.data[[1]]` instead.

Warning: Use of `df[[2]]` is discouraged.
ℹ Use `.data[[2]]` instead.

Warning: Use of `df[[1]]` is discouraged.
ℹ Use `.data[[1]]` instead.

Warning: Use of `df[[2]]` is discouraged.
ℹ Use `.data[[2]]` instead.

Warning: Use of `df[[1]]` is discouraged.
ℹ Use `.data[[1]]` instead.

Warning: Use of `df[[2]]` is discouraged.
ℹ Use `.data[[2]]` instead.

For the 3 spheres, I think the DBSCAN does the best, as having a few outlier points works well here, especially with the few points that are about equidistant from cluster 2 and 3

For the moon ring sphere, I also like DBSCAN the best here. K-means and PAM can’t quite capture the 3 distinct shapes like DBSCAN does

For the spirals, I don’t really like any of them but these points are hard to fit into clusters whatever way you slice it. I think K means does the best at separating the points however.

Question 2

In this question we will apply cluster analysis to analyze economic development indicators (WDIs) from the World Bank. The data are all 2020 indicators and include:

life_expectancy: average life expectancy at birth
gdp: GDP per capita, in 2015 USD
co2: CO2 emissions, in metric tons per capita
fert_rate: annual births per 1000 women
health: percentage of GDP spent on health care
imports and exports: imports and exports as a percentage of GDP
internet and electricity: percentage of population with access to internet and electricity, respectively
infant_mort: infant mortality rate, infant deaths per 1000 live births
inflation: consumer price inflation, as annual percentage
income: annual per-capita income, in 2020 USD

wdi <- read.csv('Data/wdi_extract_clean.csv')  %>% column_to_rownames(var = "country")
wdi_scaled <- scale(wdi)

Focus on using kmeans for this problem.

A)

My claim: 3-5 clusters appear optimal for this data set. Support or refute my claim using appropriate visualizations. With the elbow plot, 3 or 4 clusters look like an appropriate amount of clusters

fviz_nbclust(wdi_scaled, 
             FUNcluster = kmeans,
             method='wss',
             ) + 
  labs(title = 'kmeans')

B)

Use k-means to identify 4 clusters. Characterize the 4 clusters using a dimension reduction technique. Provide examples of countries that are representative of each cluster. Be thorough.

Cluster 1 (left-most cluster) contains countries with a high fertility and infant morality rate. These countries also have lower internet and electricity usage, lower GDP,as well as a lower life expectancy. There appear to be a lot of African and middle eastern countries within this cluster, like chad, sudan, afghanistan, and Zimbabwe
Cluster 2 (central most cluster) can be characterized as not having as many imports/exports, and tend to have lower GDPs. This cluster has quite a few small countries within it, such as Panama, Equador, the Bahamas and Costa Rica
Cluster 3 seems to have the most developed countries like Norway, the US, Japan, Canada, and France. This cluster tends to have the highest GDP, life expectancy, internet and electricity usage, and low fertility and infant mortality rates.

kmeans4 <- kmeans(wdi_scaled, centers = 4, nstart = 10)
wdi_pca <- prcomp(wdi, center=TRUE, scale. = TRUE)

kmeans_biplot <- fviz_pca(wdi_pca, 
         habillage = factor(kmeans4$cluster),max.overlaps = 1, labelsize = 2) + 
      ggtitle('K-means 4-cluster solution') + 
      guides(color='none',shape='none')

Warning in (function (mapping = NULL, data = NULL, stat = "identity", position
= "identity", : Ignoring unknown parameters: `max.overlaps`

kmeans_biplot

C)

Remove Ireland, Singapore, and Luxembourg from the data set. Use k-means to find 4 clusters again, with these three countries removed. How do the cluster definitions change?

Cluster 1 is about the same
cluster 2 becomes a bit less of a general melting pot of countries but is still defined by higher inflation rates and lower import/export numbers (in fact becomes more defined by them, as it had some outlier points previously that followed the import/export vectors)

Cluster 3 remains very similar with just fewer countries within it
A new and better defined cluster 4 emerges defined by high import and export numbers, containing many eastern european/Asia countries

sub_wdi <- subset(wdi, !(rownames(wdi) %in% c("Ireland", "Singapore","Luxembourg")))
scaled_sub <- scale(sub_wdi)

sub_kmeans4 <- kmeans(scaled_sub, centers = 4, nstart = 10)
sub_pca <- prcomp(sub_wdi, center=TRUE, scale. = TRUE)

sub_kmeans_biplot <- fviz_pca(sub_pca, 
         habillage = factor(sub_kmeans4$cluster),max.overlaps = 1, labelsize = 2) + 
      ggtitle('K-means 4-cluster solution') + 
      guides(color='none',shape='none')

Warning in (function (mapping = NULL, data = NULL, stat = "identity", position
= "identity", : Ignoring unknown parameters: `max.overlaps`

sub_kmeans_biplot