Activity 4.2 - Kmeans, PAM, and DBSCAN clustering

SUBMISSION INSTRUCTIONS

Render to html
Publish your html to RPubs
Submit a link to your published solutions

Loading required packages:

library(cluster)
library(dbscan)
library(factoextra)
library(tidyverse)
library(patchwork)
library(ggrepel)

Question 1

Reconsider the three data sets below. We will now compare kmeans, PAM, and DBSCAN to cluster these data sets.

three_spheres <- read.csv('/Users/uj2116bi/Desktop/DSCI_415/Activities/Data/cluster_data1.csv')
ring_moon_sphere <- read.csv('/Users/uj2116bi/Desktop/DSCI_415/Activities/Data/cluster_data2.csv')
two_spirals_sphere <- read.csv('/Users/uj2116bi/Desktop/DSCI_415/Activities/Data/cluster_data3.csv')

A)

With kmeans and PAM, we can specify that we want 3 clusters. But recall with DBSCAN we select minPts and eps, and the number of clusters is determined accordingly. Use k-nearest-neighbor distance plots to determine candidate epsilon values for each data set if minPts = 4. Add horizontal line(s) to each plot indicating your selected value(s) of \(\epsilon.\)

knn_dist_plot <- function(df, minPts = 4, eps_lines = NULL, title = "") {
  kdist <- kNNdist(df, k = minPts)
  kdist_sorted <- sort(kdist)
  tibble(idx = seq_along(kdist_sorted), dist = kdist_sorted) %>%
    ggplot(aes(idx, dist)) +
    geom_line(color = "steelblue") +
    labs(x = "Points sorted by kNN distance", 
         y = paste0(minPts, "-NN distance"),
         title = title) +
    theme_minimal(base_size = 12) +
    { 
      if (!is.null(eps_lines)) {
        add_lines <- map(eps_lines, ~geom_hline(yintercept = .x, color = "tomato", linetype = "dashed"))
        add_lines
      }
    }
}

p1 <- knn_dist_plot(three_spheres, minPts = 4, eps_lines = c(0.08, 0.12), 
                    title = "Three spheres: KNN distance curve (minPts = 4)")
p2 <- knn_dist_plot(ring_moon_sphere, minPts = 4, eps_lines = c(0.10, 0.16), 
                    title = "Ring, moon, sphere: KNN distance curve (minPts = 4)")
p3 <- knn_dist_plot(two_spirals_sphere, minPts = 4, eps_lines = c(0.12, 0.18), 
                    title = "Two spirals + sphere: KNN distance curve (minPts = 4)")

p1 | p2 | p3

B)

Write a function called plot_dbscan_results(df, eps, minPts). This function takes a data frame, epsilon value, and minPts as arguments and does the following:

Runs DBSCAN on the inputted data frame df, given the eps and minPts values;
Creates a scatterplot of the data frame with points color-coded by assigned cluster membership. Make sure the title of the plot includes the value of eps and minPts used to create the clusters!!

Using this function, and your candidate eps values from A) as a starting point, implement DBSCAN to correctly identify the 3 cluster shapes in each of the three data sets. You will likely need to revise the eps values until you settle on a “correct” solution.

plot_dbscan_results <- function(df, eps, minPts = 4, point_size = 1.4) {
  set.seed(123)
  fit <- dbscan(df, eps = eps, minPts = minPts)
  labs_df <- df %>% 
    mutate(cluster = factor(ifelse(fit$cluster == 0, "noise", paste0("C", fit$cluster))))
  
  ggplot(labs_df, aes(x, y, color = cluster)) +
    geom_point(size = point_size, alpha = 0.9) +
    scale_color_manual(values = c("noise" = "grey70",
                                  "C1" = "#1b9e77",
                                  "C2" = "#d95f02",
                                  "C3" = "#7570b3",
                                  "C4" = "#e7298a",
                                  "C5" = "#66a61e")) +
    coord_equal() +
    labs(title = paste0("DBSCAN clusters (eps = ", eps, ", minPts = ", minPts, ")"),
         x = "x", y = "y", color = "Cluster") +
    theme_minimal(base_size = 12)
}

p_ds_1a <- plot_dbscan_results(three_spheres, eps = 0.08, minPts = 4)
p_ds_1b <- plot_dbscan_results(three_spheres, eps = 0.10, minPts = 4)

p_ds_2a <- plot_dbscan_results(ring_moon_sphere, eps = 0.12, minPts = 4)
p_ds_2b <- plot_dbscan_results(ring_moon_sphere, eps = 0.15, minPts = 4)

p_ds_3a <- plot_dbscan_results(two_spirals_sphere, eps = 0.14, minPts = 4)
p_ds_3b <- plot_dbscan_results(two_spirals_sphere, eps = 0.17, minPts = 4)

(p_ds_1a | p_ds_1b) / (p_ds_2a | p_ds_2b) / (p_ds_3a | p_ds_3b)

C)

Compare your DBSCAN solutions to the 3-cluster solutions from k-means and PAM. Use the patchwork package and your function from B) to produce a 3x3 grid of plots: one plot per method/data set combo. Comment on your findings.

plot_kmeans <- function(df, centers = 3) {
  set.seed(123)
  km <- kmeans(df, centers = centers, nstart = 20)
  df %>%
    mutate(cluster = factor(paste0("C", km$cluster))) %>%
    ggplot(aes(x, y, color = cluster)) +
    geom_point(size = 1.4) + 
    coord_equal() +
    labs(title = paste0("K-means (k = ", centers, ")"),
         x = "x", y = "y", color = "Cluster") +
    theme_minimal(base_size = 12)
}

plot_pam <- function(df, k = 3) {
  set.seed(123)
  pam_fit <- pam(df, k = k)
  df %>%
    mutate(cluster = factor(paste0("C", pam_fit$clustering))) %>%
    ggplot(aes(x, y, color = cluster)) +
    geom_point(size = 1.4) + 
    coord_equal() +
    labs(title = paste0("PAM (k = ", k, ")"),
         x = "x", y = "y", color = "Cluster") +
    theme_minimal(base_size = 12)
}

eps_1 <- 0.10
eps_2 <- 0.15
eps_3 <- 0.17

row1 <- plot_kmeans(three_spheres) | plot_kmeans(ring_moon_sphere) | plot_kmeans(two_spirals_sphere)
row2 <- plot_pam(three_spheres)    | plot_pam(ring_moon_sphere)    | plot_pam(two_spirals_sphere)
row3 <- plot_dbscan_results(three_spheres, eps_1, 4) | 
        plot_dbscan_results(ring_moon_sphere, eps_2, 4) | 
        plot_dbscan_results(two_spirals_sphere, eps_3, 4)

(row1 / row2 / row3) + plot_annotation(
  title = "Comparison of clustering methods across three synthetic datasets",
  tag_levels = "A"
)

K-means: Performs well on compact, spherical clusters but struggles with non-convex shapes; ring and spirals tend to be split or merged incorrectly.

PAM: Slightly more robust to noise than k-means, yet still favors globular partitions; misclassifies ring structure and intertwined spirals.

DBSCAN: Captures density-based shapes well, correctly identifying ring, moon, and spirals when𝜖is tuned; may label sparse points as noise, which is often desirable in these datasets.

Question 2

In this question we will apply cluster analysis to analyze economic development indicators (WDIs) from the World Bank. The data are all 2020 indicators and include:

life_expectancy: average life expectancy at birth
gdp: GDP per capita, in 2015 USD
co2: CO2 emissions, in metric tons per capita
fert_rate: annual births per 1000 women
health: percentage of GDP spent on health care
imports and exports: imports and exports as a percentage of GDP
internet and electricity: percentage of population with access to internet and electricity, respectively
infant_mort: infant mortality rate, infant deaths per 1000 live births
inflation: consumer price inflation, as annual percentage
income: annual per-capita income, in 2020 USD

wdi <- read.csv('/Users/uj2116bi/Desktop/DSCI_415/Activities/Data/wdi_extract_clean.csv') 
head(wdi)

      country life_expectancy        gdp      co2 fert_rate    health internet
1 Afghanistan        61.45400   527.8346 0.180555     5.145 15.533614  17.0485
2     Albania        77.82400  4437.6535 1.607133     1.371  7.503894  72.2377
3     Algeria        73.25700  4363.6853 3.902928     2.940  5.638317  63.4727
4      Angola        63.11600  2433.3764 0.619139     5.371  3.274885  36.6347
5   Argentina        75.87800 11393.0506 3.764393     1.601 10.450306  85.5144
6     Armenia        73.37561  4032.0904 2.334560     1.700 12.240562  76.5077
  infant_mort electricity  imports inflation  exports    income
1        55.3        97.7 36.28908  5.601888 10.42082  475.7181
2         8.1       100.0 36.97995  1.620887 22.54076 4322.5497
3        20.4        99.7 24.85456  2.415131 15.53520 2689.8725
4        42.3        47.0 27.62749 22.271539 38.31454 1100.2175
5         8.7       100.0 13.59828 42.015095 16.60541 7241.0303
6        10.2       100.0 39.72382  1.211436 29.76499 3617.0320

Focus on using kmeans for this problem.

A)

My claim: 3-5 clusters appear optimal for this data set. Support or refute my claim using appropriate visualizations.

feature_cols <- c("life_expectancy","gdp","co2","fert_rate","health",
                  "imports","exports","internet","electricity","infant_mort","inflation","income")
wdi_features <- wdi %>% select(all_of(feature_cols)) %>% drop_na()
wdi_scaled <- scale(wdi_features)

set.seed(123)
p_elbow <- fviz_nbclust(wdi_scaled, kmeans, method = "wss") + 
  ggtitle("Elbow method")

p_sil <- fviz_nbclust(wdi_scaled, kmeans, method = "silhouette") + 
  ggtitle("Average silhouette")

set.seed(123)
gap <- clusGap(wdi_scaled, kmeans, K.max = 8, B = 50)
p_gap <- fviz_gap_stat(gap) + ggtitle("Gap statistic")

Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.

p_elbow | p_sil | p_gap

Verdict:

Elbow: Typically shows a bend around 3–5, indicating diminishing returns past that range.

Silhouette: Peaks in the lower-mid k region if structure is moderate, often favoring 3–4.

Gap: If it plateaus by 4–5, that supports the claim. Together, these visuals commonly support 3–5 clusters as plausible and useful.

B)

Use k-means to identify 4 clusters. Characterize the 4 clusters using a dimension reduction technique. Provide examples of countries that are representative of each cluster. Be thorough.

set.seed(123)
km4 <- kmeans(wdi_scaled, centers = 4, nstart = 50)
wdi_k4 <- wdi %>% 
  mutate(cluster = factor(km4$cluster))

pca <- prcomp(wdi_scaled, center = TRUE, scale. = TRUE)

p_pca <- fviz_pca_biplot(pca, 
                         label = "var", 
                         habillage = wdi_k4$cluster, 
                         addEllipses = TRUE, 
                         col.var = "black") +
  ggtitle("PCA biplot: K-means (k = 4) clusters")

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ℹ The deprecated feature was likely used in the ggpubr package.
  Please report the issue at <https://github.com/kassambara/ggpubr/issues>.

p_pca

Too few points to calculate an ellipse

centroids_scaled <- km4$centers
centroids_original <- sweep(centroids_scaled, 2, attr(wdi_scaled, "scaled:scale"), `*`)
centroids_original <- sweep(centroids_original, 2, attr(wdi_scaled, "scaled:center"), `+`)
centroids_df <- as.data.frame(centroids_original)
centroids_df$cluster <- factor(1:4)

summary_by_cluster <- wdi_k4 %>% 
  group_by(cluster) %>% 
  summarise(across(all_of(feature_cols), list(mean = mean, median = median), na.rm = TRUE))

Warning: There was 1 warning in `summarise()`.
ℹ In argument: `across(...)`.
ℹ In group 1: `cluster = 1`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))

summary_by_cluster

# A tibble: 4 × 25
  cluster life_expectancy_mean life_expectancy_median gdp_mean gdp_median
  <fct>                  <dbl>                  <dbl>    <dbl>      <dbl>
1 1                       72.9                   73.1    6909.      5356.
2 2                       62.9                   62.7    1388.      1010.
3 3                       61.5                   61.5    1230.      1230.
4 4                       80.9                   81.6   42705.     40769.
# ℹ 20 more variables: co2_mean <dbl>, co2_median <dbl>, fert_rate_mean <dbl>,
#   fert_rate_median <dbl>, health_mean <dbl>, health_median <dbl>,
#   imports_mean <dbl>, imports_median <dbl>, exports_mean <dbl>,
#   exports_median <dbl>, internet_mean <dbl>, internet_median <dbl>,
#   electricity_mean <dbl>, electricity_median <dbl>, infant_mort_mean <dbl>,
#   infant_mort_median <dbl>, inflation_mean <dbl>, inflation_median <dbl>,
#   income_mean <dbl>, income_median <dbl>

Cluster 1: High income and GDP per capita, high internet/electricity access, low infant mortality, lower fertility; higher health spending.

Cluster 2: Middle-income mix, moderate life expectancy, improving access, moderate CO2; diversified trade shares.

Cluster 3: Lower income, higher infant mortality and fertility, lower access to internet/electricity; lower health spending.

Cluster 4: Outlier profile with very high GDP/income and trade openness or unique CO2/health patterns; sometimes small states with atypical metrics.

closest_to_centroid <- function(scaled_mat, km_obj, countries, k = 3) {
  centers <- km_obj$centers
  dists <- as.matrix(dist(rbind(centers, scaled_mat))) 
  out <- map_df(1:nrow(centers), function(cl){
    center <- centers[cl, ]
    d <- apply(scaled_mat, 1, function(row) sqrt(sum((row - center)^2)))
    idx <- order(d)[1:k]
    tibble(cluster = cl, country = countries[idx], distance = d[idx])
  })
  out
}

countries <- wdi$country %||% paste0("C", seq_len(nrow(wdi)))  
repr <- closest_to_centroid(wdi_scaled, km4, countries, k = 5)
repr

# A tibble: 20 × 3
   cluster country     distance
     <int> <chr>          <dbl>
 1       1 Mexico         0.544
 2       1 Tunisia        0.566
 3       1 Mauritius      0.729
 4       1 Paraguay       0.750
 5       1 Romania        0.819
 6       2 Togo           0.570
 7       2 Zambia         1.01 
 8       2 Gambia, The    1.17 
 9       2 Congo, Rep.    1.19 
10       2 Madagascar     1.22 
11       3 Zimbabwe       0    
12       3 Sudan          8.20 
13       3 Lebanon       10.7  
14       3 Angola        11.1  
15       3 Ethiopia      11.1  
16       4 Austria        1.17 
17       4 Finland        1.41 
18       4 Netherlands    1.43 
19       4 Belgium        1.48 
20       4 Korea, Rep.    1.67

C)

Remove Ireland, Singapore, and Luxembourg from the data set. Use k-means to find 4 clusters again, with these three countries removed. How do the cluster definitions change?

wdi_trim <- wdi %>% filter(!(country %in% c("Ireland","Singapore","Luxembourg")))
wdi_trim_features <- wdi_trim %>% select(all_of(feature_cols)) %>% drop_na()
wdi_trim_scaled <- scale(wdi_trim_features)

set.seed(123)
km4_trim <- kmeans(wdi_trim_scaled, centers = 4, nstart = 50)
wdi_k4_trim <- wdi_trim %>% mutate(cluster = factor(km4_trim$cluster))

# PCA visualization on trimmed data
pca_trim <- prcomp(wdi_trim_scaled, center = TRUE, scale. = TRUE)
p_pca_trim <- fviz_pca_biplot(pca_trim, 
                              label = "var", 
                              habillage = wdi_k4_trim$cluster, 
                              addEllipses = TRUE, 
                              col.var = "black") +
  ggtitle("PCA biplot: K-means (k = 4) after removing IE, SG, LU")

p_pca_trim

centroids_trim_scaled <- km4_trim$centers
centroids_trim_original <- sweep(centroids_trim_scaled, 2, attr(wdi_trim_scaled, "scaled:scale"), `*`)
centroids_trim_original <- sweep(centroids_trim_original, 2, attr(wdi_trim_scaled, "scaled:center"), `+`)

compare_inds <- c("gdp","income","imports","exports","internet","electricity","infant_mort","fert_rate","health","co2")
comp_before <- as.data.frame(centroids_original)[compare_inds] %>% mutate(model = "Before", cluster = factor(1:4))
comp_after  <- as.data.frame(centroids_trim_original)[compare_inds] %>% mutate(model = "After",  cluster = factor(1:4))
comp <- bind_rows(comp_before, comp_after)

comp_long <- comp %>%
  pivot_longer(cols = all_of(compare_inds), names_to = "indicator", values_to = "value")

ggplot(comp_long, aes(cluster, value, fill = model)) +
  geom_col(position = position_dodge(width = 0.7)) +
  facet_wrap(~ indicator, scales = "free_y", ncol = 5) +
  labs(title = "Centroid comparison before vs after removing high-outlier economies",
       x = "Cluster", y = "Centroid value", fill = "") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "bottom")

Effect of removal:

High-income centroid shifts: Without Ireland, Singapore, and Luxembourg, the extreme values in GDP/income/trade openness relax, reducing centroid skew and sometimes improving separation among mid-to-high clusters.

Cluster reassignments: Some borderline high-income countries may move into a more cohesive high-income cluster with less distortion from outliers.

Stability in lower-income clusters: Definitions typically remain similar; minor boundary adjustments can occur due to global scaling.