Overview

3-clusters |To seeatches HFRS theoretical framework (stressed / coping / comfortable) |

8 clustering variables:

Variables Description
BBDI_sl_w_z Housing debt burden index
SHUSH_sl_w_z Shelter cost burden
MDI_sl_w_z Mortgage debt index (NA → 0 in EDA)
FINAS_sl_w_z Financial assets
ODDI_sl_w_z Other debt burden
PWNETWPT_sl_w_z Net worth
PATTSTIN_z Income expectations
PNBEARG_z Number of earners

0 Libraries & Setup

library(tidyverse)
library(factoextra)
library(cluster)
library(patchwork)
library(scales)
library(kableExtra)

set.seed(2024)

CLUST_VARS <- c("BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z",
                "FINAS_sl_w_z", "ODDI_sl_w_z", "PWNETWPT_sl_w_z",
                "PATTSTIN_z", "PNBEARG_z")

# Shared helpers
wmean <- function(x, w) weighted.mean(x, w, na.rm = TRUE)

make_profile <- function(df, cluster_col) {
  df %>%
    group_by(cluster = .data[[cluster_col]]) %>%
    summarise(
      n               = n(),
      wt_n            = sum(PWEIGHT),
      across(all_of(CLUST_VARS), ~ wmean(.x, PWEIGHT)),
      HFRS_mean       = wmean(HFRS_100, PWEIGHT),
      pct_stressed    = wmean(HFRS_label == "stressed",    PWEIGHT) * 100,
      pct_coping      = wmean(HFRS_label == "coping",      PWEIGHT) * 100,
      pct_comfortable = wmean(HFRS_label == "comfortable", PWEIGHT) * 100,
      pct_mortgage    = wmean(Housing_Mortgage_Status == "Own_with_mortgage", PWEIGHT) * 100,
      .groups = "drop"
    )
}

# make_heatmap <- function(profile, k_label) {
#   profile %>%
#     select(cluster, all_of(CLUST_VARS)) %>%
#     pivot_longer(-cluster, names_to = "variable", values_to = "mean_z") %>%
#     mutate(cluster = factor(cluster)) %>%
#     ggplot(aes(x = variable, y = cluster, fill = mean_z)) +
#     geom_tile(colour = "white", linewidth = 0.5) +
#     geom_text(aes(label = round(mean_z, 2)), size = 3) +
#     scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#D73027",
#                          midpoint = 0, name = "Mean z") +
#     labs(title = paste("Cluster Profiles —", k_label), x = NULL, y = "Cluster") +
#     theme_minimal(base_size = 11) +
#     theme(axis.text.x = element_text(angle = 30, hjust = 1))
# }
# 
# make_year_bar <- function(df, cluster_col, k_label) {
#   df %>%
#     count(year, cluster = .data[[cluster_col]], wt = PWEIGHT, name = "wt_n") %>%
#     group_by(year) %>%
#     mutate(pct = wt_n / sum(wt_n) * 100,
#            cluster = factor(cluster)) %>%
#     ggplot(aes(x = factor(year), y = pct, fill = cluster)) +
#     geom_col(position = "dodge") +
#     geom_text(aes(label = paste0(round(pct, 1), "%")),
#               position = position_dodge(width = 0.9), vjust = -0.3, size = 3) +
#     scale_fill_brewer(palette = "Set2", name = "Cluster") +
#     labs(title = paste("Cluster composition by year —", k_label),
#          x = "Year", y = "% of households (weighted)") +
#     theme_minimal()
# }

1 Load Data

df_raw <- readRDS("adf_clean.rds")

df_clust <- df_raw %>%
  filter(if_all(all_of(CLUST_VARS), ~ !is.na(.)))

cat("Rows used:", nrow(df_clust),
    "| Dropped:", nrow(df_raw) - nrow(df_clust), "\n")
## Rows used: 24360 | Dropped: 864
cat("2019:", sum(df_clust$year == "2019"),
    "| 2023:", sum(df_clust$year == "2023"), "\n")
## 2019: 9448 | 2023: 14912
X <- df_clust %>% select(all_of(CLUST_VARS)) %>% as.matrix()

2 Optimal k Diagnostics

Diagnostic plots run on a 5,000-row subsample to avoid memory exhaustion. See 01_1_optimal_k_comparison.Rmd for full-sample comparison.

set.seed(42)
X_sub <- X[sample(nrow(X), min(5000, nrow(X))), ]

p_elbow <- fviz_nbclust(X_sub, kmeans, method = "wss", k.max = 8,
                        nstart = 10, iter.max = 50) +
  labs(title = "Elbow (WSS)", subtitle = "Subsample n = 5,000") +
  theme_minimal()

p_sil <- fviz_nbclust(X_sub, kmeans, method = "silhouette", k.max = 8,
                      nstart = 10, iter.max = 50) +
  labs(title = "Silhouette", subtitle = "Subsample n = 5,000") +
  theme_minimal()

p_elbow + p_sil


3 Fit K-Means

km3 <- kmeans(X, centers = 3, nstart = 50, iter.max = 100)
km4 <- kmeans(X, centers = 4, nstart = 50, iter.max = 100)

cat("=== k=3 ===\n")
## === k=3 ===
print(table(km3$cluster))
## 
##     1     2     3 
##  3842  9273 11245
cat("Within-SS:", round(km3$tot.withinss, 1),
    "| Bet/Tot:", round(km3$betweenss / km3$totss, 3), "\n\n")
## Within-SS: 129449.6 | Bet/Tot: 0.248
cat("=== k=4 ===\n")
## === k=4 ===
print(table(km4$cluster))
## 
##     1     2     3     4 
## 11097  2659   708  9896
cat("Within-SS:", round(km4$tot.withinss, 1),
    "| Bet/Tot:", round(km4$betweenss / km4$totss, 3), "\n")
## Within-SS: 115600.1 | Bet/Tot: 0.328
df_clust <- df_clust %>%
  mutate(cluster_k3 = km3$cluster,
         cluster_k4 = km4$cluster)

Path A k = 3

A1 PCA Visualisation

fviz_cluster(km3, data = X,
             geom = "point", ellipse.type = "convex",
             palette = "Set2", alpha = 0.4, pointsize = 0.8) +
  labs(title = "K-Means k=3 (PCA projection)") +
  theme_minimal()

A2 Cluster Profiles

profile_k3 <- make_profile(df_clust, "cluster_k3")

profile_k3 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Path A — Cluster Profiles (k=3, weighted)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)
Path A — Cluster Profiles (k=3, weighted)
cluster n wt_n BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN_z PNBEARG_z HFRS_mean pct_stressed pct_coping pct_comfortable pct_mortgage
1 3842 4,046,691 2.08 -0.57 0.14 0.56 -0.35 0.35 -0.30 -0.73 59.61 5.35 24.35 70.30 9.91
2 9273 10,167,610 -0.35 1.09 0.33 0.43 -0.07 0.31 -0.30 0.06 57.66 5.79 33.70 60.51 40.14
3 11245 16,266,476 -0.25 -0.54 0.48 -0.38 0.13 -0.24 0.28 0.17 40.10 60.66 34.70 4.64 40.26

A3 Cluster vs HFRS Label

df_clust %>%
  count(cluster_k3, HFRS_label, wt = PWEIGHT) %>%
  group_by(cluster_k3) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  select(-n) %>%
  pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0) %>%
  kable(caption = "Path A — % of cluster by HFRS label (weighted)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Path A — % of cluster by HFRS label (weighted)
cluster_k3 stressed coping comfortable
1 5.4 24.4 70.3
2 5.8 33.7 60.5
3 60.7 34.7 4.6

Path B — k = 4

B1 PCA Visualisation

fviz_cluster(km4, data = X,
             geom = "point", ellipse.type = "convex",
             palette = "Set1", alpha = 0.4, pointsize = 0.8) +
  labs(title = "K-Means k=4 (PCA projection)") +
  theme_minimal()

B2 Cluster Profiles

profile_k4 <- make_profile(df_clust, "cluster_k4")

profile_k4 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Path B — Cluster Profiles (k=4, weighted)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)
Path B — Cluster Profiles (k=4, weighted)
cluster n wt_n BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN_z PNBEARG_z HFRS_mean pct_stressed pct_coping pct_comfortable pct_mortgage
1 11097 14,540,415 -0.29 0.08 0.61 -0.22 0.14 0.24 0.46 0.70 42.72 52.61 35.04 12.35 56.70
2 2659 2,828,111 2.66 -0.64 0.20 0.51 -0.25 0.38 -0.16 -0.66 59.28 7.92 23.13 68.95 12.33
3 708 1,341,190 -0.44 -0.66 0.11 -0.35 1.40 -4.17 0.21 -0.08 29.91 90.04 7.78 2.18 6.44
4 9896 11,771,061 -0.16 0.13 0.18 0.22 -0.28 0.15 -0.53 -0.67 55.29 13.51 35.71 50.79 19.98

B3 Cluster vs HFRS Label

df_clust %>%
  count(cluster_k4, HFRS_label, wt = PWEIGHT) %>%
  group_by(cluster_k4) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  select(-n) %>%
  pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0) %>%
  kable(caption = "Path B — % of cluster by HFRS label (weighted)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Path B — % of cluster by HFRS label (weighted)
cluster_k4 stressed coping comfortable
1 52.6 35.0 12.3
2 7.9 23.1 69.0
3 90.0 7.8 2.2
4 13.5 35.7 50.8