Overview

3-clusters |To seeatches HFRS theoretical framework (stressed / coping / comfortable) |

8 clustering variables:

Variables	Description
BBDI_sl_w_z	Housing debt burden index
SHUSH_sl_w_z	Shelter cost burden
MDI_sl_w_z	Mortgage debt index (NA → 0 in EDA)
FINAS_sl_w_z	Financial assets
ODDI_sl_w_z	Other debt burden
PWNETWPT_sl_w_z	Net worth
PATTSTIN_z	Income expectations
PNBEARG_z	Number of earners

0 Libraries & Setup

library(tidyverse)
library(factoextra)
library(cluster)
library(patchwork)
library(scales)
library(kableExtra)

set.seed(2024)

CLUST_VARS <- c("BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z",
                "FINAS_sl_w_z", "ODDI_sl_w_z", "PWNETWPT_sl_w_z",
                "PATTSTIN_z", "PNBEARG_z")

# Shared helpers
wmean <- function(x, w) weighted.mean(x, w, na.rm = TRUE)

make_profile <- function(df, cluster_col) {
  df %>%
    group_by(cluster = .data[[cluster_col]]) %>%
    summarise(
      n               = n(),
      wt_n            = sum(PWEIGHT),
      across(all_of(CLUST_VARS), ~ wmean(.x, PWEIGHT)),
      HFRS_mean       = wmean(HFRS_100, PWEIGHT),
      pct_stressed    = wmean(HFRS_label == "stressed",    PWEIGHT) * 100,
      pct_coping      = wmean(HFRS_label == "coping",      PWEIGHT) * 100,
      pct_comfortable = wmean(HFRS_label == "comfortable", PWEIGHT) * 100,
      pct_mortgage    = wmean(Housing_Mortgage_Status == "Own_with_mortgage", PWEIGHT) * 100,
      .groups = "drop"
    )
}

# make_heatmap <- function(profile, k_label) {
#   profile %>%
#     select(cluster, all_of(CLUST_VARS)) %>%
#     pivot_longer(-cluster, names_to = "variable", values_to = "mean_z") %>%
#     mutate(cluster = factor(cluster)) %>%
#     ggplot(aes(x = variable, y = cluster, fill = mean_z)) +
#     geom_tile(colour = "white", linewidth = 0.5) +
#     geom_text(aes(label = round(mean_z, 2)), size = 3) +
#     scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#D73027",
#                          midpoint = 0, name = "Mean z") +
#     labs(title = paste("Cluster Profiles —", k_label), x = NULL, y = "Cluster") +
#     theme_minimal(base_size = 11) +
#     theme(axis.text.x = element_text(angle = 30, hjust = 1))
# }
# 
# make_year_bar <- function(df, cluster_col, k_label) {
#   df %>%
#     count(year, cluster = .data[[cluster_col]], wt = PWEIGHT, name = "wt_n") %>%
#     group_by(year) %>%
#     mutate(pct = wt_n / sum(wt_n) * 100,
#            cluster = factor(cluster)) %>%
#     ggplot(aes(x = factor(year), y = pct, fill = cluster)) +
#     geom_col(position = "dodge") +
#     geom_text(aes(label = paste0(round(pct, 1), "%")),
#               position = position_dodge(width = 0.9), vjust = -0.3, size = 3) +
#     scale_fill_brewer(palette = "Set2", name = "Cluster") +
#     labs(title = paste("Cluster composition by year —", k_label),
#          x = "Year", y = "% of households (weighted)") +
#     theme_minimal()
# }

1 Load Data

df_raw <- readRDS("adf_clean.rds")

df_clust <- df_raw %>%
  filter(if_all(all_of(CLUST_VARS), ~ !is.na(.)))

cat("Rows used:", nrow(df_clust),
    "| Dropped:", nrow(df_raw) - nrow(df_clust), "\n")

## Rows used: 24360 | Dropped: 864

cat("2019:", sum(df_clust$year == "2019"),
    "| 2023:", sum(df_clust$year == "2023"), "\n")

## 2019: 9448 | 2023: 14912

X <- df_clust %>% select(all_of(CLUST_VARS)) %>% as.matrix()

2 Optimal k Diagnostics

Diagnostic plots run on a 5,000-row subsample to avoid memory exhaustion. See 01_1_optimal_k_comparison.Rmd for full-sample comparison.

set.seed(42)
X_sub <- X[sample(nrow(X), min(5000, nrow(X))), ]

p_elbow <- fviz_nbclust(X_sub, kmeans, method = "wss", k.max = 8,
                        nstart = 10, iter.max = 50) +
  labs(title = "Elbow (WSS)", subtitle = "Subsample n = 5,000") +
  theme_minimal()

p_sil <- fviz_nbclust(X_sub, kmeans, method = "silhouette", k.max = 8,
                      nstart = 10, iter.max = 50) +
  labs(title = "Silhouette", subtitle = "Subsample n = 5,000") +
  theme_minimal()

p_elbow + p_sil

3 Fit K-Means

km3 <- kmeans(X, centers = 3, nstart = 50, iter.max = 100)
km4 <- kmeans(X, centers = 4, nstart = 50, iter.max = 100)

cat("=== k=3 ===\n")

## === k=3 ===

print(table(km3$cluster))

## 
##     1     2     3 
##  3842  9273 11245

cat("Within-SS:", round(km3$tot.withinss, 1),
    "| Bet/Tot:", round(km3$betweenss / km3$totss, 3), "\n\n")

## Within-SS: 129449.6 | Bet/Tot: 0.248

cat("=== k=4 ===\n")

## === k=4 ===

print(table(km4$cluster))

## 
##     1     2     3     4 
## 11097  2659   708  9896

cat("Within-SS:", round(km4$tot.withinss, 1),
    "| Bet/Tot:", round(km4$betweenss / km4$totss, 3), "\n")

## Within-SS: 115600.1 | Bet/Tot: 0.328

df_clust <- df_clust %>%
  mutate(cluster_k3 = km3$cluster,
         cluster_k4 = km4$cluster)

Path A k = 3

A1 PCA Visualisation

fviz_cluster(km3, data = X,
             geom = "point", ellipse.type = "convex",
             palette = "Set2", alpha = 0.4, pointsize = 0.8) +
  labs(title = "K-Means k=3 (PCA projection)") +
  theme_minimal()

A2 Cluster Profiles

profile_k3 <- make_profile(df_clust, "cluster_k3")

profile_k3 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Path A — Cluster Profiles (k=3, weighted)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)

Path A — Cluster Profiles (k=3, weighted)
cluster	n	wt_n	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN_z	PNBEARG_z	HFRS_mean	pct_stressed	pct_coping	pct_comfortable	pct_mortgage
1	3842	4,046,691	2.08	-0.57	0.14	0.56	-0.35	0.35	-0.30	-0.73	59.61	5.35	24.35	70.30	9.91
2	9273	10,167,610	-0.35	1.09	0.33	0.43	-0.07	0.31	-0.30	0.06	57.66	5.79	33.70	60.51	40.14
3	11245	16,266,476	-0.25	-0.54	0.48	-0.38	0.13	-0.24	0.28	0.17	40.10	60.66	34.70	4.64	40.26

A3 Cluster vs HFRS Label

df_clust %>%
  count(cluster_k3, HFRS_label, wt = PWEIGHT) %>%
  group_by(cluster_k3) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  select(-n) %>%
  pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0) %>%
  kable(caption = "Path A — % of cluster by HFRS label (weighted)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Path A — % of cluster by HFRS label (weighted)
cluster_k3	stressed	coping	comfortable
1	5.4	24.4	70.3
2	5.8	33.7	60.5
3	60.7	34.7	4.6

Path B — k = 4

B1 PCA Visualisation

fviz_cluster(km4, data = X,
             geom = "point", ellipse.type = "convex",
             palette = "Set1", alpha = 0.4, pointsize = 0.8) +
  labs(title = "K-Means k=4 (PCA projection)") +
  theme_minimal()

B2 Cluster Profiles

profile_k4 <- make_profile(df_clust, "cluster_k4")

profile_k4 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Path B — Cluster Profiles (k=4, weighted)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE)

Path B — Cluster Profiles (k=4, weighted)
cluster	n	wt_n	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN_z	PNBEARG_z	HFRS_mean	pct_stressed	pct_coping	pct_comfortable	pct_mortgage
1	11097	14,540,415	-0.29	0.08	0.61	-0.22	0.14	0.24	0.46	0.70	42.72	52.61	35.04	12.35	56.70
2	2659	2,828,111	2.66	-0.64	0.20	0.51	-0.25	0.38	-0.16	-0.66	59.28	7.92	23.13	68.95	12.33
3	708	1,341,190	-0.44	-0.66	0.11	-0.35	1.40	-4.17	0.21	-0.08	29.91	90.04	7.78	2.18	6.44
4	9896	11,771,061	-0.16	0.13	0.18	0.22	-0.28	0.15	-0.53	-0.67	55.29	13.51	35.71	50.79	19.98

B3 Cluster vs HFRS Label

df_clust %>%
  count(cluster_k4, HFRS_label, wt = PWEIGHT) %>%
  group_by(cluster_k4) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  select(-n) %>%
  pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0) %>%
  kable(caption = "Path B — % of cluster by HFRS label (weighted)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Path B — % of cluster by HFRS label (weighted)
cluster_k4	stressed	coping	comfortable
1	52.6	35.0	12.3
2	7.9	23.1	69.0
3	90.0	7.8	2.2
4	13.5	35.7	50.8

HFRS K-Means Clustering — Unsupervised (8 Variables)

2026-05-19

Overview

0 Libraries & Setup

1 Load Data

2 Optimal k Diagnostics

3 Fit K-Means

Path A k = 3

A1 PCA Visualisation

A2 Cluster Profiles

A3 Cluster vs HFRS Label

Path B — k = 4

B1 PCA Visualisation

B2 Cluster Profiles

B3 Cluster vs HFRS Label