Overview
3-clusters |To seeatches HFRS theoretical
framework (stressed / coping / comfortable) |
8 clustering variables:
| Variables |
Description |
| BBDI_sl_w_z |
Housing debt burden index |
| SHUSH_sl_w_z |
Shelter cost burden |
| MDI_sl_w_z |
Mortgage debt index (NA → 0 in EDA) |
| FINAS_sl_w_z |
Financial assets |
| ODDI_sl_w_z |
Other debt burden |
| PWNETWPT_sl_w_z |
Net worth |
| PATTSTIN_z |
Income expectations |
| PNBEARG_z |
Number of earners |
0 Libraries & Setup
library(tidyverse)
library(factoextra)
library(cluster)
library(patchwork)
library(scales)
library(kableExtra)
set.seed(2024)
CLUST_VARS <- c("BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z",
"FINAS_sl_w_z", "ODDI_sl_w_z", "PWNETWPT_sl_w_z",
"PATTSTIN_z", "PNBEARG_z")
# Shared helpers
wmean <- function(x, w) weighted.mean(x, w, na.rm = TRUE)
make_profile <- function(df, cluster_col) {
df %>%
group_by(cluster = .data[[cluster_col]]) %>%
summarise(
n = n(),
wt_n = sum(PWEIGHT),
across(all_of(CLUST_VARS), ~ wmean(.x, PWEIGHT)),
HFRS_mean = wmean(HFRS_100, PWEIGHT),
pct_stressed = wmean(HFRS_label == "stressed", PWEIGHT) * 100,
pct_coping = wmean(HFRS_label == "coping", PWEIGHT) * 100,
pct_comfortable = wmean(HFRS_label == "comfortable", PWEIGHT) * 100,
pct_mortgage = wmean(Housing_Mortgage_Status == "Own_with_mortgage", PWEIGHT) * 100,
.groups = "drop"
)
}
# make_heatmap <- function(profile, k_label) {
# profile %>%
# select(cluster, all_of(CLUST_VARS)) %>%
# pivot_longer(-cluster, names_to = "variable", values_to = "mean_z") %>%
# mutate(cluster = factor(cluster)) %>%
# ggplot(aes(x = variable, y = cluster, fill = mean_z)) +
# geom_tile(colour = "white", linewidth = 0.5) +
# geom_text(aes(label = round(mean_z, 2)), size = 3) +
# scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#D73027",
# midpoint = 0, name = "Mean z") +
# labs(title = paste("Cluster Profiles —", k_label), x = NULL, y = "Cluster") +
# theme_minimal(base_size = 11) +
# theme(axis.text.x = element_text(angle = 30, hjust = 1))
# }
#
# make_year_bar <- function(df, cluster_col, k_label) {
# df %>%
# count(year, cluster = .data[[cluster_col]], wt = PWEIGHT, name = "wt_n") %>%
# group_by(year) %>%
# mutate(pct = wt_n / sum(wt_n) * 100,
# cluster = factor(cluster)) %>%
# ggplot(aes(x = factor(year), y = pct, fill = cluster)) +
# geom_col(position = "dodge") +
# geom_text(aes(label = paste0(round(pct, 1), "%")),
# position = position_dodge(width = 0.9), vjust = -0.3, size = 3) +
# scale_fill_brewer(palette = "Set2", name = "Cluster") +
# labs(title = paste("Cluster composition by year —", k_label),
# x = "Year", y = "% of households (weighted)") +
# theme_minimal()
# }
1 Load Data
df_raw <- readRDS("adf_clean.rds")
df_clust <- df_raw %>%
filter(if_all(all_of(CLUST_VARS), ~ !is.na(.)))
cat("Rows used:", nrow(df_clust),
"| Dropped:", nrow(df_raw) - nrow(df_clust), "\n")
## Rows used: 24360 | Dropped: 864
cat("2019:", sum(df_clust$year == "2019"),
"| 2023:", sum(df_clust$year == "2023"), "\n")
## 2019: 9448 | 2023: 14912
X <- df_clust %>% select(all_of(CLUST_VARS)) %>% as.matrix()
2 Optimal k Diagnostics
Diagnostic plots run on a 5,000-row subsample to avoid memory
exhaustion. See 01_1_optimal_k_comparison.Rmd for
full-sample comparison.
set.seed(42)
X_sub <- X[sample(nrow(X), min(5000, nrow(X))), ]
p_elbow <- fviz_nbclust(X_sub, kmeans, method = "wss", k.max = 8,
nstart = 10, iter.max = 50) +
labs(title = "Elbow (WSS)", subtitle = "Subsample n = 5,000") +
theme_minimal()
p_sil <- fviz_nbclust(X_sub, kmeans, method = "silhouette", k.max = 8,
nstart = 10, iter.max = 50) +
labs(title = "Silhouette", subtitle = "Subsample n = 5,000") +
theme_minimal()
p_elbow + p_sil

3 Fit K-Means
km3 <- kmeans(X, centers = 3, nstart = 50, iter.max = 100)
km4 <- kmeans(X, centers = 4, nstart = 50, iter.max = 100)
cat("=== k=3 ===\n")
## === k=3 ===
print(table(km3$cluster))
##
## 1 2 3
## 3842 9273 11245
cat("Within-SS:", round(km3$tot.withinss, 1),
"| Bet/Tot:", round(km3$betweenss / km3$totss, 3), "\n\n")
## Within-SS: 129449.6 | Bet/Tot: 0.248
cat("=== k=4 ===\n")
## === k=4 ===
print(table(km4$cluster))
##
## 1 2 3 4
## 11097 2659 708 9896
cat("Within-SS:", round(km4$tot.withinss, 1),
"| Bet/Tot:", round(km4$betweenss / km4$totss, 3), "\n")
## Within-SS: 115600.1 | Bet/Tot: 0.328
df_clust <- df_clust %>%
mutate(cluster_k3 = km3$cluster,
cluster_k4 = km4$cluster)
Path A k = 3
A1 PCA Visualisation
fviz_cluster(km3, data = X,
geom = "point", ellipse.type = "convex",
palette = "Set2", alpha = 0.4, pointsize = 0.8) +
labs(title = "K-Means k=3 (PCA projection)") +
theme_minimal()

A2 Cluster Profiles
profile_k3 <- make_profile(df_clust, "cluster_k3")
profile_k3 %>%
mutate(wt_n = comma(round(wt_n))) %>%
kable(digits = 2, caption = "Path A — Cluster Profiles (k=3, weighted)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)
Path A — Cluster Profiles (k=3, weighted)
|
cluster
|
n
|
wt_n
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN_z
|
PNBEARG_z
|
HFRS_mean
|
pct_stressed
|
pct_coping
|
pct_comfortable
|
pct_mortgage
|
|
1
|
3842
|
4,046,691
|
2.08
|
-0.57
|
0.14
|
0.56
|
-0.35
|
0.35
|
-0.30
|
-0.73
|
59.61
|
5.35
|
24.35
|
70.30
|
9.91
|
|
2
|
9273
|
10,167,610
|
-0.35
|
1.09
|
0.33
|
0.43
|
-0.07
|
0.31
|
-0.30
|
0.06
|
57.66
|
5.79
|
33.70
|
60.51
|
40.14
|
|
3
|
11245
|
16,266,476
|
-0.25
|
-0.54
|
0.48
|
-0.38
|
0.13
|
-0.24
|
0.28
|
0.17
|
40.10
|
60.66
|
34.70
|
4.64
|
40.26
|
A3 Cluster vs HFRS Label
df_clust %>%
count(cluster_k3, HFRS_label, wt = PWEIGHT) %>%
group_by(cluster_k3) %>%
mutate(pct = round(n / sum(n) * 100, 1)) %>%
select(-n) %>%
pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0) %>%
kable(caption = "Path A — % of cluster by HFRS label (weighted)") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Path A — % of cluster by HFRS label (weighted)
|
cluster_k3
|
stressed
|
coping
|
comfortable
|
|
1
|
5.4
|
24.4
|
70.3
|
|
2
|
5.8
|
33.7
|
60.5
|
|
3
|
60.7
|
34.7
|
4.6
|
Path B — k = 4
B1 PCA Visualisation
fviz_cluster(km4, data = X,
geom = "point", ellipse.type = "convex",
palette = "Set1", alpha = 0.4, pointsize = 0.8) +
labs(title = "K-Means k=4 (PCA projection)") +
theme_minimal()

B2 Cluster Profiles
profile_k4 <- make_profile(df_clust, "cluster_k4")
profile_k4 %>%
mutate(wt_n = comma(round(wt_n))) %>%
kable(digits = 2, caption = "Path B — Cluster Profiles (k=4, weighted)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE)
Path B — Cluster Profiles (k=4, weighted)
|
cluster
|
n
|
wt_n
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN_z
|
PNBEARG_z
|
HFRS_mean
|
pct_stressed
|
pct_coping
|
pct_comfortable
|
pct_mortgage
|
|
1
|
11097
|
14,540,415
|
-0.29
|
0.08
|
0.61
|
-0.22
|
0.14
|
0.24
|
0.46
|
0.70
|
42.72
|
52.61
|
35.04
|
12.35
|
56.70
|
|
2
|
2659
|
2,828,111
|
2.66
|
-0.64
|
0.20
|
0.51
|
-0.25
|
0.38
|
-0.16
|
-0.66
|
59.28
|
7.92
|
23.13
|
68.95
|
12.33
|
|
3
|
708
|
1,341,190
|
-0.44
|
-0.66
|
0.11
|
-0.35
|
1.40
|
-4.17
|
0.21
|
-0.08
|
29.91
|
90.04
|
7.78
|
2.18
|
6.44
|
|
4
|
9896
|
11,771,061
|
-0.16
|
0.13
|
0.18
|
0.22
|
-0.28
|
0.15
|
-0.53
|
-0.67
|
55.29
|
13.51
|
35.71
|
50.79
|
19.98
|
B3 Cluster vs HFRS Label
df_clust %>%
count(cluster_k4, HFRS_label, wt = PWEIGHT) %>%
group_by(cluster_k4) %>%
mutate(pct = round(n / sum(n) * 100, 1)) %>%
select(-n) %>%
pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0) %>%
kable(caption = "Path B — % of cluster by HFRS label (weighted)") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Path B — % of cluster by HFRS label (weighted)
|
cluster_k4
|
stressed
|
coping
|
comfortable
|
|
1
|
52.6
|
35.0
|
12.3
|
|
2
|
7.9
|
23.1
|
69.0
|
|
3
|
90.0
|
7.8
|
2.2
|
|
4
|
13.5
|
35.7
|
50.8
|