Overview
This notebook runs full-sample hierarchical
clustering on the HFRS data in two ways:
- Gower hierarchical
- six transformed continuous indicators
PATTSTIN as raw ordered scale
PNBEARG as raw count / ordered variable
- Euclidean hierarchical
- the original 8 transformed indicators
For each method, the notebook:
- fits hierarchical clustering on the full
sample
- cuts the tree at k = 2 and k =
3
- compares the cluster labels with
HFRS_label
- produces weighted cluster-profile tables
- produces a representative case table for
interpretation
Note: hierarchical clustering itself does not
produce medoids. The “representative case” tables below use the
observation nearest to each cluster centroid as a medoid-style
interpretive summary.
0 Libraries & Setup
library(here)
library(tidyverse)
library(cluster)
library(kableExtra)
library(mclust)
library(scales)
set.seed(2024)
data_path <- here( "adf.csv")
out_data_dir <- here("02_clustering", "outputs", "data")
dir.create(out_data_dir, recursive = TRUE, showWarnings = FALSE)
EUC_VARS <- c(
"BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z", "FINAS_sl_w_z",
"ODDI_sl_w_z", "PWNETWPT_sl_w_z", "PATTSTIN_z", "PNBEARG_z"
)
GOWER_NUM_VARS <- c(
"BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z",
"FINAS_sl_w_z", "ODDI_sl_w_z", "PWNETWPT_sl_w_z"
)
GOWER_ORD_VARS <- c("PATTSTIN", "PNBEARG")
GOWER_ALL_VARS <- c(GOWER_NUM_VARS, GOWER_ORD_VARS)
wmean <- function(x, w) weighted.mean(x, w, na.rm = TRUE)
weighted_profile_table <- function(df, cluster_col, profile_vars) {
df %>%
group_by(cluster = .data[[cluster_col]]) %>%
summarise(
n = n(),
wt_n = sum(PWEIGHT, na.rm = TRUE),
across(all_of(profile_vars), ~ wmean(.x, PWEIGHT)),
HFRS_mean = wmean(HFRS_100, PWEIGHT),
pct_stressed = wmean(HFRS_label == "stressed", PWEIGHT) * 100,
pct_coping = wmean(HFRS_label == "coping", PWEIGHT) * 100,
pct_comfortable = wmean(HFRS_label == "comfortable", PWEIGHT) * 100,
.groups = "drop"
)
}
weighted_crosstab <- function(df, cluster_col) {
df %>%
dplyr::count(.data[[cluster_col]], HFRS_label, wt = PWEIGHT, name = "wt_n") %>%
group_by(.data[[cluster_col]]) %>%
mutate(pct = 100 * wt_n / sum(wt_n)) %>%
ungroup() %>%
select(all_of(c(cluster_col, "HFRS_label", "pct"))) %>%
rename(cluster = all_of(cluster_col)) %>%
pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0)
}
ari_summary <- function(df, cluster_cols) {
label_int <- as.integer(factor(
df$HFRS_label,
levels = c("stressed", "coping", "comfortable")
))
tibble(
clustering = cluster_cols,
k = c(2, 3),
ARI = map_dbl(cluster_cols, ~ adjustedRandIndex(df[[.x]], label_int))
)
}
representative_cases <- function(df, cluster_col, vars_for_distance, cols_to_show) {
cluster_ids <- sort(unique(df[[cluster_col]]))
map_dfr(cluster_ids, function(cl) {
sub <- df %>% filter(.data[[cluster_col]] == cl)
centroid <- colMeans(sub[, vars_for_distance, drop = FALSE])
dists <- apply(sub[, vars_for_distance, drop = FALSE], 1, function(row) {
sqrt(sum((row - centroid)^2))
})
best_idx <- which.min(dists)
out <- sub[best_idx, cols_to_show, drop = FALSE]
out[[cluster_col]] <- cl
out$distance_to_centroid <- dists[[best_idx]]
out
}) %>%
relocate(all_of(cluster_col), .before = 1)
}
1 Load Data
df_raw <- readr::read_csv(data_path, show_col_types = FALSE) %>%
mutate(
HFRS_label = factor(HFRS_label, levels = c("stressed", "coping", "comfortable")),
PATTSTIN = ordered(PATTSTIN),
PNBEARG = ordered(PNBEARG)
)
required_vars <- c(
"PEFAMID", "PWEIGHT", "HFRS_100", "HFRS_label",
EUC_VARS, GOWER_ALL_VARS
)
missing_vars <- setdiff(required_vars, names(df_raw))
if (length(missing_vars) > 0) {
stop("Missing required variables: ", paste(missing_vars, collapse = ", "))
}
df_euc <- df_raw %>%
filter(
!is.na(HFRS_100), !is.na(HFRS_label), !is.na(PWEIGHT),
if_all(all_of(EUC_VARS), ~ !is.na(.x))
)
df_gower <- df_raw %>%
filter(
!is.na(HFRS_100), !is.na(HFRS_label), !is.na(PWEIGHT),
if_all(all_of(GOWER_ALL_VARS), ~ !is.na(.x))
) %>%
mutate(
PATTSTIN = ordered(PATTSTIN),
PNBEARG = ordered(PNBEARG)
)
cat("Euclidean sample rows:", nrow(df_euc), "\n")
## Euclidean sample rows: 24360
cat("Gower sample rows:", nrow(df_gower), "\n")
## Gower sample rows: 24360
2.1 Gower Hierarchical (Full Sample)
X_gower <- df_gower %>%
transmute(
across(all_of(GOWER_NUM_VARS), as.numeric),
PATTSTIN = ordered(PATTSTIN),
PNBEARG = ordered(PNBEARG)
)
gower_dist <- daisy(X_gower, metric = "gower")
hc_gower <- hclust(gower_dist, method = "ward.D2")
df_gower <- df_gower %>%
mutate(
gower_hc_k2 = cutree(hc_gower, k = 2),
gower_hc_k3 = cutree(hc_gower, k = 3)
)
gower_ari <- ari_summary(df_gower, c("gower_hc_k2", "gower_hc_k3"))
gower_ari %>%
mutate(ARI = round(ARI, 4)) %>%
kable(caption = "Gower hierarchical — ARI vs HFRS label") %>%
kable_styling(full_width = FALSE)
Gower hierarchical — ARI vs HFRS label
|
clustering
|
k
|
ARI
|
|
gower_hc_k2
|
2
|
0.0981
|
|
gower_hc_k3
|
3
|
0.0683
|
weighted_crosstab(df_gower, "gower_hc_k2") %>%
kable(digits = 1, caption = "Gower hierarchical k=2 — weighted % by HFRS label") %>%
kable_styling(full_width = FALSE)
Gower hierarchical k=2 — weighted % by HFRS label
|
cluster
|
stressed
|
coping
|
comfortable
|
|
1
|
23.7
|
34.5
|
41.8
|
|
2
|
56.6
|
30.0
|
13.4
|
weighted_crosstab(df_gower, "gower_hc_k3") %>%
kable(digits = 1, caption = "Gower hierarchical k=3 — weighted % by HFRS label") %>%
kable_styling(full_width = FALSE)
Gower hierarchical k=3 — weighted % by HFRS label
|
cluster
|
stressed
|
coping
|
comfortable
|
|
1
|
27.4
|
36.0
|
36.7
|
|
2
|
6.2
|
27.9
|
65.9
|
|
3
|
56.6
|
30.0
|
13.4
|
gower_profile_k2 <- weighted_profile_table(
df_gower,
"gower_hc_k2",
c(GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)
gower_profile_k3 <- weighted_profile_table(
df_gower,
"gower_hc_k3",
c(GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)
gower_profile_k2 %>%
mutate(wt_n = comma(round(wt_n))) %>%
kable(digits = 2, caption = "Gower hierarchical k=2 — weighted cluster profiles") %>%
kable_styling(full_width = FALSE)
Gower hierarchical k=2 — weighted cluster profiles
|
cluster
|
n
|
wt_n
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN
|
PNBEARG
|
HFRS_mean
|
pct_stressed
|
pct_coping
|
pct_comfortable
|
|
1
|
17136
|
19,988,308
|
0.08
|
0.01
|
-0.07
|
-0.01
|
0.04
|
-0.05
|
NA
|
NA
|
51.80
|
23.68
|
34.55
|
41.77
|
|
2
|
7224
|
10,492,470
|
-0.08
|
-0.01
|
0.15
|
0.06
|
-0.08
|
0.17
|
NA
|
NA
|
42.34
|
56.60
|
30.04
|
13.36
|
gower_profile_k3 %>%
mutate(wt_n = comma(round(wt_n))) %>%
kable(digits = 2, caption = "Gower hierarchical k=3 — weighted cluster profiles") %>%
kable_styling(full_width = FALSE)
Gower hierarchical k=3 — weighted cluster profiles
|
cluster
|
n
|
wt_n
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN
|
PNBEARG
|
HFRS_mean
|
pct_stressed
|
pct_coping
|
pct_comfortable
|
|
1
|
13589
|
16,482,621
|
0.08
|
-0.03
|
-0.08
|
0.00
|
0.05
|
-0.13
|
NA
|
NA
|
50.25
|
27.4
|
35.95
|
36.65
|
|
2
|
3547
|
3,505,687
|
0.08
|
0.19
|
0.02
|
-0.06
|
-0.01
|
0.32
|
NA
|
NA
|
59.11
|
6.2
|
27.94
|
65.87
|
|
3
|
7224
|
10,492,470
|
-0.08
|
-0.01
|
0.15
|
0.06
|
-0.08
|
0.17
|
NA
|
NA
|
42.34
|
56.6
|
30.04
|
13.36
|
gower_rep_k2 <- representative_cases(
df = df_gower %>%
mutate(
PATTSTIN_num = as.numeric(PATTSTIN),
PNBEARG_num = as.numeric(PNBEARG)
),
cluster_col = "gower_hc_k2",
vars_for_distance = c(GOWER_NUM_VARS, "PATTSTIN_num", "PNBEARG_num"),
cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)
gower_rep_k3 <- representative_cases(
df = df_gower %>%
mutate(
PATTSTIN_num = as.numeric(PATTSTIN),
PNBEARG_num = as.numeric(PNBEARG)
),
cluster_col = "gower_hc_k3",
vars_for_distance = c(GOWER_NUM_VARS, "PATTSTIN_num", "PNBEARG_num"),
cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)
gower_rep_k2 %>%
kable(digits = 3, caption = "Gower hierarchical k=2 — representative cases") %>%
kable_styling(full_width = FALSE)
Gower hierarchical k=2 — representative cases
|
gower_hc_k2
|
PEFAMID
|
HFRS_100
|
HFRS_label
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN
|
PNBEARG
|
distance_to_centroid
|
|
1
|
11
|
48.843
|
coping
|
0.032
|
-0.063
|
-0.067
|
-0.373
|
-0.061
|
0.251
|
0
|
1
|
0.529
|
|
2
|
1383
|
42.048
|
stressed
|
-0.093
|
0.331
|
0.267
|
0.085
|
-0.332
|
0.344
|
1
|
2
|
0.596
|
gower_rep_k3 %>%
kable(digits = 3, caption = "Gower hierarchical k=3 — representative cases") %>%
kable_styling(full_width = FALSE)
Gower hierarchical k=3 — representative cases
|
gower_hc_k3
|
PEFAMID
|
HFRS_100
|
HFRS_label
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN
|
PNBEARG
|
distance_to_centroid
|
|
1
|
11
|
48.843
|
coping
|
0.032
|
-0.063
|
-0.067
|
-0.373
|
-0.061
|
0.251
|
0
|
1
|
0.483
|
|
2
|
1707
|
60.318
|
comfortable
|
-0.166
|
0.483
|
0.160
|
-0.260
|
-0.177
|
0.323
|
-1
|
1
|
0.694
|
|
3
|
1383
|
42.048
|
stressed
|
-0.093
|
0.331
|
0.267
|
0.085
|
-0.332
|
0.344
|
1
|
2
|
0.596
|
2.2 Euclidean Hierarchical (Full Sample)
X_euc <- df_euc %>%
select(all_of(EUC_VARS)) %>%
as.matrix()
euc_dist <- dist(X_euc)
hc_euc <- hclust(euc_dist, method = "ward.D2")
df_euc <- df_euc %>%
mutate(
euc_hc_k2 = cutree(hc_euc, k = 2),
euc_hc_k3 = cutree(hc_euc, k = 3)
)
euc_ari <- ari_summary(df_euc, c("euc_hc_k2", "euc_hc_k3"))
euc_ari %>%
mutate(ARI = round(ARI, 4)) %>%
kable(caption = "Euclidean hierarchical — ARI vs HFRS label") %>%
kable_styling(full_width = FALSE)
Euclidean hierarchical — ARI vs HFRS label
|
clustering
|
k
|
ARI
|
|
euc_hc_k2
|
2
|
-0.0047
|
|
euc_hc_k3
|
3
|
0.0046
|
weighted_crosstab(df_euc, "euc_hc_k2") %>%
kable(digits = 1, caption = "Euclidean hierarchical k=2 — weighted % by HFRS label") %>%
kable_styling(full_width = FALSE)
Euclidean hierarchical k=2 — weighted % by HFRS label
|
cluster
|
stressed
|
coping
|
comfortable
|
|
1
|
36.5
|
33.9
|
29.6
|
|
2
|
10.9
|
18.2
|
71.0
|
weighted_crosstab(df_euc, "euc_hc_k3") %>%
kable(digits = 1, caption = "Euclidean hierarchical k=3 — weighted % by HFRS label") %>%
kable_styling(full_width = FALSE)
Euclidean hierarchical k=3 — weighted % by HFRS label
|
cluster
|
stressed
|
coping
|
comfortable
|
|
1
|
33.9
|
35.2
|
30.9
|
|
2
|
10.9
|
18.2
|
71.0
|
|
3
|
90.2
|
7.7
|
2.2
|
euc_profile_k2 <- weighted_profile_table(df_euc, "euc_hc_k2", EUC_VARS)
euc_profile_k3 <- weighted_profile_table(df_euc, "euc_hc_k3", EUC_VARS)
euc_profile_k2 %>%
mutate(wt_n = comma(round(wt_n))) %>%
kable(digits = 2, caption = "Euclidean hierarchical k=2 — weighted cluster profiles") %>%
kable_styling(full_width = FALSE)
Euclidean hierarchical k=2 — weighted cluster profiles
|
cluster
|
n
|
wt_n
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN_z
|
PNBEARG_z
|
HFRS_mean
|
pct_stressed
|
pct_coping
|
pct_comfortable
|
|
1
|
22658
|
28,716,870
|
-0.16
|
0.04
|
0.02
|
-0.01
|
0.03
|
0.00
|
0.03
|
0.06
|
47.87
|
36.49
|
33.90
|
29.60
|
|
2
|
1702
|
1,763,907
|
3.09
|
-0.68
|
-0.18
|
0.43
|
-0.47
|
0.42
|
-0.33
|
-0.69
|
59.61
|
10.86
|
18.19
|
70.96
|
euc_profile_k3 %>%
mutate(wt_n = comma(round(wt_n))) %>%
kable(digits = 2, caption = "Euclidean hierarchical k=3 — weighted cluster profiles") %>%
kable_styling(full_width = FALSE)
Euclidean hierarchical k=3 — weighted cluster profiles
|
cluster
|
n
|
wt_n
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN_z
|
PNBEARG_z
|
HFRS_mean
|
pct_stressed
|
pct_coping
|
pct_comfortable
|
|
1
|
21953
|
27,380,949
|
-0.15
|
0.08
|
0.04
|
0.01
|
-0.04
|
0.20
|
0.02
|
0.06
|
48.74
|
33.88
|
35.18
|
30.94
|
|
2
|
1702
|
1,763,907
|
3.09
|
-0.68
|
-0.18
|
0.43
|
-0.47
|
0.42
|
-0.33
|
-0.69
|
59.61
|
10.86
|
18.19
|
70.96
|
|
3
|
705
|
1,335,921
|
-0.44
|
-0.66
|
-0.45
|
-0.35
|
1.39
|
-4.18
|
0.20
|
-0.08
|
29.88
|
90.15
|
7.66
|
2.19
|
euc_rep_k2 <- representative_cases(
df = df_euc,
cluster_col = "euc_hc_k2",
vars_for_distance = EUC_VARS,
cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", EUC_VARS)
)
euc_rep_k3 <- representative_cases(
df = df_euc,
cluster_col = "euc_hc_k3",
vars_for_distance = EUC_VARS,
cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", EUC_VARS)
)
euc_rep_k2 %>%
kable(digits = 3, caption = "Euclidean hierarchical k=2 — representative cases") %>%
kable_styling(full_width = FALSE)
Euclidean hierarchical k=2 — representative cases
|
euc_hc_k2
|
PEFAMID
|
HFRS_100
|
HFRS_label
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN_z
|
PNBEARG_z
|
distance_to_centroid
|
|
1
|
5436
|
50.047
|
coping
|
-0.030
|
0.265
|
0.273
|
-0.036
|
-0.105
|
0.094
|
-0.271
|
-0.277
|
0.520
|
|
2
|
10302
|
58.328
|
comfortable
|
2.946
|
-0.674
|
-0.433
|
0.054
|
-0.636
|
0.493
|
-0.489
|
-0.343
|
0.496
|
euc_rep_k3 %>%
kable(digits = 3, caption = "Euclidean hierarchical k=3 — representative cases") %>%
kable_styling(full_width = FALSE)
Euclidean hierarchical k=3 — representative cases
|
euc_hc_k3
|
PEFAMID
|
HFRS_100
|
HFRS_label
|
BBDI_sl_w_z
|
SHUSH_sl_w_z
|
MDI_sl_w_z
|
FINAS_sl_w_z
|
ODDI_sl_w_z
|
PWNETWPT_sl_w_z
|
PATTSTIN_z
|
PNBEARG_z
|
distance_to_centroid
|
|
1
|
5436
|
50.047
|
coping
|
-0.030
|
0.265
|
0.273
|
-0.036
|
-0.105
|
0.094
|
-0.271
|
-0.277
|
0.538
|
|
2
|
10302
|
58.328
|
comfortable
|
2.946
|
-0.674
|
-0.433
|
0.054
|
-0.636
|
0.493
|
-0.489
|
-0.343
|
0.496
|
|
3
|
4967
|
30.793
|
stressed
|
-0.587
|
-0.836
|
-0.674
|
-0.725
|
0.930
|
-4.402
|
-0.271
|
-0.277
|
0.829
|
Save
saveRDS(df_gower, here("02_clustering", "outputs", "data", "hierarchical_full_gower_adf_clustered.rds"))
saveRDS(df_euc, here("02_clustering", "outputs", "data", "hierarchical_full_euclidean_adf_clustered.rds"))
cat("Saved full-sample hierarchical outputs.\n")
## Saved full-sample hierarchical outputs.
Session Info
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.utf8
## [2] LC_CTYPE=Chinese (Simplified)_China.utf8
## [3] LC_MONETARY=Chinese (Simplified)_China.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=Chinese (Simplified)_China.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.4.0 mclust_6.1.2 kableExtra_1.4.0 cluster_2.1.8.2
## [5] lubridate_1.9.5 forcats_1.0.1 stringr_1.6.0 dplyr_1.2.1
## [9] purrr_1.2.2 readr_2.2.0 tidyr_1.3.2 tibble_3.3.1
## [13] ggplot2_4.0.3 tidyverse_2.0.0 here_1.0.2
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 xml2_1.5.2 stringi_1.8.7
## [5] hms_1.1.4 digest_0.6.39 magrittr_2.0.5 evaluate_1.0.5
## [9] grid_4.4.1 timechange_0.4.0 RColorBrewer_1.1-3 fastmap_1.2.0
## [13] rprojroot_2.1.1 jsonlite_2.0.0 viridisLite_0.4.3 textshaping_1.0.5
## [17] jquerylib_0.1.4 cli_3.6.6 crayon_1.5.3 rlang_1.2.0
## [21] bit64_4.8.0 withr_3.0.2 cachem_1.1.0 yaml_2.3.12
## [25] otel_0.2.0 parallel_4.4.1 tools_4.4.1 tzdb_0.5.0
## [29] vctrs_0.7.3 R6_2.6.1 lifecycle_1.0.5 bit_4.6.0
## [33] vroom_1.7.1 pkgconfig_2.0.3 pillar_1.11.1 bslib_0.10.0
## [37] gtable_0.3.6 glue_1.8.1 systemfonts_1.3.2 xfun_0.57
## [41] tidyselect_1.2.1 rstudioapi_0.18.0 knitr_1.51 farver_2.1.2
## [45] htmltools_0.5.9 rmarkdown_2.31 svglite_2.2.2 compiler_4.4.1
## [49] S7_0.2.2