Overview

This notebook runs full-sample hierarchical clustering on the HFRS data in two ways:

Gower hierarchical
- six transformed continuous indicators
- PATTSTIN as raw ordered scale
- PNBEARG as raw count / ordered variable
Euclidean hierarchical
- the original 8 transformed indicators

For each method, the notebook:

fits hierarchical clustering on the full sample
cuts the tree at k = 2 and k = 3
compares the cluster labels with HFRS_label
produces weighted cluster-profile tables
produces a representative case table for interpretation

Note: hierarchical clustering itself does not produce medoids. The “representative case” tables below use the observation nearest to each cluster centroid as a medoid-style interpretive summary.

0 Libraries & Setup

library(here)
library(tidyverse)
library(cluster)
library(kableExtra)
library(mclust)
library(scales)

set.seed(2024)

data_path <- here( "adf.csv")
out_data_dir <- here("02_clustering", "outputs", "data")
dir.create(out_data_dir, recursive = TRUE, showWarnings = FALSE)

EUC_VARS <- c(
  "BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z", "FINAS_sl_w_z",
  "ODDI_sl_w_z", "PWNETWPT_sl_w_z", "PATTSTIN_z", "PNBEARG_z"
)

GOWER_NUM_VARS <- c(
  "BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z",
  "FINAS_sl_w_z", "ODDI_sl_w_z", "PWNETWPT_sl_w_z"
)

GOWER_ORD_VARS <- c("PATTSTIN", "PNBEARG")
GOWER_ALL_VARS <- c(GOWER_NUM_VARS, GOWER_ORD_VARS)

wmean <- function(x, w) weighted.mean(x, w, na.rm = TRUE)

weighted_profile_table <- function(df, cluster_col, profile_vars) {
  df %>%
    group_by(cluster = .data[[cluster_col]]) %>%
    summarise(
      n = n(),
      wt_n = sum(PWEIGHT, na.rm = TRUE),
      across(all_of(profile_vars), ~ wmean(.x, PWEIGHT)),
      HFRS_mean = wmean(HFRS_100, PWEIGHT),
      pct_stressed = wmean(HFRS_label == "stressed", PWEIGHT) * 100,
      pct_coping = wmean(HFRS_label == "coping", PWEIGHT) * 100,
      pct_comfortable = wmean(HFRS_label == "comfortable", PWEIGHT) * 100,
      .groups = "drop"
    )
}

weighted_crosstab <- function(df, cluster_col) {
  df %>%
    dplyr::count(.data[[cluster_col]], HFRS_label, wt = PWEIGHT, name = "wt_n") %>%
    group_by(.data[[cluster_col]]) %>%
    mutate(pct = 100 * wt_n / sum(wt_n)) %>%
    ungroup() %>%
    select(all_of(c(cluster_col, "HFRS_label", "pct"))) %>%
    rename(cluster = all_of(cluster_col)) %>%
    pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0)
}

ari_summary <- function(df, cluster_cols) {
  label_int <- as.integer(factor(
    df$HFRS_label,
    levels = c("stressed", "coping", "comfortable")
  ))

  tibble(
    clustering = cluster_cols,
    k = c(2, 3),
    ARI = map_dbl(cluster_cols, ~ adjustedRandIndex(df[[.x]], label_int))
  )
}

representative_cases <- function(df, cluster_col, vars_for_distance, cols_to_show) {
  cluster_ids <- sort(unique(df[[cluster_col]]))

  map_dfr(cluster_ids, function(cl) {
    sub <- df %>% filter(.data[[cluster_col]] == cl)
    centroid <- colMeans(sub[, vars_for_distance, drop = FALSE])
    dists <- apply(sub[, vars_for_distance, drop = FALSE], 1, function(row) {
      sqrt(sum((row - centroid)^2))
    })
    best_idx <- which.min(dists)
    out <- sub[best_idx, cols_to_show, drop = FALSE]
    out[[cluster_col]] <- cl
    out$distance_to_centroid <- dists[[best_idx]]
    out
  }) %>%
    relocate(all_of(cluster_col), .before = 1)
}

1 Load Data

df_raw <- readr::read_csv(data_path, show_col_types = FALSE) %>%
  mutate(
    HFRS_label = factor(HFRS_label, levels = c("stressed", "coping", "comfortable")),
    PATTSTIN = ordered(PATTSTIN),
    PNBEARG = ordered(PNBEARG)
  )

required_vars <- c(
  "PEFAMID", "PWEIGHT", "HFRS_100", "HFRS_label",
  EUC_VARS, GOWER_ALL_VARS
)

missing_vars <- setdiff(required_vars, names(df_raw))
if (length(missing_vars) > 0) {
  stop("Missing required variables: ", paste(missing_vars, collapse = ", "))
}

df_euc <- df_raw %>%
  filter(
    !is.na(HFRS_100), !is.na(HFRS_label), !is.na(PWEIGHT),
    if_all(all_of(EUC_VARS), ~ !is.na(.x))
  )

df_gower <- df_raw %>%
  filter(
    !is.na(HFRS_100), !is.na(HFRS_label), !is.na(PWEIGHT),
    if_all(all_of(GOWER_ALL_VARS), ~ !is.na(.x))
  ) %>%
  mutate(
    PATTSTIN = ordered(PATTSTIN),
    PNBEARG = ordered(PNBEARG)
  )

cat("Euclidean sample rows:", nrow(df_euc), "\n")

## Euclidean sample rows: 24360

cat("Gower sample rows:", nrow(df_gower), "\n")

## Gower sample rows: 24360

2.1 Gower Hierarchical (Full Sample)

X_gower <- df_gower %>%
  transmute(
    across(all_of(GOWER_NUM_VARS), as.numeric),
    PATTSTIN = ordered(PATTSTIN),
    PNBEARG = ordered(PNBEARG)
  )

gower_dist <- daisy(X_gower, metric = "gower")
hc_gower <- hclust(gower_dist, method = "ward.D2")

df_gower <- df_gower %>%
  mutate(
    gower_hc_k2 = cutree(hc_gower, k = 2),
    gower_hc_k3 = cutree(hc_gower, k = 3)
  )

gower_ari <- ari_summary(df_gower, c("gower_hc_k2", "gower_hc_k3"))
gower_ari %>%
  mutate(ARI = round(ARI, 4)) %>%
  kable(caption = "Gower hierarchical — ARI vs HFRS label") %>%
  kable_styling(full_width = FALSE)

Gower hierarchical — ARI vs HFRS label
clustering	k	ARI
gower_hc_k2	2	0.0981
gower_hc_k3	3	0.0683

weighted_crosstab(df_gower, "gower_hc_k2") %>%
  kable(digits = 1, caption = "Gower hierarchical k=2 — weighted % by HFRS label") %>%
  kable_styling(full_width = FALSE)

Gower hierarchical k=2 — weighted % by HFRS label
cluster	stressed	coping	comfortable
1	23.7	34.5	41.8
2	56.6	30.0	13.4

weighted_crosstab(df_gower, "gower_hc_k3") %>%
  kable(digits = 1, caption = "Gower hierarchical k=3 — weighted % by HFRS label") %>%
  kable_styling(full_width = FALSE)

Gower hierarchical k=3 — weighted % by HFRS label
cluster	stressed	coping	comfortable
1	27.4	36.0	36.7
2	6.2	27.9	65.9
3	56.6	30.0	13.4

gower_profile_k2 <- weighted_profile_table(
  df_gower,
  "gower_hc_k2",
  c(GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)

gower_profile_k3 <- weighted_profile_table(
  df_gower,
  "gower_hc_k3",
  c(GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)

gower_profile_k2 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Gower hierarchical k=2 — weighted cluster profiles") %>%
  kable_styling(full_width = FALSE)

Gower hierarchical k=2 — weighted cluster profiles
cluster	n	wt_n	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN	PNBEARG	HFRS_mean	pct_stressed	pct_coping	pct_comfortable
1	17136	19,988,308	0.08	0.01	-0.07	-0.01	0.04	-0.05	NA	NA	51.80	23.68	34.55	41.77
2	7224	10,492,470	-0.08	-0.01	0.15	0.06	-0.08	0.17	NA	NA	42.34	56.60	30.04	13.36

gower_profile_k3 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Gower hierarchical k=3 — weighted cluster profiles") %>%
  kable_styling(full_width = FALSE)

Gower hierarchical k=3 — weighted cluster profiles
cluster	n	wt_n	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN	PNBEARG	HFRS_mean	pct_stressed	pct_coping	pct_comfortable
1	13589	16,482,621	0.08	-0.03	-0.08	0.00	0.05	-0.13	NA	NA	50.25	27.4	35.95	36.65
2	3547	3,505,687	0.08	0.19	0.02	-0.06	-0.01	0.32	NA	NA	59.11	6.2	27.94	65.87
3	7224	10,492,470	-0.08	-0.01	0.15	0.06	-0.08	0.17	NA	NA	42.34	56.6	30.04	13.36

gower_rep_k2 <- representative_cases(
  df = df_gower %>%
    mutate(
      PATTSTIN_num = as.numeric(PATTSTIN),
      PNBEARG_num = as.numeric(PNBEARG)
    ),
  cluster_col = "gower_hc_k2",
  vars_for_distance = c(GOWER_NUM_VARS, "PATTSTIN_num", "PNBEARG_num"),
  cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)

gower_rep_k3 <- representative_cases(
  df = df_gower %>%
    mutate(
      PATTSTIN_num = as.numeric(PATTSTIN),
      PNBEARG_num = as.numeric(PNBEARG)
    ),
  cluster_col = "gower_hc_k3",
  vars_for_distance = c(GOWER_NUM_VARS, "PATTSTIN_num", "PNBEARG_num"),
  cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)

gower_rep_k2 %>%
  kable(digits = 3, caption = "Gower hierarchical k=2 — representative cases") %>%
  kable_styling(full_width = FALSE)

Gower hierarchical k=2 — representative cases
gower_hc_k2	PEFAMID	HFRS_100	HFRS_label	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN	PNBEARG	distance_to_centroid
1	11	48.843	coping	0.032	-0.063	-0.067	-0.373	-0.061	0.251	0	1	0.529
2	1383	42.048	stressed	-0.093	0.331	0.267	0.085	-0.332	0.344	1	2	0.596

gower_rep_k3 %>%
  kable(digits = 3, caption = "Gower hierarchical k=3 — representative cases") %>%
  kable_styling(full_width = FALSE)

Gower hierarchical k=3 — representative cases
gower_hc_k3	PEFAMID	HFRS_100	HFRS_label	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN	PNBEARG	distance_to_centroid
1	11	48.843	coping	0.032	-0.063	-0.067	-0.373	-0.061	0.251	0	1	0.483
2	1707	60.318	comfortable	-0.166	0.483	0.160	-0.260	-0.177	0.323	-1	1	0.694
3	1383	42.048	stressed	-0.093	0.331	0.267	0.085	-0.332	0.344	1	2	0.596

2.2 Euclidean Hierarchical (Full Sample)

X_euc <- df_euc %>%
  select(all_of(EUC_VARS)) %>%
  as.matrix()

euc_dist <- dist(X_euc)
hc_euc <- hclust(euc_dist, method = "ward.D2")

df_euc <- df_euc %>%
  mutate(
    euc_hc_k2 = cutree(hc_euc, k = 2),
    euc_hc_k3 = cutree(hc_euc, k = 3)
  )

euc_ari <- ari_summary(df_euc, c("euc_hc_k2", "euc_hc_k3"))
euc_ari %>%
  mutate(ARI = round(ARI, 4)) %>%
  kable(caption = "Euclidean hierarchical — ARI vs HFRS label") %>%
  kable_styling(full_width = FALSE)

Euclidean hierarchical — ARI vs HFRS label
clustering	k	ARI
euc_hc_k2	2	-0.0047
euc_hc_k3	3	0.0046

weighted_crosstab(df_euc, "euc_hc_k2") %>%
  kable(digits = 1, caption = "Euclidean hierarchical k=2 — weighted % by HFRS label") %>%
  kable_styling(full_width = FALSE)

Euclidean hierarchical k=2 — weighted % by HFRS label
cluster	stressed	coping	comfortable
1	36.5	33.9	29.6
2	10.9	18.2	71.0

weighted_crosstab(df_euc, "euc_hc_k3") %>%
  kable(digits = 1, caption = "Euclidean hierarchical k=3 — weighted % by HFRS label") %>%
  kable_styling(full_width = FALSE)

Euclidean hierarchical k=3 — weighted % by HFRS label
cluster	stressed	coping	comfortable
1	33.9	35.2	30.9
2	10.9	18.2	71.0
3	90.2	7.7	2.2

euc_profile_k2 <- weighted_profile_table(df_euc, "euc_hc_k2", EUC_VARS)
euc_profile_k3 <- weighted_profile_table(df_euc, "euc_hc_k3", EUC_VARS)

euc_profile_k2 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Euclidean hierarchical k=2 — weighted cluster profiles") %>%
  kable_styling(full_width = FALSE)

Euclidean hierarchical k=2 — weighted cluster profiles
cluster	n	wt_n	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN_z	PNBEARG_z	HFRS_mean	pct_stressed	pct_coping	pct_comfortable
1	22658	28,716,870	-0.16	0.04	0.02	-0.01	0.03	0.00	0.03	0.06	47.87	36.49	33.90	29.60
2	1702	1,763,907	3.09	-0.68	-0.18	0.43	-0.47	0.42	-0.33	-0.69	59.61	10.86	18.19	70.96

euc_profile_k3 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Euclidean hierarchical k=3 — weighted cluster profiles") %>%
  kable_styling(full_width = FALSE)

Euclidean hierarchical k=3 — weighted cluster profiles
cluster	n	wt_n	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN_z	PNBEARG_z	HFRS_mean	pct_stressed	pct_coping	pct_comfortable
1	21953	27,380,949	-0.15	0.08	0.04	0.01	-0.04	0.20	0.02	0.06	48.74	33.88	35.18	30.94
2	1702	1,763,907	3.09	-0.68	-0.18	0.43	-0.47	0.42	-0.33	-0.69	59.61	10.86	18.19	70.96
3	705	1,335,921	-0.44	-0.66	-0.45	-0.35	1.39	-4.18	0.20	-0.08	29.88	90.15	7.66	2.19

euc_rep_k2 <- representative_cases(
  df = df_euc,
  cluster_col = "euc_hc_k2",
  vars_for_distance = EUC_VARS,
  cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", EUC_VARS)
)

euc_rep_k3 <- representative_cases(
  df = df_euc,
  cluster_col = "euc_hc_k3",
  vars_for_distance = EUC_VARS,
  cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", EUC_VARS)
)

euc_rep_k2 %>%
  kable(digits = 3, caption = "Euclidean hierarchical k=2 — representative cases") %>%
  kable_styling(full_width = FALSE)

Euclidean hierarchical k=2 — representative cases
euc_hc_k2	PEFAMID	HFRS_100	HFRS_label	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN_z	PNBEARG_z	distance_to_centroid
1	5436	50.047	coping	-0.030	0.265	0.273	-0.036	-0.105	0.094	-0.271	-0.277	0.520
2	10302	58.328	comfortable	2.946	-0.674	-0.433	0.054	-0.636	0.493	-0.489	-0.343	0.496

euc_rep_k3 %>%
  kable(digits = 3, caption = "Euclidean hierarchical k=3 — representative cases") %>%
  kable_styling(full_width = FALSE)

Euclidean hierarchical k=3 — representative cases
euc_hc_k3	PEFAMID	HFRS_100	HFRS_label	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN_z	PNBEARG_z	distance_to_centroid
1	5436	50.047	coping	-0.030	0.265	0.273	-0.036	-0.105	0.094	-0.271	-0.277	0.538
2	10302	58.328	comfortable	2.946	-0.674	-0.433	0.054	-0.636	0.493	-0.489	-0.343	0.496
3	4967	30.793	stressed	-0.587	-0.836	-0.674	-0.725	0.930	-4.402	-0.271	-0.277	0.829

Save

saveRDS(df_gower, here("02_clustering", "outputs", "data", "hierarchical_full_gower_adf_clustered.rds"))
saveRDS(df_euc, here("02_clustering", "outputs", "data", "hierarchical_full_euclidean_adf_clustered.rds"))
cat("Saved full-sample hierarchical outputs.\n")

## Saved full-sample hierarchical outputs.

Session Info

sessionInfo()

## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.utf8 
## [2] LC_CTYPE=Chinese (Simplified)_China.utf8   
## [3] LC_MONETARY=Chinese (Simplified)_China.utf8
## [4] LC_NUMERIC=C                               
## [5] LC_TIME=Chinese (Simplified)_China.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.4.0     mclust_6.1.2     kableExtra_1.4.0 cluster_2.1.8.2 
##  [5] lubridate_1.9.5  forcats_1.0.1    stringr_1.6.0    dplyr_1.2.1     
##  [9] purrr_1.2.2      readr_2.2.0      tidyr_1.3.2      tibble_3.3.1    
## [13] ggplot2_4.0.3    tidyverse_2.0.0  here_1.0.2      
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.5.2         stringi_1.8.7     
##  [5] hms_1.1.4          digest_0.6.39      magrittr_2.0.5     evaluate_1.0.5    
##  [9] grid_4.4.1         timechange_0.4.0   RColorBrewer_1.1-3 fastmap_1.2.0     
## [13] rprojroot_2.1.1    jsonlite_2.0.0     viridisLite_0.4.3  textshaping_1.0.5 
## [17] jquerylib_0.1.4    cli_3.6.6          crayon_1.5.3       rlang_1.2.0       
## [21] bit64_4.8.0        withr_3.0.2        cachem_1.1.0       yaml_2.3.12       
## [25] otel_0.2.0         parallel_4.4.1     tools_4.4.1        tzdb_0.5.0        
## [29] vctrs_0.7.3        R6_2.6.1           lifecycle_1.0.5    bit_4.6.0         
## [33] vroom_1.7.1        pkgconfig_2.0.3    pillar_1.11.1      bslib_0.10.0      
## [37] gtable_0.3.6       glue_1.8.1         systemfonts_1.3.2  xfun_0.57         
## [41] tidyselect_1.2.1   rstudioapi_0.18.0  knitr_1.51         farver_2.1.2      
## [45] htmltools_0.5.9    rmarkdown_2.31     svglite_2.2.2      compiler_4.4.1    
## [49] S7_0.2.2

HFRS Clustering — Full-Sample Hierarchical

2026-05-29

Overview

0 Libraries & Setup

1 Load Data

2.1 Gower Hierarchical (Full Sample)

2.2 Euclidean Hierarchical (Full Sample)

Save

Session Info