Overview

This notebook runs full-sample hierarchical clustering on the HFRS data in two ways:

  1. Gower hierarchical
    • six transformed continuous indicators
    • PATTSTIN as raw ordered scale
    • PNBEARG as raw count / ordered variable
  2. Euclidean hierarchical
    • the original 8 transformed indicators

For each method, the notebook:

  • fits hierarchical clustering on the full sample
  • cuts the tree at k = 2 and k = 3
  • compares the cluster labels with HFRS_label
  • produces weighted cluster-profile tables
  • produces a representative case table for interpretation

Note: hierarchical clustering itself does not produce medoids. The “representative case” tables below use the observation nearest to each cluster centroid as a medoid-style interpretive summary.

0 Libraries & Setup

library(here)
library(tidyverse)
library(cluster)
library(kableExtra)
library(mclust)
library(scales)

set.seed(2024)

data_path <- here( "adf.csv")
out_data_dir <- here("02_clustering", "outputs", "data")
dir.create(out_data_dir, recursive = TRUE, showWarnings = FALSE)

EUC_VARS <- c(
  "BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z", "FINAS_sl_w_z",
  "ODDI_sl_w_z", "PWNETWPT_sl_w_z", "PATTSTIN_z", "PNBEARG_z"
)

GOWER_NUM_VARS <- c(
  "BBDI_sl_w_z", "SHUSH_sl_w_z", "MDI_sl_w_z",
  "FINAS_sl_w_z", "ODDI_sl_w_z", "PWNETWPT_sl_w_z"
)

GOWER_ORD_VARS <- c("PATTSTIN", "PNBEARG")
GOWER_ALL_VARS <- c(GOWER_NUM_VARS, GOWER_ORD_VARS)

wmean <- function(x, w) weighted.mean(x, w, na.rm = TRUE)

weighted_profile_table <- function(df, cluster_col, profile_vars) {
  df %>%
    group_by(cluster = .data[[cluster_col]]) %>%
    summarise(
      n = n(),
      wt_n = sum(PWEIGHT, na.rm = TRUE),
      across(all_of(profile_vars), ~ wmean(.x, PWEIGHT)),
      HFRS_mean = wmean(HFRS_100, PWEIGHT),
      pct_stressed = wmean(HFRS_label == "stressed", PWEIGHT) * 100,
      pct_coping = wmean(HFRS_label == "coping", PWEIGHT) * 100,
      pct_comfortable = wmean(HFRS_label == "comfortable", PWEIGHT) * 100,
      .groups = "drop"
    )
}

weighted_crosstab <- function(df, cluster_col) {
  df %>%
    dplyr::count(.data[[cluster_col]], HFRS_label, wt = PWEIGHT, name = "wt_n") %>%
    group_by(.data[[cluster_col]]) %>%
    mutate(pct = 100 * wt_n / sum(wt_n)) %>%
    ungroup() %>%
    select(all_of(c(cluster_col, "HFRS_label", "pct"))) %>%
    rename(cluster = all_of(cluster_col)) %>%
    pivot_wider(names_from = HFRS_label, values_from = pct, values_fill = 0)
}

ari_summary <- function(df, cluster_cols) {
  label_int <- as.integer(factor(
    df$HFRS_label,
    levels = c("stressed", "coping", "comfortable")
  ))

  tibble(
    clustering = cluster_cols,
    k = c(2, 3),
    ARI = map_dbl(cluster_cols, ~ adjustedRandIndex(df[[.x]], label_int))
  )
}

representative_cases <- function(df, cluster_col, vars_for_distance, cols_to_show) {
  cluster_ids <- sort(unique(df[[cluster_col]]))

  map_dfr(cluster_ids, function(cl) {
    sub <- df %>% filter(.data[[cluster_col]] == cl)
    centroid <- colMeans(sub[, vars_for_distance, drop = FALSE])
    dists <- apply(sub[, vars_for_distance, drop = FALSE], 1, function(row) {
      sqrt(sum((row - centroid)^2))
    })
    best_idx <- which.min(dists)
    out <- sub[best_idx, cols_to_show, drop = FALSE]
    out[[cluster_col]] <- cl
    out$distance_to_centroid <- dists[[best_idx]]
    out
  }) %>%
    relocate(all_of(cluster_col), .before = 1)
}

1 Load Data

df_raw <- readr::read_csv(data_path, show_col_types = FALSE) %>%
  mutate(
    HFRS_label = factor(HFRS_label, levels = c("stressed", "coping", "comfortable")),
    PATTSTIN = ordered(PATTSTIN),
    PNBEARG = ordered(PNBEARG)
  )

required_vars <- c(
  "PEFAMID", "PWEIGHT", "HFRS_100", "HFRS_label",
  EUC_VARS, GOWER_ALL_VARS
)

missing_vars <- setdiff(required_vars, names(df_raw))
if (length(missing_vars) > 0) {
  stop("Missing required variables: ", paste(missing_vars, collapse = ", "))
}

df_euc <- df_raw %>%
  filter(
    !is.na(HFRS_100), !is.na(HFRS_label), !is.na(PWEIGHT),
    if_all(all_of(EUC_VARS), ~ !is.na(.x))
  )

df_gower <- df_raw %>%
  filter(
    !is.na(HFRS_100), !is.na(HFRS_label), !is.na(PWEIGHT),
    if_all(all_of(GOWER_ALL_VARS), ~ !is.na(.x))
  ) %>%
  mutate(
    PATTSTIN = ordered(PATTSTIN),
    PNBEARG = ordered(PNBEARG)
  )

cat("Euclidean sample rows:", nrow(df_euc), "\n")
## Euclidean sample rows: 24360
cat("Gower sample rows:", nrow(df_gower), "\n")
## Gower sample rows: 24360

2.1 Gower Hierarchical (Full Sample)

X_gower <- df_gower %>%
  transmute(
    across(all_of(GOWER_NUM_VARS), as.numeric),
    PATTSTIN = ordered(PATTSTIN),
    PNBEARG = ordered(PNBEARG)
  )

gower_dist <- daisy(X_gower, metric = "gower")
hc_gower <- hclust(gower_dist, method = "ward.D2")

df_gower <- df_gower %>%
  mutate(
    gower_hc_k2 = cutree(hc_gower, k = 2),
    gower_hc_k3 = cutree(hc_gower, k = 3)
  )
gower_ari <- ari_summary(df_gower, c("gower_hc_k2", "gower_hc_k3"))
gower_ari %>%
  mutate(ARI = round(ARI, 4)) %>%
  kable(caption = "Gower hierarchical — ARI vs HFRS label") %>%
  kable_styling(full_width = FALSE)
Gower hierarchical — ARI vs HFRS label
clustering k ARI
gower_hc_k2 2 0.0981
gower_hc_k3 3 0.0683
weighted_crosstab(df_gower, "gower_hc_k2") %>%
  kable(digits = 1, caption = "Gower hierarchical k=2 — weighted % by HFRS label") %>%
  kable_styling(full_width = FALSE)
Gower hierarchical k=2 — weighted % by HFRS label
cluster stressed coping comfortable
1 23.7 34.5 41.8
2 56.6 30.0 13.4
weighted_crosstab(df_gower, "gower_hc_k3") %>%
  kable(digits = 1, caption = "Gower hierarchical k=3 — weighted % by HFRS label") %>%
  kable_styling(full_width = FALSE)
Gower hierarchical k=3 — weighted % by HFRS label
cluster stressed coping comfortable
1 27.4 36.0 36.7
2 6.2 27.9 65.9
3 56.6 30.0 13.4
gower_profile_k2 <- weighted_profile_table(
  df_gower,
  "gower_hc_k2",
  c(GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)

gower_profile_k3 <- weighted_profile_table(
  df_gower,
  "gower_hc_k3",
  c(GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)

gower_profile_k2 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Gower hierarchical k=2 — weighted cluster profiles") %>%
  kable_styling(full_width = FALSE)
Gower hierarchical k=2 — weighted cluster profiles
cluster n wt_n BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN PNBEARG HFRS_mean pct_stressed pct_coping pct_comfortable
1 17136 19,988,308 0.08 0.01 -0.07 -0.01 0.04 -0.05 NA NA 51.80 23.68 34.55 41.77
2 7224 10,492,470 -0.08 -0.01 0.15 0.06 -0.08 0.17 NA NA 42.34 56.60 30.04 13.36
gower_profile_k3 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Gower hierarchical k=3 — weighted cluster profiles") %>%
  kable_styling(full_width = FALSE)
Gower hierarchical k=3 — weighted cluster profiles
cluster n wt_n BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN PNBEARG HFRS_mean pct_stressed pct_coping pct_comfortable
1 13589 16,482,621 0.08 -0.03 -0.08 0.00 0.05 -0.13 NA NA 50.25 27.4 35.95 36.65
2 3547 3,505,687 0.08 0.19 0.02 -0.06 -0.01 0.32 NA NA 59.11 6.2 27.94 65.87
3 7224 10,492,470 -0.08 -0.01 0.15 0.06 -0.08 0.17 NA NA 42.34 56.6 30.04 13.36
gower_rep_k2 <- representative_cases(
  df = df_gower %>%
    mutate(
      PATTSTIN_num = as.numeric(PATTSTIN),
      PNBEARG_num = as.numeric(PNBEARG)
    ),
  cluster_col = "gower_hc_k2",
  vars_for_distance = c(GOWER_NUM_VARS, "PATTSTIN_num", "PNBEARG_num"),
  cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)

gower_rep_k3 <- representative_cases(
  df = df_gower %>%
    mutate(
      PATTSTIN_num = as.numeric(PATTSTIN),
      PNBEARG_num = as.numeric(PNBEARG)
    ),
  cluster_col = "gower_hc_k3",
  vars_for_distance = c(GOWER_NUM_VARS, "PATTSTIN_num", "PNBEARG_num"),
  cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", GOWER_NUM_VARS, "PATTSTIN", "PNBEARG")
)

gower_rep_k2 %>%
  kable(digits = 3, caption = "Gower hierarchical k=2 — representative cases") %>%
  kable_styling(full_width = FALSE)
Gower hierarchical k=2 — representative cases
gower_hc_k2 PEFAMID HFRS_100 HFRS_label BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN PNBEARG distance_to_centroid
1 11 48.843 coping 0.032 -0.063 -0.067 -0.373 -0.061 0.251 0 1 0.529
2 1383 42.048 stressed -0.093 0.331 0.267 0.085 -0.332 0.344 1 2 0.596
gower_rep_k3 %>%
  kable(digits = 3, caption = "Gower hierarchical k=3 — representative cases") %>%
  kable_styling(full_width = FALSE)
Gower hierarchical k=3 — representative cases
gower_hc_k3 PEFAMID HFRS_100 HFRS_label BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN PNBEARG distance_to_centroid
1 11 48.843 coping 0.032 -0.063 -0.067 -0.373 -0.061 0.251 0 1 0.483
2 1707 60.318 comfortable -0.166 0.483 0.160 -0.260 -0.177 0.323 -1 1 0.694
3 1383 42.048 stressed -0.093 0.331 0.267 0.085 -0.332 0.344 1 2 0.596

2.2 Euclidean Hierarchical (Full Sample)

X_euc <- df_euc %>%
  select(all_of(EUC_VARS)) %>%
  as.matrix()

euc_dist <- dist(X_euc)
hc_euc <- hclust(euc_dist, method = "ward.D2")

df_euc <- df_euc %>%
  mutate(
    euc_hc_k2 = cutree(hc_euc, k = 2),
    euc_hc_k3 = cutree(hc_euc, k = 3)
  )
euc_ari <- ari_summary(df_euc, c("euc_hc_k2", "euc_hc_k3"))
euc_ari %>%
  mutate(ARI = round(ARI, 4)) %>%
  kable(caption = "Euclidean hierarchical — ARI vs HFRS label") %>%
  kable_styling(full_width = FALSE)
Euclidean hierarchical — ARI vs HFRS label
clustering k ARI
euc_hc_k2 2 -0.0047
euc_hc_k3 3 0.0046
weighted_crosstab(df_euc, "euc_hc_k2") %>%
  kable(digits = 1, caption = "Euclidean hierarchical k=2 — weighted % by HFRS label") %>%
  kable_styling(full_width = FALSE)
Euclidean hierarchical k=2 — weighted % by HFRS label
cluster stressed coping comfortable
1 36.5 33.9 29.6
2 10.9 18.2 71.0
weighted_crosstab(df_euc, "euc_hc_k3") %>%
  kable(digits = 1, caption = "Euclidean hierarchical k=3 — weighted % by HFRS label") %>%
  kable_styling(full_width = FALSE)
Euclidean hierarchical k=3 — weighted % by HFRS label
cluster stressed coping comfortable
1 33.9 35.2 30.9
2 10.9 18.2 71.0
3 90.2 7.7 2.2
euc_profile_k2 <- weighted_profile_table(df_euc, "euc_hc_k2", EUC_VARS)
euc_profile_k3 <- weighted_profile_table(df_euc, "euc_hc_k3", EUC_VARS)

euc_profile_k2 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Euclidean hierarchical k=2 — weighted cluster profiles") %>%
  kable_styling(full_width = FALSE)
Euclidean hierarchical k=2 — weighted cluster profiles
cluster n wt_n BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN_z PNBEARG_z HFRS_mean pct_stressed pct_coping pct_comfortable
1 22658 28,716,870 -0.16 0.04 0.02 -0.01 0.03 0.00 0.03 0.06 47.87 36.49 33.90 29.60
2 1702 1,763,907 3.09 -0.68 -0.18 0.43 -0.47 0.42 -0.33 -0.69 59.61 10.86 18.19 70.96
euc_profile_k3 %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Euclidean hierarchical k=3 — weighted cluster profiles") %>%
  kable_styling(full_width = FALSE)
Euclidean hierarchical k=3 — weighted cluster profiles
cluster n wt_n BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN_z PNBEARG_z HFRS_mean pct_stressed pct_coping pct_comfortable
1 21953 27,380,949 -0.15 0.08 0.04 0.01 -0.04 0.20 0.02 0.06 48.74 33.88 35.18 30.94
2 1702 1,763,907 3.09 -0.68 -0.18 0.43 -0.47 0.42 -0.33 -0.69 59.61 10.86 18.19 70.96
3 705 1,335,921 -0.44 -0.66 -0.45 -0.35 1.39 -4.18 0.20 -0.08 29.88 90.15 7.66 2.19
euc_rep_k2 <- representative_cases(
  df = df_euc,
  cluster_col = "euc_hc_k2",
  vars_for_distance = EUC_VARS,
  cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", EUC_VARS)
)

euc_rep_k3 <- representative_cases(
  df = df_euc,
  cluster_col = "euc_hc_k3",
  vars_for_distance = EUC_VARS,
  cols_to_show = c("PEFAMID", "HFRS_100", "HFRS_label", EUC_VARS)
)

euc_rep_k2 %>%
  kable(digits = 3, caption = "Euclidean hierarchical k=2 — representative cases") %>%
  kable_styling(full_width = FALSE)
Euclidean hierarchical k=2 — representative cases
euc_hc_k2 PEFAMID HFRS_100 HFRS_label BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN_z PNBEARG_z distance_to_centroid
1 5436 50.047 coping -0.030 0.265 0.273 -0.036 -0.105 0.094 -0.271 -0.277 0.520
2 10302 58.328 comfortable 2.946 -0.674 -0.433 0.054 -0.636 0.493 -0.489 -0.343 0.496
euc_rep_k3 %>%
  kable(digits = 3, caption = "Euclidean hierarchical k=3 — representative cases") %>%
  kable_styling(full_width = FALSE)
Euclidean hierarchical k=3 — representative cases
euc_hc_k3 PEFAMID HFRS_100 HFRS_label BBDI_sl_w_z SHUSH_sl_w_z MDI_sl_w_z FINAS_sl_w_z ODDI_sl_w_z PWNETWPT_sl_w_z PATTSTIN_z PNBEARG_z distance_to_centroid
1 5436 50.047 coping -0.030 0.265 0.273 -0.036 -0.105 0.094 -0.271 -0.277 0.538
2 10302 58.328 comfortable 2.946 -0.674 -0.433 0.054 -0.636 0.493 -0.489 -0.343 0.496
3 4967 30.793 stressed -0.587 -0.836 -0.674 -0.725 0.930 -4.402 -0.271 -0.277 0.829

Save

saveRDS(df_gower, here("02_clustering", "outputs", "data", "hierarchical_full_gower_adf_clustered.rds"))
saveRDS(df_euc, here("02_clustering", "outputs", "data", "hierarchical_full_euclidean_adf_clustered.rds"))
cat("Saved full-sample hierarchical outputs.\n")
## Saved full-sample hierarchical outputs.

Session Info

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.utf8 
## [2] LC_CTYPE=Chinese (Simplified)_China.utf8   
## [3] LC_MONETARY=Chinese (Simplified)_China.utf8
## [4] LC_NUMERIC=C                               
## [5] LC_TIME=Chinese (Simplified)_China.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.4.0     mclust_6.1.2     kableExtra_1.4.0 cluster_2.1.8.2 
##  [5] lubridate_1.9.5  forcats_1.0.1    stringr_1.6.0    dplyr_1.2.1     
##  [9] purrr_1.2.2      readr_2.2.0      tidyr_1.3.2      tibble_3.3.1    
## [13] ggplot2_4.0.3    tidyverse_2.0.0  here_1.0.2      
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.5.2         stringi_1.8.7     
##  [5] hms_1.1.4          digest_0.6.39      magrittr_2.0.5     evaluate_1.0.5    
##  [9] grid_4.4.1         timechange_0.4.0   RColorBrewer_1.1-3 fastmap_1.2.0     
## [13] rprojroot_2.1.1    jsonlite_2.0.0     viridisLite_0.4.3  textshaping_1.0.5 
## [17] jquerylib_0.1.4    cli_3.6.6          crayon_1.5.3       rlang_1.2.0       
## [21] bit64_4.8.0        withr_3.0.2        cachem_1.1.0       yaml_2.3.12       
## [25] otel_0.2.0         parallel_4.4.1     tools_4.4.1        tzdb_0.5.0        
## [29] vctrs_0.7.3        R6_2.6.1           lifecycle_1.0.5    bit_4.6.0         
## [33] vroom_1.7.1        pkgconfig_2.0.3    pillar_1.11.1      bslib_0.10.0      
## [37] gtable_0.3.6       glue_1.8.1         systemfonts_1.3.2  xfun_0.57         
## [41] tidyselect_1.2.1   rstudioapi_0.18.0  knitr_1.51         farver_2.1.2      
## [45] htmltools_0.5.9    rmarkdown_2.31     svglite_2.2.2      compiler_4.4.1    
## [49] S7_0.2.2