Overview

This notebook treats decision trees as segmentation tools rather than traditional clustering methods.

That means:

the tree is supervised, not unsupervised
the target variable influences how groups are formed
but the resulting segments are often highly interpretable and useful for policy, financial wellness, or intervention design

This file fits two segmentation trees:

a regression tree predicting HFRS_100
a classification tree predicting HFRS_label

The goal is to recover actionable rules such as:

low savings + high debt -> high-risk segment
stable income + strong assets -> comfortable segment
high mortgage + strong income -> mixed or transitional segment

0 Libraries & Setup

library(here)
library(tidyverse)
library(rpart)
library(kableExtra)
library(scales)

set.seed(2024)

FIG_DIR <- here("02_clustering", "outputs", "figures")
DATA_DIR <- here("02_clustering", "outputs", "data")
dir.create(FIG_DIR, recursive = TRUE, showWarnings = FALSE)
dir.create(DATA_DIR, recursive = TRUE, showWarnings = FALSE)

TREE_VARS <- c(
  "BBDI_sl_w_z",
  "SHUSH_sl_w_z",
  "MDI_sl_w_z",
  "FINAS_sl_w_z",
  "ODDI_sl_w_z",
  "PWNETWPT_sl_w_z",
  "PATTSTIN_z",
  "PNBEARG_z"
)

wmean <- function(x, w) weighted.mean(x, w, na.rm = TRUE)

extract_leaf_profile <- function(df, leaf_col) {
  df %>%
    group_by(segment = .data[[leaf_col]]) %>%
    summarise(
      n = n(),
      wt_n = sum(PWEIGHT, na.rm = TRUE),
      HFRS_mean = wmean(HFRS_100, PWEIGHT),
      pct_stressed = wmean(HFRS_label == "stressed", PWEIGHT) * 100,
      pct_coping = wmean(HFRS_label == "coping", PWEIGHT) * 100,
      pct_comfortable = wmean(HFRS_label == "comfortable", PWEIGHT) * 100,
      across(all_of(TREE_VARS), ~ wmean(.x, PWEIGHT)),
      .groups = "drop"
    ) %>%
    arrange(desc(wt_n))
}

print_tree <- function(fit, main_title) {
  if (requireNamespace("rpart.plot", quietly = TRUE)) {
    if (fit$method == "class") {
      rpart.plot::rpart.plot(
        fit,
        main = main_title,
        type = 2,
        extra = 104,
        under = TRUE
      )
    } else {
      rpart.plot::rpart.plot(
        fit,
        main = main_title,
        type = 2,
        under = TRUE
      )
    }
  } else {
    plot(fit, margin = 0.1)
    text(fit, use.n = TRUE, cex = 0.7)
    title(main = paste0(main_title, " (base plot)"))
  }
}

1 Load Data

df_raw <- readr::read_csv(here("data", "adf.csv"), show_col_types = FALSE)

df_tree <- df_raw %>%
  filter(
    !is.na(HFRS_100),
    !is.na(HFRS_label),
    !is.na(PWEIGHT),
    if_all(all_of(TREE_VARS), ~ !is.na(.x))
  ) %>%
  mutate(
    HFRS_label = factor(HFRS_label, levels = c("stressed", "coping", "comfortable"))
  )

cat("Rows used:", nrow(df_tree), "\n")

## Rows used: 24360

2 Regression Tree: Segmenting by Continuous HFRS Score

score_formula <- as.formula(
  paste("HFRS_100 ~", paste(TREE_VARS, collapse = " + "))
)

tree_score <- rpart(
  formula = score_formula,
  data = df_tree,
  weights = PWEIGHT,
  method = "anova",
  control = rpart.control(
    minsplit = 300,
    minbucket = 150,
    cp = 0.002,
    maxdepth = 5
  )
)

printcp(tree_score)

## 
## Regression tree:
## rpart(formula = score_formula, data = df_tree, weights = PWEIGHT, 
##     method = "anova", control = rpart.control(minsplit = 300, 
##         minbucket = 150, cp = 0.002, maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] FINAS_sl_w_z    MDI_sl_w_z      PATTSTIN_z      PWNETWPT_sl_w_z
## [5] SHUSH_sl_w_z   
## 
## Root node error: 5354226604/24360 = 219796
## 
## n= 24360 
## 
##           CP nsplit rel error  xerror       xstd
## 1  0.3193949      0   1.00000 1.00019 2.6412e-04
## 2  0.1197615      1   0.68061 0.68509 1.7458e-04
## 3  0.0946411      2   0.56084 0.56798 1.5893e-04
## 4  0.0471670      3   0.46620 0.46988 1.2049e-04
## 5  0.0448078      4   0.41904 0.42532 1.1206e-04
## 6  0.0366181      5   0.37423 0.37586 1.0231e-04
## 7  0.0203730      6   0.33761 0.34095 9.4832e-05
## 8  0.0132699      8   0.29686 0.30137 8.6265e-05
## 9  0.0114927      9   0.28359 0.29069 8.4105e-05
## 10 0.0108773     10   0.27210 0.28349 8.2463e-05
## 11 0.0099324     11   0.26122 0.26766 7.9896e-05
## 12 0.0092734     12   0.25129 0.25180 7.4383e-05
## 13 0.0091053     13   0.24202 0.24863 7.3100e-05
## 14 0.0062852     14   0.23291 0.23520 6.9783e-05
## 15 0.0056804     15   0.22663 0.22677 6.7684e-05
## 16 0.0051358     16   0.22095 0.22210 6.6387e-05
## 17 0.0050266     17   0.21581 0.22040 6.6099e-05
## 18 0.0038006     18   0.21078 0.21496 6.4231e-05
## 19 0.0034338     19   0.20698 0.21080 6.2773e-05
## 20 0.0033187     20   0.20355 0.20993 6.2689e-05
## 21 0.0031782     21   0.20023 0.20741 6.2163e-05
## 22 0.0020000     22   0.19705 0.20331 6.1421e-05

print_tree(tree_score, "Decision Tree Segmentation — HFRS_100")

df_tree <- df_tree %>%
  mutate(score_segment = factor(tree_score$where))

score_segment_profiles <- extract_leaf_profile(df_tree, "score_segment")

score_segment_profiles %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Regression-tree segments — weighted profiles") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Regression-tree segments — weighted profiles
segment	n	wt_n	HFRS_mean	pct_stressed	pct_coping	pct_comfortable	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN_z	PNBEARG_z
23	4299	4,598,544	51.94	6.23	58.11	35.66	0.26	0.05	-0.49	-0.49	-0.15	0.42	-0.37	-0.18
10	2119	3,391,741	37.75	84.32	15.08	0.60	-0.06	-0.65	-0.34	-0.56	0.02	0.03	1.14	0.18
35	3131	2,794,452	58.84	2.28	22.95	74.77	0.32	0.25	-0.55	0.29	-0.18	0.15	-0.66	-0.20
21	1535	2,442,040	45.49	36.74	56.67	6.59	-0.35	-0.72	-0.58	-0.60	-0.14	-0.37	-0.57	-0.45
30	1406	1,746,686	48.03	18.17	66.82	15.01	0.03	0.16	-0.52	0.36	-0.03	-0.08	1.14	0.10
37	1416	1,543,274	62.01	0.46	8.57	90.96	0.23	0.34	-0.38	1.11	-0.25	0.13	-0.38	-0.26
15	922	1,367,889	34.87	91.58	8.19	0.23	-0.16	-0.71	1.76	-0.79	0.27	0.18	-0.67	0.39
17	1027	1,294,503	46.42	28.99	61.41	9.59	-0.43	1.05	1.40	-0.44	0.36	0.33	-0.70	0.39
24	1286	1,232,176	59.68	0.17	19.99	79.84	0.23	0.16	-0.50	-0.49	-0.04	0.45	-1.89	0.03
6	808	1,228,312	33.18	97.22	2.78	0.00	-0.35	0.51	1.66	-0.47	0.17	0.25	1.14	0.45
11	915	1,189,540	45.19	32.48	63.11	4.41	-0.40	1.07	-0.11	-0.39	0.11	0.33	1.14	0.42
5	627	1,045,395	26.20	99.94	0.06	0.00	0.07	-0.79	1.86	-0.67	0.26	0.18	1.13	0.41
31	652	941,585	53.15	4.60	54.14	41.26	0.13	0.17	-0.38	1.25	-0.17	-0.06	1.13	-0.02
16	760	912,584	40.56	68.19	31.11	0.70	0.15	-0.42	1.46	-0.43	0.08	0.30	-0.64	0.40
42	516	791,233	71.96	0.00	1.48	98.52	0.31	0.31	-0.58	2.30	-0.29	0.02	-0.62	-0.45
34	760	757,491	51.32	6.80	64.64	28.56	-0.12	0.72	1.20	0.27	0.04	0.34	-0.71	0.32
29	518	576,511	41.51	58.98	40.71	0.31	-0.11	0.58	1.25	0.34	0.04	0.37	1.13	0.46
45	302	520,064	82.71	0.00	0.00	100.00	0.80	0.21	-0.66	3.68	-0.40	0.11	-0.68	-0.67
8	252	499,801	23.79	100.00	0.00	0.00	-0.47	-0.79	-0.64	-0.63	1.29	-4.05	1.12	-0.04
20	255	454,424	32.82	93.49	5.47	1.05	-0.52	-0.81	-0.65	-0.67	1.36	-4.26	-0.71	-0.16
41	224	426,408	62.10	0.54	11.29	88.18	0.23	0.17	-0.57	2.34	-0.33	0.07	1.15	-0.02
38	462	400,598	69.33	0.00	1.74	98.26	0.23	0.51	-0.28	1.07	-0.14	0.16	-1.89	0.08
44	168	325,523	72.63	0.00	0.00	100.00	0.52	0.12	-0.66	3.72	-0.45	0.09	1.15	-0.41

readr::write_csv(
  score_segment_profiles,
  file.path(DATA_DIR, "decision_tree_score_segments.csv")
)

3 Classification Tree: Segmenting by HFRS Label

label_formula <- as.formula(
  paste("HFRS_label ~", paste(TREE_VARS, collapse = " + "))
)

tree_label <- rpart(
  formula = label_formula,
  data = df_tree,
  weights = PWEIGHT,
  method = "class",
  control = rpart.control(
    minsplit = 300,
    minbucket = 150,
    cp = 0.002,
    maxdepth = 5
  )
)

printcp(tree_label)

## 
## Classification tree:
## rpart(formula = label_formula, data = df_tree, weights = PWEIGHT, 
##     method = "class", control = rpart.control(minsplit = 300, 
##         minbucket = 150, cp = 0.002, maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] BBDI_sl_w_z     FINAS_sl_w_z    MDI_sl_w_z      PATTSTIN_z     
## [5] PWNETWPT_sl_w_z SHUSH_sl_w_z   
## 
## Root node error: 19809239/24360 = 813.19
## 
## n= 24360 
## 
##           CP nsplit rel error  xerror       xstd
## 1  0.3106578      0   1.00000 1.00000 0.00013294
## 2  0.0645168      1   0.68934 0.69318 0.00013867
## 3  0.0609996      2   0.62483 0.63984 0.00013736
## 4  0.0446075      3   0.56383 0.55843 0.00013401
## 5  0.0361270      4   0.51922 0.54062 0.00013305
## 6  0.0337603      5   0.48309 0.48984 0.00012983
## 7  0.0284701      6   0.44933 0.44031 0.00012596
## 8  0.0171723      7   0.42086 0.43281 0.00012531
## 9  0.0119892      8   0.40369 0.41051 0.00012327
## 10 0.0110119      9   0.39170 0.40304 0.00012254
## 11 0.0092049     11   0.36968 0.39415 0.00012166
## 12 0.0081240     13   0.35127 0.36621 0.00011869
## 13 0.0071334     14   0.34314 0.36260 0.00011828
## 14 0.0059503     15   0.33601 0.35873 0.00011784
## 15 0.0040179     16   0.33006 0.34690 0.00011646
## 16 0.0025294     17   0.32604 0.34261 0.00011595
## 17 0.0022594     18   0.32351 0.34170 0.00011584
## 18 0.0020298     20   0.31899 0.34183 0.00011586
## 19 0.0020000     21   0.31696 0.34171 0.00011584

print_tree(tree_label, "Decision Tree Segmentation — HFRS_label")

df_tree <- df_tree %>%
  mutate(label_segment = factor(tree_label$where))

label_segment_profiles <- extract_leaf_profile(df_tree, "label_segment")

label_segment_profiles %>%
  mutate(wt_n = comma(round(wt_n))) %>%
  kable(digits = 2, caption = "Classification-tree segments — weighted profiles") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Classification-tree segments — weighted profiles
segment	n	wt_n	HFRS_mean	pct_stressed	pct_coping	pct_comfortable	BBDI_sl_w_z	SHUSH_sl_w_z	MDI_sl_w_z	FINAS_sl_w_z	ODDI_sl_w_z	PWNETWPT_sl_w_z	PATTSTIN_z	PNBEARG_z
43	5577	5,550,800	65.05	0.00	9.05	90.95	0.45	0.30	-0.63	1.01	-0.30	0.29	-0.67	-0.31
21	4538	5,313,340	49.80	13.23	63.80	22.97	0.26	-0.50	-0.49	-0.59	-0.14	0.27	-0.66	-0.16
5	2609	4,456,447	32.18	97.05	2.95	0.00	-0.37	-0.63	0.28	-0.63	0.19	-0.36	1.13	0.27
31	2010	2,450,157	49.79	10.11	70.45	19.44	0.10	0.28	-0.47	0.45	-0.13	0.21	1.14	0.16
15	1389	1,943,744	35.99	90.81	9.07	0.12	-0.09	-0.49	1.65	-0.65	0.20	0.23	-0.36	0.40
22	1443	1,499,993	57.18	0.00	31.20	68.80	-0.42	1.31	-0.42	-0.46	0.06	0.38	-0.71	-0.11
19	681	1,221,364	38.91	76.88	22.13	0.99	-0.53	-0.93	-0.65	-0.67	0.29	-2.04	-0.61	-0.53
32	587	1,063,095	63.25	1.42	15.63	82.95	0.30	0.12	-0.56	2.54	-0.34	0.01	1.15	-0.16
10	667	937,336	37.22	84.47	15.53	0.00	-0.44	1.02	1.23	-0.47	0.17	0.26	1.15	0.58
36	928	929,176	49.71	9.55	72.99	17.47	-0.07	0.40	1.02	0.13	-0.05	0.37	-0.66	0.44
17	635	787,511	47.19	23.53	66.73	9.74	-0.45	1.28	1.45	-0.49	0.46	0.33	-0.75	0.35
11	470	630,230	46.72	17.72	75.59	6.69	-0.38	1.06	-0.60	-0.47	0.22	0.12	1.14	0.22
38	580	558,586	59.16	0.41	23.02	76.57	-0.22	1.00	0.92	1.00	-0.01	0.43	-0.77	0.33
27	479	531,617	38.51	83.42	16.58	0.00	-0.03	0.12	1.34	0.26	-0.06	0.34	1.13	0.48
16	377	474,499	43.30	44.27	52.74	2.99	-0.08	-0.45	1.69	-0.65	0.29	0.26	-1.87	0.38
8	279	401,834	44.96	37.51	59.79	2.70	1.64	-0.71	-0.67	-0.53	0.29	0.22	1.15	-0.32
30	168	341,780	38.45	69.65	28.31	2.05	-0.32	-0.62	-0.66	0.41	0.57	-2.03	1.13	-0.23
41	156	328,433	47.17	20.58	71.84	7.58	-0.29	-0.73	-0.67	0.18	0.31	-1.60	-0.48	-0.61
7	213	307,738	31.32	98.15	1.85	0.00	1.40	-0.74	1.78	-0.57	0.27	0.38	1.14	0.32
42	150	289,166	56.88	6.78	37.94	55.27	-0.07	-0.57	-0.67	1.41	0.16	-1.64	-0.59	-0.65
28	207	251,269	45.82	24.82	71.73	3.45	-0.50	1.75	1.38	0.40	0.24	0.38	1.13	0.36
37	217	212,662	56.76	1.31	30.63	68.06	-0.51	2.17	1.07	0.16	0.36	0.33	-0.66	0.16

readr::write_csv(
  label_segment_profiles,
  file.path(DATA_DIR, "decision_tree_label_segments.csv")
)

4 Variable Importance

importance_tbl <- tibble(
  variable = names(tree_label$variable.importance),
  importance = as.numeric(tree_label$variable.importance)
) %>%
  mutate(importance = importance / sum(importance)) %>%
  arrange(desc(importance))

importance_tbl %>%
  kable(digits = 3, caption = "Classification tree — normalized variable importance") %>%
  kable_styling(full_width = FALSE)

Classification tree — normalized variable importance
variable	importance
FINAS_sl_w_z	0.324
PATTSTIN_z	0.237
MDI_sl_w_z	0.157
SHUSH_sl_w_z	0.136
PWNETWPT_sl_w_z	0.108
BBDI_sl_w_z	0.027
PNBEARG_z	0.006
ODDI_sl_w_z	0.005

readr::write_csv(
  importance_tbl,
  file.path(DATA_DIR, "decision_tree_variable_importance.csv")
)

5 Interpretation Notes

cat("
- This is a **supervised segmentation** method, not unsupervised clustering.
- That means the target (`HFRS_100` or `HFRS_label`) directly shapes the groups.
- The advantage is interpretability: you get explicit split rules that can be turned into actionable segment descriptions.
- The tradeoff is that these segments should not be described as \"natural clusters\".
- In this project, decision trees are most useful when the research goal is **actionable groups** rather than latent structure.
")

This is a supervised segmentation method, not unsupervised clustering.
That means the target (HFRS_100 or HFRS_label) directly shapes the groups.
The advantage is interpretability: you get explicit split rules that can be turned into actionable segment descriptions.
The tradeoff is that these segments should not be described as “natural clusters”.
In this project, decision trees are most useful when the research goal is actionable groups rather than latent structure.

Save

saveRDS(df_tree, file.path(DATA_DIR, "decision_tree_segmented_adf.rds"))
cat("Saved: outputs/data/decision_tree_segmented_adf.rds\n")

## Saved: outputs/data/decision_tree_segmented_adf.rds

Session Info

sessionInfo()

## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Toronto
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.4.0     kableExtra_1.4.0 rpart_4.1.21     lubridate_1.9.3 
##  [5] forcats_1.0.0    stringr_1.5.1    dplyr_1.1.4      purrr_1.0.4     
##  [9] readr_2.1.5      tidyr_1.3.1      tibble_3.2.1     ggplot2_3.5.2   
## [13] tidyverse_2.0.0  here_1.0.1      
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        utf8_1.2.4         generics_0.1.3     xml2_1.3.6        
##  [5] stringi_1.8.3      hms_1.1.3          digest_0.6.34      magrittr_2.0.3    
##  [9] evaluate_0.23      grid_4.3.2         timechange_0.3.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      rprojroot_2.0.4    jsonlite_1.8.8     rpart.plot_3.1.2  
## [17] fansi_1.0.6        viridisLite_0.4.2  textshaping_0.3.7  jquerylib_0.1.4   
## [21] cli_3.6.5          crayon_1.5.2       rlang_1.1.6        bit64_4.0.5       
## [25] withr_3.0.2        cachem_1.1.0       yaml_2.3.8         parallel_4.3.2    
## [29] tools_4.3.2        tzdb_0.5.0         vctrs_0.6.5        R6_2.5.1          
## [33] lifecycle_1.0.4    bit_4.0.5          vroom_1.6.5        pkgconfig_2.0.3   
## [37] pillar_1.9.0       bslib_0.6.1        gtable_0.3.6       glue_1.8.0        
## [41] systemfonts_1.2.3  highr_0.11         xfun_0.52          tidyselect_1.2.1  
## [45] rstudioapi_0.15.0  knitr_1.48         farver_2.1.1       htmltools_0.5.8.1 
## [49] rmarkdown_2.25     svglite_2.2.1      compiler_4.3.2

Method 6 — Decision Tree Segmentation

2026-05-29