NS Road Safety — ETL, Data Cleaning & Driving Archetype Construction

Audit Reference: VCM_Model_Validation_Final.qmd

Author

Gavin Shklanka & Rachel Kodi

Published

March 20, 2026

Note

Audit Reference: This document is the upstream ETL companion to VCM_Model_Validation_Final.qmd. Every transformation applied here flows directly into the analysis_df object consumed by that report. Rachel (or any auditor) can verify that the cleaned dataset and the six driving archetypes were produced reproducibly and without data leakage.

1 Package Setup & Reproducibility Seed

library(tidyverse)
library(readr)
library(janitor)
library(lubridate)
library(knitr)
library(kableExtra)
library(scales)

set.seed(42)                     # all stochastic steps use this seed
options(scipen = 999)
Sys.setenv(VROOM_CONNECTION_SIZE = 5000000)

theme_set(theme_minimal(base_size = 12))

Why seed = 42? The modeling pipeline (VCM_Model_Validation_Final.qmd) was built and validated against synthetic data generated with set.seed(42). Keeping the same seed here ensures that any stochastic imputation, sampling, or regime-flag construction is perfectly reproducible across renders.

2 Raw Data Ingest

2.1 Data path configuration

# ── Set via environment variable or replace the fallback path directly ─────────
# Example:  Sys.setenv(NS_ENRICHED_CSV = "/data/ns_collisions_enriched.csv")
# The enriched file must contain weather-joined columns:
#   temp_c, wind_kph, precipitation_mm, visibility_km
# The raw provincial collision file must contain:
#   severity_raw (1 = severe, 0 = non-severe), datetime_raw, location_id, road_type

enriched_path <- Sys.getenv("NS_ENRICHED_CSV", unset = "")
raw_path      <- Sys.getenv("NS_RAW_CSV",      unset = "")

use_synthetic <- nchar(enriched_path) == 0 || !file.exists(enriched_path)

if (use_synthetic) {
  message("⚠  No enriched CSV found — generating reproducible synthetic dataset (SEED = 42).")
  message("   Set NS_ENRICHED_CSV env var to use real data.")
} else {
  message("✅ Enriched CSV found: ", enriched_path)
}

2.2 Load or synthesise raw tables

if (use_synthetic) {

  # ── Table 1: Collision records ──────────────────────────────────────────────
  n <- 5000

  collision_raw <- tibble(
    collision_id  = 1:n,
    datetime_raw  = format(
      seq.POSIXt(as.POSIXct("2020-01-01"), by = "6 hours", length.out = n),
      "%m/%d/%Y %H:%M:%S"
    ),
    location_id   = sample(1:200, n, replace = TRUE),
    road_type     = sample(c("highway", "arterial", "local"), n, replace = TRUE,
                           prob = c(0.536, 0.34, 0.124)),   # 53.6% highway — matches dashboard
    light_cond    = sample(c("daylight", "dark", "dusk"),    n, replace = TRUE,
                           prob = c(0.60, 0.30, 0.10)),
    severity_raw  = sample(c(0L, 1L), n, replace = TRUE, prob = c(0.782, 0.218)) # 21.8% severe
  )

  # ── Table 2: Traffic exposure (VMT proxy via AADT) ──────────────────────────
  traffic_raw <- tibble(
    location_id = rep(1:200, 12),
    month       = rep(1:12, each = 200),
    aadt        = round(rexp(2400, rate = 1 / 8000)),
    seg_len_km  = runif(2400, 0.5, 15)
  ) %>%
    mutate(vmt = aadt * seg_len_km * 365)

  # ── Table 3: Weather observations ──────────────────────────────────────────
  weather_raw <- tibble(
    location_id      = collision_raw$location_id,
    datetime_raw     = collision_raw$datetime_raw,
    precipitation_mm = rexp(n, rate = 1 / 1.5),
    visibility_km    = pmin(pmax(rnorm(n, 15, 5), 0.5), 30),
    wind_kph         = abs(rnorm(n, 20, 15)),
    temp_c           = rnorm(n, 5, 12),
    wx_condition_code = sample(
      c("clear", "rain", "snow", "fog", "freezing_rain"),
      n, replace = TRUE, prob = c(0.55, 0.20, 0.15, 0.06, 0.04)
    )
  )

  cat("Synthetic tables generated:\n")
  cat("  collision_raw  :", nrow(collision_raw),  "rows ×", ncol(collision_raw),  "cols\n")
  cat("  traffic_raw    :", nrow(traffic_raw),    "rows ×", ncol(traffic_raw),    "cols\n")
  cat("  weather_raw    :", nrow(weather_raw),    "rows ×", ncol(weather_raw),    "cols\n")

} else {

  collision_raw <- read_csv(enriched_path, show_col_types = FALSE)
  if (nchar(raw_path) > 0 && file.exists(raw_path)) {
    traffic_raw <- read_csv(raw_path, show_col_types = FALSE)
  }
  cat("Real data loaded from:", enriched_path, "\n")

}

Synthetic tables generated:
  collision_raw  : 5000 rows × 6 cols
  traffic_raw    : 2400 rows × 5 cols
  weather_raw    : 5000 rows × 7 cols

3 Cleaning Pass — Collision Records

3.1 Standardise column names

collision_clean <- janitor::clean_names(collision_raw)
cat("Column names after clean_names():\n")

Column names after clean_names():

cat(paste(names(collision_clean), collapse = ", "), "\n")

collision_id, datetime_raw, location_id, road_type, light_cond, severity_raw

3.2 Parse datetime

collision_clean <- collision_clean %>%
  mutate(
    datetime = suppressWarnings(mdy_hms(datetime_raw, tz = "America/Halifax")),
    # Fallback: try ymd_hms if mdy_hms returns all NA
    datetime = if_else(
      is.na(datetime),
      suppressWarnings(ymd_hms(datetime_raw, tz = "America/Halifax")),
      datetime
    )
  )

n_failed <- sum(is.na(collision_clean$datetime))
cat("Datetime parse failures:", n_failed, "of", nrow(collision_clean), "\n")

Datetime parse failures: 0 of 5000

if (n_failed / nrow(collision_clean) > 0.05) {
  warning("More than 5% of datetime values failed to parse. Check datetime_raw format.")
}

3.3 Derive temporal features

collision_clean <- collision_clean %>%
  mutate(
    month        = month(datetime),
    hour         = hour(datetime),
    day_of_week  = wday(datetime, label = TRUE, abbr = FALSE),
    is_weekend   = as.integer(wday(datetime) %in% c(1, 7)),
    # Holiday proxy: July, August, December peak travel months
    is_holiday_period = as.integer(month %in% c(7L, 8L, 12L)),
    # Night-time: 8pm–5am
    is_night     = as.integer(hour >= 20 | hour <= 5)
  )

3.4 Validate and recode severity

# Expected: 0 = non-severe, 1 = severe
# Audit check: confirm no unexpected values
sev_vals <- unique(collision_clean$severity_raw)
cat("Unique severity_raw values:", paste(sort(sev_vals), collapse = ", "), "\n")

Unique severity_raw values: 0, 1

if (!all(sev_vals %in% c(0L, 1L))) {
  warning("Unexpected values in severity_raw: ",
          paste(setdiff(sev_vals, c(0L, 1L)), collapse = ", "))
}

collision_clean <- collision_clean %>%
  mutate(
    severe = factor(severity_raw, levels = c(0, 1), labels = c("No", "Yes"))
  )

severe_rate <- mean(collision_clean$severity_raw == 1, na.rm = TRUE)
cat(sprintf("Severe collision rate: %.1f%%\n", severe_rate * 100))

Severe collision rate: 21.3%

3.5 Road type standardisation

collision_clean <- collision_clean %>%
  mutate(
    road_type = str_to_lower(str_trim(road_type)),
    road_type = case_when(
      str_detect(road_type, "highway|hwy|provincial")  ~ "highway",
      str_detect(road_type, "arterial|major")          ~ "arterial",
      str_detect(road_type, "local|residential")       ~ "local",
      TRUE                                              ~ "other"
    )
  )

road_dist <- collision_clean %>%
  count(road_type) %>%
  mutate(share = percent(n / sum(n), accuracy = 0.1))

road_dist %>%
  kbl(caption = "Road type distribution after standardisation",
      col.names = c("Road Type", "Count", "Share")) %>%
  kable_styling(full_width = FALSE)

Road type distribution after standardisation
Road Type	Count	Share
arterial	1730	34.6%
highway	2653	53.1%
local	617	12.3%

4 Cleaning Pass — Traffic Exposure

if (use_synthetic) {
  traffic_clean <- janitor::clean_names(traffic_raw) %>%
    filter(!is.na(vmt), vmt >= 0) %>%
    # Cap extreme VMT outliers at 99.5th percentile
    mutate(vmt = pmin(vmt, quantile(vmt, 0.995)))
} else {
  # Real data: replace with your actual AADT/VMT cleaning logic
  traffic_clean <- janitor::clean_names(traffic_raw)
}

cat("Traffic rows after cleaning:", nrow(traffic_clean), "\n")

Traffic rows after cleaning: 2400

cat("VMT summary:\n")

VMT summary:

print(summary(traffic_clean$vmt))

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
     5845   4293192  12427862  22156321  30258517 156973973

5 Cleaning Pass — Weather Data

if (use_synthetic) {
  weather_clean <- janitor::clean_names(weather_raw) %>%
    mutate(
      # is_severe_weather: flag hazardous conditions
      # Canonical name is is_severe_weather (not severe_weather — see VCM correction log)
      is_severe_weather = as.integer(
        wx_condition_code %in% c("snow", "fog", "freezing_rain")
      ),
      # Cap physically implausible values
      precipitation_mm = pmax(precipitation_mm, 0),
      visibility_km    = pmin(pmax(visibility_km, 0), 50),
      wind_kph         = pmax(wind_kph, 0)
    )
} else {
  weather_clean <- janitor::clean_names(weather_raw) %>%
    mutate(
      is_severe_weather = as.integer(
        wx_condition_code %in% c("SN", "FG", "FZRA", "IC", "PE")
      )
    )
}

cat("Weather rows after cleaning:", nrow(weather_clean), "\n")

Weather rows after cleaning: 5000

has_temp <- "temp_c" %in% names(weather_clean)
cat("temp_c present:", has_temp, "\n")

temp_c present: TRUE

Important

Naming convention audit note: The column is_severe_weather is the canonical name used throughout this pipeline. An earlier version of the Python pipeline contained a duplicate column called severe_weather. That duplicate was removed. Only is_severe_weather (integer 0/1) appears in the feature set passed to the models in VCM_Model_Validation_Final.qmd.

6 Three-Table Merge

# Step 1: collision × traffic (join key: location_id + month)
df_merged <- collision_clean %>%
  left_join(
    traffic_clean %>% select(location_id, month, vmt),
    by = c("location_id", "month")
  )

# Step 2: × weather (join key: location_id + datetime_raw for synthetic;
#          in real data this is a spatial-temporal fuzzy join by nearest station)
df_merged <- df_merged %>%
  left_join(
    weather_clean %>% select(location_id, datetime_raw,
                             precipitation_mm, visibility_km,
                             wind_kph, temp_c, is_severe_weather),
    by = c("location_id", "datetime_raw")
  )

# Step 3: drop records missing essential modelling inputs
df_merged <- df_merged %>%
  filter(!is.na(vmt), !is.na(precipitation_mm), !is.na(severe))

cat("Post-merge row count:", nrow(df_merged), "\n")

Post-merge row count: 5000

cat("Post-merge col count:", ncol(df_merged), "\n")

Post-merge col count: 20

cat("Missing values by column:\n")

Missing values by column:

df_merged %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
  filter(n_missing > 0) %>%
  kbl(caption = "Remaining missingness after merge") %>%
  kable_styling(full_width = FALSE)

Remaining missingness after merge
column	n_missing

7 Feature Engineering

df_features <- df_merged %>%
  mutate(
    # Interaction terms (traffic × adverse conditions)
    traffic_x_precip      = vmt * precipitation_mm,
    traffic_x_visibility  = vmt * visibility_km,

    # Speed proxy from road class
    speed_proxy = case_when(
      road_type == "highway"  ~ 100L,
      road_type == "arterial" ~ 50L,
      road_type == "local"    ~ 30L,
      TRUE                    ~ 50L   # fallback for "other"
    )
  )

FEATURES <- c(
  "vmt", "precipitation_mm", "visibility_km", "wind_kph",
  "temp_c", "speed_proxy",
  "traffic_x_precip", "traffic_x_visibility",
  "is_severe_weather", "is_holiday_period", "is_night"
)

# Final model-ready frame
df_model <- df_features %>%
  select(collision_id, all_of(FEATURES), severe, severity_raw) %>%
  drop_na(all_of(FEATURES))

cat("Final model-ready dataset:", nrow(df_model), "rows ×",
    length(FEATURES), "features\n")

Final model-ready dataset: 5000 rows × 11 features

Feature summary table

df_model %>%
  select(all_of(FEATURES)) %>%
  summarise(across(everything(),
    list(mean = ~ round(mean(., na.rm = TRUE), 3),
         sd   = ~ round(sd(.,   na.rm = TRUE), 3),
         min  = ~ round(min(.,  na.rm = TRUE), 3),
         max  = ~ round(max(.,  na.rm = TRUE), 3)),
    .names = "{.col}_{.fn}"
  )) %>%
  pivot_longer(everything(),
               names_to  = c("feature", "stat"),
               names_sep = "_(?=[^_]+$)") %>%
  pivot_wider(names_from = stat, values_from = value) %>%
  kbl(caption = "Feature descriptive statistics (post-cleaning)") %>%
  kable_styling(full_width = FALSE)

Feature descriptive statistics (post-cleaning)
feature	mean	sd	min	max
vmt	22683502.466	27251005.842	5844.621	156973972.877
precipitation_mm	1.528	1.537	0.000	14.199
visibility_km	15.106	4.991	0.500	30.000
wind_kph	21.540	13.376	0.002	80.996
temp_c	5.161	12.090	-38.804	49.630
speed_proxy	74.062	28.232	30.000	100.000
traffic_x_precip	34327752.837	65742497.092	544.510	1072849589.950
traffic_x_visibility	343022061.016	449989059.751	73679.189	4074067394.046
is_severe_weather	0.251	0.433	0.000	1.000
is_holiday_period	0.223	0.416	0.000	1.000
is_night	0.250	0.433	0.000	1.000

8 The Six Driving Archetypes

The six driving archetypes are condition-partitioned behavioral regimes. They represent the six mutually exclusive (or nearly so) operating contexts under which separate XGBoost sub-models are trained in VCM_Model_Validation_Final.qmd.

Each archetype isolates a different population of collision records defined by environmental and temporal context — a proxy for the underlying driver behaviour environment. The archetypes are not K-means clusters; they are engineered binary partitions derived from the flag variables constructed above.

archetypes <- list(
  "Archetype 1 — Severe Weather"   = list(col = "is_severe_weather",  val = 1L),
  "Archetype 2 — Clear Conditions" = list(col = "is_severe_weather",  val = 0L),
  "Archetype 3 — Holiday Period"   = list(col = "is_holiday_period",  val = 1L),
  "Archetype 4 — Non-Holiday"      = list(col = "is_holiday_period",  val = 0L),
  "Archetype 5 — Night-time"       = list(col = "is_night",           val = 1L),
  "Archetype 6 — Daytime"          = list(col = "is_night",           val = 0L)
)

8.1 Archetype row counts and severe rates

archetype_profile <- map_dfr(names(archetypes), function(aname) {
  spec  <- archetypes[[aname]]
  rows  <- df_model %>% filter(.data[[spec$col]] == spec$val)
  tibble(
    Archetype       = aname,
    n               = nrow(rows),
    `Share of Data` = percent(nrow(rows) / nrow(df_model), accuracy = 0.1),
    `Severe Rate`   = percent(mean(rows$severity_raw == 1, na.rm = TRUE), accuracy = 0.1),
    `Partition Key` = paste0(spec$col, " == ", spec$val)
  )
})

archetype_profile %>%
  kbl(caption = "Six driving archetypes — record counts and severe rates") %>%
  kable_styling(full_width = FALSE) %>%
  row_spec(which(str_detect(archetype_profile$Archetype, "Severe Weather|Night|Holiday Period")),
           background = "#FFF3E0")

Six driving archetypes — record counts and severe rates
Archetype	n	Share of Data	Severe Rate	Partition Key
Archetype 1 — Severe Weather	1253	25.1%	21.3%	is_severe_weather == 1
Archetype 2 — Clear Conditions	3747	74.9%	21.3%	is_severe_weather == 0
Archetype 3 — Holiday Period	1116	22.3%	21.5%	is_holiday_period == 1
Archetype 4 — Non-Holiday	3884	77.7%	21.2%	is_holiday_period == 0
Archetype 5 — Night-time	1250	25.0%	20.5%	is_night == 1
Archetype 6 — Daytime	3750	75.0%	21.5%	is_night == 0

8.2 Archetype severe-rate comparison plot

archetype_rates <- map_dfr(names(archetypes), function(aname) {
  spec <- archetypes[[aname]]
  rows <- df_model %>% filter(.data[[spec$col]] == spec$val)
  tibble(
    archetype   = aname,
    severe_rate = mean(rows$severity_raw == 1, na.rm = TRUE),
    n           = nrow(rows)
  )
})

overall_rate <- mean(df_model$severity_raw == 1, na.rm = TRUE)

ggplot(archetype_rates, aes(
  x    = reorder(str_wrap(archetype, 20), severe_rate),
  y    = severe_rate,
  fill = severe_rate > overall_rate
)) +
  geom_col(show.legend = FALSE, width = 0.65) +
  geom_hline(yintercept = overall_rate,
             linetype = "dashed", colour = "#D7191C", linewidth = 0.8) +
  annotate("text", x = 0.6, y = overall_rate,
           label = paste0("Overall: ", percent(overall_rate, accuracy = 0.1)),
           hjust = 0, vjust = -0.5, colour = "#D7191C", size = 3.2) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  scale_fill_manual(values = c("FALSE" = "#1565C0", "TRUE" = "#D7191C")) +
  coord_flip() +
  labs(
    title    = "Severe Collision Rate by Driving Archetype",
    subtitle = "Red bars exceed the overall severe rate; blue bars fall below it.",
    x        = NULL, y = "Severe Collision Rate"
  )

Severe collision rate by driving archetype

8.3 Archetype partition logic (audit trail)

The table below documents exactly how each archetype flag is constructed so Rachel (or any downstream auditor) can trace the lineage from raw source fields to model-input partitions.

tibble(
  Archetype       = names(archetypes),
  `Source Column` = map_chr(archetypes, ~ .x$col),
  `Filter Value`  = map_chr(archetypes, ~ as.character(.x$val)),
  `Derived From`  = c(
    "wx_condition_code %in% snow/fog/freezing_rain → is_severe_weather",
    "wx_condition_code NOT in snow/fog/freezing_rain",
    "month %in% c(7, 8, 12) → is_holiday_period",
    "month NOT in c(7, 8, 12)",
    "hour >= 20 | hour <= 5 → is_night",
    "hour > 5 & hour < 20"
  )
) %>%
  kbl(caption = "Archetype construction — full audit trail") %>%
  kable_styling(full_width = FALSE) %>%
  column_spec(4, width = "12cm")

Archetype construction — full audit trail
Archetype	Source Column	Filter Value	Derived From
Archetype 1 — Severe Weather	is_severe_weather	1	wx_condition_code %in% snow/fog/freezing_rain → is_severe_weather
Archetype 2 — Clear Conditions	is_severe_weather	0	wx_condition_code NOT in snow/fog/freezing_rain
Archetype 3 — Holiday Period	is_holiday_period	1	month %in% c(7, 8, 12) → is_holiday_period
Archetype 4 — Non-Holiday	is_holiday_period	0	month NOT in c(7, 8, 12)
Archetype 5 — Night-time	is_night	1	hour >= 20 \| hour <= 5 → is_night
Archetype 6 — Daytime	is_night	0	hour > 5 & hour < 20

9 Train / Test Split

The 80/20 stratified split used in VCM_Model_Validation_Final.qmd is reproduced here so the cleaned dataset and splits are verifiable end-to-end.

library(caret)

set.seed(42)
train_idx   <- createDataPartition(df_model$severe, p = 0.80, list = FALSE)
train_clean <- df_model[ train_idx, ]
test_clean  <- df_model[-train_idx, ]

cat("Train set:", nrow(train_clean), "rows |",
    "Severe:", percent(mean(train_clean$severity_raw == 1), accuracy = 0.1), "\n")

Train set: 4001 rows | Severe: 21.3%

cat("Test set :", nrow(test_clean),  "rows |",
    "Severe:", percent(mean(test_clean$severity_raw  == 1), accuracy = 0.1), "\n")

Test set : 999 rows | Severe: 21.2%

Note

Stratification ensures that the 21.8% severe rate is preserved in both splits, preventing the test set from being accidentally all-non-severe or heavily imbalanced relative to the training set.

10 Save Cleaned Outputs

# Write the cleaned, feature-engineered dataset so VCM_Model_Validation_Final.qmd
# can consume it via the NS_ENRICHED_CSV env var.

out_dir <- Sys.getenv("NS_OUTPUT_DIR", unset = tempdir())

write_csv(df_model,     file.path(out_dir, "ns_vcm_model_ready.csv"))
write_csv(train_clean,  file.path(out_dir, "ns_vcm_train.csv"))
write_csv(test_clean,   file.path(out_dir, "ns_vcm_test.csv"))

cat("Outputs written to:", out_dir, "\n")

Outputs written to: C:\Users\gshk0\AppData\Local\Temp\RtmpGqWbYR

cat("  ns_vcm_model_ready.csv  —", nrow(df_model),    "rows (full cleaned set)\n")

  ns_vcm_model_ready.csv  — 5000 rows (full cleaned set)

cat("  ns_vcm_train.csv        —", nrow(train_clean),  "rows (80% train)\n")

  ns_vcm_train.csv        — 4001 rows (80% train)

cat("  ns_vcm_test.csv         —", nrow(test_clean),   "rows (20% test)\n")

  ns_vcm_test.csv         — 999 rows (20% test)

cat("\nTo use in VCM_Model_Validation_Final.qmd:\n")


To use in VCM_Model_Validation_Final.qmd:

cat("  Sys.setenv(NS_ENRICHED_CSV = '", file.path(out_dir, "ns_vcm_model_ready.csv"), "')\n", sep = "")

  Sys.setenv(NS_ENRICHED_CSV = 'C:\Users\gshk0\AppData\Local\Temp\RtmpGqWbYR/ns_vcm_model_ready.csv')

11 Compliance Cross-Reference

The table below maps each cleaning step in this QMD to the corresponding section in VCM_Model_Validation_Final.qmd that depends on it.

tibble(
  `ETL Step (this file)` = c(
    "§3  Collision cleaning",
    "§3.2  Datetime parse",
    "§3.4  Severity recode",
    "§4  Traffic exposure cleaning",
    "§5  Weather cleaning — is_severe_weather naming",
    "§6  Three-table merge",
    "§7  Feature engineering",
    "§8  Six archetypes defined",
    "§9  Train/test split (seed = 42)"
  ),
  `Depends On in VCM_Model_Validation_Final.qmd` = c(
    "setup-data chunk — analysis_df construction",
    "datetime_raw → datetime parse in setup chunk",
    "severity_raw == 1 logic throughout",
    "vmt feature in FEATURES vector",
    "is_severe_weather in FEATURES; Archetype 1/2 regime sub-models",
    "analysis_df is the merged frame",
    "FEATURES vector in model training chunks",
    "Section 10/11 — Behavioral Regime Sub-Models",
    "createDataPartition(seed=42) in train/test split chunk"
  )
) %>%
  kbl(caption = "ETL-to-model compliance cross-reference") %>%
  kable_styling(full_width = FALSE) %>%
  column_spec(1, bold = TRUE, width = "8cm") %>%
  column_spec(2, width = "10cm")

ETL-to-model compliance cross-reference
ETL Step (this file)	Depends On in VCM_Model_Validation_Final.qmd
§3 Collision cleaning	setup-data chunk — analysis_df construction
§3.2 Datetime parse	datetime_raw → datetime parse in setup chunk
§3.4 Severity recode	severity_raw == 1 logic throughout
§4 Traffic exposure cleaning	vmt feature in FEATURES vector
§5 Weather cleaning — is_severe_weather naming	is_severe_weather in FEATURES; Archetype 1/2 regime sub-models
§6 Three-table merge	analysis_df is the merged frame
§7 Feature engineering	FEATURES vector in model training chunks
§8 Six archetypes defined	Section 10/11 — Behavioral Regime Sub-Models
§9 Train/test split (seed = 42)	createDataPartition(seed=42) in train/test split chunk

12 LLM Usage Disclosure

Claude was used to help structure the R/Quarto ETL workflow, generate the audit trail table, and refine interpretive phrasing. All final analytical decisions, data field mappings, and archetype definitions were reviewed by the authors.