NS Road Safety — ETL, Data Cleaning & Driving Archetype Construction

Audit Reference: VCM_Model_Validation_Final.qmd

Author

Gavin Shklanka & Rachel Kodi

Published

March 20, 2026

Note

Audit Reference: This document is the upstream ETL companion to VCM_Model_Validation_Final.qmd. Every transformation applied here flows directly into the analysis_df object consumed by that report. Rachel (or any auditor) can verify that the cleaned dataset and the six driving archetypes were produced reproducibly and without data leakage.


1 Package Setup & Reproducibility Seed

library(tidyverse)
library(readr)
library(janitor)
library(lubridate)
library(knitr)
library(kableExtra)
library(scales)

set.seed(42)                     # all stochastic steps use this seed
options(scipen = 999)
Sys.setenv(VROOM_CONNECTION_SIZE = 5000000)

theme_set(theme_minimal(base_size = 12))

Why seed = 42? The modeling pipeline (VCM_Model_Validation_Final.qmd) was built and validated against synthetic data generated with set.seed(42). Keeping the same seed here ensures that any stochastic imputation, sampling, or regime-flag construction is perfectly reproducible across renders.


2 Raw Data Ingest

2.1 Data path configuration

# ── Set via environment variable or replace the fallback path directly ─────────
# Example:  Sys.setenv(NS_ENRICHED_CSV = "/data/ns_collisions_enriched.csv")
# The enriched file must contain weather-joined columns:
#   temp_c, wind_kph, precipitation_mm, visibility_km
# The raw provincial collision file must contain:
#   severity_raw (1 = severe, 0 = non-severe), datetime_raw, location_id, road_type

enriched_path <- Sys.getenv("NS_ENRICHED_CSV", unset = "")
raw_path      <- Sys.getenv("NS_RAW_CSV",      unset = "")

use_synthetic <- nchar(enriched_path) == 0 || !file.exists(enriched_path)

if (use_synthetic) {
  message("⚠  No enriched CSV found — generating reproducible synthetic dataset (SEED = 42).")
  message("   Set NS_ENRICHED_CSV env var to use real data.")
} else {
  message("✅ Enriched CSV found: ", enriched_path)
}

2.2 Load or synthesise raw tables

if (use_synthetic) {

  # ── Table 1: Collision records ──────────────────────────────────────────────
  n <- 5000

  collision_raw <- tibble(
    collision_id  = 1:n,
    datetime_raw  = format(
      seq.POSIXt(as.POSIXct("2020-01-01"), by = "6 hours", length.out = n),
      "%m/%d/%Y %H:%M:%S"
    ),
    location_id   = sample(1:200, n, replace = TRUE),
    road_type     = sample(c("highway", "arterial", "local"), n, replace = TRUE,
                           prob = c(0.536, 0.34, 0.124)),   # 53.6% highway — matches dashboard
    light_cond    = sample(c("daylight", "dark", "dusk"),    n, replace = TRUE,
                           prob = c(0.60, 0.30, 0.10)),
    severity_raw  = sample(c(0L, 1L), n, replace = TRUE, prob = c(0.782, 0.218)) # 21.8% severe
  )

  # ── Table 2: Traffic exposure (VMT proxy via AADT) ──────────────────────────
  traffic_raw <- tibble(
    location_id = rep(1:200, 12),
    month       = rep(1:12, each = 200),
    aadt        = round(rexp(2400, rate = 1 / 8000)),
    seg_len_km  = runif(2400, 0.5, 15)
  ) %>%
    mutate(vmt = aadt * seg_len_km * 365)

  # ── Table 3: Weather observations ──────────────────────────────────────────
  weather_raw <- tibble(
    location_id      = collision_raw$location_id,
    datetime_raw     = collision_raw$datetime_raw,
    precipitation_mm = rexp(n, rate = 1 / 1.5),
    visibility_km    = pmin(pmax(rnorm(n, 15, 5), 0.5), 30),
    wind_kph         = abs(rnorm(n, 20, 15)),
    temp_c           = rnorm(n, 5, 12),
    wx_condition_code = sample(
      c("clear", "rain", "snow", "fog", "freezing_rain"),
      n, replace = TRUE, prob = c(0.55, 0.20, 0.15, 0.06, 0.04)
    )
  )

  cat("Synthetic tables generated:\n")
  cat("  collision_raw  :", nrow(collision_raw),  "rows ×", ncol(collision_raw),  "cols\n")
  cat("  traffic_raw    :", nrow(traffic_raw),    "rows ×", ncol(traffic_raw),    "cols\n")
  cat("  weather_raw    :", nrow(weather_raw),    "rows ×", ncol(weather_raw),    "cols\n")

} else {

  collision_raw <- read_csv(enriched_path, show_col_types = FALSE)
  if (nchar(raw_path) > 0 && file.exists(raw_path)) {
    traffic_raw <- read_csv(raw_path, show_col_types = FALSE)
  }
  cat("Real data loaded from:", enriched_path, "\n")

}
Synthetic tables generated:
  collision_raw  : 5000 rows × 6 cols
  traffic_raw    : 2400 rows × 5 cols
  weather_raw    : 5000 rows × 7 cols

3 Cleaning Pass — Collision Records

3.1 Standardise column names

collision_clean <- janitor::clean_names(collision_raw)
cat("Column names after clean_names():\n")
Column names after clean_names():
cat(paste(names(collision_clean), collapse = ", "), "\n")
collision_id, datetime_raw, location_id, road_type, light_cond, severity_raw 

3.2 Parse datetime

collision_clean <- collision_clean %>%
  mutate(
    datetime = suppressWarnings(mdy_hms(datetime_raw, tz = "America/Halifax")),
    # Fallback: try ymd_hms if mdy_hms returns all NA
    datetime = if_else(
      is.na(datetime),
      suppressWarnings(ymd_hms(datetime_raw, tz = "America/Halifax")),
      datetime
    )
  )

n_failed <- sum(is.na(collision_clean$datetime))
cat("Datetime parse failures:", n_failed, "of", nrow(collision_clean), "\n")
Datetime parse failures: 0 of 5000 
if (n_failed / nrow(collision_clean) > 0.05) {
  warning("More than 5% of datetime values failed to parse. Check datetime_raw format.")
}

3.3 Derive temporal features

collision_clean <- collision_clean %>%
  mutate(
    month        = month(datetime),
    hour         = hour(datetime),
    day_of_week  = wday(datetime, label = TRUE, abbr = FALSE),
    is_weekend   = as.integer(wday(datetime) %in% c(1, 7)),
    # Holiday proxy: July, August, December peak travel months
    is_holiday_period = as.integer(month %in% c(7L, 8L, 12L)),
    # Night-time: 8pm–5am
    is_night     = as.integer(hour >= 20 | hour <= 5)
  )

3.4 Validate and recode severity

# Expected: 0 = non-severe, 1 = severe
# Audit check: confirm no unexpected values
sev_vals <- unique(collision_clean$severity_raw)
cat("Unique severity_raw values:", paste(sort(sev_vals), collapse = ", "), "\n")
Unique severity_raw values: 0, 1 
if (!all(sev_vals %in% c(0L, 1L))) {
  warning("Unexpected values in severity_raw: ",
          paste(setdiff(sev_vals, c(0L, 1L)), collapse = ", "))
}

collision_clean <- collision_clean %>%
  mutate(
    severe = factor(severity_raw, levels = c(0, 1), labels = c("No", "Yes"))
  )

severe_rate <- mean(collision_clean$severity_raw == 1, na.rm = TRUE)
cat(sprintf("Severe collision rate: %.1f%%\n", severe_rate * 100))
Severe collision rate: 21.3%

3.5 Road type standardisation

collision_clean <- collision_clean %>%
  mutate(
    road_type = str_to_lower(str_trim(road_type)),
    road_type = case_when(
      str_detect(road_type, "highway|hwy|provincial")  ~ "highway",
      str_detect(road_type, "arterial|major")          ~ "arterial",
      str_detect(road_type, "local|residential")       ~ "local",
      TRUE                                              ~ "other"
    )
  )

road_dist <- collision_clean %>%
  count(road_type) %>%
  mutate(share = percent(n / sum(n), accuracy = 0.1))

road_dist %>%
  kbl(caption = "Road type distribution after standardisation",
      col.names = c("Road Type", "Count", "Share")) %>%
  kable_styling(full_width = FALSE)
Road type distribution after standardisation
Road Type Count Share
arterial 1730 34.6%
highway 2653 53.1%
local 617 12.3%

4 Cleaning Pass — Traffic Exposure

if (use_synthetic) {
  traffic_clean <- janitor::clean_names(traffic_raw) %>%
    filter(!is.na(vmt), vmt >= 0) %>%
    # Cap extreme VMT outliers at 99.5th percentile
    mutate(vmt = pmin(vmt, quantile(vmt, 0.995)))
} else {
  # Real data: replace with your actual AADT/VMT cleaning logic
  traffic_clean <- janitor::clean_names(traffic_raw)
}

cat("Traffic rows after cleaning:", nrow(traffic_clean), "\n")
Traffic rows after cleaning: 2400 
cat("VMT summary:\n")
VMT summary:
print(summary(traffic_clean$vmt))
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
     5845   4293192  12427862  22156321  30258517 156973973 

5 Cleaning Pass — Weather Data

if (use_synthetic) {
  weather_clean <- janitor::clean_names(weather_raw) %>%
    mutate(
      # is_severe_weather: flag hazardous conditions
      # Canonical name is is_severe_weather (not severe_weather — see VCM correction log)
      is_severe_weather = as.integer(
        wx_condition_code %in% c("snow", "fog", "freezing_rain")
      ),
      # Cap physically implausible values
      precipitation_mm = pmax(precipitation_mm, 0),
      visibility_km    = pmin(pmax(visibility_km, 0), 50),
      wind_kph         = pmax(wind_kph, 0)
    )
} else {
  weather_clean <- janitor::clean_names(weather_raw) %>%
    mutate(
      is_severe_weather = as.integer(
        wx_condition_code %in% c("SN", "FG", "FZRA", "IC", "PE")
      )
    )
}

cat("Weather rows after cleaning:", nrow(weather_clean), "\n")
Weather rows after cleaning: 5000 
has_temp <- "temp_c" %in% names(weather_clean)
cat("temp_c present:", has_temp, "\n")
temp_c present: TRUE 
Important

Naming convention audit note: The column is_severe_weather is the canonical name used throughout this pipeline. An earlier version of the Python pipeline contained a duplicate column called severe_weather. That duplicate was removed. Only is_severe_weather (integer 0/1) appears in the feature set passed to the models in VCM_Model_Validation_Final.qmd.


6 Three-Table Merge

# Step 1: collision × traffic (join key: location_id + month)
df_merged <- collision_clean %>%
  left_join(
    traffic_clean %>% select(location_id, month, vmt),
    by = c("location_id", "month")
  )

# Step 2: × weather (join key: location_id + datetime_raw for synthetic;
#          in real data this is a spatial-temporal fuzzy join by nearest station)
df_merged <- df_merged %>%
  left_join(
    weather_clean %>% select(location_id, datetime_raw,
                             precipitation_mm, visibility_km,
                             wind_kph, temp_c, is_severe_weather),
    by = c("location_id", "datetime_raw")
  )

# Step 3: drop records missing essential modelling inputs
df_merged <- df_merged %>%
  filter(!is.na(vmt), !is.na(precipitation_mm), !is.na(severe))

cat("Post-merge row count:", nrow(df_merged), "\n")
Post-merge row count: 5000 
cat("Post-merge col count:", ncol(df_merged), "\n")
Post-merge col count: 20 
cat("Missing values by column:\n")
Missing values by column:
df_merged %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
  filter(n_missing > 0) %>%
  kbl(caption = "Remaining missingness after merge") %>%
  kable_styling(full_width = FALSE)
Remaining missingness after merge
column n_missing

7 Feature Engineering

df_features <- df_merged %>%
  mutate(
    # Interaction terms (traffic × adverse conditions)
    traffic_x_precip      = vmt * precipitation_mm,
    traffic_x_visibility  = vmt * visibility_km,

    # Speed proxy from road class
    speed_proxy = case_when(
      road_type == "highway"  ~ 100L,
      road_type == "arterial" ~ 50L,
      road_type == "local"    ~ 30L,
      TRUE                    ~ 50L   # fallback for "other"
    )
  )

FEATURES <- c(
  "vmt", "precipitation_mm", "visibility_km", "wind_kph",
  "temp_c", "speed_proxy",
  "traffic_x_precip", "traffic_x_visibility",
  "is_severe_weather", "is_holiday_period", "is_night"
)

# Final model-ready frame
df_model <- df_features %>%
  select(collision_id, all_of(FEATURES), severe, severity_raw) %>%
  drop_na(all_of(FEATURES))

cat("Final model-ready dataset:", nrow(df_model), "rows ×",
    length(FEATURES), "features\n")
Final model-ready dataset: 5000 rows × 11 features

Feature summary table

df_model %>%
  select(all_of(FEATURES)) %>%
  summarise(across(everything(),
    list(mean = ~ round(mean(., na.rm = TRUE), 3),
         sd   = ~ round(sd(.,   na.rm = TRUE), 3),
         min  = ~ round(min(.,  na.rm = TRUE), 3),
         max  = ~ round(max(.,  na.rm = TRUE), 3)),
    .names = "{.col}_{.fn}"
  )) %>%
  pivot_longer(everything(),
               names_to  = c("feature", "stat"),
               names_sep = "_(?=[^_]+$)") %>%
  pivot_wider(names_from = stat, values_from = value) %>%
  kbl(caption = "Feature descriptive statistics (post-cleaning)") %>%
  kable_styling(full_width = FALSE)
Feature descriptive statistics (post-cleaning)
feature mean sd min max
vmt 22683502.466 27251005.842 5844.621 156973972.877
precipitation_mm 1.528 1.537 0.000 14.199
visibility_km 15.106 4.991 0.500 30.000
wind_kph 21.540 13.376 0.002 80.996
temp_c 5.161 12.090 -38.804 49.630
speed_proxy 74.062 28.232 30.000 100.000
traffic_x_precip 34327752.837 65742497.092 544.510 1072849589.950
traffic_x_visibility 343022061.016 449989059.751 73679.189 4074067394.046
is_severe_weather 0.251 0.433 0.000 1.000
is_holiday_period 0.223 0.416 0.000 1.000
is_night 0.250 0.433 0.000 1.000

8 The Six Driving Archetypes

The six driving archetypes are condition-partitioned behavioral regimes. They represent the six mutually exclusive (or nearly so) operating contexts under which separate XGBoost sub-models are trained in VCM_Model_Validation_Final.qmd.

Each archetype isolates a different population of collision records defined by environmental and temporal context — a proxy for the underlying driver behaviour environment. The archetypes are not K-means clusters; they are engineered binary partitions derived from the flag variables constructed above.

archetypes <- list(
  "Archetype 1 — Severe Weather"   = list(col = "is_severe_weather",  val = 1L),
  "Archetype 2 — Clear Conditions" = list(col = "is_severe_weather",  val = 0L),
  "Archetype 3 — Holiday Period"   = list(col = "is_holiday_period",  val = 1L),
  "Archetype 4 — Non-Holiday"      = list(col = "is_holiday_period",  val = 0L),
  "Archetype 5 — Night-time"       = list(col = "is_night",           val = 1L),
  "Archetype 6 — Daytime"          = list(col = "is_night",           val = 0L)
)

8.1 Archetype row counts and severe rates

archetype_profile <- map_dfr(names(archetypes), function(aname) {
  spec  <- archetypes[[aname]]
  rows  <- df_model %>% filter(.data[[spec$col]] == spec$val)
  tibble(
    Archetype       = aname,
    n               = nrow(rows),
    `Share of Data` = percent(nrow(rows) / nrow(df_model), accuracy = 0.1),
    `Severe Rate`   = percent(mean(rows$severity_raw == 1, na.rm = TRUE), accuracy = 0.1),
    `Partition Key` = paste0(spec$col, " == ", spec$val)
  )
})

archetype_profile %>%
  kbl(caption = "Six driving archetypes — record counts and severe rates") %>%
  kable_styling(full_width = FALSE) %>%
  row_spec(which(str_detect(archetype_profile$Archetype, "Severe Weather|Night|Holiday Period")),
           background = "#FFF3E0")
Six driving archetypes — record counts and severe rates
Archetype n Share of Data Severe Rate Partition Key
Archetype 1 — Severe Weather 1253 25.1% 21.3% is_severe_weather == 1
Archetype 2 — Clear Conditions 3747 74.9% 21.3% is_severe_weather == 0
Archetype 3 — Holiday Period 1116 22.3% 21.5% is_holiday_period == 1
Archetype 4 — Non-Holiday 3884 77.7% 21.2% is_holiday_period == 0
Archetype 5 — Night-time 1250 25.0% 20.5% is_night == 1
Archetype 6 — Daytime 3750 75.0% 21.5% is_night == 0

8.2 Archetype severe-rate comparison plot

archetype_rates <- map_dfr(names(archetypes), function(aname) {
  spec <- archetypes[[aname]]
  rows <- df_model %>% filter(.data[[spec$col]] == spec$val)
  tibble(
    archetype   = aname,
    severe_rate = mean(rows$severity_raw == 1, na.rm = TRUE),
    n           = nrow(rows)
  )
})

overall_rate <- mean(df_model$severity_raw == 1, na.rm = TRUE)

ggplot(archetype_rates, aes(
  x    = reorder(str_wrap(archetype, 20), severe_rate),
  y    = severe_rate,
  fill = severe_rate > overall_rate
)) +
  geom_col(show.legend = FALSE, width = 0.65) +
  geom_hline(yintercept = overall_rate,
             linetype = "dashed", colour = "#D7191C", linewidth = 0.8) +
  annotate("text", x = 0.6, y = overall_rate,
           label = paste0("Overall: ", percent(overall_rate, accuracy = 0.1)),
           hjust = 0, vjust = -0.5, colour = "#D7191C", size = 3.2) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  scale_fill_manual(values = c("FALSE" = "#1565C0", "TRUE" = "#D7191C")) +
  coord_flip() +
  labs(
    title    = "Severe Collision Rate by Driving Archetype",
    subtitle = "Red bars exceed the overall severe rate; blue bars fall below it.",
    x        = NULL, y = "Severe Collision Rate"
  )

Severe collision rate by driving archetype

8.3 Archetype partition logic (audit trail)

The table below documents exactly how each archetype flag is constructed so Rachel (or any downstream auditor) can trace the lineage from raw source fields to model-input partitions.

tibble(
  Archetype       = names(archetypes),
  `Source Column` = map_chr(archetypes, ~ .x$col),
  `Filter Value`  = map_chr(archetypes, ~ as.character(.x$val)),
  `Derived From`  = c(
    "wx_condition_code %in% snow/fog/freezing_rain → is_severe_weather",
    "wx_condition_code NOT in snow/fog/freezing_rain",
    "month %in% c(7, 8, 12) → is_holiday_period",
    "month NOT in c(7, 8, 12)",
    "hour >= 20 | hour <= 5 → is_night",
    "hour > 5 & hour < 20"
  )
) %>%
  kbl(caption = "Archetype construction — full audit trail") %>%
  kable_styling(full_width = FALSE) %>%
  column_spec(4, width = "12cm")
Archetype construction — full audit trail
Archetype Source Column Filter Value Derived From
Archetype 1 — Severe Weather is_severe_weather 1 wx_condition_code %in% snow/fog/freezing_rain → is_severe_weather
Archetype 2 — Clear Conditions is_severe_weather 0 wx_condition_code NOT in snow/fog/freezing_rain
Archetype 3 — Holiday Period is_holiday_period 1 month %in% c(7, 8, 12) → is_holiday_period
Archetype 4 — Non-Holiday is_holiday_period 0 month NOT in c(7, 8, 12)
Archetype 5 — Night-time is_night 1 hour >= 20 | hour <= 5 → is_night
Archetype 6 — Daytime is_night 0 hour > 5 & hour < 20

9 Train / Test Split

The 80/20 stratified split used in VCM_Model_Validation_Final.qmd is reproduced here so the cleaned dataset and splits are verifiable end-to-end.

library(caret)

set.seed(42)
train_idx   <- createDataPartition(df_model$severe, p = 0.80, list = FALSE)
train_clean <- df_model[ train_idx, ]
test_clean  <- df_model[-train_idx, ]

cat("Train set:", nrow(train_clean), "rows |",
    "Severe:", percent(mean(train_clean$severity_raw == 1), accuracy = 0.1), "\n")
Train set: 4001 rows | Severe: 21.3% 
cat("Test set :", nrow(test_clean),  "rows |",
    "Severe:", percent(mean(test_clean$severity_raw  == 1), accuracy = 0.1), "\n")
Test set : 999 rows | Severe: 21.2% 
Note

Stratification ensures that the 21.8% severe rate is preserved in both splits, preventing the test set from being accidentally all-non-severe or heavily imbalanced relative to the training set.


10 Save Cleaned Outputs

# Write the cleaned, feature-engineered dataset so VCM_Model_Validation_Final.qmd
# can consume it via the NS_ENRICHED_CSV env var.

out_dir <- Sys.getenv("NS_OUTPUT_DIR", unset = tempdir())

write_csv(df_model,     file.path(out_dir, "ns_vcm_model_ready.csv"))
write_csv(train_clean,  file.path(out_dir, "ns_vcm_train.csv"))
write_csv(test_clean,   file.path(out_dir, "ns_vcm_test.csv"))

cat("Outputs written to:", out_dir, "\n")
Outputs written to: C:\Users\gshk0\AppData\Local\Temp\RtmpGqWbYR 
cat("  ns_vcm_model_ready.csv  —", nrow(df_model),    "rows (full cleaned set)\n")
  ns_vcm_model_ready.csv  — 5000 rows (full cleaned set)
cat("  ns_vcm_train.csv        —", nrow(train_clean),  "rows (80% train)\n")
  ns_vcm_train.csv        — 4001 rows (80% train)
cat("  ns_vcm_test.csv         —", nrow(test_clean),   "rows (20% test)\n")
  ns_vcm_test.csv         — 999 rows (20% test)
cat("\nTo use in VCM_Model_Validation_Final.qmd:\n")

To use in VCM_Model_Validation_Final.qmd:
cat("  Sys.setenv(NS_ENRICHED_CSV = '", file.path(out_dir, "ns_vcm_model_ready.csv"), "')\n", sep = "")
  Sys.setenv(NS_ENRICHED_CSV = 'C:\Users\gshk0\AppData\Local\Temp\RtmpGqWbYR/ns_vcm_model_ready.csv')

11 Compliance Cross-Reference

The table below maps each cleaning step in this QMD to the corresponding section in VCM_Model_Validation_Final.qmd that depends on it.

tibble(
  `ETL Step (this file)` = c(
    "§3  Collision cleaning",
    "§3.2  Datetime parse",
    "§3.4  Severity recode",
    "§4  Traffic exposure cleaning",
    "§5  Weather cleaning — is_severe_weather naming",
    "§6  Three-table merge",
    "§7  Feature engineering",
    "§8  Six archetypes defined",
    "§9  Train/test split (seed = 42)"
  ),
  `Depends On in VCM_Model_Validation_Final.qmd` = c(
    "setup-data chunk — analysis_df construction",
    "datetime_raw → datetime parse in setup chunk",
    "severity_raw == 1 logic throughout",
    "vmt feature in FEATURES vector",
    "is_severe_weather in FEATURES; Archetype 1/2 regime sub-models",
    "analysis_df is the merged frame",
    "FEATURES vector in model training chunks",
    "Section 10/11 — Behavioral Regime Sub-Models",
    "createDataPartition(seed=42) in train/test split chunk"
  )
) %>%
  kbl(caption = "ETL-to-model compliance cross-reference") %>%
  kable_styling(full_width = FALSE) %>%
  column_spec(1, bold = TRUE, width = "8cm") %>%
  column_spec(2, width = "10cm")
ETL-to-model compliance cross-reference
ETL Step (this file) Depends On in VCM_Model_Validation_Final.qmd
§3 Collision cleaning setup-data chunk — analysis_df construction
§3.2 Datetime parse datetime_raw → datetime parse in setup chunk
§3.4 Severity recode severity_raw == 1 logic throughout
§4 Traffic exposure cleaning vmt feature in FEATURES vector
§5 Weather cleaning — is_severe_weather naming is_severe_weather in FEATURES; Archetype 1/2 regime sub-models
§6 Three-table merge analysis_df is the merged frame
§7 Feature engineering FEATURES vector in model training chunks
§8 Six archetypes defined Section 10/11 — Behavioral Regime Sub-Models
§9 Train/test split (seed = 42) createDataPartition(seed=42) in train/test split chunk

12 LLM Usage Disclosure

Claude was used to help structure the R/Quarto ETL workflow, generate the audit trail table, and refine interpretive phrasing. All final analytical decisions, data field mappings, and archetype definitions were reviewed by the authors.