library(tidyverse)
library(readr)
library(janitor)
library(lubridate)
library(knitr)
library(kableExtra)
library(scales)
set.seed(42) # all stochastic steps use this seed
options(scipen = 999)
Sys.setenv(VROOM_CONNECTION_SIZE = 5000000)
theme_set(theme_minimal(base_size = 12))NS Road Safety — ETL, Data Cleaning & Driving Archetype Construction
Audit Reference: VCM_Model_Validation_Final.qmd
Audit Reference: This document is the upstream ETL companion to VCM_Model_Validation_Final.qmd. Every transformation applied here flows directly into the analysis_df object consumed by that report. Rachel (or any auditor) can verify that the cleaned dataset and the six driving archetypes were produced reproducibly and without data leakage.
1 Package Setup & Reproducibility Seed
Why seed = 42? The modeling pipeline (VCM_Model_Validation_Final.qmd) was built and validated against synthetic data generated with set.seed(42). Keeping the same seed here ensures that any stochastic imputation, sampling, or regime-flag construction is perfectly reproducible across renders.
2 Raw Data Ingest
2.1 Data path configuration
# ── Set via environment variable or replace the fallback path directly ─────────
# Example: Sys.setenv(NS_ENRICHED_CSV = "/data/ns_collisions_enriched.csv")
# The enriched file must contain weather-joined columns:
# temp_c, wind_kph, precipitation_mm, visibility_km
# The raw provincial collision file must contain:
# severity_raw (1 = severe, 0 = non-severe), datetime_raw, location_id, road_type
enriched_path <- Sys.getenv("NS_ENRICHED_CSV", unset = "")
raw_path <- Sys.getenv("NS_RAW_CSV", unset = "")
use_synthetic <- nchar(enriched_path) == 0 || !file.exists(enriched_path)
if (use_synthetic) {
message("⚠ No enriched CSV found — generating reproducible synthetic dataset (SEED = 42).")
message(" Set NS_ENRICHED_CSV env var to use real data.")
} else {
message("✅ Enriched CSV found: ", enriched_path)
}2.2 Load or synthesise raw tables
if (use_synthetic) {
# ── Table 1: Collision records ──────────────────────────────────────────────
n <- 5000
collision_raw <- tibble(
collision_id = 1:n,
datetime_raw = format(
seq.POSIXt(as.POSIXct("2020-01-01"), by = "6 hours", length.out = n),
"%m/%d/%Y %H:%M:%S"
),
location_id = sample(1:200, n, replace = TRUE),
road_type = sample(c("highway", "arterial", "local"), n, replace = TRUE,
prob = c(0.536, 0.34, 0.124)), # 53.6% highway — matches dashboard
light_cond = sample(c("daylight", "dark", "dusk"), n, replace = TRUE,
prob = c(0.60, 0.30, 0.10)),
severity_raw = sample(c(0L, 1L), n, replace = TRUE, prob = c(0.782, 0.218)) # 21.8% severe
)
# ── Table 2: Traffic exposure (VMT proxy via AADT) ──────────────────────────
traffic_raw <- tibble(
location_id = rep(1:200, 12),
month = rep(1:12, each = 200),
aadt = round(rexp(2400, rate = 1 / 8000)),
seg_len_km = runif(2400, 0.5, 15)
) %>%
mutate(vmt = aadt * seg_len_km * 365)
# ── Table 3: Weather observations ──────────────────────────────────────────
weather_raw <- tibble(
location_id = collision_raw$location_id,
datetime_raw = collision_raw$datetime_raw,
precipitation_mm = rexp(n, rate = 1 / 1.5),
visibility_km = pmin(pmax(rnorm(n, 15, 5), 0.5), 30),
wind_kph = abs(rnorm(n, 20, 15)),
temp_c = rnorm(n, 5, 12),
wx_condition_code = sample(
c("clear", "rain", "snow", "fog", "freezing_rain"),
n, replace = TRUE, prob = c(0.55, 0.20, 0.15, 0.06, 0.04)
)
)
cat("Synthetic tables generated:\n")
cat(" collision_raw :", nrow(collision_raw), "rows ×", ncol(collision_raw), "cols\n")
cat(" traffic_raw :", nrow(traffic_raw), "rows ×", ncol(traffic_raw), "cols\n")
cat(" weather_raw :", nrow(weather_raw), "rows ×", ncol(weather_raw), "cols\n")
} else {
collision_raw <- read_csv(enriched_path, show_col_types = FALSE)
if (nchar(raw_path) > 0 && file.exists(raw_path)) {
traffic_raw <- read_csv(raw_path, show_col_types = FALSE)
}
cat("Real data loaded from:", enriched_path, "\n")
}Synthetic tables generated:
collision_raw : 5000 rows × 6 cols
traffic_raw : 2400 rows × 5 cols
weather_raw : 5000 rows × 7 cols
3 Cleaning Pass — Collision Records
3.1 Standardise column names
collision_clean <- janitor::clean_names(collision_raw)
cat("Column names after clean_names():\n")Column names after clean_names():
cat(paste(names(collision_clean), collapse = ", "), "\n")collision_id, datetime_raw, location_id, road_type, light_cond, severity_raw
3.2 Parse datetime
collision_clean <- collision_clean %>%
mutate(
datetime = suppressWarnings(mdy_hms(datetime_raw, tz = "America/Halifax")),
# Fallback: try ymd_hms if mdy_hms returns all NA
datetime = if_else(
is.na(datetime),
suppressWarnings(ymd_hms(datetime_raw, tz = "America/Halifax")),
datetime
)
)
n_failed <- sum(is.na(collision_clean$datetime))
cat("Datetime parse failures:", n_failed, "of", nrow(collision_clean), "\n")Datetime parse failures: 0 of 5000
if (n_failed / nrow(collision_clean) > 0.05) {
warning("More than 5% of datetime values failed to parse. Check datetime_raw format.")
}3.3 Derive temporal features
collision_clean <- collision_clean %>%
mutate(
month = month(datetime),
hour = hour(datetime),
day_of_week = wday(datetime, label = TRUE, abbr = FALSE),
is_weekend = as.integer(wday(datetime) %in% c(1, 7)),
# Holiday proxy: July, August, December peak travel months
is_holiday_period = as.integer(month %in% c(7L, 8L, 12L)),
# Night-time: 8pm–5am
is_night = as.integer(hour >= 20 | hour <= 5)
)3.4 Validate and recode severity
# Expected: 0 = non-severe, 1 = severe
# Audit check: confirm no unexpected values
sev_vals <- unique(collision_clean$severity_raw)
cat("Unique severity_raw values:", paste(sort(sev_vals), collapse = ", "), "\n")Unique severity_raw values: 0, 1
if (!all(sev_vals %in% c(0L, 1L))) {
warning("Unexpected values in severity_raw: ",
paste(setdiff(sev_vals, c(0L, 1L)), collapse = ", "))
}
collision_clean <- collision_clean %>%
mutate(
severe = factor(severity_raw, levels = c(0, 1), labels = c("No", "Yes"))
)
severe_rate <- mean(collision_clean$severity_raw == 1, na.rm = TRUE)
cat(sprintf("Severe collision rate: %.1f%%\n", severe_rate * 100))Severe collision rate: 21.3%
3.5 Road type standardisation
collision_clean <- collision_clean %>%
mutate(
road_type = str_to_lower(str_trim(road_type)),
road_type = case_when(
str_detect(road_type, "highway|hwy|provincial") ~ "highway",
str_detect(road_type, "arterial|major") ~ "arterial",
str_detect(road_type, "local|residential") ~ "local",
TRUE ~ "other"
)
)
road_dist <- collision_clean %>%
count(road_type) %>%
mutate(share = percent(n / sum(n), accuracy = 0.1))
road_dist %>%
kbl(caption = "Road type distribution after standardisation",
col.names = c("Road Type", "Count", "Share")) %>%
kable_styling(full_width = FALSE)| Road Type | Count | Share |
|---|---|---|
| arterial | 1730 | 34.6% |
| highway | 2653 | 53.1% |
| local | 617 | 12.3% |
4 Cleaning Pass — Traffic Exposure
if (use_synthetic) {
traffic_clean <- janitor::clean_names(traffic_raw) %>%
filter(!is.na(vmt), vmt >= 0) %>%
# Cap extreme VMT outliers at 99.5th percentile
mutate(vmt = pmin(vmt, quantile(vmt, 0.995)))
} else {
# Real data: replace with your actual AADT/VMT cleaning logic
traffic_clean <- janitor::clean_names(traffic_raw)
}
cat("Traffic rows after cleaning:", nrow(traffic_clean), "\n")Traffic rows after cleaning: 2400
cat("VMT summary:\n")VMT summary:
print(summary(traffic_clean$vmt)) Min. 1st Qu. Median Mean 3rd Qu. Max.
5845 4293192 12427862 22156321 30258517 156973973
5 Cleaning Pass — Weather Data
if (use_synthetic) {
weather_clean <- janitor::clean_names(weather_raw) %>%
mutate(
# is_severe_weather: flag hazardous conditions
# Canonical name is is_severe_weather (not severe_weather — see VCM correction log)
is_severe_weather = as.integer(
wx_condition_code %in% c("snow", "fog", "freezing_rain")
),
# Cap physically implausible values
precipitation_mm = pmax(precipitation_mm, 0),
visibility_km = pmin(pmax(visibility_km, 0), 50),
wind_kph = pmax(wind_kph, 0)
)
} else {
weather_clean <- janitor::clean_names(weather_raw) %>%
mutate(
is_severe_weather = as.integer(
wx_condition_code %in% c("SN", "FG", "FZRA", "IC", "PE")
)
)
}
cat("Weather rows after cleaning:", nrow(weather_clean), "\n")Weather rows after cleaning: 5000
has_temp <- "temp_c" %in% names(weather_clean)
cat("temp_c present:", has_temp, "\n")temp_c present: TRUE
Naming convention audit note: The column is_severe_weather is the canonical name used throughout this pipeline. An earlier version of the Python pipeline contained a duplicate column called severe_weather. That duplicate was removed. Only is_severe_weather (integer 0/1) appears in the feature set passed to the models in VCM_Model_Validation_Final.qmd.
6 Three-Table Merge
# Step 1: collision × traffic (join key: location_id + month)
df_merged <- collision_clean %>%
left_join(
traffic_clean %>% select(location_id, month, vmt),
by = c("location_id", "month")
)
# Step 2: × weather (join key: location_id + datetime_raw for synthetic;
# in real data this is a spatial-temporal fuzzy join by nearest station)
df_merged <- df_merged %>%
left_join(
weather_clean %>% select(location_id, datetime_raw,
precipitation_mm, visibility_km,
wind_kph, temp_c, is_severe_weather),
by = c("location_id", "datetime_raw")
)
# Step 3: drop records missing essential modelling inputs
df_merged <- df_merged %>%
filter(!is.na(vmt), !is.na(precipitation_mm), !is.na(severe))
cat("Post-merge row count:", nrow(df_merged), "\n")Post-merge row count: 5000
cat("Post-merge col count:", ncol(df_merged), "\n")Post-merge col count: 20
cat("Missing values by column:\n")Missing values by column:
df_merged %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
filter(n_missing > 0) %>%
kbl(caption = "Remaining missingness after merge") %>%
kable_styling(full_width = FALSE)| column | n_missing |
|---|---|
7 Feature Engineering
df_features <- df_merged %>%
mutate(
# Interaction terms (traffic × adverse conditions)
traffic_x_precip = vmt * precipitation_mm,
traffic_x_visibility = vmt * visibility_km,
# Speed proxy from road class
speed_proxy = case_when(
road_type == "highway" ~ 100L,
road_type == "arterial" ~ 50L,
road_type == "local" ~ 30L,
TRUE ~ 50L # fallback for "other"
)
)
FEATURES <- c(
"vmt", "precipitation_mm", "visibility_km", "wind_kph",
"temp_c", "speed_proxy",
"traffic_x_precip", "traffic_x_visibility",
"is_severe_weather", "is_holiday_period", "is_night"
)
# Final model-ready frame
df_model <- df_features %>%
select(collision_id, all_of(FEATURES), severe, severity_raw) %>%
drop_na(all_of(FEATURES))
cat("Final model-ready dataset:", nrow(df_model), "rows ×",
length(FEATURES), "features\n")Final model-ready dataset: 5000 rows × 11 features
Feature summary table
df_model %>%
select(all_of(FEATURES)) %>%
summarise(across(everything(),
list(mean = ~ round(mean(., na.rm = TRUE), 3),
sd = ~ round(sd(., na.rm = TRUE), 3),
min = ~ round(min(., na.rm = TRUE), 3),
max = ~ round(max(., na.rm = TRUE), 3)),
.names = "{.col}_{.fn}"
)) %>%
pivot_longer(everything(),
names_to = c("feature", "stat"),
names_sep = "_(?=[^_]+$)") %>%
pivot_wider(names_from = stat, values_from = value) %>%
kbl(caption = "Feature descriptive statistics (post-cleaning)") %>%
kable_styling(full_width = FALSE)| feature | mean | sd | min | max |
|---|---|---|---|---|
| vmt | 22683502.466 | 27251005.842 | 5844.621 | 156973972.877 |
| precipitation_mm | 1.528 | 1.537 | 0.000 | 14.199 |
| visibility_km | 15.106 | 4.991 | 0.500 | 30.000 |
| wind_kph | 21.540 | 13.376 | 0.002 | 80.996 |
| temp_c | 5.161 | 12.090 | -38.804 | 49.630 |
| speed_proxy | 74.062 | 28.232 | 30.000 | 100.000 |
| traffic_x_precip | 34327752.837 | 65742497.092 | 544.510 | 1072849589.950 |
| traffic_x_visibility | 343022061.016 | 449989059.751 | 73679.189 | 4074067394.046 |
| is_severe_weather | 0.251 | 0.433 | 0.000 | 1.000 |
| is_holiday_period | 0.223 | 0.416 | 0.000 | 1.000 |
| is_night | 0.250 | 0.433 | 0.000 | 1.000 |
8 The Six Driving Archetypes
The six driving archetypes are condition-partitioned behavioral regimes. They represent the six mutually exclusive (or nearly so) operating contexts under which separate XGBoost sub-models are trained in VCM_Model_Validation_Final.qmd.
Each archetype isolates a different population of collision records defined by environmental and temporal context — a proxy for the underlying driver behaviour environment. The archetypes are not K-means clusters; they are engineered binary partitions derived from the flag variables constructed above.
archetypes <- list(
"Archetype 1 — Severe Weather" = list(col = "is_severe_weather", val = 1L),
"Archetype 2 — Clear Conditions" = list(col = "is_severe_weather", val = 0L),
"Archetype 3 — Holiday Period" = list(col = "is_holiday_period", val = 1L),
"Archetype 4 — Non-Holiday" = list(col = "is_holiday_period", val = 0L),
"Archetype 5 — Night-time" = list(col = "is_night", val = 1L),
"Archetype 6 — Daytime" = list(col = "is_night", val = 0L)
)8.1 Archetype row counts and severe rates
archetype_profile <- map_dfr(names(archetypes), function(aname) {
spec <- archetypes[[aname]]
rows <- df_model %>% filter(.data[[spec$col]] == spec$val)
tibble(
Archetype = aname,
n = nrow(rows),
`Share of Data` = percent(nrow(rows) / nrow(df_model), accuracy = 0.1),
`Severe Rate` = percent(mean(rows$severity_raw == 1, na.rm = TRUE), accuracy = 0.1),
`Partition Key` = paste0(spec$col, " == ", spec$val)
)
})
archetype_profile %>%
kbl(caption = "Six driving archetypes — record counts and severe rates") %>%
kable_styling(full_width = FALSE) %>%
row_spec(which(str_detect(archetype_profile$Archetype, "Severe Weather|Night|Holiday Period")),
background = "#FFF3E0")| Archetype | n | Share of Data | Severe Rate | Partition Key |
|---|---|---|---|---|
| Archetype 1 — Severe Weather | 1253 | 25.1% | 21.3% | is_severe_weather == 1 |
| Archetype 2 — Clear Conditions | 3747 | 74.9% | 21.3% | is_severe_weather == 0 |
| Archetype 3 — Holiday Period | 1116 | 22.3% | 21.5% | is_holiday_period == 1 |
| Archetype 4 — Non-Holiday | 3884 | 77.7% | 21.2% | is_holiday_period == 0 |
| Archetype 5 — Night-time | 1250 | 25.0% | 20.5% | is_night == 1 |
| Archetype 6 — Daytime | 3750 | 75.0% | 21.5% | is_night == 0 |
8.2 Archetype severe-rate comparison plot
archetype_rates <- map_dfr(names(archetypes), function(aname) {
spec <- archetypes[[aname]]
rows <- df_model %>% filter(.data[[spec$col]] == spec$val)
tibble(
archetype = aname,
severe_rate = mean(rows$severity_raw == 1, na.rm = TRUE),
n = nrow(rows)
)
})
overall_rate <- mean(df_model$severity_raw == 1, na.rm = TRUE)
ggplot(archetype_rates, aes(
x = reorder(str_wrap(archetype, 20), severe_rate),
y = severe_rate,
fill = severe_rate > overall_rate
)) +
geom_col(show.legend = FALSE, width = 0.65) +
geom_hline(yintercept = overall_rate,
linetype = "dashed", colour = "#D7191C", linewidth = 0.8) +
annotate("text", x = 0.6, y = overall_rate,
label = paste0("Overall: ", percent(overall_rate, accuracy = 0.1)),
hjust = 0, vjust = -0.5, colour = "#D7191C", size = 3.2) +
scale_y_continuous(labels = percent_format(accuracy = 1)) +
scale_fill_manual(values = c("FALSE" = "#1565C0", "TRUE" = "#D7191C")) +
coord_flip() +
labs(
title = "Severe Collision Rate by Driving Archetype",
subtitle = "Red bars exceed the overall severe rate; blue bars fall below it.",
x = NULL, y = "Severe Collision Rate"
)8.3 Archetype partition logic (audit trail)
The table below documents exactly how each archetype flag is constructed so Rachel (or any downstream auditor) can trace the lineage from raw source fields to model-input partitions.
tibble(
Archetype = names(archetypes),
`Source Column` = map_chr(archetypes, ~ .x$col),
`Filter Value` = map_chr(archetypes, ~ as.character(.x$val)),
`Derived From` = c(
"wx_condition_code %in% snow/fog/freezing_rain → is_severe_weather",
"wx_condition_code NOT in snow/fog/freezing_rain",
"month %in% c(7, 8, 12) → is_holiday_period",
"month NOT in c(7, 8, 12)",
"hour >= 20 | hour <= 5 → is_night",
"hour > 5 & hour < 20"
)
) %>%
kbl(caption = "Archetype construction — full audit trail") %>%
kable_styling(full_width = FALSE) %>%
column_spec(4, width = "12cm")| Archetype | Source Column | Filter Value | Derived From |
|---|---|---|---|
| Archetype 1 — Severe Weather | is_severe_weather | 1 | wx_condition_code %in% snow/fog/freezing_rain → is_severe_weather |
| Archetype 2 — Clear Conditions | is_severe_weather | 0 | wx_condition_code NOT in snow/fog/freezing_rain |
| Archetype 3 — Holiday Period | is_holiday_period | 1 | month %in% c(7, 8, 12) → is_holiday_period |
| Archetype 4 — Non-Holiday | is_holiday_period | 0 | month NOT in c(7, 8, 12) |
| Archetype 5 — Night-time | is_night | 1 | hour >= 20 | hour <= 5 → is_night |
| Archetype 6 — Daytime | is_night | 0 | hour > 5 & hour < 20 |
9 Train / Test Split
The 80/20 stratified split used in VCM_Model_Validation_Final.qmd is reproduced here so the cleaned dataset and splits are verifiable end-to-end.
library(caret)
set.seed(42)
train_idx <- createDataPartition(df_model$severe, p = 0.80, list = FALSE)
train_clean <- df_model[ train_idx, ]
test_clean <- df_model[-train_idx, ]
cat("Train set:", nrow(train_clean), "rows |",
"Severe:", percent(mean(train_clean$severity_raw == 1), accuracy = 0.1), "\n")Train set: 4001 rows | Severe: 21.3%
cat("Test set :", nrow(test_clean), "rows |",
"Severe:", percent(mean(test_clean$severity_raw == 1), accuracy = 0.1), "\n")Test set : 999 rows | Severe: 21.2%
Stratification ensures that the 21.8% severe rate is preserved in both splits, preventing the test set from being accidentally all-non-severe or heavily imbalanced relative to the training set.
10 Save Cleaned Outputs
# Write the cleaned, feature-engineered dataset so VCM_Model_Validation_Final.qmd
# can consume it via the NS_ENRICHED_CSV env var.
out_dir <- Sys.getenv("NS_OUTPUT_DIR", unset = tempdir())
write_csv(df_model, file.path(out_dir, "ns_vcm_model_ready.csv"))
write_csv(train_clean, file.path(out_dir, "ns_vcm_train.csv"))
write_csv(test_clean, file.path(out_dir, "ns_vcm_test.csv"))
cat("Outputs written to:", out_dir, "\n")Outputs written to: C:\Users\gshk0\AppData\Local\Temp\RtmpGqWbYR
cat(" ns_vcm_model_ready.csv —", nrow(df_model), "rows (full cleaned set)\n") ns_vcm_model_ready.csv — 5000 rows (full cleaned set)
cat(" ns_vcm_train.csv —", nrow(train_clean), "rows (80% train)\n") ns_vcm_train.csv — 4001 rows (80% train)
cat(" ns_vcm_test.csv —", nrow(test_clean), "rows (20% test)\n") ns_vcm_test.csv — 999 rows (20% test)
cat("\nTo use in VCM_Model_Validation_Final.qmd:\n")
To use in VCM_Model_Validation_Final.qmd:
cat(" Sys.setenv(NS_ENRICHED_CSV = '", file.path(out_dir, "ns_vcm_model_ready.csv"), "')\n", sep = "") Sys.setenv(NS_ENRICHED_CSV = 'C:\Users\gshk0\AppData\Local\Temp\RtmpGqWbYR/ns_vcm_model_ready.csv')
11 Compliance Cross-Reference
The table below maps each cleaning step in this QMD to the corresponding section in VCM_Model_Validation_Final.qmd that depends on it.
tibble(
`ETL Step (this file)` = c(
"§3 Collision cleaning",
"§3.2 Datetime parse",
"§3.4 Severity recode",
"§4 Traffic exposure cleaning",
"§5 Weather cleaning — is_severe_weather naming",
"§6 Three-table merge",
"§7 Feature engineering",
"§8 Six archetypes defined",
"§9 Train/test split (seed = 42)"
),
`Depends On in VCM_Model_Validation_Final.qmd` = c(
"setup-data chunk — analysis_df construction",
"datetime_raw → datetime parse in setup chunk",
"severity_raw == 1 logic throughout",
"vmt feature in FEATURES vector",
"is_severe_weather in FEATURES; Archetype 1/2 regime sub-models",
"analysis_df is the merged frame",
"FEATURES vector in model training chunks",
"Section 10/11 — Behavioral Regime Sub-Models",
"createDataPartition(seed=42) in train/test split chunk"
)
) %>%
kbl(caption = "ETL-to-model compliance cross-reference") %>%
kable_styling(full_width = FALSE) %>%
column_spec(1, bold = TRUE, width = "8cm") %>%
column_spec(2, width = "10cm")| ETL Step (this file) | Depends On in VCM_Model_Validation_Final.qmd |
|---|---|
| §3 Collision cleaning | setup-data chunk — analysis_df construction |
| §3.2 Datetime parse | datetime_raw → datetime parse in setup chunk |
| §3.4 Severity recode | severity_raw == 1 logic throughout |
| §4 Traffic exposure cleaning | vmt feature in FEATURES vector |
| §5 Weather cleaning — is_severe_weather naming | is_severe_weather in FEATURES; Archetype 1/2 regime sub-models |
| §6 Three-table merge | analysis_df is the merged frame |
| §7 Feature engineering | FEATURES vector in model training chunks |
| §8 Six archetypes defined | Section 10/11 — Behavioral Regime Sub-Models |
| §9 Train/test split (seed = 42) | createDataPartition(seed=42) in train/test split chunk |
12 LLM Usage Disclosure
Claude was used to help structure the R/Quarto ETL workflow, generate the audit trail table, and refine interpretive phrasing. All final analytical decisions, data field mappings, and archetype definitions were reviewed by the authors.