This R Markdown file reads a crash-data CSV file, identifies categorical variables, and creates grouped versions of those variables. The grouped variables are limited to a maximum of seven categories.
The workflow follows these rules:
Not Reported.Other._Grp.library(tidyverse)
library(stringr)
library(readr)
library(writexl)
Change only file_path if you want to use another CSV
file.
file_path <- "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c.csv"
input_dir <- dirname(file_path)
input_base <- tools::file_path_sans_ext(basename(file_path))
output_csv <- file.path(input_dir, paste0(input_base, "_grouped.csv"))
output_summary <- file.path(input_dir, paste0(input_base, "_grouped_summary.csv"))
output_xlsx <- file.path(input_dir, paste0(input_base, "_grouped.xlsx"))
output_csv
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped.csv"
output_summary
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped_summary.csv"
output_xlsx
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped.xlsx"
The name_repair = "minimal" option is used so that blank
column names can be diagnosed before they are removed.
dat_raw <- read_csv(
file_path,
show_col_types = FALSE,
name_repair = "minimal"
)
dim(dat_raw)
## [1] 67018 15
head(dat_raw)
## # A tibble: 6 × 15
## Crash_ID D:\\DAS_CODE\\AVISTA\\Bike_…¹ Light_Cond_ID Road_Type_ID Road_Algn_ID
## <dbl> <chr> <chr> <chr> <chr>
## 1 15517107 Rain Daylight 4 Or More L… Curve, Grade
## 2 15517293 Rain Daylight <NA> Straight, L…
## 3 15517363 Rain Dawn 2 Lane, 2 W… Straight, L…
## 4 15517363 Rain Dawn 2 Lane, 2 W… Straight, L…
## 5 15522606 Rain Dark, Not Li… 2 Lane, 2 W… Straight, L…
## 6 15522606 Rain Dark, Not Li… 2 Lane, 2 W… Straight, L…
## # ℹ abbreviated name: ¹`D:\\DAS_CODE\\AVISTA\\Bike_HitnRun`
## # ℹ 10 more variables: Surf_Cond_ID <chr>, Traffic_Cntl_ID <chr>,
## # Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Obj_Struck_ID <chr>,
## # Road_Cls_ID <chr>, Crash_Sev_ID <chr>, Rural_Urban_Type_ID <chr>,
## # Veh_Body_Styl_ID <chr>, Veh_Damage_Description1_Id <chr>
This diagnostic identifies blank, missing, and duplicated column names. Blank-name columns are removed after this step.
original_names <- names(dat_raw)
blank_col_index <- which(is.na(original_names) | str_squish(original_names) == "")
diagnostic_columns <- tibble(
Column_Position = seq_along(original_names),
Original_Name = original_names,
Is_Blank_Name = is.na(original_names) | str_squish(original_names) == "",
Is_Duplicated_Name = duplicated(original_names) | duplicated(original_names, fromLast = TRUE)
)
diagnostic_columns
## # A tibble: 15 × 4
## Column_Position Original_Name Is_Blank_Name Is_Duplicated_Name
## <int> <chr> <lgl> <lgl>
## 1 1 "Crash_ID" FALSE FALSE
## 2 2 "D:\\DAS_CODE\\AVISTA\\Bike… FALSE FALSE
## 3 3 "Light_Cond_ID" FALSE FALSE
## 4 4 "Road_Type_ID" FALSE FALSE
## 5 5 "Road_Algn_ID" FALSE FALSE
## 6 6 "Surf_Cond_ID" FALSE FALSE
## 7 7 "Traffic_Cntl_ID" FALSE FALSE
## 8 8 "Intrsct_Relat_ID" FALSE FALSE
## 9 9 "FHE_Collsn_ID" FALSE FALSE
## 10 10 "Obj_Struck_ID" FALSE FALSE
## 11 11 "Road_Cls_ID" FALSE FALSE
## 12 12 "Crash_Sev_ID" FALSE FALSE
## 13 13 "Rural_Urban_Type_ID" FALSE FALSE
## 14 14 "Veh_Body_Styl_ID" FALSE FALSE
## 15 15 "Veh_Damage_Description1_Id" FALSE FALSE
Blank-name columns are removed after the diagnostic table. Remaining column names are cleaned and made unique.
dat <- dat_raw
if (length(blank_col_index) > 0) {
dat <- dat[, -blank_col_index, drop = FALSE]
}
names(dat) <- names(dat) %>%
str_squish() %>%
make.unique(sep = "_")
dim(dat)
## [1] 67018 15
names(dat)
## [1] "Crash_ID" "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun"
## [3] "Light_Cond_ID" "Road_Type_ID"
## [5] "Road_Algn_ID" "Surf_Cond_ID"
## [7] "Traffic_Cntl_ID" "Intrsct_Relat_ID"
## [9] "FHE_Collsn_ID" "Obj_Struck_ID"
## [11] "Road_Cls_ID" "Crash_Sev_ID"
## [13] "Rural_Urban_Type_ID" "Veh_Body_Styl_ID"
## [15] "Veh_Damage_Description1_Id"
clean_text <- function(x) {
x %>%
str_replace_all("_", " ") %>%
str_replace_all("\\s+", " ") %>%
str_squish()
}
norm_text <- function(x) {
x %>%
str_to_lower() %>%
str_replace_all("\\([^\\)]*\\)", " ") %>%
str_replace_all("[^a-z0-9 ]", " ") %>%
str_replace_all("\\s+", " ") %>%
str_squish()
}
make_ngrams <- function(words, n = 2) {
if (length(words) < n) return(character(0))
map_chr(
seq_len(length(words) - n + 1),
~ paste(words[.x:(.x + n - 1)], collapse = " ")
)
}
This function groups a categorical column into a maximum of seven
categories. If Not Reported exists, it is preserved as its
own category and is not merged into Other.
fast_group_column <- function(x, max_cat = 7, min_group_n = 2) {
x0 <- as.character(x)
# Blank cells become Not Reported, not Other
x0 <- ifelse(is.na(x0) | str_squish(x0) == "", "Not Reported", x0)
x0 <- clean_text(x0)
tab <- tibble(original = x0) %>%
count(original, sort = TRUE) %>%
mutate(norm = norm_text(original))
# Already small enough
if (nrow(tab) <= max_cat) {
return(x0)
}
has_not_reported <- any(tab$original == "Not Reported")
# If Not Reported exists, available slots are:
# Not Reported + Other + top/common groups
main_slots <- ifelse(has_not_reported, max_cat - 2, max_cat - 1)
main_slots <- max(main_slots, 1)
stop_words <- c(
"and", "or", "the", "of", "to", "in", "on", "at", "with",
"not", "id", "type", "description", "one", "both",
"other", "unknown", "reported", "applicable", "damage", "impact"
)
# Do not use Not Reported to discover common phrases
tab_phrase <- tab %>%
filter(original != "Not Reported")
phrase_df <- tab_phrase %>%
mutate(
words = str_split(norm, " "),
words = map(words, ~ setdiff(.x, stop_words)),
phrases = map(words, ~ c(
make_ngrams(.x, 3),
make_ngrams(.x, 2),
.x
))
) %>%
select(original, n, phrases) %>%
unnest(phrases) %>%
filter(!is.na(phrases), str_length(phrases) >= 3) %>%
group_by(phrases) %>%
summarise(
total_n = sum(n),
category_n = n_distinct(original),
.groups = "drop"
) %>%
filter(category_n >= min_group_n) %>%
arrange(desc(total_n), desc(category_n))
selected <- phrase_df %>%
slice_head(n = main_slots) %>%
pull(phrases)
# If no phrase is useful, keep top exact categories
if (length(selected) == 0) {
top_exact <- tab %>%
filter(original != "Not Reported") %>%
slice_head(n = main_slots) %>%
pull(original)
return(case_when(
x0 == "Not Reported" ~ "Not Reported",
x0 %in% top_exact ~ x0,
TRUE ~ "Other"
))
}
lookup <- tab %>%
mutate(
Grouped = map_chr(norm, function(v) {
matched <- selected[str_detect(v, fixed(selected))]
if (length(matched) > 0) str_to_title(matched[1]) else "Other"
}),
Grouped = ifelse(original == "Not Reported", "Not Reported", Grouped)
) %>%
select(original, Grouped)
other_share <- tab %>%
left_join(lookup, by = "original") %>%
filter(original != "Not Reported") %>%
summarise(p = sum(n[Grouped == "Other"]) / sum(n)) %>%
pull(p)
# If phrase grouping is too broad, use top exact categories instead
if (is.na(other_share) | other_share > 0.50) {
top_exact <- tab %>%
filter(original != "Not Reported") %>%
slice_head(n = main_slots) %>%
pull(original)
lookup <- tab %>%
mutate(
Grouped = case_when(
original == "Not Reported" ~ "Not Reported",
original %in% top_exact ~ original,
TRUE ~ "Other"
)
) %>%
select(original, Grouped)
}
lookup_vec <- setNames(lookup$Grouped, lookup$original)
grouped <- unname(lookup_vec[x0])
# Safety check: any unexpected missing group becomes Not Reported
grouped <- ifelse(is.na(grouped), "Not Reported", grouped)
grouped
}
Only character and factor columns are grouped. Blank-name columns are not included because they were removed earlier.
cat_cols <- names(dat)[sapply(dat, function(x) is.character(x) | is.factor(x))]
cat_cols <- cat_cols[!is.na(cat_cols) & str_squish(cat_cols) != ""]
cat_cols
## [1] "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" "Light_Cond_ID"
## [3] "Road_Type_ID" "Road_Algn_ID"
## [5] "Surf_Cond_ID" "Traffic_Cntl_ID"
## [7] "Intrsct_Relat_ID" "FHE_Collsn_ID"
## [9] "Obj_Struck_ID" "Road_Cls_ID"
## [11] "Crash_Sev_ID" "Rural_Urban_Type_ID"
## [13] "Veh_Body_Styl_ID" "Veh_Damage_Description1_Id"
length(cat_cols)
## [1] 14
if (length(cat_cols) == 0) {
dat_grouped <- dat
message("No character or factor columns were found. No grouped columns were created.")
} else {
dat_grouped <- dat %>%
mutate(
across(
all_of(cat_cols),
~ fast_group_column(.x, max_cat = 7),
.names = "{.col}_Grp"
)
)
}
dim(dat_grouped)
## [1] 67018 29
head(dat_grouped)
## # A tibble: 6 × 29
## Crash_ID D:\\DAS_CODE\\AVISTA\\Bike_…¹ Light_Cond_ID Road_Type_ID Road_Algn_ID
## <dbl> <chr> <chr> <chr> <chr>
## 1 15517107 Rain Daylight 4 Or More L… Curve, Grade
## 2 15517293 Rain Daylight <NA> Straight, L…
## 3 15517363 Rain Dawn 2 Lane, 2 W… Straight, L…
## 4 15517363 Rain Dawn 2 Lane, 2 W… Straight, L…
## 5 15522606 Rain Dark, Not Li… 2 Lane, 2 W… Straight, L…
## 6 15522606 Rain Dark, Not Li… 2 Lane, 2 W… Straight, L…
## # ℹ abbreviated name: ¹`D:\\DAS_CODE\\AVISTA\\Bike_HitnRun`
## # ℹ 24 more variables: Surf_Cond_ID <chr>, Traffic_Cntl_ID <chr>,
## # Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Obj_Struck_ID <chr>,
## # Road_Cls_ID <chr>, Crash_Sev_ID <chr>, Rural_Urban_Type_ID <chr>,
## # Veh_Body_Styl_ID <chr>, Veh_Damage_Description1_Id <chr>,
## # `D:\\DAS_CODE\\AVISTA\\Bike_HitnRun_Grp` <chr>, Light_Cond_ID_Grp <chr>,
## # Road_Type_ID_Grp <chr>, Road_Algn_ID_Grp <chr>, Surf_Cond_ID_Grp <chr>, …
if (length(cat_cols) == 0) {
summary_table <- tibble(
Variable = character(),
Grouped_Category = character(),
n = integer(),
Percent = numeric()
)
} else {
summary_table <- map_dfr(cat_cols, function(col) {
new_col <- paste0(col, "_Grp")
dat_grouped %>%
count(.data[[new_col]], sort = TRUE) %>%
mutate(
Variable = col,
Grouped_Category = .data[[new_col]],
Percent = round(100 * n / sum(n), 2)
) %>%
select(Variable, Grouped_Category, n, Percent)
})
}
summary_table
## # A tibble: 72 × 4
## Variable Grouped_Category n Percent
## <chr> <chr> <int> <dbl>
## 1 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Rain 60087 89.7
## 2 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Cloudy 4245 6.33
## 3 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Clear 1951 2.91
## 4 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Sleet/Hail 340 0.51
## 5 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Fog 162 0.24
## 6 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Snow 150 0.22
## 7 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Other 83 0.12
## 8 "Light_Cond_ID" Daylight 44188 65.9
## 9 "Light_Cond_ID" Dark, Lighted 10018 15.0
## 10 "Light_Cond_ID" Dark, Not Lighted 9990 14.9
## # ℹ 62 more rows
This table checks whether each grouped column has seven or fewer categories.
if (length(cat_cols) == 0) {
category_check <- tibble(
Variable = character(),
Number_of_Grouped_Categories = integer(),
Pass_Max_7_Check = logical()
)
} else {
category_check <- map_dfr(cat_cols, function(col) {
new_col <- paste0(col, "_Grp")
tibble(
Variable = col,
Number_of_Grouped_Categories = n_distinct(dat_grouped[[new_col]]),
Pass_Max_7_Check = n_distinct(dat_grouped[[new_col]]) <= 7
)
})
}
category_check
## # A tibble: 14 × 3
## Variable Number_of_Grouped_Cat…¹ Pass_Max_7_Check
## <chr> <int> <lgl>
## 1 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" 7 TRUE
## 2 "Light_Cond_ID" 7 TRUE
## 3 "Road_Type_ID" 4 TRUE
## 4 "Road_Algn_ID" 5 TRUE
## 5 "Surf_Cond_ID" 7 TRUE
## 6 "Traffic_Cntl_ID" 7 TRUE
## 7 "Intrsct_Relat_ID" 4 TRUE
## 8 "FHE_Collsn_ID" 3 TRUE
## 9 "Obj_Struck_ID" 2 TRUE
## 10 "Road_Cls_ID" 7 TRUE
## 11 "Crash_Sev_ID" 5 TRUE
## 12 "Rural_Urban_Type_ID" 5 TRUE
## 13 "Veh_Body_Styl_ID" 4 TRUE
## 14 "Veh_Damage_Description1_Id" 5 TRUE
## # ℹ abbreviated name: ¹Number_of_Grouped_Categories
write_csv(dat_grouped, output_csv)
write_csv(summary_table, output_summary)
write_xlsx(
list(
Grouped_Data = dat_grouped,
Grouping_Summary = summary_table,
Column_Diagnostic = diagnostic_columns,
Category_Check = category_check
),
output_xlsx
)
output_csv
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped.csv"
output_summary
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped_summary.csv"
output_xlsx
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped.xlsx"
The original categorical columns are preserved. The grouped versions
are added using the _Grp suffix. Blank cells inside valid
columns are coded as Not Reported. Blank-name columns are
included in the diagnostic sheet but removed from the grouped data.