1 Purpose

This R Markdown file reads a crash-data CSV file, identifies categorical variables, and creates grouped versions of those variables. The grouped variables are limited to a maximum of seven categories.

The workflow follows these rules:

2 Load Packages

library(tidyverse)
library(stringr)
library(readr)
library(writexl)

3 Input File

Change only file_path if you want to use another CSV file.

file_path <- "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c.csv"

input_dir  <- dirname(file_path)
input_base <- tools::file_path_sans_ext(basename(file_path))

output_csv     <- file.path(input_dir, paste0(input_base, "_grouped.csv"))
output_summary <- file.path(input_dir, paste0(input_base, "_grouped_summary.csv"))
output_xlsx    <- file.path(input_dir, paste0(input_base, "_grouped.xlsx"))

output_csv
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped.csv"
output_summary
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped_summary.csv"
output_xlsx
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped.xlsx"

4 Read Data

The name_repair = "minimal" option is used so that blank column names can be diagnosed before they are removed.

dat_raw <- read_csv(
  file_path,
  show_col_types = FALSE,
  name_repair = "minimal"
)

dim(dat_raw)
## [1] 67018    15
head(dat_raw)
## # A tibble: 6 × 15
##   Crash_ID D:\\DAS_CODE\\AVISTA\\Bike_…¹ Light_Cond_ID Road_Type_ID Road_Algn_ID
##      <dbl> <chr>                         <chr>         <chr>        <chr>       
## 1 15517107 Rain                          Daylight      4 Or More L… Curve, Grade
## 2 15517293 Rain                          Daylight      <NA>         Straight, L…
## 3 15517363 Rain                          Dawn          2 Lane, 2 W… Straight, L…
## 4 15517363 Rain                          Dawn          2 Lane, 2 W… Straight, L…
## 5 15522606 Rain                          Dark, Not Li… 2 Lane, 2 W… Straight, L…
## 6 15522606 Rain                          Dark, Not Li… 2 Lane, 2 W… Straight, L…
## # ℹ abbreviated name: ¹​`D:\\DAS_CODE\\AVISTA\\Bike_HitnRun`
## # ℹ 10 more variables: Surf_Cond_ID <chr>, Traffic_Cntl_ID <chr>,
## #   Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Obj_Struck_ID <chr>,
## #   Road_Cls_ID <chr>, Crash_Sev_ID <chr>, Rural_Urban_Type_ID <chr>,
## #   Veh_Body_Styl_ID <chr>, Veh_Damage_Description1_Id <chr>

5 Column-Name Diagnostic

This diagnostic identifies blank, missing, and duplicated column names. Blank-name columns are removed after this step.

original_names <- names(dat_raw)

blank_col_index <- which(is.na(original_names) | str_squish(original_names) == "")

diagnostic_columns <- tibble(
  Column_Position = seq_along(original_names),
  Original_Name = original_names,
  Is_Blank_Name = is.na(original_names) | str_squish(original_names) == "",
  Is_Duplicated_Name = duplicated(original_names) | duplicated(original_names, fromLast = TRUE)
)

diagnostic_columns
## # A tibble: 15 × 4
##    Column_Position Original_Name                Is_Blank_Name Is_Duplicated_Name
##              <int> <chr>                        <lgl>         <lgl>             
##  1               1 "Crash_ID"                   FALSE         FALSE             
##  2               2 "D:\\DAS_CODE\\AVISTA\\Bike… FALSE         FALSE             
##  3               3 "Light_Cond_ID"              FALSE         FALSE             
##  4               4 "Road_Type_ID"               FALSE         FALSE             
##  5               5 "Road_Algn_ID"               FALSE         FALSE             
##  6               6 "Surf_Cond_ID"               FALSE         FALSE             
##  7               7 "Traffic_Cntl_ID"            FALSE         FALSE             
##  8               8 "Intrsct_Relat_ID"           FALSE         FALSE             
##  9               9 "FHE_Collsn_ID"              FALSE         FALSE             
## 10              10 "Obj_Struck_ID"              FALSE         FALSE             
## 11              11 "Road_Cls_ID"                FALSE         FALSE             
## 12              12 "Crash_Sev_ID"               FALSE         FALSE             
## 13              13 "Rural_Urban_Type_ID"        FALSE         FALSE             
## 14              14 "Veh_Body_Styl_ID"           FALSE         FALSE             
## 15              15 "Veh_Damage_Description1_Id" FALSE         FALSE

6 Remove Blank-Name Columns

Blank-name columns are removed after the diagnostic table. Remaining column names are cleaned and made unique.

dat <- dat_raw

if (length(blank_col_index) > 0) {
  dat <- dat[, -blank_col_index, drop = FALSE]
}

names(dat) <- names(dat) %>%
  str_squish() %>%
  make.unique(sep = "_")

dim(dat)
## [1] 67018    15
names(dat)
##  [1] "Crash_ID"                           "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun"
##  [3] "Light_Cond_ID"                      "Road_Type_ID"                      
##  [5] "Road_Algn_ID"                       "Surf_Cond_ID"                      
##  [7] "Traffic_Cntl_ID"                    "Intrsct_Relat_ID"                  
##  [9] "FHE_Collsn_ID"                      "Obj_Struck_ID"                     
## [11] "Road_Cls_ID"                        "Crash_Sev_ID"                      
## [13] "Rural_Urban_Type_ID"                "Veh_Body_Styl_ID"                  
## [15] "Veh_Damage_Description1_Id"

7 Helper Functions

clean_text <- function(x) {
  x %>%
    str_replace_all("_", " ") %>%
    str_replace_all("\\s+", " ") %>%
    str_squish()
}

norm_text <- function(x) {
  x %>%
    str_to_lower() %>%
    str_replace_all("\\([^\\)]*\\)", " ") %>%
    str_replace_all("[^a-z0-9 ]", " ") %>%
    str_replace_all("\\s+", " ") %>%
    str_squish()
}

make_ngrams <- function(words, n = 2) {
  if (length(words) < n) return(character(0))
  map_chr(
    seq_len(length(words) - n + 1),
    ~ paste(words[.x:(.x + n - 1)], collapse = " ")
  )
}

8 Fast Grouping Function

This function groups a categorical column into a maximum of seven categories. If Not Reported exists, it is preserved as its own category and is not merged into Other.

fast_group_column <- function(x, max_cat = 7, min_group_n = 2) {
  
  x0 <- as.character(x)
  
  # Blank cells become Not Reported, not Other
  x0 <- ifelse(is.na(x0) | str_squish(x0) == "", "Not Reported", x0)
  x0 <- clean_text(x0)
  
  tab <- tibble(original = x0) %>%
    count(original, sort = TRUE) %>%
    mutate(norm = norm_text(original))
  
  # Already small enough
  if (nrow(tab) <= max_cat) {
    return(x0)
  }
  
  has_not_reported <- any(tab$original == "Not Reported")
  
  # If Not Reported exists, available slots are:
  # Not Reported + Other + top/common groups
  main_slots <- ifelse(has_not_reported, max_cat - 2, max_cat - 1)
  main_slots <- max(main_slots, 1)
  
  stop_words <- c(
    "and", "or", "the", "of", "to", "in", "on", "at", "with",
    "not", "id", "type", "description", "one", "both",
    "other", "unknown", "reported", "applicable", "damage", "impact"
  )
  
  # Do not use Not Reported to discover common phrases
  tab_phrase <- tab %>%
    filter(original != "Not Reported")
  
  phrase_df <- tab_phrase %>%
    mutate(
      words = str_split(norm, " "),
      words = map(words, ~ setdiff(.x, stop_words)),
      phrases = map(words, ~ c(
        make_ngrams(.x, 3),
        make_ngrams(.x, 2),
        .x
      ))
    ) %>%
    select(original, n, phrases) %>%
    unnest(phrases) %>%
    filter(!is.na(phrases), str_length(phrases) >= 3) %>%
    group_by(phrases) %>%
    summarise(
      total_n = sum(n),
      category_n = n_distinct(original),
      .groups = "drop"
    ) %>%
    filter(category_n >= min_group_n) %>%
    arrange(desc(total_n), desc(category_n))
  
  selected <- phrase_df %>%
    slice_head(n = main_slots) %>%
    pull(phrases)
  
  # If no phrase is useful, keep top exact categories
  if (length(selected) == 0) {
    top_exact <- tab %>%
      filter(original != "Not Reported") %>%
      slice_head(n = main_slots) %>%
      pull(original)
    
    return(case_when(
      x0 == "Not Reported" ~ "Not Reported",
      x0 %in% top_exact ~ x0,
      TRUE ~ "Other"
    ))
  }
  
  lookup <- tab %>%
    mutate(
      Grouped = map_chr(norm, function(v) {
        matched <- selected[str_detect(v, fixed(selected))]
        if (length(matched) > 0) str_to_title(matched[1]) else "Other"
      }),
      Grouped = ifelse(original == "Not Reported", "Not Reported", Grouped)
    ) %>%
    select(original, Grouped)
  
  other_share <- tab %>%
    left_join(lookup, by = "original") %>%
    filter(original != "Not Reported") %>%
    summarise(p = sum(n[Grouped == "Other"]) / sum(n)) %>%
    pull(p)
  
  # If phrase grouping is too broad, use top exact categories instead
  if (is.na(other_share) | other_share > 0.50) {
    top_exact <- tab %>%
      filter(original != "Not Reported") %>%
      slice_head(n = main_slots) %>%
      pull(original)
    
    lookup <- tab %>%
      mutate(
        Grouped = case_when(
          original == "Not Reported" ~ "Not Reported",
          original %in% top_exact ~ original,
          TRUE ~ "Other"
        )
      ) %>%
      select(original, Grouped)
  }
  
  lookup_vec <- setNames(lookup$Grouped, lookup$original)
  grouped <- unname(lookup_vec[x0])
  
  # Safety check: any unexpected missing group becomes Not Reported
  grouped <- ifelse(is.na(grouped), "Not Reported", grouped)
  
  grouped
}

9 Identify Categorical Columns

Only character and factor columns are grouped. Blank-name columns are not included because they were removed earlier.

cat_cols <- names(dat)[sapply(dat, function(x) is.character(x) | is.factor(x))]
cat_cols <- cat_cols[!is.na(cat_cols) & str_squish(cat_cols) != ""]

cat_cols
##  [1] "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" "Light_Cond_ID"                     
##  [3] "Road_Type_ID"                       "Road_Algn_ID"                      
##  [5] "Surf_Cond_ID"                       "Traffic_Cntl_ID"                   
##  [7] "Intrsct_Relat_ID"                   "FHE_Collsn_ID"                     
##  [9] "Obj_Struck_ID"                      "Road_Cls_ID"                       
## [11] "Crash_Sev_ID"                       "Rural_Urban_Type_ID"               
## [13] "Veh_Body_Styl_ID"                   "Veh_Damage_Description1_Id"
length(cat_cols)
## [1] 14

10 Apply Grouping

if (length(cat_cols) == 0) {
  dat_grouped <- dat
  message("No character or factor columns were found. No grouped columns were created.")
} else {
  dat_grouped <- dat %>%
    mutate(
      across(
        all_of(cat_cols),
        ~ fast_group_column(.x, max_cat = 7),
        .names = "{.col}_Grp"
      )
    )
}

dim(dat_grouped)
## [1] 67018    29
head(dat_grouped)
## # A tibble: 6 × 29
##   Crash_ID D:\\DAS_CODE\\AVISTA\\Bike_…¹ Light_Cond_ID Road_Type_ID Road_Algn_ID
##      <dbl> <chr>                         <chr>         <chr>        <chr>       
## 1 15517107 Rain                          Daylight      4 Or More L… Curve, Grade
## 2 15517293 Rain                          Daylight      <NA>         Straight, L…
## 3 15517363 Rain                          Dawn          2 Lane, 2 W… Straight, L…
## 4 15517363 Rain                          Dawn          2 Lane, 2 W… Straight, L…
## 5 15522606 Rain                          Dark, Not Li… 2 Lane, 2 W… Straight, L…
## 6 15522606 Rain                          Dark, Not Li… 2 Lane, 2 W… Straight, L…
## # ℹ abbreviated name: ¹​`D:\\DAS_CODE\\AVISTA\\Bike_HitnRun`
## # ℹ 24 more variables: Surf_Cond_ID <chr>, Traffic_Cntl_ID <chr>,
## #   Intrsct_Relat_ID <chr>, FHE_Collsn_ID <chr>, Obj_Struck_ID <chr>,
## #   Road_Cls_ID <chr>, Crash_Sev_ID <chr>, Rural_Urban_Type_ID <chr>,
## #   Veh_Body_Styl_ID <chr>, Veh_Damage_Description1_Id <chr>,
## #   `D:\\DAS_CODE\\AVISTA\\Bike_HitnRun_Grp` <chr>, Light_Cond_ID_Grp <chr>,
## #   Road_Type_ID_Grp <chr>, Road_Algn_ID_Grp <chr>, Surf_Cond_ID_Grp <chr>, …

11 Create Grouping Summary

if (length(cat_cols) == 0) {
  summary_table <- tibble(
    Variable = character(),
    Grouped_Category = character(),
    n = integer(),
    Percent = numeric()
  )
} else {
  summary_table <- map_dfr(cat_cols, function(col) {
    new_col <- paste0(col, "_Grp")
    
    dat_grouped %>%
      count(.data[[new_col]], sort = TRUE) %>%
      mutate(
        Variable = col,
        Grouped_Category = .data[[new_col]],
        Percent = round(100 * n / sum(n), 2)
      ) %>%
      select(Variable, Grouped_Category, n, Percent)
  })
}

summary_table
## # A tibble: 72 × 4
##    Variable                             Grouped_Category      n Percent
##    <chr>                                <chr>             <int>   <dbl>
##  1 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Rain              60087   89.7 
##  2 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Cloudy             4245    6.33
##  3 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Clear              1951    2.91
##  4 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Sleet/Hail          340    0.51
##  5 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Fog                 162    0.24
##  6 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Snow                150    0.22
##  7 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun" Other                83    0.12
##  8 "Light_Cond_ID"                      Daylight          44188   65.9 
##  9 "Light_Cond_ID"                      Dark, Lighted     10018   15.0 
## 10 "Light_Cond_ID"                      Dark, Not Lighted  9990   14.9 
## # ℹ 62 more rows

12 Check Maximum Category Count

This table checks whether each grouped column has seven or fewer categories.

if (length(cat_cols) == 0) {
  category_check <- tibble(
    Variable = character(),
    Number_of_Grouped_Categories = integer(),
    Pass_Max_7_Check = logical()
  )
} else {
  category_check <- map_dfr(cat_cols, function(col) {
    new_col <- paste0(col, "_Grp")
    tibble(
      Variable = col,
      Number_of_Grouped_Categories = n_distinct(dat_grouped[[new_col]]),
      Pass_Max_7_Check = n_distinct(dat_grouped[[new_col]]) <= 7
    )
  })
}

category_check
## # A tibble: 14 × 3
##    Variable                             Number_of_Grouped_Cat…¹ Pass_Max_7_Check
##    <chr>                                                  <int> <lgl>           
##  1 "D:\\DAS_CODE\\AVISTA\\Bike_HitnRun"                       7 TRUE            
##  2 "Light_Cond_ID"                                            7 TRUE            
##  3 "Road_Type_ID"                                             4 TRUE            
##  4 "Road_Algn_ID"                                             5 TRUE            
##  5 "Surf_Cond_ID"                                             7 TRUE            
##  6 "Traffic_Cntl_ID"                                          7 TRUE            
##  7 "Intrsct_Relat_ID"                                         4 TRUE            
##  8 "FHE_Collsn_ID"                                            3 TRUE            
##  9 "Obj_Struck_ID"                                            2 TRUE            
## 10 "Road_Cls_ID"                                              7 TRUE            
## 11 "Crash_Sev_ID"                                             5 TRUE            
## 12 "Rural_Urban_Type_ID"                                      5 TRUE            
## 13 "Veh_Body_Styl_ID"                                         4 TRUE            
## 14 "Veh_Damage_Description1_Id"                               5 TRUE            
## # ℹ abbreviated name: ¹​Number_of_Grouped_Categories

13 Save Outputs

write_csv(dat_grouped, output_csv)
write_csv(summary_table, output_summary)

write_xlsx(
  list(
    Grouped_Data = dat_grouped,
    Grouping_Summary = summary_table,
    Column_Diagnostic = diagnostic_columns,
    Category_Check = category_check
  ),
  output_xlsx
)

output_csv
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped.csv"
output_summary
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped_summary.csv"
output_xlsx
## [1] "D:/Data_Archive/Crash Data/TX/2025Analysis/Data/2025/Hydroplanning_2017_2025c_grouped.xlsx"

14 Final Notes

The original categorical columns are preserved. The grouped versions are added using the _Grp suffix. Blank cells inside valid columns are coded as Not Reported. Blank-name columns are included in the diagnostic sheet but removed from the grouped data.