library(readxl)
library(dplyr)
library(stringr)
library(forcats)
library(DT)
library(writexl)

1. File path and data import

data_dir <- "D:/DAS_CODE/Tools/data"
setwd(data_dir)

dat <- read_excel("Ped_Bike_Esc_2016_2024_All.xlsx", sheet = "Ped_Lim")

dim(dat)
## [1] 60741    81
names(dat)
##  [1] "STATENAME"         "YEAR"              "CRASH_NUM1"       
##  [4] "CRASH_VEH_NUM1"    "CRASH_VehPer_NUM1" "PER_TYPNAME"      
##  [7] "INJ_SEVNAME"       "LATITUDENAME"      "LONGITUDNAME"     
## [10] "VE_TOTAL"          "FATALS"            "DRUNK_DR"         
## [13] "HIT_RUNNAME_2"     "WRK_ZONENAME"      "REL_ROADNAME"     
## [16] "WEATHER1NAME"      "LGT_CONDNAME"      "BODY_TYPNAME"     
## [19] "RUR_URBNAME"       "FUNC_SYSNAME"      "PBAGENAME"        
## [22] "PBSEXNAME"         "PEDCTYPENAME"      "PEDPOSNAME"       
## [25] "MOTMANNAME"        "PEDLEGNAME"        "PEDSNRNAME"       
## [28] "PEDCGPNAME"        "INJ_SEVNAMEDr"     "AGENAMEDr"        
## [31] "SEXNAMEDr"         "DRINKINGNAMEDr"    "ROUTENAME"        
## [34] "TYP_INTNAME"       "DRINKINGNAME"      "ALC_RESNAME"      
## [37] "DRUGSNAME"         "DRUGRES1NAME"      "WORK_INJNAME"     
## [40] "RACENAME"          "LOCATIONNAME"      "NUMOCCS_2"        
## [43] "MAKENAME_2"        "MAK_MODNAME_2"     "MOD_YEAR_2"       
## [46] "HAZ_INVNAME_2"     "BUS_USENAME_2"     "EMER_USENAME_2"   
## [49] "TRAV_SPNAME_2"     "UNDERIDENAME_2"    "ROLLOVERNAME_2"   
## [52] "DEFORMEDNAME_2"    "VTRAFWAYNAME_2"    "VNUM_LANNAME_2"   
## [55] "VSPD_LIMNAME_2"    "VALIGNNAME_2"      "VPROFILENAME_2"   
## [58] "VPAVETYPNAME_2"    "VSURCONDNAME_2"    "P_CRASH1NAME_2"   
## [61] "ACC_TYPENAME_2"    "L_STATUSNAME_2"    "L_TYPENAME_2"     
## [64] "CDL_STATNAME_2"    "MDRDSTRDNAME_2"    "DRIMPAIRNAME_2"   
## [67] "MDRMANAVNAME_2"    "MVISOBSCNAME_2"    "INJ_SEVNAME_2"    
## [70] "ALC_RESNAME_2"     "DRUGSNAME_2"       "COUNTYNAME"       
## [73] "CITYNAME"          "DAY"               "MONTHNAME"        
## [76] "DAY_WEEKNAME"      "HOUR"              "HOURNAME"         
## [79] "MAN_COLLNAME"      "WEATHERNAME"       "SCH_BUSNAME"

2. Select analysis variables

dat1 <- dat[, c(1, 2, 3, 39, 8:32, 46:67, 76)]

dim(dat1)
## [1] 60741    52
names(dat1)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "BODY_TYPNAME"   "RUR_URBNAME"   
## [17] "FUNC_SYSNAME"   "PBAGENAME"      "PBSEXNAME"      "PEDCTYPENAME"  
## [21] "PEDPOSNAME"     "MOTMANNAME"     "PEDLEGNAME"     "PEDSNRNAME"    
## [25] "PEDCGPNAME"     "INJ_SEVNAMEDr"  "AGENAMEDr"      "SEXNAMEDr"     
## [29] "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2"  "EMER_USENAME_2"
## [33] "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [37] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"  
## [41] "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [45] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"   "CDL_STATNAME_2"
## [49] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"
datatable(
  head(dat1, 25),
  options = list(scrollX = TRUE, pageLength = 10),
  caption = "Selected variables"
)

3. Create pedestrian and driver age groups

dat1 <- dat1 %>%
  mutate(
    age_num = as.numeric(str_extract(PBAGENAME, "\\d+")),
    Ped_Age = case_when(
      age_num <= 4  ~ "0-4",
      age_num <= 9  ~ "5-9",
      age_num <= 14 ~ "10-14",
      age_num <= 20 ~ "15-20",
      age_num <= 24 ~ "21-24",
      age_num <= 34 ~ "25-34",
      age_num <= 44 ~ "35-44",
      age_num <= 54 ~ "45-54",
      age_num <= 64 ~ "55-64",
      age_num <= 74 ~ "65-74",
      age_num >= 75 ~ "75+",
      TRUE ~ "Unknown"
    ),
    age_num1 = as.numeric(str_extract(AGENAMEDr, "\\d+")),
    Dr_Age = case_when(
      age_num1 <= 4  ~ "0-4",
      age_num1 <= 9  ~ "5-9",
      age_num1 <= 14 ~ "10-14",
      age_num1 <= 20 ~ "15-20",
      age_num1 <= 24 ~ "21-24",
      age_num1 <= 34 ~ "25-34",
      age_num1 <= 44 ~ "35-44",
      age_num1 <= 54 ~ "45-54",
      age_num1 <= 64 ~ "55-64",
      age_num1 <= 74 ~ "65-74",
      age_num1 >= 75 ~ "75+",
      TRUE ~ "Unknown"
    )
  )

table(dat1$Ped_Age, useNA = "ifany")
## 
##     0-4   10-14   15-20   21-24   25-34   35-44   45-54     5-9   55-64   65-74 
##     552     667    2581    3077    9797   10098    9599     468   11021    7015 
##     75+ Unknown 
##    5130     736
table(dat1$Dr_Age, useNA = "ifany")
## 
##     0-4   10-14   15-20   21-24   25-34   35-44   45-54     5-9   55-64   65-74 
##       8      33    4178    5333   11846    9211    8105       9    7215    3924 
##     75+ Unknown 
##    1886    8993

4. Create weekday/weekend variable

dat1 <- dat1 %>%
  mutate(
    Day_Type = case_when(
      DAY_WEEKNAME %in% c("Saturday", "Sunday") ~ "Weekend",
      !is.na(DAY_WEEKNAME) ~ "Weekday",
      TRUE ~ "Unknown"
    )
  )

table(dat1$Day_Type, useNA = "ifany")
## 
## Weekday Weekend 
##   42325   18416

5. Create major vehicle type

dat2 <- dat1 %>%
  mutate(
    Veh_Type = case_when(
      str_detect(BODY_TYPNAME, regex("sedan|coupe|hatchback|convertible|station wagon|automobile", ignore_case = TRUE)) ~ "Car",
      str_detect(BODY_TYPNAME, regex("pickup|utility|light truck|cab chassis based", ignore_case = TRUE)) ~ "Light_Truck",
      str_detect(BODY_TYPNAME, regex("truck-tractor|single-unit|heavy truck|medium/heavy|construction equipment|farm equipment|motorhome", ignore_case = TRUE)) ~ "Large_Truck",
      str_detect(BODY_TYPNAME, regex("bus", ignore_case = TRUE)) ~ "Bus",
      str_detect(BODY_TYPNAME, regex("van|minivan|step-van|walk-in van", ignore_case = TRUE)) ~ "Van",
      str_detect(BODY_TYPNAME, regex("motorcycle|moped|scooter|motored cycle", ignore_case = TRUE)) ~ "Motorcycle",
      str_detect(BODY_TYPNAME, regex("ATV|ATC|golf cart|off-highway|recreational|three-wheel|go-cart|fork-lift|street sweeper", ignore_case = TRUE)) ~ "Off_Road_Other",
      is.na(BODY_TYPNAME) |
        BODY_TYPNAME %in% c("NA", "Not Reported", "Unknown body type") |
        str_detect(BODY_TYPNAME, regex("unknown|not reported", ignore_case = TRUE)) ~ "Unknown",
      TRUE ~ "Off_Road_Other"
    )
  )

table(dat2$Veh_Type, useNA = "ifany")
## 
##            Bus            Car    Large_Truck    Light_Truck     Motorcycle 
##           1205          24241           3394          22579            368 
## Off_Road_Other        Unknown            Van 
##             30           7036           1888

6. Remove unused source variables

names(dat2)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "BODY_TYPNAME"   "RUR_URBNAME"   
## [17] "FUNC_SYSNAME"   "PBAGENAME"      "PBSEXNAME"      "PEDCTYPENAME"  
## [21] "PEDPOSNAME"     "MOTMANNAME"     "PEDLEGNAME"     "PEDSNRNAME"    
## [25] "PEDCGPNAME"     "INJ_SEVNAMEDr"  "AGENAMEDr"      "SEXNAMEDr"     
## [29] "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2"  "EMER_USENAME_2"
## [33] "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [37] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"  
## [41] "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [45] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"   "CDL_STATNAME_2"
## [49] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"  
## [53] "age_num"        "Ped_Age"        "age_num1"       "Dr_Age"        
## [57] "Day_Type"       "Veh_Type"
dat2 <- dat2[, -c(18, 27, 52, 53, 55, 15)]

dim(dat2)
## [1] 60741    52
names(dat2)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "RUR_URBNAME"    "FUNC_SYSNAME"  
## [17] "PBSEXNAME"      "PEDCTYPENAME"   "PEDPOSNAME"     "MOTMANNAME"    
## [21] "PEDLEGNAME"     "PEDSNRNAME"     "PEDCGPNAME"     "INJ_SEVNAMEDr" 
## [25] "SEXNAMEDr"      "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2" 
## [29] "EMER_USENAME_2" "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2"
## [33] "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2"
## [37] "VALIGNNAME_2"   "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2"
## [41] "P_CRASH1NAME_2" "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"  
## [45] "CDL_STATNAME_2" "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2"
## [49] "Ped_Age"        "Dr_Age"         "Day_Type"       "Veh_Type"

7. Create posted speed limit group

dat2 <- dat2 %>%
  mutate(
    spd_num = as.numeric(str_extract(VSPD_LIMNAME_2, "\\d+")),
    PSL = case_when(
      spd_num <= 25 ~ "25_or_less",
      spd_num <= 35 ~ "30_35",
      spd_num <= 45 ~ "40_45",
      spd_num <= 55 ~ "50_55",
      spd_num >= 60 ~ "60_plus",
      TRUE ~ "Unknown"
    )
  )

table(dat2$PSL, useNA = "ifany")
## 
## 25_or_less      30_35      40_45      50_55    60_plus    Unknown 
##       5427      14474      18063       9752       8789       4236
dat3 <- dat2[, -c(53)]

dim(dat3)
## [1] 60741    53
names(dat3)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "RUR_URBNAME"    "FUNC_SYSNAME"  
## [17] "PBSEXNAME"      "PEDCTYPENAME"   "PEDPOSNAME"     "MOTMANNAME"    
## [21] "PEDLEGNAME"     "PEDSNRNAME"     "PEDCGPNAME"     "INJ_SEVNAMEDr" 
## [25] "SEXNAMEDr"      "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2" 
## [29] "EMER_USENAME_2" "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2"
## [33] "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2"
## [37] "VALIGNNAME_2"   "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2"
## [41] "P_CRASH1NAME_2" "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"  
## [45] "CDL_STATNAME_2" "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2"
## [49] "Ped_Age"        "Dr_Age"         "Day_Type"       "Veh_Type"      
## [53] "PSL"
str(dat3)
## tibble [60,741 × 53] (S3: tbl_df/tbl/data.frame)
##  $ STATENAME     : chr [1:60741] "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ YEAR          : num [1:60741] 2016 2016 2016 2016 2016 ...
##  $ CRASH_NUM1    : chr [1:60741] "2016_10013" "2016_10024" "2016_10036" "2016_10039" ...
##  $ WORK_INJNAME  : chr [1:60741] "Yes" "No" "No" "No" ...
##  $ LATITUDENAME  : chr [1:60741] "33.96818889" "32.422077780000002" "33.166269440000001" "33.997655559999998" ...
##  $ LONGITUDNAME  : chr [1:60741] "-86.120808330000003" "-87.033072219999994" "-87.525480560000005" "-86.934538889999999" ...
##  $ VE_TOTAL      : num [1:60741] 1 2 1 1 1 1 1 1 1 2 ...
##  $ FATALS        : num [1:60741] 1 1 1 1 1 1 1 1 1 1 ...
##  $ DRUNK_DR      : num [1:60741] 0 0 0 0 0 0 0 0 0 0 ...
##  $ HIT_RUNNAME_2 : chr [1:60741] "No" "No" "No" "No" ...
##  $ WRK_ZONENAME  : chr [1:60741] "Construction" "None" "None" "None" ...
##  $ REL_ROADNAME  : chr [1:60741] "On Shoulder" "Outside Trafficway" "On Roadway" "On Roadway" ...
##  $ WEATHER1NAME  : chr [1:60741] "Cloudy" "Clear" "Clear" "Clear" ...
##  $ LGT_CONDNAME  : chr [1:60741] "Dusk" "Daylight" "Dark - Lighted" "Daylight" ...
##  $ RUR_URBNAME   : chr [1:60741] "Rural" "Rural" "Urban" "Rural" ...
##  $ FUNC_SYSNAME  : chr [1:60741] "Interstate" "Local" "Principal Arterial - Other" "Local" ...
##  $ PBSEXNAME     : chr [1:60741] "Male" "Male" "Female" "Male" ...
##  $ PEDCTYPENAME  : chr [1:60741] "Working in Roadway" "Motor Vehicle Loss of Control" "Pedestrian Failed to Yield" "Mailbox-Related" ...
##  $ PEDPOSNAME    : chr [1:60741] "Paved Shoulder / Bicycle Lane / Parking Lane" "Non-Trafficway - Parking Lot/Other" "Travel Lane" "Travel Lane" ...
##  $ MOTMANNAME    : chr [1:60741] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ PEDLEGNAME    : chr [1:60741] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ PEDSNRNAME    : chr [1:60741] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ PEDCGPNAME    : chr [1:60741] "Working or Playing in Roadway" "Unusual Circumstances" "Crossing Roadway - Vehicle Not Turning" "Unique Midblock" ...
##  $ INJ_SEVNAMEDr : chr [1:60741] "No Apparent Injury (O)" "Suspected Serious Injury(A)" "No Apparent Injury (O)" "No Apparent Injury (O)" ...
##  $ SEXNAMEDr     : chr [1:60741] "Male" "Female" "Male" "Female" ...
##  $ DRINKINGNAMEDr: chr [1:60741] "No (Alcohol Not Involved)" "No (Alcohol Not Involved)" "No (Alcohol Not Involved)" "No (Alcohol Not Involved)" ...
##  $ HAZ_INVNAME_2 : chr [1:60741] "No" "No" "No" "No" ...
##  $ BUS_USENAME_2 : chr [1:60741] "Not a Bus" "Not a Bus" "Not a Bus" "Not a Bus" ...
##  $ EMER_USENAME_2: chr [1:60741] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ TRAV_SPNAME_2 : chr [1:60741] "070 MPH" "Unknown" "050 MPH" "050 MPH" ...
##  $ UNDERIDENAME_2: chr [1:60741] "No Underride or Override Noted" "No Underride or Override Noted" "No Underride or Override Noted" "No Underride or Override Noted" ...
##  $ ROLLOVERNAME_2: chr [1:60741] "No Rollover" "No Rollover" "No Rollover" "No Rollover" ...
##  $ DEFORMEDNAME_2: chr [1:60741] "Disabling Damage" "Disabling Damage" "Minor Damage" "Functional Damage" ...
##  $ VTRAFWAYNAME_2: chr [1:60741] "Two-Way,  Divided, Unprotected (Painted > 4 Feet ) Median" "Two-Way, Not Divided" "Two-Way,  Divided, Unprotected (Painted > 4 Feet ) Median" "Two-Way, Not Divided" ...
##  $ VNUM_LANNAME_2: chr [1:60741] "Two lanes" "Two lanes" "Two lanes" "Two lanes" ...
##  $ VSPD_LIMNAME_2: chr [1:60741] "55 MPH" "15 MPH" "45 MPH" "45 MPH" ...
##  $ VALIGNNAME_2  : chr [1:60741] "Straight" "Straight" "Straight" "Straight" ...
##  $ VPROFILENAME_2: chr [1:60741] "Level" "Downhill" "Downhill" "Hillcrest" ...
##  $ VPAVETYPNAME_2: chr [1:60741] "Blacktop, Bituminous, or Asphalt" "Blacktop, Bituminous, or Asphalt" "Blacktop, Bituminous, or Asphalt" "Blacktop, Bituminous, or Asphalt" ...
##  $ VSURCONDNAME_2: chr [1:60741] "Dry" "Dry" "Dry" "Dry" ...
##  $ P_CRASH1NAME_2: chr [1:60741] "Going Straight" "Going Straight" "Going Straight" "Going Straight" ...
##  $ ACC_TYPENAME_2: chr [1:60741] "C13-Single Driver-Forward Impact-Pedestrian/ Animal" "A2-Single Driver-Right Roadside Departure-Control/Traction Loss" "C13-Single Driver-Forward Impact-Pedestrian/ Animal" "C13-Single Driver-Forward Impact-Pedestrian/ Animal" ...
##  $ L_STATUSNAME_2: chr [1:60741] "Valid" "Valid" "Valid" "Suspended" ...
##  $ L_TYPENAME_2  : chr [1:60741] "Full Driver License" "Full Driver License" "Full Driver License" "Full Driver License" ...
##  $ CDL_STATNAME_2: chr [1:60741] "No (CDL)" "No (CDL)" "No (CDL)" "No (CDL)" ...
##  $ MDRDSTRDNAME_2: chr [1:60741] "Unknown if Distracted" "Unknown if Distracted" "Not Distracted" "Not Distracted" ...
##  $ DRIMPAIRNAME_2: chr [1:60741] "Under the Influence of Alcohol, Drugs or Medication" "Unknown if Impaired" "None/Apparently Normal" "None/Apparently Normal" ...
##  $ MDRMANAVNAME_2: chr [1:60741] "Driver Did Not Maneuver to Avoid" "Not Reported" "Driver Did Not Maneuver to Avoid" "Pedestrian, Pedalcyclist or Other Non-Motorist" ...
##  $ Ped_Age       : chr [1:60741] "35-44" "45-54" "25-34" "75+" ...
##  $ Dr_Age        : chr [1:60741] "65-74" "65-74" "35-44" "25-34" ...
##  $ Day_Type      : chr [1:60741] "Weekday" "Weekday" "Weekend" "Weekday" ...
##  $ Veh_Type      : chr [1:60741] "Car" "Car" "Car" "Van" ...
##  $ PSL           : chr [1:60741] "50_55" "25_or_less" "40_45" "40_45" ...

8. Standardize missing and unknown categories

dat3 <- dat3 %>%
  mutate(
    across(
      where(~ is.character(.) | is.factor(.)),
      ~ case_when(
        is.na(.) ~ "Not Reported",
        str_trim(as.character(.)) %in% c(
          "NA", "N/A", "Unknown", "Reported as Unknown",
          " Reported as Unknown",
          "Reported as Unknown if Distracted ",
          "Reported as Unknown if Impaired",
          "Unknown body type", "Not Reported"
        ) ~ "Not Reported",
        str_detect(as.character(.), regex("^unknown|not reported$", ignore_case = TRUE)) ~ "Not Reported",
        TRUE ~ as.character(.)
      )
    )
  )

9. Lump variables with more than five categories

keep_cols <- c("CRASH_NUM1", "Ped_Age", "Dr_Age", "Veh_Type", "LATITUDENAME", "LONGITUDNAME", "STATENAME")

dat3_lumped <- dat3 %>%
  mutate(
    across(
      .cols = setdiff(
        names(select(., where(~ is.character(.) | is.factor(.)))),
        keep_cols
      ),
      .fns = ~ if (n_distinct(., na.rm = TRUE) > 5) {
        fct_lump_n(as.factor(.), n = 5, other_level = "Other")
      } else {
        as.factor(.)
      }
    )
  )

dim(dat3_lumped)
## [1] 60741    53
category_summary <- data.frame(
  Variable = names(dat3_lumped),
  Categories = sapply(dat3_lumped, function(x) n_distinct(x, na.rm = FALSE))
) %>%
  arrange(desc(Categories))

datatable(
  category_summary,
  options = list(pageLength = 25),
  caption = "Number of categories after lumping"
)
freq_tables <- lapply(
  dat3_lumped %>% select(-all_of(keep_cols)),
  table,
  useNA = "ifany"
)

freq_tables
## $YEAR
## 
## 2016 2017 2018 2019 2020 2021 2022 2023 2024 
## 6080 6075 6374 6272 6565 7388 7593 7314 7080 
## 
## $WORK_INJNAME
## 
##           No Not Reported     Redacted          Yes 
##        53105         6460          133         1043 
## 
## $VE_TOTAL
## 
##     1     2     3     4     5     6     7     8     9    10    11    12    14 
## 52229  6160  1645   452   155    48    23    16     4     2     1     3     1 
##   130 
##     2 
## 
## $FATALS
## 
##     1     2     3     4     5     6     8    20 
## 58940  1531   188    57     6     9     8     2 
## 
## $DRUNK_DR
## 
##     0     1     2  <NA> 
## 29063  2289    14 29375 
## 
## $HIT_RUNNAME_2
## 
##           No Not Reported          Yes 
##        46422          988        13331 
## 
## $WRK_ZONENAME
## 
##            Construction             Maintenance                    None 
##                     705                     106                   59455 
##                 Utility Work Zone, Type Unknown 
##                      33                     442 
## 
## $REL_ROADNAME
## 
##       Not Reported        On Roadside         On Roadway        On Shoulder 
##                295               1928              55468               1956 
## Outside Trafficway              Other 
##                320                774 
## 
## $WEATHER1NAME
## 
##            Clear           Cloudy Fog, Smog, Smoke     Not Reported 
##            17338             3484              274            37373 
##             Rain            Other 
##             2044              228 
## 
## $LGT_CONDNAME
## 
##     Dark - Lighted Dark - Not Lighted               Dawn           Daylight 
##              23917              21172               1014              12078 
##               Dusk              Other 
##               1186               1374 
## 
## $RUR_URBNAME
## 
##                      Not Reported                             Rural 
##                               163                             10237 
## Trafficway Not in State Inventory                             Urban 
##                               326                             50015 
## 
## $FUNC_SYSNAME
## 
##                 Interstate                      Local 
##                       7231                       6119 
##            Major Collector             Minor Arterial 
##                       5856                      14137 
## Principal Arterial - Other                      Other 
##                      17820                       9578 
## 
## $PBSEXNAME
## 
##       Female         Male Not Reported 
##        17933        42482          326 
## 
## $PEDCTYPENAME
## 
##                                   Crossing an Expressway 
##                                                     3168 
##                                 Disabled Vehicle-Related 
##                                                     2693 
##                    Not At Intersection - Other / Unknown 
##                                                     5356 
##                               Pedestrian Failed to Yield 
##                                                    19768 
## Walking/Running Along Roadway With Traffic - From Behind 
##                                                     4768 
##                                                    Other 
##                                                    24988 
## 
## $PEDPOSNAME
## 
##                               Crosswalk Area 
##                                         7700 
##                            Intersection Area 
##                                         2518 
##                                Other/Unknown 
##                                         1802 
## Paved Shoulder / Bicycle Lane / Parking Lane 
##                                         2543 
##                                  Travel Lane 
##                                        43644 
##                                        Other 
##                                         2534 
## 
## $MOTMANNAME
## 
##        Left Turn   Not Applicable     Not Reported       Right Turn 
##             1983            45795              359              798 
## Straight through 
##            11806 
## 
## $PEDLEGNAME
## 
##        Farside       Nearside Not Applicable   Not Reported 
##           7608           5600          45836           1697 
## 
## $PEDSNRNAME
## 
##                                    Motorist traveling straight through - Crash Occurred on Far Side of Intersection / Pedestrian outside crosswalk area, other 
##                                                                                                                                                            456 
##             Motorist traveling straight through - Crash Occurred on Far Side of Intersection / Pedestrian within crosswalk area, traveled from motorist’s left 
##                                                                                                                                                            365 
## Motorist traveling straight through - Crash Occurred on Near (Approach) Side of Intersection / Pedestrian within crosswalk area, traveled from motorist’s left 
##                                                                                                                                                            364 
##                                                                                                                                                 Not Applicable 
##                                                                                                                                                          46065 
##                                                                                                                                                   Not Reported 
##                                                                                                                                                           1555 
##                                                                                                                                                          Other 
##                                                                                                                                                          11936 
## 
## $PEDCGPNAME
## 
##        Crossing Roadway - Vehicle Not Turning 
##                                         22317 
##        Other / Unknown - Insufficient Details 
##                                          7457 
## Pedestrian in Roadway - Circumstances Unknown 
##                                          4374 
##                         Unusual Circumstances 
##                                          5886 
##                 Walking/Running Along Roadway 
##                                          8272 
##                                         Other 
##                                         12435 
## 
## $INJ_SEVNAMEDr
## 
##       No Apparent Injury (O)                 Not Reported 
##                        48413                         6826 
##          Possible Injury (C)   Suspected Minor Injury (B) 
##                         2473                         1943 
## Suspected Serious Injury (A)                        Other 
##                          607                          479 
## 
## $SEXNAMEDr
## 
##       Female         Male Not Reported 
##        14830        37170         8741 
## 
## $DRINKINGNAMEDr
## 
## No (Alcohol Not Involved)              Not Reported    Yes (Alcohol Involved) 
##                     35594                     21161                      3986 
## 
## $HAZ_INVNAME_2
## 
##           No Not Reported          Yes 
##        59732          936           73 
## 
## $BUS_USENAME_2
## 
##      Charter/Tour         Not a Bus      Not Reported            School 
##                39             54506              5804               104 
## Transit/ Commuter             Other 
##               241                47 
## 
## $EMER_USENAME_2
## 
##         Emergency Operation, Emergency Warning Equipment in Use 
##                                                              57 
## Emergency Operation, Emergency Warning Equipment in Use Unknown 
##                                                              43 
##                                    Non-Emergency, Non-Transport 
##                                                              53 
##                                                  Not Applicable 
##                                                           58212 
##                                                    Not Reported 
##                                                            2351 
##                                                           Other 
##                                                              25 
## 
## $TRAV_SPNAME_2
## 
##      035 MPH      040 MPH      045 MPH      055 MPH Not Reported        Other 
##         2333         2527         3414         1903        38258        12306 
## 
## $UNDERIDENAME_2
## 
##                                                         No Underride or Override Noted 
##                                                                                  30840 
##                                                                           Not Reported 
##                                                                                  29877 
##                                                Overriding a Motor Vehicle In-Transport 
##                                                                                      4 
##                                            Overriding a Motor Vehicle Not In-Transport 
##                                                                                      7 
##     Underriding a Motor Vehicle In-Transport, Underride, Compartment Intrusion Unknown 
##                                                                                      3 
## Underriding a Motor Vehicle Not In-Transport, Underride, Compartment Intrusion Unknown 
##                                                                                      3 
##                                                                                  Other 
##                                                                                      7 
## 
## $ROLLOVERNAME_2
## 
##                         No Rollover                      Not Applicable 
##                               59261                                 125 
##                        Not Reported                            Rollover 
##                                 936                                 155 
## Rollover, Tripped by Object/Vehicle                               Other 
##                                 227                                  37 
## 
## $DEFORMEDNAME_2
## 
## Damage Reported, Extent Unknown                Disabling Damage 
##                            4028                           13269 
##               Functional Damage                    Minor Damage 
##                           15432                           12081 
##                    Not Reported                           Other 
##                           12483                            3448 
## 
## $VTRAFWAYNAME_2
## 
##                                          Not Reported 
##                                                  1599 
##           Two-Way,  Divided, Positive  Median Barrier 
##                                                  9525 
##                  Two-Way, Divided, Unprotected Median 
##                                                 13168 
##                                  Two-Way, Not Divided 
##                                                 24466 
## Two-Way, Not Divided With a Continuous Left-Turn Lane 
##                                                  7725 
##                                                 Other 
##                                                  4258 
## 
## $VNUM_LANNAME_2
## 
##   Five lanes   Four lanes Not Reported  Three lanes    Two lanes        Other 
##         8514         9016         2029        10293        26179         4710 
## 
## $VSPD_LIMNAME_2
## 
## 25 MPH 35 MPH 40 MPH 45 MPH 55 MPH  Other 
##   4874   9993   7268  10795   7212  20599 
## 
## $VALIGNNAME_2
## 
## Curve - Left   Curve Left  Curve Right Not Reported     Straight        Other 
##          708          927          944         2682        53922         1558 
## 
## $VPROFILENAME_2
## 
##             Downhill Grade, Unknown Slope                Level 
##                 1837                 3695                44259 
##         Not Reported               Uphill                Other 
##                 7903                 1580                 1467 
## 
## $VPAVETYPNAME_2
## 
##  Blacktop, Bituminous, or Asphalt                          Concrete 
##                             36107                              4603 
## Non-Trafficway or Driveway Access                      Not Reported 
##                               457                             19416 
##             Slag, Gravel or Stone                             Other 
##                                69                                89 
## 
## $VSURCONDNAME_2
## 
##                               Dry                         Ice/Frost 
##                             50381                               236 
## Non-Trafficway or Driveway Access                      Not Reported 
##                               457                              1828 
##                               Wet                             Other 
##                              7490                               349 
## 
## $P_CRASH1NAME_2
## 
##      Going Straight Negotiating a Curve        Not Reported        Turning Left 
##               48441                3146                2633                2355 
##       Turning Right               Other 
##                 982                3184 
## 
## $ACC_TYPENAME_2
## 
## A1-Single Driver-Right Roadside Departure-Drive Off Road 
##                                                     1004 
##  B6-Single Driver-Left Roadside Departure-Drive Off Road 
##                                                      407 
##      C13-Single Driver-Forward Impact-Pedestrian/ Animal 
##                                                    47624 
##                                     M98-Other Crash Type 
##                                                      884 
##                                             Not Reported 
##                                                     8136 
##                                                    Other 
##                                                     2686 
## 
## $L_STATUSNAME_2
## 
##      Expired Not licensed Not Reported    Suspended        Valid        Other 
##          636         2501         9226         2399        45437          542 
## 
## $L_TYPENAME_2
## 
##         Full Driver License Intermediate Driver License 
##                       47828                         721 
##            Learner's Permit                Not Licensed 
##                         213                        2501 
##                Not Reported                       Other 
##                        9218                         260 
## 
## $CDL_STATNAME_2
## 
## Disqualified     No (CDL) Not Reported    Suspended        Valid        Other 
##          149        45875         9248          131         5014          324 
## 
## $MDRDSTRDNAME_2
## 
##                    Distraction/Inattention 
##                                        276 
## Inattention (Inattentive), Details Unknown 
##                                        533 
##                             Not Distracted 
##                                      11548 
##                               Not Reported 
##                                      45743 
##          Reported as Unknown if Distracted 
##                                       1372 
##                                      Other 
##                                       1269 
## 
## $DRIMPAIRNAME_2
## 
##                                  Asleep or Fatigued 
##                                                 305 
##                                       Ill, Blackout 
##                                                 253 
##                              None/Apparently Normal 
##                                               27034 
##                                        Not Reported 
##                                               28584 
## Under the Influence of Alcohol, Drugs or Medication 
##                                                3971 
##                                               Other 
##                                                 594 
## 
## $MDRMANAVNAME_2
## 
##               Driver Did Not Maneuver to Avoid 
##                                           7082 
##                                  Motor Vehicle 
##                                            160 
##                                   Not Reported 
##                                          50283 
## Pedestrian, Pedalcyclist or Other Non-Motorist 
##                                           2952 
##              Phantom/Non-Contact Motor Vehicle 
##                                            179 
##                                          Other 
##                                             85 
## 
## $Day_Type
## 
## Weekday Weekend 
##   42325   18416 
## 
## $PSL
## 
## 25_or_less      30_35      40_45      50_55    60_plus      Other 
##       5427      14474      18063       9752       8789       4236

10. Shorten variable names

keep_names <- c("CRASH_NUM1")

old_names <- names(dat3_lumped)

new_names <- ifelse(
  old_names %in% keep_names,
  old_names,
  str_to_title(str_sub(str_replace_all(old_names, "_", ""), 1, 4))
)

new_names <- make.unique(new_names, sep = "")

name_key <- data.frame(
  old_name = old_names,
  new_name = new_names
)

names(dat3_lumped) <- new_names

names(dat3_lumped)
##  [1] "Stat"       "Year"       "CRASH_NUM1" "Work"       "Lati"      
##  [6] "Long"       "Veto"       "Fata"       "Drun"       "Hitr"      
## [11] "Wrkz"       "Relr"       "Weat"       "Lgtc"       "Ruru"      
## [16] "Func"       "Pbse"       "Pedc"       "Pedp"       "Motm"      
## [21] "Pedl"       "Peds"       "Pedc1"      "Injs"       "Sexn"      
## [26] "Drin"       "Hazi"       "Busu"       "Emer"       "Trav"      
## [31] "Unde"       "Roll"       "Defo"       "Vtra"       "Vnum"      
## [36] "Vspd"       "Vali"       "Vpro"       "Vpav"       "Vsur"      
## [41] "Pcra"       "Acct"       "Lsta"       "Ltyp"       "Cdls"      
## [46] "Mdrd"       "Drim"       "Mdrm"       "Peda"       "Drag"      
## [51] "Dayt"       "Veht"       "Psl"
datatable(
  name_key,
  options = list(pageLength = 25, scrollX = TRUE),
  caption = "Variable name key"
)

11. Final data preview

datatable(
  head(dat3_lumped, 50),
  options = list(scrollX = TRUE, pageLength = 10),
  caption = "Final processed dataset"
)

12. Export CSV and XLSX files

write.csv(dat3_lumped, "Ped01.csv", row.names = FALSE)
write_xlsx(dat3_lumped, "Ped01.xlsx")

write.csv(name_key, "Ped01_variable_name_key.csv", row.names = FALSE)
write_xlsx(name_key, "Ped01_variable_name_key.xlsx")

write.csv(category_summary, "Ped01_category_summary.csv", row.names = FALSE)
write_xlsx(category_summary, "Ped01_category_summary.xlsx")

13. Output files

The following files are created in the working directory:

  • Ped01.csv
  • Ped01.xlsx
  • Ped01_variable_name_key.csv
  • Ped01_variable_name_key.xlsx
  • Ped01_category_summary.csv
  • Ped01_category_summary.xlsx