library(readxl)
library(dplyr)
library(stringr)
library(forcats)
library(DT)
library(writexl)
library(tidyr)

1. File path and data import

data_dir <- "D:/DAS_CODE/Tools/data"
setwd(data_dir)

dat <- read_excel("Ped_Bike_Esc_2016_2024_All.xlsx", sheet = "Ped_Lim")

dim(dat)
## [1] 60741    81
names(dat)
##  [1] "STATENAME"         "YEAR"              "CRASH_NUM1"       
##  [4] "CRASH_VEH_NUM1"    "CRASH_VehPer_NUM1" "PER_TYPNAME"      
##  [7] "INJ_SEVNAME"       "LATITUDENAME"      "LONGITUDNAME"     
## [10] "VE_TOTAL"          "FATALS"            "DRUNK_DR"         
## [13] "HIT_RUNNAME_2"     "WRK_ZONENAME"      "REL_ROADNAME"     
## [16] "WEATHER1NAME"      "LGT_CONDNAME"      "BODY_TYPNAME"     
## [19] "RUR_URBNAME"       "FUNC_SYSNAME"      "PBAGENAME"        
## [22] "PBSEXNAME"         "PEDCTYPENAME"      "PEDPOSNAME"       
## [25] "MOTMANNAME"        "PEDLEGNAME"        "PEDSNRNAME"       
## [28] "PEDCGPNAME"        "INJ_SEVNAMEDr"     "AGENAMEDr"        
## [31] "SEXNAMEDr"         "DRINKINGNAMEDr"    "ROUTENAME"        
## [34] "TYP_INTNAME"       "DRINKINGNAME"      "ALC_RESNAME"      
## [37] "DRUGSNAME"         "DRUGRES1NAME"      "WORK_INJNAME"     
## [40] "RACENAME"          "LOCATIONNAME"      "NUMOCCS_2"        
## [43] "MAKENAME_2"        "MAK_MODNAME_2"     "MOD_YEAR_2"       
## [46] "HAZ_INVNAME_2"     "BUS_USENAME_2"     "EMER_USENAME_2"   
## [49] "TRAV_SPNAME_2"     "UNDERIDENAME_2"    "ROLLOVERNAME_2"   
## [52] "DEFORMEDNAME_2"    "VTRAFWAYNAME_2"    "VNUM_LANNAME_2"   
## [55] "VSPD_LIMNAME_2"    "VALIGNNAME_2"      "VPROFILENAME_2"   
## [58] "VPAVETYPNAME_2"    "VSURCONDNAME_2"    "P_CRASH1NAME_2"   
## [61] "ACC_TYPENAME_2"    "L_STATUSNAME_2"    "L_TYPENAME_2"     
## [64] "CDL_STATNAME_2"    "MDRDSTRDNAME_2"    "DRIMPAIRNAME_2"   
## [67] "MDRMANAVNAME_2"    "MVISOBSCNAME_2"    "INJ_SEVNAME_2"    
## [70] "ALC_RESNAME_2"     "DRUGSNAME_2"       "COUNTYNAME"       
## [73] "CITYNAME"          "DAY"               "MONTHNAME"        
## [76] "DAY_WEEKNAME"      "HOUR"              "HOURNAME"         
## [79] "MAN_COLLNAME"      "WEATHERNAME"       "SCH_BUSNAME"

2. Select analysis variables

dat1 <- dat[, c(1, 2, 3, 39, 8:32, 46:67, 76)]

dim(dat1)
## [1] 60741    52
names(dat1)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "BODY_TYPNAME"   "RUR_URBNAME"   
## [17] "FUNC_SYSNAME"   "PBAGENAME"      "PBSEXNAME"      "PEDCTYPENAME"  
## [21] "PEDPOSNAME"     "MOTMANNAME"     "PEDLEGNAME"     "PEDSNRNAME"    
## [25] "PEDCGPNAME"     "INJ_SEVNAMEDr"  "AGENAMEDr"      "SEXNAMEDr"     
## [29] "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2"  "EMER_USENAME_2"
## [33] "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [37] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"  
## [41] "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [45] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"   "CDL_STATNAME_2"
## [49] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"
datatable(
  head(dat1, 25),
  options = list(scrollX = TRUE, pageLength = 10),
  caption = "Selected variables"
)

3. Create pedestrian and driver age groups

dat1 <- dat1 %>%
  mutate(
    age_num = as.numeric(str_extract(PBAGENAME, "\\d+")),
    Ped_Age = case_when(
      age_num <= 4  ~ "0-4",
      age_num <= 9  ~ "5-9",
      age_num <= 14 ~ "10-14",
      age_num <= 20 ~ "15-20",
      age_num <= 24 ~ "21-24",
      age_num <= 34 ~ "25-34",
      age_num <= 44 ~ "35-44",
      age_num <= 54 ~ "45-54",
      age_num <= 64 ~ "55-64",
      age_num <= 74 ~ "65-74",
      age_num >= 75 ~ "75+",
      TRUE ~ "Unknown"
    ),
    age_num1 = as.numeric(str_extract(AGENAMEDr, "\\d+")),
    Dr_Age = case_when(
      age_num1 <= 4  ~ "0-4",
      age_num1 <= 9  ~ "5-9",
      age_num1 <= 14 ~ "10-14",
      age_num1 <= 20 ~ "15-20",
      age_num1 <= 24 ~ "21-24",
      age_num1 <= 34 ~ "25-34",
      age_num1 <= 44 ~ "35-44",
      age_num1 <= 54 ~ "45-54",
      age_num1 <= 64 ~ "55-64",
      age_num1 <= 74 ~ "65-74",
      age_num1 >= 75 ~ "75+",
      TRUE ~ "Unknown"
    )
  )

table(dat1$Ped_Age, useNA = "ifany")
## 
##     0-4   10-14   15-20   21-24   25-34   35-44   45-54     5-9   55-64   65-74 
##     552     667    2581    3077    9797   10098    9599     468   11021    7015 
##     75+ Unknown 
##    5130     736
table(dat1$Dr_Age, useNA = "ifany")
## 
##     0-4   10-14   15-20   21-24   25-34   35-44   45-54     5-9   55-64   65-74 
##       8      33    4178    5333   11846    9211    8105       9    7215    3924 
##     75+ Unknown 
##    1886    8993

4. Create weekday/weekend variable

dat1 <- dat1 %>%
  mutate(
    Day_Type = case_when(
      DAY_WEEKNAME %in% c("Saturday", "Sunday") ~ "Weekend",
      !is.na(DAY_WEEKNAME) ~ "Weekday",
      TRUE ~ "Unknown"
    )
  )

table(dat1$Day_Type, useNA = "ifany")
## 
## Weekday Weekend 
##   42325   18416

5. Create major vehicle type

dat2 <- dat1 %>%
  mutate(
    Veh_Type = case_when(
      str_detect(BODY_TYPNAME, regex("sedan|coupe|hatchback|convertible|station wagon|automobile", ignore_case = TRUE)) ~ "Car",
      str_detect(BODY_TYPNAME, regex("pickup|utility|light truck|cab chassis based", ignore_case = TRUE)) ~ "Light_Truck",
      str_detect(BODY_TYPNAME, regex("truck-tractor|single-unit|heavy truck|medium/heavy|construction equipment|farm equipment|motorhome", ignore_case = TRUE)) ~ "Large_Truck",
      str_detect(BODY_TYPNAME, regex("bus", ignore_case = TRUE)) ~ "Bus",
      str_detect(BODY_TYPNAME, regex("van|minivan|step-van|walk-in van", ignore_case = TRUE)) ~ "Van",
      str_detect(BODY_TYPNAME, regex("motorcycle|moped|scooter|motored cycle", ignore_case = TRUE)) ~ "Motorcycle",
      str_detect(BODY_TYPNAME, regex("ATV|ATC|golf cart|off-highway|recreational|three-wheel|go-cart|fork-lift|street sweeper", ignore_case = TRUE)) ~ "Off_Road_Other",
      is.na(BODY_TYPNAME) |
        BODY_TYPNAME %in% c("NA", "Not Reported", "Unknown body type") |
        str_detect(BODY_TYPNAME, regex("unknown|not reported", ignore_case = TRUE)) ~ "Unknown",
      TRUE ~ "Off_Road_Other"
    )
  )

table(dat2$Veh_Type, useNA = "ifany")
## 
##            Bus            Car    Large_Truck    Light_Truck     Motorcycle 
##           1205          24241           3394          22579            368 
## Off_Road_Other        Unknown            Van 
##             30           7036           1888

6. Remove unused source variables

names(dat2)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "BODY_TYPNAME"   "RUR_URBNAME"   
## [17] "FUNC_SYSNAME"   "PBAGENAME"      "PBSEXNAME"      "PEDCTYPENAME"  
## [21] "PEDPOSNAME"     "MOTMANNAME"     "PEDLEGNAME"     "PEDSNRNAME"    
## [25] "PEDCGPNAME"     "INJ_SEVNAMEDr"  "AGENAMEDr"      "SEXNAMEDr"     
## [29] "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2"  "EMER_USENAME_2"
## [33] "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [37] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"  
## [41] "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [45] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"   "CDL_STATNAME_2"
## [49] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"  
## [53] "age_num"        "Ped_Age"        "age_num1"       "Dr_Age"        
## [57] "Day_Type"       "Veh_Type"
dat2 <- dat2[, -c(18, 27, 52, 53, 55, 15)]

dim(dat2)
## [1] 60741    52
names(dat2)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "RUR_URBNAME"    "FUNC_SYSNAME"  
## [17] "PBSEXNAME"      "PEDCTYPENAME"   "PEDPOSNAME"     "MOTMANNAME"    
## [21] "PEDLEGNAME"     "PEDSNRNAME"     "PEDCGPNAME"     "INJ_SEVNAMEDr" 
## [25] "SEXNAMEDr"      "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2" 
## [29] "EMER_USENAME_2" "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2"
## [33] "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2"
## [37] "VALIGNNAME_2"   "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2"
## [41] "P_CRASH1NAME_2" "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"  
## [45] "CDL_STATNAME_2" "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2"
## [49] "Ped_Age"        "Dr_Age"         "Day_Type"       "Veh_Type"

7. Create posted speed limit group

dat2 <- dat2 %>%
  mutate(
    spd_num = as.numeric(str_extract(VSPD_LIMNAME_2, "\\d+")),
    PSL = case_when(
      spd_num <= 25 ~ "25_or_less",
      spd_num <= 35 ~ "30_35",
      spd_num <= 45 ~ "40_45",
      spd_num <= 55 ~ "50_55",
      spd_num >= 60 ~ "60_plus",
      TRUE ~ "Unknown"
    )
  )

table(dat2$PSL, useNA = "ifany")
## 
## 25_or_less      30_35      40_45      50_55    60_plus    Unknown 
##       5427      14474      18063       9752       8789       4236
dat3 <- dat2[, -c(53)]

dim(dat3)
## [1] 60741    53

8. Standardize missing and unknown categories

dat3 <- dat3 %>%
  mutate(
    across(
      where(~ is.character(.) | is.factor(.)),
      ~ case_when(
        is.na(.) ~ "Not Reported",
        str_trim(as.character(.)) %in% c(
          "NA", "N/A", "Unknown", "Reported as Unknown",
          " Reported as Unknown",
          "Reported as Unknown if Distracted ",
          "Reported as Unknown if Impaired",
          "Unknown body type", "Not Reported"
        ) ~ "Not Reported",
        str_detect(as.character(.), regex("^unknown|not reported$", ignore_case = TRUE)) ~ "Not Reported",
        TRUE ~ as.character(.)
      )
    )
  )

cat_summary <- dat3 %>%
  summarise(across(
    where(~ is.character(.) | is.factor(.)),
    ~ n_distinct(., na.rm = TRUE)
  )) %>%
  pivot_longer(
    everything(),
    names_to = "variable",
    values_to = "n_categories"
  ) %>%
  arrange(desc(n_categories))

cat_summary
## # A tibble: 49 × 2
##    variable       n_categories
##    <chr>                 <int>
##  1 CRASH_NUM1            59912
##  2 LONGITUDNAME          59162
##  3 LATITUDENAME          58976
##  4 PEDSNRNAME              179
##  5 TRAV_SPNAME_2           123
##  6 ACC_TYPENAME_2           76
##  7 PEDCTYPENAME             60
##  8 STATENAME                51
##  9 MDRDSTRDNAME_2           27
## 10 PEDCGPNAME               21
## # ℹ 39 more rows
datatable(
  cat_summary,
  options = list(pageLength = 25),
  caption = "Number of categories after lumping"
)

9. PBCAT vairable recategorization

#### PEDCTYPENAME (From 60 to 12 Categories)

dat3 <- dat3 %>%
  mutate(
    ped_type_clean = PEDCTYPENAME %>%
      str_replace_all("–|–", "-") %>%
      str_squish(),
    
    PedCTyp = case_when(
      
      str_detect(ped_type_clean, regex("Pedestrian Failed|Dart|Dash", ignore_case = TRUE)) ~
        "Pedestrian Crossing / Dart-Out",
      
      str_detect(ped_type_clean, regex("Crossing an Expressway|Waiting to Cross", ignore_case = TRUE)) ~
        "Expressway / Waiting to Cross",
      
      str_detect(ped_type_clean, regex("Intersection", ignore_case = TRUE)) ~
        "Intersection Related",
      
      str_detect(ped_type_clean, regex("Walking/Running Along Roadway", ignore_case = TRUE)) ~
        "Walking / Running Along Roadway",
      
      str_detect(ped_type_clean, regex("Walking in Roadway|Lying|Standing|Playing|Working|Trapped", ignore_case = TRUE)) ~
        "Pedestrian in Roadway",
      
      str_detect(ped_type_clean, regex("Backing", ignore_case = TRUE)) ~
        "Backing Vehicle",
      
      str_detect(ped_type_clean, regex("Left Turn|Right Turn|Turn / Merge|Right Turn on Red", ignore_case = TRUE)) ~
        "Turning / Merging Vehicle",
      
      str_detect(ped_type_clean, regex("Driveway|Alley", ignore_case = TRUE)) ~
        "Driveway / Alley Conflict",
      
      str_detect(ped_type_clean, regex("Motorist Failed to Yield", ignore_case = TRUE)) ~
        "Motorist Failed to Yield",
      
      str_detect(ped_type_clean, regex("Disabled Vehicle|Driverless Vehicle|Loss of Control|Vehicle-Vehicle|Vehicle into|Object", ignore_case = TRUE)) ~
        "Vehicle / Object / Control Related",
      
      str_detect(ped_type_clean, regex("Bus|Transit|School Bus|Emergency|Vendor|Mailbox|Ice Cream", ignore_case = TRUE)) ~
        "Transit / Service Vehicle Related",
      
      TRUE ~
        "Other / Unknown"
    )
  )

dat3 %>%
  count(PedCTyp, sort = TRUE) %>%
  mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 12 × 3
##    PedCTyp                                n percent
##    <chr>                              <int>   <dbl>
##  1 Pedestrian Crossing / Dart-Out     23756   39.1 
##  2 Walking / Running Along Roadway     8272   13.6 
##  3 Intersection Related                7149   11.8 
##  4 Vehicle / Object / Control Related  5369    8.84
##  5 Pedestrian in Roadway               4869    8.02
##  6 Expressway / Waiting to Cross       3307    5.44
##  7 Turning / Merging Vehicle           2602    4.28
##  8 Motorist Failed to Yield            2549    4.2 
##  9 Other / Unknown                     1598    2.63
## 10 Transit / Service Vehicle Related    516    0.85
## 11 Backing Vehicle                      477    0.79
## 12 Driveway / Alley Conflict            277    0.46
#### PEDCTYPENAME (From 21 to 8 Categories)


dat3 <- dat3 %>%
  mutate(
    ped_cgp_clean = PEDCGPNAME %>%
      str_replace_all("–|–|\\?", "-") %>%
      str_squish(),
    
    PedCgp = case_when(
      
      str_detect(ped_cgp_clean, regex("Crossing Roadway|Crossing Expressway|Waiting to Cross", ignore_case = TRUE)) ~
        "Crossing Roadway / Expressway",
      
      str_detect(ped_cgp_clean, regex("Dash|Dart-Out", ignore_case = TRUE)) ~
        "Dash / Dart-Out",
      
      str_detect(ped_cgp_clean, regex("Walking/Running Along Roadway", ignore_case = TRUE)) ~
        "Walking / Running Along Roadway",
      
      str_detect(ped_cgp_clean, regex("Pedestrian in Roadway|Working or Playing", ignore_case = TRUE)) ~
        "Pedestrian in Roadway",
      
      str_detect(ped_cgp_clean, regex("Backing Vehicle|Driveway", ignore_case = TRUE)) ~
        "Backing / Driveway Related",
      
      str_detect(ped_cgp_clean, regex("Bus|Bus Stop", ignore_case = TRUE)) ~
        "Bus / Bus Stop Related",
      
      str_detect(ped_cgp_clean, regex("Non-Trafficway|Unique Midblock|Multiple Threat|Trapped", ignore_case = TRUE)) ~
        "Non-Trafficway / Midblock / Multiple Threat",
      
      TRUE ~
        "Other / Unknown / Unusual"
    )
  )

dat3 %>%
  count(PedCgp, sort = TRUE) %>%
  mutate(
    percent = round(100 * n / sum(n), 2)
  )
## # A tibble: 8 × 3
##   PedCgp                                          n percent
##   <chr>                                       <int>   <dbl>
## 1 Crossing Roadway / Expressway               28226   46.5 
## 2 Other / Unknown / Unusual                   13343   22.0 
## 3 Walking / Running Along Roadway              8272   13.6 
## 4 Pedestrian in Roadway                        4833    7.96
## 5 Dash / Dart-Out                              3988    6.57
## 6 Non-Trafficway / Midblock / Multiple Threat  1069    1.76
## 7 Backing / Driveway Related                    754    1.24
## 8 Bus / Bus Stop Related                        256    0.42
library(dplyr)
library(stringr)

dat3 <- dat3 %>%
  mutate(
    acc_type_clean = ACC_TYPENAME_2 %>%
      str_replace_all("–|–", "-") %>%
      str_remove("^[A-Z][0-9]+-") %>%   # removes A1-, B6-, C13-, M99-, etc.
      str_squish(),

    AccTyp = case_when(

      # 1-4. Roadside departure
      str_detect(acc_type_clean, regex("Right Roadside Departure-Drive Off Road", ignore_case = TRUE)) ~
        "Right Roadside Departure - Drive Off Road",

      str_detect(acc_type_clean, regex("Right Roadside Departure-Control/Traction Loss", ignore_case = TRUE)) ~
        "Right Roadside Departure - Control Loss",

      str_detect(acc_type_clean, regex("Left Roadside Departure-Drive Off Road", ignore_case = TRUE)) ~
        "Left Roadside Departure - Drive Off Road",

      str_detect(acc_type_clean, regex("Left Roadside Departure-Control/Traction Loss", ignore_case = TRUE)) ~
        "Left Roadside Departure - Control Loss",

      # 5. Other roadside departure
      str_detect(acc_type_clean, regex("Roadside Departure", ignore_case = TRUE)) ~
        "Other Roadside Departure",

      # 6-8. Forward impact
      str_detect(acc_type_clean, regex("Forward Impact-Pedestrian|Pedestrian/ Animal", ignore_case = TRUE)) ~
        "Forward Impact - Pedestrian/Animal",

      str_detect(acc_type_clean, regex("Forward Impact-Parked Veh", ignore_case = TRUE)) ~
        "Forward Impact - Parked Vehicle",

      str_detect(acc_type_clean, regex("Forward Impact-Sta|Sta\\. Object|End Departure", ignore_case = TRUE)) ~
        "Forward Impact - Object/End Departure",

      # 9-11. Rear-end
      str_detect(acc_type_clean, regex("Rear End-Stopped", ignore_case = TRUE)) ~
        "Rear-End - Stopped Vehicle",

      str_detect(acc_type_clean, regex("Rear End-Slower", ignore_case = TRUE)) ~
        "Rear-End - Slower Vehicle",

      str_detect(acc_type_clean, regex("Rear End-Decelerating|Rear End-Specifics", ignore_case = TRUE)) ~
        "Rear-End - Decelerating/Other",

      # 12-13. Same-direction sideswipe/angle
      str_detect(acc_type_clean, regex("Same Direction-Angle, Sideswipe.*Straight", ignore_case = TRUE)) ~
        "Same-Direction Sideswipe - Straight",

      str_detect(acc_type_clean, regex("Same Direction-Angle, Sideswipe.*Changing Lanes|Same Direction-Angle, Sideswipe-Specifics", ignore_case = TRUE)) ~
        "Same-Direction Sideswipe - Lane Change/Other",

      # 14-15. Opposite-direction conflict
      str_detect(acc_type_clean, regex("Opposite Direction-Head-On", ignore_case = TRUE)) ~
        "Opposite-Direction Head-On",

      str_detect(acc_type_clean, regex("Opposite Direction-Angle|Opposite Direction-Forward Impact", ignore_case = TRUE)) ~
        "Opposite-Direction Angle/Sideswipe",

      # 16-17. Turning conflicts
      str_detect(acc_type_clean, regex("Turn Across Path", ignore_case = TRUE)) ~
        "Turning Conflict - Turn Across Path",

      str_detect(acc_type_clean, regex("Turn Into Path", ignore_case = TRUE)) ~
        "Turning Conflict - Turn Into Path",

      # 18. Intersecting paths
      str_detect(acc_type_clean, regex("Intersecting Paths", ignore_case = TRUE)) ~
        "Intersecting-Path Conflict",

      # 19. Backing
      str_detect(acc_type_clean, regex("Backing", ignore_case = TRUE)) ~
        "Backing",

      # 20. Other / unknown
      TRUE ~
        "Other / Unknown / No Impact"
    )
  )

dat3 %>%
  count(AccTyp, sort = TRUE) %>%
  mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 20 × 3
##    AccTyp                                           n percent
##    <chr>                                        <int>   <dbl>
##  1 Forward Impact - Pedestrian/Animal           47624   78.4 
##  2 Other / Unknown / No Impact                   9211   15.2 
##  3 Right Roadside Departure - Drive Off Road     1004    1.65
##  4 Left Roadside Departure - Drive Off Road       407    0.67
##  5 Backing                                        384    0.63
##  6 Forward Impact - Parked Vehicle                356    0.59
##  7 Rear-End - Decelerating/Other                  298    0.49
##  8 Same-Direction Sideswipe - Lane Change/Other   276    0.45
##  9 Other Roadside Departure                       181    0.3 
## 10 Right Roadside Departure - Control Loss        170    0.28
## 11 Intersecting-Path Conflict                     162    0.27
## 12 Rear-End - Stopped Vehicle                     131    0.22
## 13 Turning Conflict - Turn Across Path            103    0.17
## 14 Left Roadside Departure - Control Loss          99    0.16
## 15 Forward Impact - Object/End Departure           86    0.14
## 16 Same-Direction Sideswipe - Straight             58    0.1 
## 17 Rear-End - Slower Vehicle                       57    0.09
## 18 Turning Conflict - Turn Into Path               50    0.08
## 19 Opposite-Direction Angle/Sideswipe              46    0.08
## 20 Opposite-Direction Head-On                      38    0.06
names(dat3)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "RUR_URBNAME"    "FUNC_SYSNAME"  
## [17] "PBSEXNAME"      "PEDCTYPENAME"   "PEDPOSNAME"     "MOTMANNAME"    
## [21] "PEDLEGNAME"     "PEDSNRNAME"     "PEDCGPNAME"     "INJ_SEVNAMEDr" 
## [25] "SEXNAMEDr"      "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2" 
## [29] "EMER_USENAME_2" "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2"
## [33] "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2"
## [37] "VALIGNNAME_2"   "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2"
## [41] "P_CRASH1NAME_2" "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"  
## [45] "CDL_STATNAME_2" "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2"
## [49] "Ped_Age"        "Dr_Age"         "Day_Type"       "Veh_Type"      
## [53] "PSL"            "ped_type_clean" "PedCTyp"        "ped_cgp_clean" 
## [57] "PedCgp"         "acc_type_clean" "AccTyp"
dat3 <- dat3[, -c(58, 54, 56)]

dim(dat3)
## [1] 60741    56

9. Lump variables with more than five categories

keep_cols <- c("CRASH_NUM1", "Ped_Age", "Dr_Age", "Veh_Type", "LATITUDENAME", "LONGITUDNAME", "STATENAME", "LGT_CONDNAME", "PedCTyp", "PedCgp","AccTyp")

dat3_lumped <- dat3 %>%
  mutate(
    across(
      .cols = setdiff(
        names(select(., where(~ is.character(.) | is.factor(.)))),
        keep_cols
      ),
      .fns = ~ if (n_distinct(., na.rm = TRUE) > 5) {
        fct_lump_n(as.factor(.), n = 5, other_level = "Other")
      } else {
        as.factor(.)
      }
    )
  )

dim(dat3_lumped)
## [1] 60741    56
category_summary <- data.frame(
  Variable = names(dat3_lumped),
  Categories = sapply(dat3_lumped, function(x) n_distinct(x, na.rm = FALSE))
) %>%
  arrange(desc(Categories))

datatable(
  category_summary,
  options = list(pageLength = 25),
  caption = "Number of categories after lumping"
)
freq_tables <- lapply(
  dat3_lumped %>% select(-all_of(keep_cols)),
  table,
  useNA = "ifany"
)

freq_tables
## $YEAR
## 
## 2016 2017 2018 2019 2020 2021 2022 2023 2024 
## 6080 6075 6374 6272 6565 7388 7593 7314 7080 
## 
## $WORK_INJNAME
## 
##           No Not Reported     Redacted          Yes 
##        53105         6460          133         1043 
## 
## $VE_TOTAL
## 
##     1     2     3     4     5     6     7     8     9    10    11    12    14 
## 52229  6160  1645   452   155    48    23    16     4     2     1     3     1 
##   130 
##     2 
## 
## $FATALS
## 
##     1     2     3     4     5     6     8    20 
## 58940  1531   188    57     6     9     8     2 
## 
## $DRUNK_DR
## 
##     0     1     2  <NA> 
## 29063  2289    14 29375 
## 
## $HIT_RUNNAME_2
## 
##           No Not Reported          Yes 
##        46422          988        13331 
## 
## $WRK_ZONENAME
## 
##            Construction             Maintenance                    None 
##                     705                     106                   59455 
##                 Utility Work Zone, Type Unknown 
##                      33                     442 
## 
## $REL_ROADNAME
## 
##       Not Reported        On Roadside         On Roadway        On Shoulder 
##                295               1928              55468               1956 
## Outside Trafficway              Other 
##                320                774 
## 
## $WEATHER1NAME
## 
##            Clear           Cloudy Fog, Smog, Smoke     Not Reported 
##            17338             3484              274            37373 
##             Rain            Other 
##             2044              228 
## 
## $RUR_URBNAME
## 
##                      Not Reported                             Rural 
##                               163                             10237 
## Trafficway Not in State Inventory                             Urban 
##                               326                             50015 
## 
## $FUNC_SYSNAME
## 
##                 Interstate                      Local 
##                       7231                       6119 
##            Major Collector             Minor Arterial 
##                       5856                      14137 
## Principal Arterial - Other                      Other 
##                      17820                       9578 
## 
## $PBSEXNAME
## 
##       Female         Male Not Reported 
##        17933        42482          326 
## 
## $PEDCTYPENAME
## 
##                                   Crossing an Expressway 
##                                                     3168 
##                                 Disabled Vehicle-Related 
##                                                     2693 
##                    Not At Intersection - Other / Unknown 
##                                                     5356 
##                               Pedestrian Failed to Yield 
##                                                    19768 
## Walking/Running Along Roadway With Traffic - From Behind 
##                                                     4768 
##                                                    Other 
##                                                    24988 
## 
## $PEDPOSNAME
## 
##                               Crosswalk Area 
##                                         7700 
##                            Intersection Area 
##                                         2518 
##                                Other/Unknown 
##                                         1802 
## Paved Shoulder / Bicycle Lane / Parking Lane 
##                                         2543 
##                                  Travel Lane 
##                                        43644 
##                                        Other 
##                                         2534 
## 
## $MOTMANNAME
## 
##        Left Turn   Not Applicable     Not Reported       Right Turn 
##             1983            45795              359              798 
## Straight through 
##            11806 
## 
## $PEDLEGNAME
## 
##        Farside       Nearside Not Applicable   Not Reported 
##           7608           5600          45836           1697 
## 
## $PEDSNRNAME
## 
##                                    Motorist traveling straight through - Crash Occurred on Far Side of Intersection / Pedestrian outside crosswalk area, other 
##                                                                                                                                                            456 
##             Motorist traveling straight through - Crash Occurred on Far Side of Intersection / Pedestrian within crosswalk area, traveled from motorist’s left 
##                                                                                                                                                            365 
## Motorist traveling straight through - Crash Occurred on Near (Approach) Side of Intersection / Pedestrian within crosswalk area, traveled from motorist’s left 
##                                                                                                                                                            364 
##                                                                                                                                                 Not Applicable 
##                                                                                                                                                          46065 
##                                                                                                                                                   Not Reported 
##                                                                                                                                                           1555 
##                                                                                                                                                          Other 
##                                                                                                                                                          11936 
## 
## $PEDCGPNAME
## 
##        Crossing Roadway - Vehicle Not Turning 
##                                         22317 
##        Other / Unknown - Insufficient Details 
##                                          7457 
## Pedestrian in Roadway - Circumstances Unknown 
##                                          4374 
##                         Unusual Circumstances 
##                                          5886 
##                 Walking/Running Along Roadway 
##                                          8272 
##                                         Other 
##                                         12435 
## 
## $INJ_SEVNAMEDr
## 
##       No Apparent Injury (O)                 Not Reported 
##                        48413                         6826 
##          Possible Injury (C)   Suspected Minor Injury (B) 
##                         2473                         1943 
## Suspected Serious Injury (A)                        Other 
##                          607                          479 
## 
## $SEXNAMEDr
## 
##       Female         Male Not Reported 
##        14830        37170         8741 
## 
## $DRINKINGNAMEDr
## 
## No (Alcohol Not Involved)              Not Reported    Yes (Alcohol Involved) 
##                     35594                     21161                      3986 
## 
## $HAZ_INVNAME_2
## 
##           No Not Reported          Yes 
##        59732          936           73 
## 
## $BUS_USENAME_2
## 
##      Charter/Tour         Not a Bus      Not Reported            School 
##                39             54506              5804               104 
## Transit/ Commuter             Other 
##               241                47 
## 
## $EMER_USENAME_2
## 
##         Emergency Operation, Emergency Warning Equipment in Use 
##                                                              57 
## Emergency Operation, Emergency Warning Equipment in Use Unknown 
##                                                              43 
##                                    Non-Emergency, Non-Transport 
##                                                              53 
##                                                  Not Applicable 
##                                                           58212 
##                                                    Not Reported 
##                                                            2351 
##                                                           Other 
##                                                              25 
## 
## $TRAV_SPNAME_2
## 
##      035 MPH      040 MPH      045 MPH      055 MPH Not Reported        Other 
##         2333         2527         3414         1903        38258        12306 
## 
## $UNDERIDENAME_2
## 
##                                                         No Underride or Override Noted 
##                                                                                  30840 
##                                                                           Not Reported 
##                                                                                  29877 
##                                                Overriding a Motor Vehicle In-Transport 
##                                                                                      4 
##                                            Overriding a Motor Vehicle Not In-Transport 
##                                                                                      7 
##     Underriding a Motor Vehicle In-Transport, Underride, Compartment Intrusion Unknown 
##                                                                                      3 
## Underriding a Motor Vehicle Not In-Transport, Underride, Compartment Intrusion Unknown 
##                                                                                      3 
##                                                                                  Other 
##                                                                                      7 
## 
## $ROLLOVERNAME_2
## 
##                         No Rollover                      Not Applicable 
##                               59261                                 125 
##                        Not Reported                            Rollover 
##                                 936                                 155 
## Rollover, Tripped by Object/Vehicle                               Other 
##                                 227                                  37 
## 
## $DEFORMEDNAME_2
## 
## Damage Reported, Extent Unknown                Disabling Damage 
##                            4028                           13269 
##               Functional Damage                    Minor Damage 
##                           15432                           12081 
##                    Not Reported                           Other 
##                           12483                            3448 
## 
## $VTRAFWAYNAME_2
## 
##                                          Not Reported 
##                                                  1599 
##           Two-Way,  Divided, Positive  Median Barrier 
##                                                  9525 
##                  Two-Way, Divided, Unprotected Median 
##                                                 13168 
##                                  Two-Way, Not Divided 
##                                                 24466 
## Two-Way, Not Divided With a Continuous Left-Turn Lane 
##                                                  7725 
##                                                 Other 
##                                                  4258 
## 
## $VNUM_LANNAME_2
## 
##   Five lanes   Four lanes Not Reported  Three lanes    Two lanes        Other 
##         8514         9016         2029        10293        26179         4710 
## 
## $VSPD_LIMNAME_2
## 
## 25 MPH 35 MPH 40 MPH 45 MPH 55 MPH  Other 
##   4874   9993   7268  10795   7212  20599 
## 
## $VALIGNNAME_2
## 
## Curve - Left   Curve Left  Curve Right Not Reported     Straight        Other 
##          708          927          944         2682        53922         1558 
## 
## $VPROFILENAME_2
## 
##             Downhill Grade, Unknown Slope                Level 
##                 1837                 3695                44259 
##         Not Reported               Uphill                Other 
##                 7903                 1580                 1467 
## 
## $VPAVETYPNAME_2
## 
##  Blacktop, Bituminous, or Asphalt                          Concrete 
##                             36107                              4603 
## Non-Trafficway or Driveway Access                      Not Reported 
##                               457                             19416 
##             Slag, Gravel or Stone                             Other 
##                                69                                89 
## 
## $VSURCONDNAME_2
## 
##                               Dry                         Ice/Frost 
##                             50381                               236 
## Non-Trafficway or Driveway Access                      Not Reported 
##                               457                              1828 
##                               Wet                             Other 
##                              7490                               349 
## 
## $P_CRASH1NAME_2
## 
##      Going Straight Negotiating a Curve        Not Reported        Turning Left 
##               48441                3146                2633                2355 
##       Turning Right               Other 
##                 982                3184 
## 
## $ACC_TYPENAME_2
## 
## A1-Single Driver-Right Roadside Departure-Drive Off Road 
##                                                     1004 
##  B6-Single Driver-Left Roadside Departure-Drive Off Road 
##                                                      407 
##      C13-Single Driver-Forward Impact-Pedestrian/ Animal 
##                                                    47624 
##                                     M98-Other Crash Type 
##                                                      884 
##                                             Not Reported 
##                                                     8136 
##                                                    Other 
##                                                     2686 
## 
## $L_STATUSNAME_2
## 
##      Expired Not licensed Not Reported    Suspended        Valid        Other 
##          636         2501         9226         2399        45437          542 
## 
## $L_TYPENAME_2
## 
##         Full Driver License Intermediate Driver License 
##                       47828                         721 
##            Learner's Permit                Not Licensed 
##                         213                        2501 
##                Not Reported                       Other 
##                        9218                         260 
## 
## $CDL_STATNAME_2
## 
## Disqualified     No (CDL) Not Reported    Suspended        Valid        Other 
##          149        45875         9248          131         5014          324 
## 
## $MDRDSTRDNAME_2
## 
##                    Distraction/Inattention 
##                                        276 
## Inattention (Inattentive), Details Unknown 
##                                        533 
##                             Not Distracted 
##                                      11548 
##                               Not Reported 
##                                      45743 
##          Reported as Unknown if Distracted 
##                                       1372 
##                                      Other 
##                                       1269 
## 
## $DRIMPAIRNAME_2
## 
##                                  Asleep or Fatigued 
##                                                 305 
##                                       Ill, Blackout 
##                                                 253 
##                              None/Apparently Normal 
##                                               27034 
##                                        Not Reported 
##                                               28584 
## Under the Influence of Alcohol, Drugs or Medication 
##                                                3971 
##                                               Other 
##                                                 594 
## 
## $MDRMANAVNAME_2
## 
##               Driver Did Not Maneuver to Avoid 
##                                           7082 
##                                  Motor Vehicle 
##                                            160 
##                                   Not Reported 
##                                          50283 
## Pedestrian, Pedalcyclist or Other Non-Motorist 
##                                           2952 
##              Phantom/Non-Contact Motor Vehicle 
##                                            179 
##                                          Other 
##                                             85 
## 
## $Day_Type
## 
## Weekday Weekend 
##   42325   18416 
## 
## $PSL
## 
## 25_or_less      30_35      40_45      50_55    60_plus      Other 
##       5427      14474      18063       9752       8789       4236

10. Shorten variable names

keep_names <- c("CRASH_NUM1")

old_names <- names(dat3_lumped)

new_names <- ifelse(
  old_names %in% keep_names,
  old_names,
  str_to_title(str_sub(str_replace_all(old_names, "_", ""), 1, 4))
)

new_names <- make.unique(new_names, sep = "")

name_key <- data.frame(
  old_name = old_names,
  new_name = new_names
)

names(dat3_lumped) <- new_names

names(dat3_lumped)
##  [1] "Stat"       "Year"       "CRASH_NUM1" "Work"       "Lati"      
##  [6] "Long"       "Veto"       "Fata"       "Drun"       "Hitr"      
## [11] "Wrkz"       "Relr"       "Weat"       "Lgtc"       "Ruru"      
## [16] "Func"       "Pbse"       "Pedc"       "Pedp"       "Motm"      
## [21] "Pedl"       "Peds"       "Pedc1"      "Injs"       "Sexn"      
## [26] "Drin"       "Hazi"       "Busu"       "Emer"       "Trav"      
## [31] "Unde"       "Roll"       "Defo"       "Vtra"       "Vnum"      
## [36] "Vspd"       "Vali"       "Vpro"       "Vpav"       "Vsur"      
## [41] "Pcra"       "Acct"       "Lsta"       "Ltyp"       "Cdls"      
## [46] "Mdrd"       "Drim"       "Mdrm"       "Peda"       "Drag"      
## [51] "Dayt"       "Veht"       "Psl"        "Pedc2"      "Pedc3"     
## [56] "Acct1"
datatable(
  name_key,
  options = list(pageLength = 25, scrollX = TRUE),
  caption = "Variable name key"
)

11. Final data preview

datatable(
  head(dat3_lumped, 50),
  options = list(scrollX = TRUE, pageLength = 10),
  caption = "Final processed dataset"
)

12. Export CSV and XLSX files

write.csv(dat3_lumped, "Ped01.csv", row.names = FALSE)
write_xlsx(dat3_lumped, "Ped01.xlsx")

write.csv(name_key, "Ped01_variable_name_key.csv", row.names = FALSE)
write_xlsx(name_key, "Ped01_variable_name_key.xlsx")

write.csv(category_summary, "Ped01_category_summary.csv", row.names = FALSE)
write_xlsx(category_summary, "Ped01_category_summary.xlsx")

13. Output files

The following files are created in the working directory:

  • Ped01.csv
  • Ped01.xlsx
  • Ped01_variable_name_key.csv
  • Ped01_variable_name_key.xlsx
  • Ped01_category_summary.csv
  • Ped01_category_summary.xlsx