library(readxl)
library(dplyr)
library(stringr)
library(forcats)
library(DT)
library(writexl)
library(tidyr)

1. File path and data import

data_dir <- "D:/DAS_CODE/Tools/data"
setwd(data_dir)

dat <- read_excel("Ped_Bike_Esc_2016_2024_All.xlsx", sheet = "Bike_Lim")

dim(dat)
## [1] 8689  278
names(dat)
##   [1] "STATE"                   "STATENAME"              
##   [3] "ST_CASE"                 "VEH_NO"                 
##   [5] "PER_NO"                  "YEAR"                   
##   [7] "CRASH_NUM1"              "CRASH_VEH_NUM1"         
##   [9] "CRASH_VehPer_NUM1"       "LATITUDENAME"           
##  [11] "LONGITUDNAME"            "VE_TOTAL"               
##  [13] "HIT_RUNNAME_2"           "LGT_CONDNAME"           
##  [15] "BODY_TYPNAME"            "PBPTYPENAME"            
##  [17] "PBAGENAME"               "PBSEXNAME"              
##  [19] "PBCWALKNAME"             "PBSWALK"                
##  [21] "PBSWALKNAME"             "PBSZONENAME"            
##  [23] "BIKECTYPENAME"           "BIKELOCNAME"            
##  [25] "BIKEPOSNAME"             "BIKEDIRNAME"            
##  [27] "MOTDIRNAME"              "MOTMANNAME"             
##  [29] "BIKECGPNAME"             "CRASH_VEH_NUM2"         
##  [31] "CRASH_VehPer_NUM2"       "PER_TYPNAMEDr"          
##  [33] "INJ_SEVNAMEDr"           "AGENAMEDr"              
##  [35] "SEXNAMEDr"               "HARM_EVNAMEDr"          
##  [37] "DRINKINGNAMEDr"          "RACENAMEDr"             
##  [39] "HISPANICNAMEDr"          "COUNTY"                 
##  [41] "CITY"                    "MONTH"                  
##  [43] "NHSNAME"                 "ROUTENAME"              
##  [45] "TWAY_ID"                 "TWAY_ID2"               
##  [47] "RD_OWNERNAME"            "LATITUDE"               
##  [49] "LONGITUD"                "SP_JURNAME"             
##  [51] "RELJCT1NAME"             "RELJCT2NAME"            
##  [53] "TYP_INTNAME"             "WRK_ZONENAME"           
##  [55] "REL_ROADNAME"            "WEATHER1NAME"           
##  [57] "WEATHER2NAME"            "RAILNAME"               
##  [59] "NOT_HOURNAME"            "ARR_HOUR"               
##  [61] "ARR_HOURNAME"            "ARR_MIN"                
##  [63] "ARR_MINNAME"             "HOSP_HR"                
##  [65] "HOSP_HRNAME"             "CF1NAME"                
##  [67] "CF2NAME"                 "CF3NAME"                
##  [69] "FATALS"                  "DRUNK_DR"               
##  [71] "STR_VEH"                 "MAK_MOD"                
##  [73] "MOD_YEARNAME"            "AGENAME"                
##  [75] "SEXNAME"                 "PER_TYPNAME"            
##  [77] "INJ_SEVNAME"             "SEAT_POSNAME"           
##  [79] "REST_USENAME"            "REST_MISNAME"           
##  [81] "AIR_BAGNAME"             "EJECTIONNAME"           
##  [83] "EJ_PATHNAME"             "EXTRICATNAME"           
##  [85] "DRINKINGNAME"            "ALC_DETNAME"            
##  [87] "ALC_STATUSNAME"          "ATST_TYPNAME"           
##  [89] "ALC_RESNAME"             "DRUGSNAME"              
##  [91] "DRUG_DETNAME"            "DSTATUSNAME"            
##  [93] "DRUGTST1NAME"            "DRUGTST2NAME"           
##  [95] "DRUGTST3NAME"            "DRUGRES1NAME"           
##  [97] "DRUGRES2NAME"            "DRUGRES3NAME"           
##  [99] "HOSPITALNAME"            "DOANAME"                
## [101] "DEATH_DANAME"            "DEATH_MONAME"           
## [103] "DEATH_YRNAME"            "DEATH_HRNAME"           
## [105] "DEATH_MNNAME"            "DEATH_TMNAME"           
## [107] "LAG_HRSNAME"             "LAG_MINSNAME"           
## [109] "P_SF1NAME"               "P_SF2NAME"              
## [111] "P_SF3NAME"               "WORK_INJNAME"           
## [113] "HISPANICNAME"            "RACENAME"               
## [115] "LOCATIONNAME"            "HELM_USENAME"           
## [117] "HELM_MISNAME"            "VPICMAKENAME"           
## [119] "VPICMODEL"               "VPICMODELNAME"          
## [121] "VPICBODYCLASSNAME"       "ICFINALBODYNAME"        
## [123] "Other_VEH_NO"            "Other_PER_NO"           
## [125] "Other_CRASH_VehPer_NUM1" "CRASH_NUM1_2"           
## [127] "ST_CASE_2"               "VEH_NO_2"               
## [129] "CRASH_VEH_NUM1_2"        "PER_NO_2"               
## [131] "CRASH_VehPer_NUM1_2"     "NUMOCCS_2"              
## [133] "UNITTYPENAME_2"          "REG_STATNAME_2"         
## [135] "OWNERNAME_2"             "MAKENAME_2"             
## [137] "MAK_MODNAME_2"           "BODY_TYPNAME_2"         
## [139] "MOD_YEAR_2"              "VINNAME_2"              
## [141] "TOW_VEHNAME_2"           "J_KNIFENAME_2"          
## [143] "GVWRNAME_2"              "V_CONFIGNAME_2"         
## [145] "CARGO_BTNAME_2"          "HAZ_INVNAME_2"          
## [147] "BUS_USENAME_2"           "SPEC_USENAME_2"         
## [149] "EMER_USENAME_2"          "TRAV_SPNAME_2"          
## [151] "UNDERIDENAME_2"          "ROLLOVERNAME_2"         
## [153] "ROLINLOCNAME_2"          "IMPACT1NAME_2"          
## [155] "DEFORMEDNAME_2"          "TOWEDNAME_2"            
## [157] "M_HARMNAME_2"            "VEH_SC1NAME_2"          
## [159] "VEH_SC2NAME_2"           "FIRE_EXPNAME_2"         
## [161] "VTRAFWAYNAME_2"          "VNUM_LANNAME_2"         
## [163] "VSPD_LIMNAME_2"          "VALIGNNAME_2"           
## [165] "VPROFILENAME_2"          "VPAVETYPNAME_2"         
## [167] "VSURCONDNAME_2"          "VTRAFCONNAME_2"         
## [169] "VTCONT_FNAME_2"          "P_CRASH1NAME_2"         
## [171] "P_CRASH2NAME_2"          "P_CRASH3NAME_2"         
## [173] "PCRASH4NAME_2"           "PCRASH5NAME_2"          
## [175] "ACC_TYPENAME_2"          "DEATHS_2"               
## [177] "VEHICLECCNAME_2"         "VPICMAKENAME_2"         
## [179] "VPICMODEL_2"             "VPICMODELNAME_2"        
## [181] "VPICBODYCLASSNAME_2"     "ICFINALBODYNAME_2"      
## [183] "DR_PRESNAME_2"           "L_STATENAME_2"          
## [185] "DR_ZIPNAME_2"            "L_STATUSNAME_2"         
## [187] "L_TYPENAME_2"            "CDL_STATNAME_2"         
## [189] "L_ENDORSNAME_2"          "L_COMPLNAME_2"          
## [191] "L_RESTRINAME_2"          "DR_HGT_2"               
## [193] "DR_WGTNAME_2"            "PREV_ACCNAME_2"         
## [195] "PREV_SUSNAME_2"          "PREV_DWINAME_2"         
## [197] "PREV_SPDNAME_2"          "PREV_OTHNAME_2"         
## [199] "FIRST_MONAME_2"          "FIRST_YRNAME_2"         
## [201] "LAST_MONAME_2"           "LAST_YRNAME_2"          
## [203] "SPEEDRELNAME_2"          "DR_SF1NAME_2"           
## [205] "DR_SF2NAME_2"            "DR_SF3NAME_2"           
## [207] "DR_SF4NAME_2"            "DR_DRINKNAME_2"         
## [209] "MDRDSTRDNAME_2"          "DRDISTRACTNAME_2"       
## [211] "DRIMPAIRNAME_2"          "MDRMANAVNAME_2"         
## [213] "MANEUVERNAME_2"          "MFACTORNAME_2"          
## [215] "MVIOLATNNAME_2"          "VIOLATIONNAME_2"        
## [217] "MVISOBSCNAME_2"          "VISIONNAME_2"           
## [219] "STR_VEH_2"               "MAK_MOD_2"              
## [221] "MOD_YEARNAME_2"          "AGENAME_2"              
## [223] "SEXNAME_2"               "PER_TYPNAME_2"          
## [225] "INJ_SEVNAME_2"           "SEAT_POSNAME_2"         
## [227] "REST_USENAME_2"          "REST_MISNAME_2"         
## [229] "AIR_BAGNAME_2"           "EJECTIONNAME_2"         
## [231] "EJ_PATHNAME_2"           "EXTRICATNAME_2"         
## [233] "DRINKINGNAME_2"          "ALC_DETNAME_2"          
## [235] "ALC_STATUSNAME_2"        "ATST_TYPNAME_2"         
## [237] "ALC_RESNAME_2"           "DRUGSNAME_2"            
## [239] "DRUG_DETNAME_2"          "DSTATUSNAME_2"          
## [241] "DRUGTST1NAME_2"          "DRUGTST2NAME_2"         
## [243] "DRUGTST3NAME_2"          "DRUGRES1NAME_2"         
## [245] "DRUGRES2NAME_2"          "DRUGRES3NAME_2"         
## [247] "HOSPITALNAME_2"          "DOANAME_2"              
## [249] "DEATH_DANAME_2"          "DEATH_MONAME_2"         
## [251] "DEATH_YRNAME_2"          "DEATH_HRNAME_2"         
## [253] "DEATH_MNNAME_2"          "DEATH_TMNAME_2"         
## [255] "LAG_HRSNAME_2"           "LAG_MINSNAME_2"         
## [257] "P_SF1NAME_2"             "P_SF2NAME_2"            
## [259] "P_SF3NAME_2"             "WORK_INJNAME_2"         
## [261] "HISPANICNAME_2"          "RACENAME_2"             
## [263] "LOCATIONNAME_2"          "HELM_USENAME_2"         
## [265] "HELM_MISNAME_2"          "COUNTYNAME"             
## [267] "CITYNAME"                "DAY"                    
## [269] "MONTHNAME"               "DAY_WEEKNAME"           
## [271] "HOUR"                    "HOURNAME"               
## [273] "RUR_URBNAME"             "FUNC_SYSNAME"           
## [275] "HARM_EVNAME"             "MAN_COLLNAME"           
## [277] "WEATHERNAME"             "SCH_BUSNAME"

2. Select analysis variables

keep_vars <- c(
  "STATENAME",
  "YEAR",
  "CRASH_NUM1",
  "WORK_INJNAME",
  "LATITUDENAME",
  "LONGITUDNAME",
  "VE_TOTAL",
  "FATALS",
  "DRUNK_DR",
  "HIT_RUNNAME_2",
  "WRK_ZONENAME",
  "REL_ROADNAME",
  "WEATHER1NAME",
  "LGT_CONDNAME",
  "BODY_TYPNAME",
  "RUR_URBNAME",
  "FUNC_SYSNAME",
  "PBAGENAME",
  "PBSEXNAME",
  "BIKECTYPENAME",
  "BIKEPOSNAME",
  "MOTMANNAME",
  "BIKELOCNAME",
  "BIKECGPNAME",
  "INJ_SEVNAMEDr",
  "AGENAMEDr",
  "SEXNAMEDr",
  "DRINKINGNAMEDr",
  "HAZ_INVNAME_2",
  "BUS_USENAME_2",
  "EMER_USENAME_2",
  "TRAV_SPNAME_2",
  "UNDERIDENAME_2",
  "ROLLOVERNAME_2",
  "DEFORMEDNAME_2",
  "VTRAFWAYNAME_2",
  "VNUM_LANNAME_2",
  "VSPD_LIMNAME_2",
  "VALIGNNAME_2",
  "VPROFILENAME_2",
  "VPAVETYPNAME_2",
  "VSURCONDNAME_2",
  "P_CRASH1NAME_2",
  "ACC_TYPENAME_2",
  "L_STATUSNAME_2",
  "L_TYPENAME_2",
  "CDL_STATNAME_2",
  "MDRDSTRDNAME_2",
  "DRIMPAIRNAME_2",
  "MDRMANAVNAME_2",
  "DAY_WEEKNAME"
)

dat1 <- dat %>%
  select(all_of(keep_vars))

dim(dat1)
## [1] 8689   51
names(dat1)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "BODY_TYPNAME"   "RUR_URBNAME"   
## [17] "FUNC_SYSNAME"   "PBAGENAME"      "PBSEXNAME"      "BIKECTYPENAME" 
## [21] "BIKEPOSNAME"    "MOTMANNAME"     "BIKELOCNAME"    "BIKECGPNAME"   
## [25] "INJ_SEVNAMEDr"  "AGENAMEDr"      "SEXNAMEDr"      "DRINKINGNAMEDr"
## [29] "HAZ_INVNAME_2"  "BUS_USENAME_2"  "EMER_USENAME_2" "TRAV_SPNAME_2" 
## [33] "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2" "VTRAFWAYNAME_2"
## [37] "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"   "VPROFILENAME_2"
## [41] "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2" "ACC_TYPENAME_2"
## [45] "L_STATUSNAME_2" "L_TYPENAME_2"   "CDL_STATNAME_2" "MDRDSTRDNAME_2"
## [49] "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"
datatable(
  head(dat1, 25),
  options = list(scrollX = TRUE, pageLength = 10),
  caption = "Selected variables"
)

3. Create pedestrian and driver age groups

dat1 <- dat1 %>%
  mutate(
    age_num = as.numeric(str_extract(PBAGENAME, "\\d+")),
    Ped_Age = case_when(
      age_num <= 4  ~ "0-4",
      age_num <= 9  ~ "5-9",
      age_num <= 14 ~ "10-14",
      age_num <= 20 ~ "15-20",
      age_num <= 24 ~ "21-24",
      age_num <= 34 ~ "25-34",
      age_num <= 44 ~ "35-44",
      age_num <= 54 ~ "45-54",
      age_num <= 64 ~ "55-64",
      age_num <= 74 ~ "65-74",
      age_num >= 75 ~ "75+",
      TRUE ~ "Unknown"
    ),
    age_num1 = as.numeric(str_extract(AGENAMEDr, "\\d+")),
    Dr_Age = case_when(
      age_num1 <= 4  ~ "0-4",
      age_num1 <= 9  ~ "5-9",
      age_num1 <= 14 ~ "10-14",
      age_num1 <= 20 ~ "15-20",
      age_num1 <= 24 ~ "21-24",
      age_num1 <= 34 ~ "25-34",
      age_num1 <= 44 ~ "35-44",
      age_num1 <= 54 ~ "45-54",
      age_num1 <= 64 ~ "55-64",
      age_num1 <= 74 ~ "65-74",
      age_num1 >= 75 ~ "75+",
      TRUE ~ "Unknown"
    )
  )

table(dat1$Ped_Age, useNA = "ifany")
## 
##     0-4   10-14   15-20   21-24   25-34   35-44   45-54     5-9   55-64   65-74 
##      40     282     493     300    1004    1228    1505     107    1981    1093 
##     75+ Unknown 
##     536     120
table(dat1$Dr_Age, useNA = "ifany")
## 
##   10-14   15-20   21-24   25-34   35-44   45-54   55-64   65-74     75+ Unknown 
##       4     749     763    1800    1401    1161     992     562     343     914

4. Create weekday/weekend variable

dat1 <- dat1 %>%
  mutate(
    Day_Type = case_when(
      DAY_WEEKNAME %in% c("Saturday", "Sunday") ~ "Weekend",
      !is.na(DAY_WEEKNAME) ~ "Weekday",
      TRUE ~ "Unknown"
    )
  )

table(dat1$Day_Type, useNA = "ifany")
## 
## Weekday Weekend 
##    6234    2455

5. Create major vehicle type

dat2 <- dat1 %>%
  mutate(
    Veh_Type = case_when(
      str_detect(BODY_TYPNAME, regex("sedan|coupe|hatchback|convertible|station wagon|automobile", ignore_case = TRUE)) ~ "Car",
      str_detect(BODY_TYPNAME, regex("pickup|utility|light truck|cab chassis based", ignore_case = TRUE)) ~ "Light_Truck",
      str_detect(BODY_TYPNAME, regex("truck-tractor|single-unit|heavy truck|medium/heavy|construction equipment|farm equipment|motorhome", ignore_case = TRUE)) ~ "Large_Truck",
      str_detect(BODY_TYPNAME, regex("bus", ignore_case = TRUE)) ~ "Bus",
      str_detect(BODY_TYPNAME, regex("van|minivan|step-van|walk-in van", ignore_case = TRUE)) ~ "Van",
      str_detect(BODY_TYPNAME, regex("motorcycle|moped|scooter|motored cycle", ignore_case = TRUE)) ~ "Motorcycle",
      str_detect(BODY_TYPNAME, regex("ATV|ATC|golf cart|off-highway|recreational|three-wheel|go-cart|fork-lift|street sweeper", ignore_case = TRUE)) ~ "Off_Road_Other",
      is.na(BODY_TYPNAME) |
        BODY_TYPNAME %in% c("NA", "Not Reported", "Unknown body type") |
        str_detect(BODY_TYPNAME, regex("unknown|not reported", ignore_case = TRUE)) ~ "Unknown",
      TRUE ~ "Off_Road_Other"
    )
  )

table(dat2$Veh_Type, useNA = "ifany")
## 
##            Bus            Car    Large_Truck    Light_Truck     Motorcycle 
##            218           3249            698           3465             80 
## Off_Road_Other        Unknown            Van 
##              8            695            276

6. Remove unused source variables

names(dat2)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "BODY_TYPNAME"   "RUR_URBNAME"   
## [17] "FUNC_SYSNAME"   "PBAGENAME"      "PBSEXNAME"      "BIKECTYPENAME" 
## [21] "BIKEPOSNAME"    "MOTMANNAME"     "BIKELOCNAME"    "BIKECGPNAME"   
## [25] "INJ_SEVNAMEDr"  "AGENAMEDr"      "SEXNAMEDr"      "DRINKINGNAMEDr"
## [29] "HAZ_INVNAME_2"  "BUS_USENAME_2"  "EMER_USENAME_2" "TRAV_SPNAME_2" 
## [33] "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2" "VTRAFWAYNAME_2"
## [37] "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"   "VPROFILENAME_2"
## [41] "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2" "ACC_TYPENAME_2"
## [45] "L_STATUSNAME_2" "L_TYPENAME_2"   "CDL_STATNAME_2" "MDRDSTRDNAME_2"
## [49] "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"   "age_num"       
## [53] "Ped_Age"        "age_num1"       "Dr_Age"         "Day_Type"      
## [57] "Veh_Type"
dat2 <- dat2[, -c(52, 54, 18, 26, 15, 51)]

dim(dat2)
## [1] 8689   51
names(dat2)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "RUR_URBNAME"    "FUNC_SYSNAME"  
## [17] "PBSEXNAME"      "BIKECTYPENAME"  "BIKEPOSNAME"    "MOTMANNAME"    
## [21] "BIKELOCNAME"    "BIKECGPNAME"    "INJ_SEVNAMEDr"  "SEXNAMEDr"     
## [25] "DRINKINGNAMEDr" "HAZ_INVNAME_2"  "BUS_USENAME_2"  "EMER_USENAME_2"
## [29] "TRAV_SPNAME_2"  "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [33] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"  
## [37] "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [41] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"   "CDL_STATNAME_2"
## [45] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "Ped_Age"       
## [49] "Dr_Age"         "Day_Type"       "Veh_Type"

7. Create posted speed limit group

dat2 <- dat2 %>%
  mutate(
    spd_num = as.numeric(str_extract(VSPD_LIMNAME_2, "\\d+")),
    PSL = case_when(
      spd_num <= 25 ~ "25_or_less",
      spd_num <= 35 ~ "30_35",
      spd_num <= 45 ~ "40_45",
      spd_num <= 55 ~ "50_55",
      spd_num >= 60 ~ "60_plus",
      TRUE ~ "Unknown"
    )
  )

table(dat2$PSL, useNA = "ifany")
## 
## 25_or_less      30_35      40_45      50_55    60_plus    Unknown 
##        950       2205       2827       1781        447        479
dat3 <- dat2[, -c(35, 52)]

dim(dat3)
## [1] 8689   51

8. Standardize missing and unknown categories

dat3 <- dat3 %>%
  mutate(
    across(
      where(~ is.character(.) | is.factor(.)),
      ~ case_when(
        is.na(.) ~ "Not Reported",
        str_trim(as.character(.)) %in% c(
          "NA", "N/A", "Unknown", "Reported as Unknown",
          " Reported as Unknown",
          "Reported as Unknown if Distracted ",
          "Reported as Unknown if Impaired",
          "Unknown body type", "Not Reported"
        ) ~ "Not Reported",
        str_detect(as.character(.), regex("^unknown|not reported$", ignore_case = TRUE)) ~ "Not Reported",
        TRUE ~ as.character(.)
      )
    )
  )

cat_summary <- dat3 %>%
  summarise(across(
    where(~ is.character(.) | is.factor(.)),
    ~ n_distinct(., na.rm = TRUE)
  )) %>%
  pivot_longer(
    everything(),
    names_to = "variable",
    values_to = "n_categories"
  ) %>%
  arrange(desc(n_categories))

cat_summary
## # A tibble: 47 × 2
##    variable       n_categories
##    <chr>                 <int>
##  1 CRASH_NUM1             8650
##  2 LONGITUDNAME           8608
##  3 LATITUDENAME           8594
##  4 TRAV_SPNAME_2            93
##  5 BIKECTYPENAME            81
##  6 STATENAME                51
##  7 ACC_TYPENAME_2           36
##  8 MDRDSTRDNAME_2           27
##  9 BIKECGPNAME              22
## 10 P_CRASH1NAME_2           20
## # ℹ 37 more rows
datatable(
  cat_summary,
  options = list(pageLength = 25),
  caption = "Number of categories after lumping"
)

9. PBCAT vairable recategorization

#### BIKECTYPENAME (From 81 to 12 Categories)

dat3 <- dat3 %>%
   mutate(
    bike_type_clean = BIKECTYPENAME %>%
      str_replace_all("–|–|Â|ÿ", "-") %>%
      str_squish(),

    BikeCTyp = case_when(

      str_detect(bike_type_clean, regex("Ride[- ]?Out|Drive[- ]?Out|Drive-In/Out", ignore_case = TRUE)) ~
        "Ride-Out / Drive-Out",

      str_detect(bike_type_clean, regex("Ride Through|Drive Through", ignore_case = TRUE)) ~
        "Ride-Through / Drive-Through",

      str_detect(bike_type_clean, regex("Left Turn|Right Turn|Right Turn on Red|Turn / Merge|Turning Error", ignore_case = TRUE)) ~
        "Turning / Merging",

      str_detect(bike_type_clean, regex("Crossing Paths", ignore_case = TRUE)) ~
        "Crossing Paths",

      str_detect(bike_type_clean, regex("Overtaking|Passing|Extended Door", ignore_case = TRUE)) ~
        "Overtaking / Passing",

      str_detect(bike_type_clean, regex("Wrong-Way|Wrong-Side", ignore_case = TRUE)) ~
        "Wrong-Way / Wrong-Side",

      str_detect(bike_type_clean, regex("Lost Control", ignore_case = TRUE)) ~
        "Loss of Control",

      str_detect(bike_type_clean, regex("Backing Vehicle", ignore_case = TRUE)) ~
        "Backing Vehicle",

      str_detect(bike_type_clean, regex("Multiple Threat|Failed to Clear|Trapped", ignore_case = TRUE)) ~
        "Multiple Threat / Trapped",

      str_detect(bike_type_clean, regex("Parallel Paths", ignore_case = TRUE)) ~
        "Parallel Paths",

      str_detect(bike_type_clean, regex("Non-Roadway|Play Vehicle", ignore_case = TRUE)) ~
        "Non-Roadway / Play Vehicle",

      TRUE ~
        "Other / Unknown / Not Reported"
    )
  )


dat3 %>%
  count(BikeCTyp, sort = TRUE) %>%
  mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 12 × 3
##    BikeCTyp                           n percent
##    <chr>                          <int>   <dbl>
##  1 Overtaking / Passing            2606   30.0 
##  2 Turning / Merging               1441   16.6 
##  3 Ride-Out / Drive-Out            1279   14.7 
##  4 Ride-Through / Drive-Through     927   10.7 
##  5 Other / Unknown / Not Reported   723    8.32
##  6 Parallel Paths                   517    5.95
##  7 Crossing Paths                   504    5.8 
##  8 Wrong-Way / Wrong-Side           431    4.96
##  9 Loss of Control                  190    2.19
## 10 Multiple Threat / Trapped         28    0.32
## 11 Backing Vehicle                   22    0.25
## 12 Non-Roadway / Play Vehicle        21    0.24
### BIKECGPNAME (21 to 9)

dat3 <- dat3 %>%
  mutate(
    bike_cgp_clean = BIKECGPNAME %>%
      str_replace_all("–|–|Â|ÿ", "-") %>%
      str_squish(),

    BikeCgp = case_when(

      str_detect(bike_cgp_clean, regex("Bicyclist Failed to Yield", ignore_case = TRUE)) ~
        "Bicyclist Failed to Yield",

      str_detect(bike_cgp_clean, regex("Motorist Failed to Yield", ignore_case = TRUE)) ~
        "Motorist Failed to Yield",

      str_detect(bike_cgp_clean, regex("Bicyclist Left Turn|Bicyclist Right Turn|Bicyclist.*Merge", ignore_case = TRUE)) ~
        "Bicyclist Turn / Merge",

      str_detect(bike_cgp_clean, regex("Motorist Left Turn|Motorist Right Turn|Motorist.*Merge", ignore_case = TRUE)) ~
        "Motorist Turn / Merge",

      str_detect(bike_cgp_clean, regex("Overtaking", ignore_case = TRUE)) ~
        "Overtaking",

      str_detect(bike_cgp_clean, regex("Crossing Paths|Parallel Paths", ignore_case = TRUE)) ~
        "Crossing / Parallel Paths",

      str_detect(bike_cgp_clean, regex("Loss of Control|Turning Error", ignore_case = TRUE)) ~
        "Loss of Control / Turning Error",

      str_detect(bike_cgp_clean, regex("Backing|Wrong-Way|Wrong-Side|Non-Trafficway|Parking|Bus", ignore_case = TRUE)) ~
        "Backing / Wrong-Way / Non-Trafficway",

      TRUE ~
        "Other / Unknown / Unusual"
    )
  )


dat3 %>%
  count(BikeCgp, sort = TRUE) %>%
  mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 9 × 3
##   BikeCgp                                  n percent
##   <chr>                                <int>   <dbl>
## 1 Overtaking                            2606   30.0 
## 2 Bicyclist Failed to Yield             1934   22.3 
## 3 Crossing / Parallel Paths             1254   14.4 
## 4 Bicyclist Turn / Merge                 757    8.71
## 5 Other / Unknown / Unusual              597    6.87
## 6 Motorist Turn / Merge                  483    5.56
## 7 Backing / Wrong-Way / Non-Trafficway   464    5.34
## 8 Loss of Control / Turning Error        352    4.05
## 9 Motorist Failed to Yield               242    2.79
#####  ACC_TYPENAME_2


dat3 <- dat3 %>%
  mutate(
    acc_type_clean = ACC_TYPENAME_2 %>%
      str_replace_all("–|–", "-") %>%
      str_remove("^[A-Z][0-9]+-") %>%   # removes A1-, B6-, C13-, M99-, etc.
      str_squish(),

    AccTyp = case_when(

      # 1-4. Roadside departure
      str_detect(acc_type_clean, regex("Right Roadside Departure-Drive Off Road", ignore_case = TRUE)) ~
        "Right Roadside Departure - Drive Off Road",

      str_detect(acc_type_clean, regex("Right Roadside Departure-Control/Traction Loss", ignore_case = TRUE)) ~
        "Right Roadside Departure - Control Loss",

      str_detect(acc_type_clean, regex("Left Roadside Departure-Drive Off Road", ignore_case = TRUE)) ~
        "Left Roadside Departure - Drive Off Road",

      str_detect(acc_type_clean, regex("Left Roadside Departure-Control/Traction Loss", ignore_case = TRUE)) ~
        "Left Roadside Departure - Control Loss",

      # 5. Other roadside departure
      str_detect(acc_type_clean, regex("Roadside Departure", ignore_case = TRUE)) ~
        "Other Roadside Departure",

      # 6-8. Forward impact
      str_detect(acc_type_clean, regex("Forward Impact-Pedestrian|Pedestrian/ Animal", ignore_case = TRUE)) ~
        "Forward Impact - Pedestrian/Animal",

      str_detect(acc_type_clean, regex("Forward Impact-Parked Veh", ignore_case = TRUE)) ~
        "Forward Impact - Parked Vehicle",

      str_detect(acc_type_clean, regex("Forward Impact-Sta|Sta\\. Object|End Departure", ignore_case = TRUE)) ~
        "Forward Impact - Object/End Departure",

      # 9-11. Rear-end
      str_detect(acc_type_clean, regex("Rear End-Stopped", ignore_case = TRUE)) ~
        "Rear-End - Stopped Vehicle",

      str_detect(acc_type_clean, regex("Rear End-Slower", ignore_case = TRUE)) ~
        "Rear-End - Slower Vehicle",

      str_detect(acc_type_clean, regex("Rear End-Decelerating|Rear End-Specifics", ignore_case = TRUE)) ~
        "Rear-End - Decelerating/Other",

      # 12-13. Same-direction sideswipe/angle
      str_detect(acc_type_clean, regex("Same Direction-Angle, Sideswipe.*Straight", ignore_case = TRUE)) ~
        "Same-Direction Sideswipe - Straight",

      str_detect(acc_type_clean, regex("Same Direction-Angle, Sideswipe.*Changing Lanes|Same Direction-Angle, Sideswipe-Specifics", ignore_case = TRUE)) ~
        "Same-Direction Sideswipe - Lane Change/Other",

      # 14-15. Opposite-direction conflict
      str_detect(acc_type_clean, regex("Opposite Direction-Head-On", ignore_case = TRUE)) ~
        "Opposite-Direction Head-On",

      str_detect(acc_type_clean, regex("Opposite Direction-Angle|Opposite Direction-Forward Impact", ignore_case = TRUE)) ~
        "Opposite-Direction Angle/Sideswipe",

      # 16-17. Turning conflicts
      str_detect(acc_type_clean, regex("Turn Across Path", ignore_case = TRUE)) ~
        "Turning Conflict - Turn Across Path",

      str_detect(acc_type_clean, regex("Turn Into Path", ignore_case = TRUE)) ~
        "Turning Conflict - Turn Into Path",

      # 18. Intersecting paths
      str_detect(acc_type_clean, regex("Intersecting Paths", ignore_case = TRUE)) ~
        "Intersecting-Path Conflict",

      # 19. Backing
      str_detect(acc_type_clean, regex("Backing", ignore_case = TRUE)) ~
        "Backing",

      # 20. Other / unknown
      TRUE ~
        "Other / Unknown / No Impact"
    )
  )

dat3 %>%
  count(AccTyp, sort = TRUE) %>%
  mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 18 × 3
##    AccTyp                                           n percent
##    <chr>                                        <int>   <dbl>
##  1 Forward Impact - Pedestrian/Animal            7227   83.2 
##  2 Other / Unknown / No Impact                   1244   14.3 
##  3 Right Roadside Departure - Drive Off Road      103    1.19
##  4 Left Roadside Departure - Drive Off Road        21    0.24
##  5 Backing                                         20    0.23
##  6 Intersecting-Path Conflict                      11    0.13
##  7 Other Roadside Departure                        11    0.13
##  8 Right Roadside Departure - Control Loss         10    0.12
##  9 Turning Conflict - Turn Across Path             10    0.12
## 10 Left Roadside Departure - Control Loss           6    0.07
## 11 Rear-End - Slower Vehicle                        5    0.06
## 12 Forward Impact - Object/End Departure            4    0.05
## 13 Forward Impact - Parked Vehicle                  4    0.05
## 14 Opposite-Direction Angle/Sideswipe               4    0.05
## 15 Same-Direction Sideswipe - Lane Change/Other     3    0.03
## 16 Turning Conflict - Turn Into Path                3    0.03
## 17 Same-Direction Sideswipe - Straight              2    0.02
## 18 Opposite-Direction Head-On                       1    0.01
names(dat3)
##  [1] "STATENAME"       "YEAR"            "CRASH_NUM1"      "WORK_INJNAME"   
##  [5] "LATITUDENAME"    "LONGITUDNAME"    "VE_TOTAL"        "FATALS"         
##  [9] "DRUNK_DR"        "HIT_RUNNAME_2"   "WRK_ZONENAME"    "REL_ROADNAME"   
## [13] "WEATHER1NAME"    "LGT_CONDNAME"    "RUR_URBNAME"     "FUNC_SYSNAME"   
## [17] "PBSEXNAME"       "BIKECTYPENAME"   "BIKEPOSNAME"     "MOTMANNAME"     
## [21] "BIKELOCNAME"     "BIKECGPNAME"     "INJ_SEVNAMEDr"   "SEXNAMEDr"      
## [25] "DRINKINGNAMEDr"  "HAZ_INVNAME_2"   "BUS_USENAME_2"   "EMER_USENAME_2" 
## [29] "TRAV_SPNAME_2"   "UNDERIDENAME_2"  "ROLLOVERNAME_2"  "DEFORMEDNAME_2" 
## [33] "VTRAFWAYNAME_2"  "VNUM_LANNAME_2"  "VALIGNNAME_2"    "VPROFILENAME_2" 
## [37] "VPAVETYPNAME_2"  "VSURCONDNAME_2"  "P_CRASH1NAME_2"  "ACC_TYPENAME_2" 
## [41] "L_STATUSNAME_2"  "L_TYPENAME_2"    "CDL_STATNAME_2"  "MDRDSTRDNAME_2" 
## [45] "DRIMPAIRNAME_2"  "MDRMANAVNAME_2"  "Ped_Age"         "Dr_Age"         
## [49] "Day_Type"        "Veh_Type"        "PSL"             "bike_type_clean"
## [53] "BikeCTyp"        "bike_cgp_clean"  "BikeCgp"         "acc_type_clean" 
## [57] "AccTyp"
dat3 <- dat3[, -c(18, 22, 40, 52, 54, 56)]

dim(dat3)
## [1] 8689   51
names(dat3)
##  [1] "STATENAME"      "YEAR"           "CRASH_NUM1"     "WORK_INJNAME"  
##  [5] "LATITUDENAME"   "LONGITUDNAME"   "VE_TOTAL"       "FATALS"        
##  [9] "DRUNK_DR"       "HIT_RUNNAME_2"  "WRK_ZONENAME"   "REL_ROADNAME"  
## [13] "WEATHER1NAME"   "LGT_CONDNAME"   "RUR_URBNAME"    "FUNC_SYSNAME"  
## [17] "PBSEXNAME"      "BIKEPOSNAME"    "MOTMANNAME"     "BIKELOCNAME"   
## [21] "INJ_SEVNAMEDr"  "SEXNAMEDr"      "DRINKINGNAMEDr" "HAZ_INVNAME_2" 
## [25] "BUS_USENAME_2"  "EMER_USENAME_2" "TRAV_SPNAME_2"  "UNDERIDENAME_2"
## [29] "ROLLOVERNAME_2" "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2"
## [33] "VALIGNNAME_2"   "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2"
## [37] "P_CRASH1NAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"   "CDL_STATNAME_2"
## [41] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "Ped_Age"       
## [45] "Dr_Age"         "Day_Type"       "Veh_Type"       "PSL"           
## [49] "BikeCTyp"       "BikeCgp"        "AccTyp"

10. Lump variables with more than five categories

keep_cols <- c("CRASH_NUM1", "Ped_Age", "Dr_Age", "Veh_Type", "LATITUDENAME", "LONGITUDNAME", "STATENAME", "LGT_CONDNAME", "BikeCTyp", "BikeCgp","AccTyp", "YEAR", "VE_TOTAL", "Veh_Type", "BIKEPOSNAME", "FUNC_SYSNAME")

dat3_lumped <- dat3 %>%
  mutate(
    across(
      .cols = setdiff(
        names(select(., where(~ is.character(.) | is.factor(.)))),
        keep_cols
      ),
      .fns = ~ if (n_distinct(., na.rm = TRUE) > 5) {
        fct_lump_n(as.factor(.), n = 5, other_level = "Other")
      } else {
        as.factor(.)
      }
    )
  )

dim(dat3_lumped)
## [1] 8689   51
category_summary <- data.frame(
  Variable = names(dat3_lumped),
  Categories = sapply(dat3_lumped, function(x) n_distinct(x, na.rm = FALSE))
) %>%
  arrange(desc(Categories))

datatable(
  category_summary,
  options = list(pageLength = 25),
  caption = "Number of categories after lumping"
)
freq_tables <- lapply(
  dat3_lumped %>% select(-all_of(keep_cols)),
  table,
  useNA = "ifany"
)

freq_tables
## $WORK_INJNAME
## 
##           No Not Reported     Redacted          Yes 
##         7865          773           34           17 
## 
## $FATALS
## 
##    1    2    3    5 
## 8585   90    4   10 
## 
## $DRUNK_DR
## 
##    0    1    2 <NA> 
## 3964  368    5 4352 
## 
## $HIT_RUNNAME_2
## 
##           No Not Reported          Yes 
##         6875           10         1804 
## 
## $WRK_ZONENAME
## 
##            Construction             Maintenance                    None 
##                      42                       6                    8596 
##                 Utility Work Zone, Type Unknown 
##                       1                      44 
## 
## $REL_ROADNAME
## 
## Continuous Left - Turn Lane                Not Reported 
##                          17                          46 
##                 On Roadside                  On Roadway 
##                         222                        8027 
##                 On Shoulder                       Other 
##                         340                          37 
## 
## $WEATHER1NAME
## 
##            Clear           Cloudy Fog, Smog, Smoke     Not Reported 
##             2611              417               13             5495 
##             Rain            Other 
##              145                8 
## 
## $RUR_URBNAME
## 
##                      Not Reported                             Rural 
##                                35                              1698 
## Trafficway Not in State Inventory                             Urban 
##                                12                              6944 
## 
## $PBSEXNAME
## 
##       Female         Male Not Reported 
##         1113         7503           73 
## 
## $MOTMANNAME
## 
## Not a Pedestrian 
##             8689 
## 
## $BIKELOCNAME
## 
##         At Intersection    Intersection-Related Non-Trafficway Location 
##                    2519                     708                       8 
##     Not At Intersection            Not Reported 
##                    5410                      44 
## 
## $INJ_SEVNAMEDr
## 
##       No Apparent Injury (O)                 Not Reported 
##                         7349                          661 
##          Possible Injury (C)   Suspected Minor Injury (B) 
##                          336                          212 
## Suspected Serious Injury (A)                        Other 
##                           63                           68 
## 
## $SEXNAMEDr
## 
##       Female         Male Not Reported 
##         2218         5596          875 
## 
## $DRINKINGNAMEDr
## 
## No (Alcohol Not Involved)              Not Reported    Yes (Alcohol Involved) 
##                      5449                      2577                       663 
## 
## $HAZ_INVNAME_2
## 
##           No Not Reported          Yes 
##         8670            5           14 
## 
## $BUS_USENAME_2
## 
##      Charter/Tour         Not a Bus      Not Reported            School 
##                 8              8080               531                26 
## Transit/ Commuter             Other 
##                37                 7 
## 
## $EMER_USENAME_2
## 
##         Emergency Operation, Emergency Warning Equipment in Use 
##                                                               5 
## Emergency Operation, Emergency Warning Equipment in Use Unknown 
##                                                               9 
##                                    Non-Emergency, Non-Transport 
##                                                               6 
##                                                  Not Applicable 
##                                                            8513 
##                                                    Not Reported 
##                                                             149 
##                                                           Other 
##                                                               7 
## 
## $TRAV_SPNAME_2
## 
##      035 MPH      040 MPH      045 MPH      055 MPH Not Reported        Other 
##          389          412          617          359         5007         1905 
## 
## $UNDERIDENAME_2
## 
##              No Underride or Override Noted 
##                                        4335 
##                                Not Reported 
##                                        4353 
## Overriding a Motor Vehicle Not In-Transport 
##                                           1 
## 
## $ROLLOVERNAME_2
## 
##                         No Rollover                      Not Applicable 
##                                8614                                  31 
##                        Not Reported                            Rollover 
##                                   5                                  16 
## Rollover, Tripped by Object/Vehicle                               Other 
##                                  20                                   3 
## 
## $DEFORMEDNAME_2
## 
## Damage Reported, Extent Unknown                Disabling Damage 
##                             580                            1723 
##               Functional Damage                    Minor Damage 
##                            2396                            2041 
##                    Not Reported                           Other 
##                            1434                             515 
## 
## $VTRAFWAYNAME_2
## 
##                                    One-Way Trafficway 
##                                                   265 
##           Two-Way,  Divided, Positive  Median Barrier 
##                                                   486 
##                  Two-Way, Divided, Unprotected Median 
##                                                  1965 
##                                  Two-Way, Not Divided 
##                                                  4633 
## Two-Way, Not Divided With a Continuous Left-Turn Lane 
##                                                   964 
##                                                 Other 
##                                                   376 
## 
## $VNUM_LANNAME_2
## 
##  Five lanes  Four lanes   Six lanes Three lanes   Two lanes       Other 
##        1009        1101         182        1161        4724         512 
## 
## $VALIGNNAME_2
## 
## Curve - Left   Curve Left  Curve Right Not Reported     Straight        Other 
##          113          163          150          257         7812          194 
## 
## $VPROFILENAME_2
## 
##             Downhill Grade, Unknown Slope                Level 
##                  228                  467                 6482 
##         Not Reported               Uphill                Other 
##                 1006                  304                  202 
## 
## $VPAVETYPNAME_2
## 
##  Blacktop, Bituminous, or Asphalt                          Concrete 
##                              4950                               508 
## Non-Trafficway or Driveway Access                      Not Reported 
##                                45                              3164 
##             Slag, Gravel or Stone                             Other 
##                                12                                10 
## 
## $VSURCONDNAME_2
## 
##                               Dry Non-Trafficway or Driveway Access 
##                              7835                                45 
##                      Not Reported                              Snow 
##                               127                                10 
##                               Wet                             Other 
##                               653                                19 
## 
## $P_CRASH1NAME_2
## 
##      Going Straight Negotiating a Curve        Not Reported        Turning Left 
##                6734                 480                 212                 427 
##       Turning Right               Other 
##                 428                 408 
## 
## $L_STATUSNAME_2
## 
##      Expired Not licensed Not Reported    Suspended        Valid        Other 
##           85          334          928          370         6901           71 
## 
## $L_TYPENAME_2
## 
##         Full Driver License Intermediate Driver License 
##                        7225                         141 
##            Learner's Permit                Not Licensed 
##                          39                         334 
##                Not Reported                       Other 
##                         928                          22 
## 
## $CDL_STATNAME_2
## 
## Disqualified     No (CDL) Not Reported    Suspended        Valid        Other 
##           21         6788          934           19          879           48 
## 
## $MDRDSTRDNAME_2
## 
##                    Distraction/Inattention 
##                                         47 
## Inattention (Inattentive), Details Unknown 
##                                        100 
##                             Not Distracted 
##                                       1659 
##                               Not Reported 
##                                       6521 
##          Reported as Unknown if Distracted 
##                                        180 
##                                      Other 
##                                        182 
## 
## $DRIMPAIRNAME_2
## 
##                                  Asleep or Fatigued 
##                                                  64 
##                                       Ill, Blackout 
##                                                  32 
##                              None/Apparently Normal 
##                                                3973 
##                                        Not Reported 
##                                                3842 
## Under the Influence of Alcohol, Drugs or Medication 
##                                                 685 
##                                               Other 
##                                                  93 
## 
## $MDRMANAVNAME_2
## 
##               Driver Did Not Maneuver to Avoid 
##                                            974 
##                                  Motor Vehicle 
##                                              3 
##                                   Not Reported 
##                                           7312 
## Pedestrian, Pedalcyclist or Other Non-Motorist 
##                                            387 
##              Phantom/Non-Contact Motor Vehicle 
##                                              8 
##                                          Other 
##                                              5 
## 
## $Day_Type
## 
## Weekday Weekend 
##    6234    2455 
## 
## $PSL
## 
##   25_or_less        30_35        40_45        50_55 Not Reported        Other 
##          950         2205         2827         1781          479          447

11. Shorten variable names

keep_names <- c("CRASH_NUM1")

old_names <- names(dat3_lumped)

new_names <- ifelse(
  old_names %in% keep_names,
  old_names,
  str_to_title(str_sub(str_replace_all(old_names, "_", ""), 1, 4))
)

new_names <- make.unique(new_names, sep = "")

name_key <- data.frame(
  old_name = old_names,
  new_name = new_names
)

names(dat3_lumped) <- new_names

names(dat3_lumped)
##  [1] "Stat"       "Year"       "CRASH_NUM1" "Work"       "Lati"      
##  [6] "Long"       "Veto"       "Fata"       "Drun"       "Hitr"      
## [11] "Wrkz"       "Relr"       "Weat"       "Lgtc"       "Ruru"      
## [16] "Func"       "Pbse"       "Bike"       "Motm"       "Bike1"     
## [21] "Injs"       "Sexn"       "Drin"       "Hazi"       "Busu"      
## [26] "Emer"       "Trav"       "Unde"       "Roll"       "Defo"      
## [31] "Vtra"       "Vnum"       "Vali"       "Vpro"       "Vpav"      
## [36] "Vsur"       "Pcra"       "Lsta"       "Ltyp"       "Cdls"      
## [41] "Mdrd"       "Drim"       "Mdrm"       "Peda"       "Drag"      
## [46] "Dayt"       "Veht"       "Psl"        "Bike2"      "Bike3"     
## [51] "Acct"
datatable(
  name_key,
  options = list(pageLength = 25, scrollX = TRUE),
  caption = "Variable name key"
)

12. Final data preview

datatable(
  head(dat3_lumped, 50),
  options = list(scrollX = TRUE, pageLength = 10),
  caption = "Final processed dataset"
)

13. Export CSV and XLSX files

write.csv(dat3_lumped, "Bike01.csv", row.names = FALSE)
write_xlsx(dat3_lumped, "Bike01.xlsx")

write.csv(name_key, "Bike01_variable_name_key.csv", row.names = FALSE)
write_xlsx(name_key, "Bike01_variable_name_key.xlsx")

write.csv(category_summary, "Bike01_category_summary.csv", row.names = FALSE)
write_xlsx(category_summary, "Bike01_category_summary.xlsx")

14. Output files

The following files are created in the working directory:

  • Bike01.csv
  • Bike01.xlsx
  • Bike01_variable_name_key.csv
  • Bike01_variable_name_key.xlsx
  • Bike01_category_summary.csv
  • Bike01_category_summary.xlsx