library(readxl)
library(dplyr)
library(stringr)
library(forcats)
library(DT)
library(writexl)
library(tidyr)
data_dir <- "D:/DAS_CODE/Tools/data"
setwd(data_dir)
dat <- read_excel("Ped_Bike_Esc_2016_2024_All.xlsx", sheet = "Bike_Lim")
dim(dat)
## [1] 8689 278
names(dat)
## [1] "STATE" "STATENAME"
## [3] "ST_CASE" "VEH_NO"
## [5] "PER_NO" "YEAR"
## [7] "CRASH_NUM1" "CRASH_VEH_NUM1"
## [9] "CRASH_VehPer_NUM1" "LATITUDENAME"
## [11] "LONGITUDNAME" "VE_TOTAL"
## [13] "HIT_RUNNAME_2" "LGT_CONDNAME"
## [15] "BODY_TYPNAME" "PBPTYPENAME"
## [17] "PBAGENAME" "PBSEXNAME"
## [19] "PBCWALKNAME" "PBSWALK"
## [21] "PBSWALKNAME" "PBSZONENAME"
## [23] "BIKECTYPENAME" "BIKELOCNAME"
## [25] "BIKEPOSNAME" "BIKEDIRNAME"
## [27] "MOTDIRNAME" "MOTMANNAME"
## [29] "BIKECGPNAME" "CRASH_VEH_NUM2"
## [31] "CRASH_VehPer_NUM2" "PER_TYPNAMEDr"
## [33] "INJ_SEVNAMEDr" "AGENAMEDr"
## [35] "SEXNAMEDr" "HARM_EVNAMEDr"
## [37] "DRINKINGNAMEDr" "RACENAMEDr"
## [39] "HISPANICNAMEDr" "COUNTY"
## [41] "CITY" "MONTH"
## [43] "NHSNAME" "ROUTENAME"
## [45] "TWAY_ID" "TWAY_ID2"
## [47] "RD_OWNERNAME" "LATITUDE"
## [49] "LONGITUD" "SP_JURNAME"
## [51] "RELJCT1NAME" "RELJCT2NAME"
## [53] "TYP_INTNAME" "WRK_ZONENAME"
## [55] "REL_ROADNAME" "WEATHER1NAME"
## [57] "WEATHER2NAME" "RAILNAME"
## [59] "NOT_HOURNAME" "ARR_HOUR"
## [61] "ARR_HOURNAME" "ARR_MIN"
## [63] "ARR_MINNAME" "HOSP_HR"
## [65] "HOSP_HRNAME" "CF1NAME"
## [67] "CF2NAME" "CF3NAME"
## [69] "FATALS" "DRUNK_DR"
## [71] "STR_VEH" "MAK_MOD"
## [73] "MOD_YEARNAME" "AGENAME"
## [75] "SEXNAME" "PER_TYPNAME"
## [77] "INJ_SEVNAME" "SEAT_POSNAME"
## [79] "REST_USENAME" "REST_MISNAME"
## [81] "AIR_BAGNAME" "EJECTIONNAME"
## [83] "EJ_PATHNAME" "EXTRICATNAME"
## [85] "DRINKINGNAME" "ALC_DETNAME"
## [87] "ALC_STATUSNAME" "ATST_TYPNAME"
## [89] "ALC_RESNAME" "DRUGSNAME"
## [91] "DRUG_DETNAME" "DSTATUSNAME"
## [93] "DRUGTST1NAME" "DRUGTST2NAME"
## [95] "DRUGTST3NAME" "DRUGRES1NAME"
## [97] "DRUGRES2NAME" "DRUGRES3NAME"
## [99] "HOSPITALNAME" "DOANAME"
## [101] "DEATH_DANAME" "DEATH_MONAME"
## [103] "DEATH_YRNAME" "DEATH_HRNAME"
## [105] "DEATH_MNNAME" "DEATH_TMNAME"
## [107] "LAG_HRSNAME" "LAG_MINSNAME"
## [109] "P_SF1NAME" "P_SF2NAME"
## [111] "P_SF3NAME" "WORK_INJNAME"
## [113] "HISPANICNAME" "RACENAME"
## [115] "LOCATIONNAME" "HELM_USENAME"
## [117] "HELM_MISNAME" "VPICMAKENAME"
## [119] "VPICMODEL" "VPICMODELNAME"
## [121] "VPICBODYCLASSNAME" "ICFINALBODYNAME"
## [123] "Other_VEH_NO" "Other_PER_NO"
## [125] "Other_CRASH_VehPer_NUM1" "CRASH_NUM1_2"
## [127] "ST_CASE_2" "VEH_NO_2"
## [129] "CRASH_VEH_NUM1_2" "PER_NO_2"
## [131] "CRASH_VehPer_NUM1_2" "NUMOCCS_2"
## [133] "UNITTYPENAME_2" "REG_STATNAME_2"
## [135] "OWNERNAME_2" "MAKENAME_2"
## [137] "MAK_MODNAME_2" "BODY_TYPNAME_2"
## [139] "MOD_YEAR_2" "VINNAME_2"
## [141] "TOW_VEHNAME_2" "J_KNIFENAME_2"
## [143] "GVWRNAME_2" "V_CONFIGNAME_2"
## [145] "CARGO_BTNAME_2" "HAZ_INVNAME_2"
## [147] "BUS_USENAME_2" "SPEC_USENAME_2"
## [149] "EMER_USENAME_2" "TRAV_SPNAME_2"
## [151] "UNDERIDENAME_2" "ROLLOVERNAME_2"
## [153] "ROLINLOCNAME_2" "IMPACT1NAME_2"
## [155] "DEFORMEDNAME_2" "TOWEDNAME_2"
## [157] "M_HARMNAME_2" "VEH_SC1NAME_2"
## [159] "VEH_SC2NAME_2" "FIRE_EXPNAME_2"
## [161] "VTRAFWAYNAME_2" "VNUM_LANNAME_2"
## [163] "VSPD_LIMNAME_2" "VALIGNNAME_2"
## [165] "VPROFILENAME_2" "VPAVETYPNAME_2"
## [167] "VSURCONDNAME_2" "VTRAFCONNAME_2"
## [169] "VTCONT_FNAME_2" "P_CRASH1NAME_2"
## [171] "P_CRASH2NAME_2" "P_CRASH3NAME_2"
## [173] "PCRASH4NAME_2" "PCRASH5NAME_2"
## [175] "ACC_TYPENAME_2" "DEATHS_2"
## [177] "VEHICLECCNAME_2" "VPICMAKENAME_2"
## [179] "VPICMODEL_2" "VPICMODELNAME_2"
## [181] "VPICBODYCLASSNAME_2" "ICFINALBODYNAME_2"
## [183] "DR_PRESNAME_2" "L_STATENAME_2"
## [185] "DR_ZIPNAME_2" "L_STATUSNAME_2"
## [187] "L_TYPENAME_2" "CDL_STATNAME_2"
## [189] "L_ENDORSNAME_2" "L_COMPLNAME_2"
## [191] "L_RESTRINAME_2" "DR_HGT_2"
## [193] "DR_WGTNAME_2" "PREV_ACCNAME_2"
## [195] "PREV_SUSNAME_2" "PREV_DWINAME_2"
## [197] "PREV_SPDNAME_2" "PREV_OTHNAME_2"
## [199] "FIRST_MONAME_2" "FIRST_YRNAME_2"
## [201] "LAST_MONAME_2" "LAST_YRNAME_2"
## [203] "SPEEDRELNAME_2" "DR_SF1NAME_2"
## [205] "DR_SF2NAME_2" "DR_SF3NAME_2"
## [207] "DR_SF4NAME_2" "DR_DRINKNAME_2"
## [209] "MDRDSTRDNAME_2" "DRDISTRACTNAME_2"
## [211] "DRIMPAIRNAME_2" "MDRMANAVNAME_2"
## [213] "MANEUVERNAME_2" "MFACTORNAME_2"
## [215] "MVIOLATNNAME_2" "VIOLATIONNAME_2"
## [217] "MVISOBSCNAME_2" "VISIONNAME_2"
## [219] "STR_VEH_2" "MAK_MOD_2"
## [221] "MOD_YEARNAME_2" "AGENAME_2"
## [223] "SEXNAME_2" "PER_TYPNAME_2"
## [225] "INJ_SEVNAME_2" "SEAT_POSNAME_2"
## [227] "REST_USENAME_2" "REST_MISNAME_2"
## [229] "AIR_BAGNAME_2" "EJECTIONNAME_2"
## [231] "EJ_PATHNAME_2" "EXTRICATNAME_2"
## [233] "DRINKINGNAME_2" "ALC_DETNAME_2"
## [235] "ALC_STATUSNAME_2" "ATST_TYPNAME_2"
## [237] "ALC_RESNAME_2" "DRUGSNAME_2"
## [239] "DRUG_DETNAME_2" "DSTATUSNAME_2"
## [241] "DRUGTST1NAME_2" "DRUGTST2NAME_2"
## [243] "DRUGTST3NAME_2" "DRUGRES1NAME_2"
## [245] "DRUGRES2NAME_2" "DRUGRES3NAME_2"
## [247] "HOSPITALNAME_2" "DOANAME_2"
## [249] "DEATH_DANAME_2" "DEATH_MONAME_2"
## [251] "DEATH_YRNAME_2" "DEATH_HRNAME_2"
## [253] "DEATH_MNNAME_2" "DEATH_TMNAME_2"
## [255] "LAG_HRSNAME_2" "LAG_MINSNAME_2"
## [257] "P_SF1NAME_2" "P_SF2NAME_2"
## [259] "P_SF3NAME_2" "WORK_INJNAME_2"
## [261] "HISPANICNAME_2" "RACENAME_2"
## [263] "LOCATIONNAME_2" "HELM_USENAME_2"
## [265] "HELM_MISNAME_2" "COUNTYNAME"
## [267] "CITYNAME" "DAY"
## [269] "MONTHNAME" "DAY_WEEKNAME"
## [271] "HOUR" "HOURNAME"
## [273] "RUR_URBNAME" "FUNC_SYSNAME"
## [275] "HARM_EVNAME" "MAN_COLLNAME"
## [277] "WEATHERNAME" "SCH_BUSNAME"
keep_vars <- c(
"STATENAME",
"YEAR",
"CRASH_NUM1",
"WORK_INJNAME",
"LATITUDENAME",
"LONGITUDNAME",
"VE_TOTAL",
"FATALS",
"DRUNK_DR",
"HIT_RUNNAME_2",
"WRK_ZONENAME",
"REL_ROADNAME",
"WEATHER1NAME",
"LGT_CONDNAME",
"BODY_TYPNAME",
"RUR_URBNAME",
"FUNC_SYSNAME",
"PBAGENAME",
"PBSEXNAME",
"BIKECTYPENAME",
"BIKEPOSNAME",
"MOTMANNAME",
"BIKELOCNAME",
"BIKECGPNAME",
"INJ_SEVNAMEDr",
"AGENAMEDr",
"SEXNAMEDr",
"DRINKINGNAMEDr",
"HAZ_INVNAME_2",
"BUS_USENAME_2",
"EMER_USENAME_2",
"TRAV_SPNAME_2",
"UNDERIDENAME_2",
"ROLLOVERNAME_2",
"DEFORMEDNAME_2",
"VTRAFWAYNAME_2",
"VNUM_LANNAME_2",
"VSPD_LIMNAME_2",
"VALIGNNAME_2",
"VPROFILENAME_2",
"VPAVETYPNAME_2",
"VSURCONDNAME_2",
"P_CRASH1NAME_2",
"ACC_TYPENAME_2",
"L_STATUSNAME_2",
"L_TYPENAME_2",
"CDL_STATNAME_2",
"MDRDSTRDNAME_2",
"DRIMPAIRNAME_2",
"MDRMANAVNAME_2",
"DAY_WEEKNAME"
)
dat1 <- dat %>%
select(all_of(keep_vars))
dim(dat1)
## [1] 8689 51
names(dat1)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "BODY_TYPNAME" "RUR_URBNAME"
## [17] "FUNC_SYSNAME" "PBAGENAME" "PBSEXNAME" "BIKECTYPENAME"
## [21] "BIKEPOSNAME" "MOTMANNAME" "BIKELOCNAME" "BIKECGPNAME"
## [25] "INJ_SEVNAMEDr" "AGENAMEDr" "SEXNAMEDr" "DRINKINGNAMEDr"
## [29] "HAZ_INVNAME_2" "BUS_USENAME_2" "EMER_USENAME_2" "TRAV_SPNAME_2"
## [33] "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2" "VTRAFWAYNAME_2"
## [37] "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2" "VPROFILENAME_2"
## [41] "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2" "ACC_TYPENAME_2"
## [45] "L_STATUSNAME_2" "L_TYPENAME_2" "CDL_STATNAME_2" "MDRDSTRDNAME_2"
## [49] "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"
datatable(
head(dat1, 25),
options = list(scrollX = TRUE, pageLength = 10),
caption = "Selected variables"
)
dat1 <- dat1 %>%
mutate(
age_num = as.numeric(str_extract(PBAGENAME, "\\d+")),
Ped_Age = case_when(
age_num <= 4 ~ "0-4",
age_num <= 9 ~ "5-9",
age_num <= 14 ~ "10-14",
age_num <= 20 ~ "15-20",
age_num <= 24 ~ "21-24",
age_num <= 34 ~ "25-34",
age_num <= 44 ~ "35-44",
age_num <= 54 ~ "45-54",
age_num <= 64 ~ "55-64",
age_num <= 74 ~ "65-74",
age_num >= 75 ~ "75+",
TRUE ~ "Unknown"
),
age_num1 = as.numeric(str_extract(AGENAMEDr, "\\d+")),
Dr_Age = case_when(
age_num1 <= 4 ~ "0-4",
age_num1 <= 9 ~ "5-9",
age_num1 <= 14 ~ "10-14",
age_num1 <= 20 ~ "15-20",
age_num1 <= 24 ~ "21-24",
age_num1 <= 34 ~ "25-34",
age_num1 <= 44 ~ "35-44",
age_num1 <= 54 ~ "45-54",
age_num1 <= 64 ~ "55-64",
age_num1 <= 74 ~ "65-74",
age_num1 >= 75 ~ "75+",
TRUE ~ "Unknown"
)
)
table(dat1$Ped_Age, useNA = "ifany")
##
## 0-4 10-14 15-20 21-24 25-34 35-44 45-54 5-9 55-64 65-74
## 40 282 493 300 1004 1228 1505 107 1981 1093
## 75+ Unknown
## 536 120
table(dat1$Dr_Age, useNA = "ifany")
##
## 10-14 15-20 21-24 25-34 35-44 45-54 55-64 65-74 75+ Unknown
## 4 749 763 1800 1401 1161 992 562 343 914
dat1 <- dat1 %>%
mutate(
Day_Type = case_when(
DAY_WEEKNAME %in% c("Saturday", "Sunday") ~ "Weekend",
!is.na(DAY_WEEKNAME) ~ "Weekday",
TRUE ~ "Unknown"
)
)
table(dat1$Day_Type, useNA = "ifany")
##
## Weekday Weekend
## 6234 2455
dat2 <- dat1 %>%
mutate(
Veh_Type = case_when(
str_detect(BODY_TYPNAME, regex("sedan|coupe|hatchback|convertible|station wagon|automobile", ignore_case = TRUE)) ~ "Car",
str_detect(BODY_TYPNAME, regex("pickup|utility|light truck|cab chassis based", ignore_case = TRUE)) ~ "Light_Truck",
str_detect(BODY_TYPNAME, regex("truck-tractor|single-unit|heavy truck|medium/heavy|construction equipment|farm equipment|motorhome", ignore_case = TRUE)) ~ "Large_Truck",
str_detect(BODY_TYPNAME, regex("bus", ignore_case = TRUE)) ~ "Bus",
str_detect(BODY_TYPNAME, regex("van|minivan|step-van|walk-in van", ignore_case = TRUE)) ~ "Van",
str_detect(BODY_TYPNAME, regex("motorcycle|moped|scooter|motored cycle", ignore_case = TRUE)) ~ "Motorcycle",
str_detect(BODY_TYPNAME, regex("ATV|ATC|golf cart|off-highway|recreational|three-wheel|go-cart|fork-lift|street sweeper", ignore_case = TRUE)) ~ "Off_Road_Other",
is.na(BODY_TYPNAME) |
BODY_TYPNAME %in% c("NA", "Not Reported", "Unknown body type") |
str_detect(BODY_TYPNAME, regex("unknown|not reported", ignore_case = TRUE)) ~ "Unknown",
TRUE ~ "Off_Road_Other"
)
)
table(dat2$Veh_Type, useNA = "ifany")
##
## Bus Car Large_Truck Light_Truck Motorcycle
## 218 3249 698 3465 80
## Off_Road_Other Unknown Van
## 8 695 276
names(dat2)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "BODY_TYPNAME" "RUR_URBNAME"
## [17] "FUNC_SYSNAME" "PBAGENAME" "PBSEXNAME" "BIKECTYPENAME"
## [21] "BIKEPOSNAME" "MOTMANNAME" "BIKELOCNAME" "BIKECGPNAME"
## [25] "INJ_SEVNAMEDr" "AGENAMEDr" "SEXNAMEDr" "DRINKINGNAMEDr"
## [29] "HAZ_INVNAME_2" "BUS_USENAME_2" "EMER_USENAME_2" "TRAV_SPNAME_2"
## [33] "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2" "VTRAFWAYNAME_2"
## [37] "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2" "VPROFILENAME_2"
## [41] "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2" "ACC_TYPENAME_2"
## [45] "L_STATUSNAME_2" "L_TYPENAME_2" "CDL_STATNAME_2" "MDRDSTRDNAME_2"
## [49] "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME" "age_num"
## [53] "Ped_Age" "age_num1" "Dr_Age" "Day_Type"
## [57] "Veh_Type"
dat2 <- dat2[, -c(52, 54, 18, 26, 15, 51)]
dim(dat2)
## [1] 8689 51
names(dat2)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "RUR_URBNAME" "FUNC_SYSNAME"
## [17] "PBSEXNAME" "BIKECTYPENAME" "BIKEPOSNAME" "MOTMANNAME"
## [21] "BIKELOCNAME" "BIKECGPNAME" "INJ_SEVNAMEDr" "SEXNAMEDr"
## [25] "DRINKINGNAMEDr" "HAZ_INVNAME_2" "BUS_USENAME_2" "EMER_USENAME_2"
## [29] "TRAV_SPNAME_2" "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [33] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"
## [37] "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [41] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2" "CDL_STATNAME_2"
## [45] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "Ped_Age"
## [49] "Dr_Age" "Day_Type" "Veh_Type"
dat2 <- dat2 %>%
mutate(
spd_num = as.numeric(str_extract(VSPD_LIMNAME_2, "\\d+")),
PSL = case_when(
spd_num <= 25 ~ "25_or_less",
spd_num <= 35 ~ "30_35",
spd_num <= 45 ~ "40_45",
spd_num <= 55 ~ "50_55",
spd_num >= 60 ~ "60_plus",
TRUE ~ "Unknown"
)
)
table(dat2$PSL, useNA = "ifany")
##
## 25_or_less 30_35 40_45 50_55 60_plus Unknown
## 950 2205 2827 1781 447 479
dat3 <- dat2[, -c(35, 52)]
dim(dat3)
## [1] 8689 51
dat3 <- dat3 %>%
mutate(
across(
where(~ is.character(.) | is.factor(.)),
~ case_when(
is.na(.) ~ "Not Reported",
str_trim(as.character(.)) %in% c(
"NA", "N/A", "Unknown", "Reported as Unknown",
" Reported as Unknown",
"Reported as Unknown if Distracted ",
"Reported as Unknown if Impaired",
"Unknown body type", "Not Reported"
) ~ "Not Reported",
str_detect(as.character(.), regex("^unknown|not reported$", ignore_case = TRUE)) ~ "Not Reported",
TRUE ~ as.character(.)
)
)
)
cat_summary <- dat3 %>%
summarise(across(
where(~ is.character(.) | is.factor(.)),
~ n_distinct(., na.rm = TRUE)
)) %>%
pivot_longer(
everything(),
names_to = "variable",
values_to = "n_categories"
) %>%
arrange(desc(n_categories))
cat_summary
## # A tibble: 47 × 2
## variable n_categories
## <chr> <int>
## 1 CRASH_NUM1 8650
## 2 LONGITUDNAME 8608
## 3 LATITUDENAME 8594
## 4 TRAV_SPNAME_2 93
## 5 BIKECTYPENAME 81
## 6 STATENAME 51
## 7 ACC_TYPENAME_2 36
## 8 MDRDSTRDNAME_2 27
## 9 BIKECGPNAME 22
## 10 P_CRASH1NAME_2 20
## # ℹ 37 more rows
datatable(
cat_summary,
options = list(pageLength = 25),
caption = "Number of categories after lumping"
)
#### BIKECTYPENAME (From 81 to 12 Categories)
dat3 <- dat3 %>%
mutate(
bike_type_clean = BIKECTYPENAME %>%
str_replace_all("–|–|Â|ÿ", "-") %>%
str_squish(),
BikeCTyp = case_when(
str_detect(bike_type_clean, regex("Ride[- ]?Out|Drive[- ]?Out|Drive-In/Out", ignore_case = TRUE)) ~
"Ride-Out / Drive-Out",
str_detect(bike_type_clean, regex("Ride Through|Drive Through", ignore_case = TRUE)) ~
"Ride-Through / Drive-Through",
str_detect(bike_type_clean, regex("Left Turn|Right Turn|Right Turn on Red|Turn / Merge|Turning Error", ignore_case = TRUE)) ~
"Turning / Merging",
str_detect(bike_type_clean, regex("Crossing Paths", ignore_case = TRUE)) ~
"Crossing Paths",
str_detect(bike_type_clean, regex("Overtaking|Passing|Extended Door", ignore_case = TRUE)) ~
"Overtaking / Passing",
str_detect(bike_type_clean, regex("Wrong-Way|Wrong-Side", ignore_case = TRUE)) ~
"Wrong-Way / Wrong-Side",
str_detect(bike_type_clean, regex("Lost Control", ignore_case = TRUE)) ~
"Loss of Control",
str_detect(bike_type_clean, regex("Backing Vehicle", ignore_case = TRUE)) ~
"Backing Vehicle",
str_detect(bike_type_clean, regex("Multiple Threat|Failed to Clear|Trapped", ignore_case = TRUE)) ~
"Multiple Threat / Trapped",
str_detect(bike_type_clean, regex("Parallel Paths", ignore_case = TRUE)) ~
"Parallel Paths",
str_detect(bike_type_clean, regex("Non-Roadway|Play Vehicle", ignore_case = TRUE)) ~
"Non-Roadway / Play Vehicle",
TRUE ~
"Other / Unknown / Not Reported"
)
)
dat3 %>%
count(BikeCTyp, sort = TRUE) %>%
mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 12 × 3
## BikeCTyp n percent
## <chr> <int> <dbl>
## 1 Overtaking / Passing 2606 30.0
## 2 Turning / Merging 1441 16.6
## 3 Ride-Out / Drive-Out 1279 14.7
## 4 Ride-Through / Drive-Through 927 10.7
## 5 Other / Unknown / Not Reported 723 8.32
## 6 Parallel Paths 517 5.95
## 7 Crossing Paths 504 5.8
## 8 Wrong-Way / Wrong-Side 431 4.96
## 9 Loss of Control 190 2.19
## 10 Multiple Threat / Trapped 28 0.32
## 11 Backing Vehicle 22 0.25
## 12 Non-Roadway / Play Vehicle 21 0.24
### BIKECGPNAME (21 to 9)
dat3 <- dat3 %>%
mutate(
bike_cgp_clean = BIKECGPNAME %>%
str_replace_all("–|–|Â|ÿ", "-") %>%
str_squish(),
BikeCgp = case_when(
str_detect(bike_cgp_clean, regex("Bicyclist Failed to Yield", ignore_case = TRUE)) ~
"Bicyclist Failed to Yield",
str_detect(bike_cgp_clean, regex("Motorist Failed to Yield", ignore_case = TRUE)) ~
"Motorist Failed to Yield",
str_detect(bike_cgp_clean, regex("Bicyclist Left Turn|Bicyclist Right Turn|Bicyclist.*Merge", ignore_case = TRUE)) ~
"Bicyclist Turn / Merge",
str_detect(bike_cgp_clean, regex("Motorist Left Turn|Motorist Right Turn|Motorist.*Merge", ignore_case = TRUE)) ~
"Motorist Turn / Merge",
str_detect(bike_cgp_clean, regex("Overtaking", ignore_case = TRUE)) ~
"Overtaking",
str_detect(bike_cgp_clean, regex("Crossing Paths|Parallel Paths", ignore_case = TRUE)) ~
"Crossing / Parallel Paths",
str_detect(bike_cgp_clean, regex("Loss of Control|Turning Error", ignore_case = TRUE)) ~
"Loss of Control / Turning Error",
str_detect(bike_cgp_clean, regex("Backing|Wrong-Way|Wrong-Side|Non-Trafficway|Parking|Bus", ignore_case = TRUE)) ~
"Backing / Wrong-Way / Non-Trafficway",
TRUE ~
"Other / Unknown / Unusual"
)
)
dat3 %>%
count(BikeCgp, sort = TRUE) %>%
mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 9 × 3
## BikeCgp n percent
## <chr> <int> <dbl>
## 1 Overtaking 2606 30.0
## 2 Bicyclist Failed to Yield 1934 22.3
## 3 Crossing / Parallel Paths 1254 14.4
## 4 Bicyclist Turn / Merge 757 8.71
## 5 Other / Unknown / Unusual 597 6.87
## 6 Motorist Turn / Merge 483 5.56
## 7 Backing / Wrong-Way / Non-Trafficway 464 5.34
## 8 Loss of Control / Turning Error 352 4.05
## 9 Motorist Failed to Yield 242 2.79
##### ACC_TYPENAME_2
dat3 <- dat3 %>%
mutate(
acc_type_clean = ACC_TYPENAME_2 %>%
str_replace_all("–|–", "-") %>%
str_remove("^[A-Z][0-9]+-") %>% # removes A1-, B6-, C13-, M99-, etc.
str_squish(),
AccTyp = case_when(
# 1-4. Roadside departure
str_detect(acc_type_clean, regex("Right Roadside Departure-Drive Off Road", ignore_case = TRUE)) ~
"Right Roadside Departure - Drive Off Road",
str_detect(acc_type_clean, regex("Right Roadside Departure-Control/Traction Loss", ignore_case = TRUE)) ~
"Right Roadside Departure - Control Loss",
str_detect(acc_type_clean, regex("Left Roadside Departure-Drive Off Road", ignore_case = TRUE)) ~
"Left Roadside Departure - Drive Off Road",
str_detect(acc_type_clean, regex("Left Roadside Departure-Control/Traction Loss", ignore_case = TRUE)) ~
"Left Roadside Departure - Control Loss",
# 5. Other roadside departure
str_detect(acc_type_clean, regex("Roadside Departure", ignore_case = TRUE)) ~
"Other Roadside Departure",
# 6-8. Forward impact
str_detect(acc_type_clean, regex("Forward Impact-Pedestrian|Pedestrian/ Animal", ignore_case = TRUE)) ~
"Forward Impact - Pedestrian/Animal",
str_detect(acc_type_clean, regex("Forward Impact-Parked Veh", ignore_case = TRUE)) ~
"Forward Impact - Parked Vehicle",
str_detect(acc_type_clean, regex("Forward Impact-Sta|Sta\\. Object|End Departure", ignore_case = TRUE)) ~
"Forward Impact - Object/End Departure",
# 9-11. Rear-end
str_detect(acc_type_clean, regex("Rear End-Stopped", ignore_case = TRUE)) ~
"Rear-End - Stopped Vehicle",
str_detect(acc_type_clean, regex("Rear End-Slower", ignore_case = TRUE)) ~
"Rear-End - Slower Vehicle",
str_detect(acc_type_clean, regex("Rear End-Decelerating|Rear End-Specifics", ignore_case = TRUE)) ~
"Rear-End - Decelerating/Other",
# 12-13. Same-direction sideswipe/angle
str_detect(acc_type_clean, regex("Same Direction-Angle, Sideswipe.*Straight", ignore_case = TRUE)) ~
"Same-Direction Sideswipe - Straight",
str_detect(acc_type_clean, regex("Same Direction-Angle, Sideswipe.*Changing Lanes|Same Direction-Angle, Sideswipe-Specifics", ignore_case = TRUE)) ~
"Same-Direction Sideswipe - Lane Change/Other",
# 14-15. Opposite-direction conflict
str_detect(acc_type_clean, regex("Opposite Direction-Head-On", ignore_case = TRUE)) ~
"Opposite-Direction Head-On",
str_detect(acc_type_clean, regex("Opposite Direction-Angle|Opposite Direction-Forward Impact", ignore_case = TRUE)) ~
"Opposite-Direction Angle/Sideswipe",
# 16-17. Turning conflicts
str_detect(acc_type_clean, regex("Turn Across Path", ignore_case = TRUE)) ~
"Turning Conflict - Turn Across Path",
str_detect(acc_type_clean, regex("Turn Into Path", ignore_case = TRUE)) ~
"Turning Conflict - Turn Into Path",
# 18. Intersecting paths
str_detect(acc_type_clean, regex("Intersecting Paths", ignore_case = TRUE)) ~
"Intersecting-Path Conflict",
# 19. Backing
str_detect(acc_type_clean, regex("Backing", ignore_case = TRUE)) ~
"Backing",
# 20. Other / unknown
TRUE ~
"Other / Unknown / No Impact"
)
)
dat3 %>%
count(AccTyp, sort = TRUE) %>%
mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 18 × 3
## AccTyp n percent
## <chr> <int> <dbl>
## 1 Forward Impact - Pedestrian/Animal 7227 83.2
## 2 Other / Unknown / No Impact 1244 14.3
## 3 Right Roadside Departure - Drive Off Road 103 1.19
## 4 Left Roadside Departure - Drive Off Road 21 0.24
## 5 Backing 20 0.23
## 6 Intersecting-Path Conflict 11 0.13
## 7 Other Roadside Departure 11 0.13
## 8 Right Roadside Departure - Control Loss 10 0.12
## 9 Turning Conflict - Turn Across Path 10 0.12
## 10 Left Roadside Departure - Control Loss 6 0.07
## 11 Rear-End - Slower Vehicle 5 0.06
## 12 Forward Impact - Object/End Departure 4 0.05
## 13 Forward Impact - Parked Vehicle 4 0.05
## 14 Opposite-Direction Angle/Sideswipe 4 0.05
## 15 Same-Direction Sideswipe - Lane Change/Other 3 0.03
## 16 Turning Conflict - Turn Into Path 3 0.03
## 17 Same-Direction Sideswipe - Straight 2 0.02
## 18 Opposite-Direction Head-On 1 0.01
names(dat3)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "RUR_URBNAME" "FUNC_SYSNAME"
## [17] "PBSEXNAME" "BIKECTYPENAME" "BIKEPOSNAME" "MOTMANNAME"
## [21] "BIKELOCNAME" "BIKECGPNAME" "INJ_SEVNAMEDr" "SEXNAMEDr"
## [25] "DRINKINGNAMEDr" "HAZ_INVNAME_2" "BUS_USENAME_2" "EMER_USENAME_2"
## [29] "TRAV_SPNAME_2" "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [33] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VALIGNNAME_2" "VPROFILENAME_2"
## [37] "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2" "ACC_TYPENAME_2"
## [41] "L_STATUSNAME_2" "L_TYPENAME_2" "CDL_STATNAME_2" "MDRDSTRDNAME_2"
## [45] "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "Ped_Age" "Dr_Age"
## [49] "Day_Type" "Veh_Type" "PSL" "bike_type_clean"
## [53] "BikeCTyp" "bike_cgp_clean" "BikeCgp" "acc_type_clean"
## [57] "AccTyp"
dat3 <- dat3[, -c(18, 22, 40, 52, 54, 56)]
dim(dat3)
## [1] 8689 51
names(dat3)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "RUR_URBNAME" "FUNC_SYSNAME"
## [17] "PBSEXNAME" "BIKEPOSNAME" "MOTMANNAME" "BIKELOCNAME"
## [21] "INJ_SEVNAMEDr" "SEXNAMEDr" "DRINKINGNAMEDr" "HAZ_INVNAME_2"
## [25] "BUS_USENAME_2" "EMER_USENAME_2" "TRAV_SPNAME_2" "UNDERIDENAME_2"
## [29] "ROLLOVERNAME_2" "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2"
## [33] "VALIGNNAME_2" "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2"
## [37] "P_CRASH1NAME_2" "L_STATUSNAME_2" "L_TYPENAME_2" "CDL_STATNAME_2"
## [41] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "Ped_Age"
## [45] "Dr_Age" "Day_Type" "Veh_Type" "PSL"
## [49] "BikeCTyp" "BikeCgp" "AccTyp"
keep_cols <- c("CRASH_NUM1", "Ped_Age", "Dr_Age", "Veh_Type", "LATITUDENAME", "LONGITUDNAME", "STATENAME", "LGT_CONDNAME", "BikeCTyp", "BikeCgp","AccTyp", "YEAR", "VE_TOTAL", "Veh_Type", "BIKEPOSNAME", "FUNC_SYSNAME")
dat3_lumped <- dat3 %>%
mutate(
across(
.cols = setdiff(
names(select(., where(~ is.character(.) | is.factor(.)))),
keep_cols
),
.fns = ~ if (n_distinct(., na.rm = TRUE) > 5) {
fct_lump_n(as.factor(.), n = 5, other_level = "Other")
} else {
as.factor(.)
}
)
)
dim(dat3_lumped)
## [1] 8689 51
category_summary <- data.frame(
Variable = names(dat3_lumped),
Categories = sapply(dat3_lumped, function(x) n_distinct(x, na.rm = FALSE))
) %>%
arrange(desc(Categories))
datatable(
category_summary,
options = list(pageLength = 25),
caption = "Number of categories after lumping"
)
freq_tables <- lapply(
dat3_lumped %>% select(-all_of(keep_cols)),
table,
useNA = "ifany"
)
freq_tables
## $WORK_INJNAME
##
## No Not Reported Redacted Yes
## 7865 773 34 17
##
## $FATALS
##
## 1 2 3 5
## 8585 90 4 10
##
## $DRUNK_DR
##
## 0 1 2 <NA>
## 3964 368 5 4352
##
## $HIT_RUNNAME_2
##
## No Not Reported Yes
## 6875 10 1804
##
## $WRK_ZONENAME
##
## Construction Maintenance None
## 42 6 8596
## Utility Work Zone, Type Unknown
## 1 44
##
## $REL_ROADNAME
##
## Continuous Left - Turn Lane Not Reported
## 17 46
## On Roadside On Roadway
## 222 8027
## On Shoulder Other
## 340 37
##
## $WEATHER1NAME
##
## Clear Cloudy Fog, Smog, Smoke Not Reported
## 2611 417 13 5495
## Rain Other
## 145 8
##
## $RUR_URBNAME
##
## Not Reported Rural
## 35 1698
## Trafficway Not in State Inventory Urban
## 12 6944
##
## $PBSEXNAME
##
## Female Male Not Reported
## 1113 7503 73
##
## $MOTMANNAME
##
## Not a Pedestrian
## 8689
##
## $BIKELOCNAME
##
## At Intersection Intersection-Related Non-Trafficway Location
## 2519 708 8
## Not At Intersection Not Reported
## 5410 44
##
## $INJ_SEVNAMEDr
##
## No Apparent Injury (O) Not Reported
## 7349 661
## Possible Injury (C) Suspected Minor Injury (B)
## 336 212
## Suspected Serious Injury (A) Other
## 63 68
##
## $SEXNAMEDr
##
## Female Male Not Reported
## 2218 5596 875
##
## $DRINKINGNAMEDr
##
## No (Alcohol Not Involved) Not Reported Yes (Alcohol Involved)
## 5449 2577 663
##
## $HAZ_INVNAME_2
##
## No Not Reported Yes
## 8670 5 14
##
## $BUS_USENAME_2
##
## Charter/Tour Not a Bus Not Reported School
## 8 8080 531 26
## Transit/ Commuter Other
## 37 7
##
## $EMER_USENAME_2
##
## Emergency Operation, Emergency Warning Equipment in Use
## 5
## Emergency Operation, Emergency Warning Equipment in Use Unknown
## 9
## Non-Emergency, Non-Transport
## 6
## Not Applicable
## 8513
## Not Reported
## 149
## Other
## 7
##
## $TRAV_SPNAME_2
##
## 035 MPH 040 MPH 045 MPH 055 MPH Not Reported Other
## 389 412 617 359 5007 1905
##
## $UNDERIDENAME_2
##
## No Underride or Override Noted
## 4335
## Not Reported
## 4353
## Overriding a Motor Vehicle Not In-Transport
## 1
##
## $ROLLOVERNAME_2
##
## No Rollover Not Applicable
## 8614 31
## Not Reported Rollover
## 5 16
## Rollover, Tripped by Object/Vehicle Other
## 20 3
##
## $DEFORMEDNAME_2
##
## Damage Reported, Extent Unknown Disabling Damage
## 580 1723
## Functional Damage Minor Damage
## 2396 2041
## Not Reported Other
## 1434 515
##
## $VTRAFWAYNAME_2
##
## One-Way Trafficway
## 265
## Two-Way, Divided, Positive Median Barrier
## 486
## Two-Way, Divided, Unprotected Median
## 1965
## Two-Way, Not Divided
## 4633
## Two-Way, Not Divided With a Continuous Left-Turn Lane
## 964
## Other
## 376
##
## $VNUM_LANNAME_2
##
## Five lanes Four lanes Six lanes Three lanes Two lanes Other
## 1009 1101 182 1161 4724 512
##
## $VALIGNNAME_2
##
## Curve - Left Curve Left Curve Right Not Reported Straight Other
## 113 163 150 257 7812 194
##
## $VPROFILENAME_2
##
## Downhill Grade, Unknown Slope Level
## 228 467 6482
## Not Reported Uphill Other
## 1006 304 202
##
## $VPAVETYPNAME_2
##
## Blacktop, Bituminous, or Asphalt Concrete
## 4950 508
## Non-Trafficway or Driveway Access Not Reported
## 45 3164
## Slag, Gravel or Stone Other
## 12 10
##
## $VSURCONDNAME_2
##
## Dry Non-Trafficway or Driveway Access
## 7835 45
## Not Reported Snow
## 127 10
## Wet Other
## 653 19
##
## $P_CRASH1NAME_2
##
## Going Straight Negotiating a Curve Not Reported Turning Left
## 6734 480 212 427
## Turning Right Other
## 428 408
##
## $L_STATUSNAME_2
##
## Expired Not licensed Not Reported Suspended Valid Other
## 85 334 928 370 6901 71
##
## $L_TYPENAME_2
##
## Full Driver License Intermediate Driver License
## 7225 141
## Learner's Permit Not Licensed
## 39 334
## Not Reported Other
## 928 22
##
## $CDL_STATNAME_2
##
## Disqualified No (CDL) Not Reported Suspended Valid Other
## 21 6788 934 19 879 48
##
## $MDRDSTRDNAME_2
##
## Distraction/Inattention
## 47
## Inattention (Inattentive), Details Unknown
## 100
## Not Distracted
## 1659
## Not Reported
## 6521
## Reported as Unknown if Distracted
## 180
## Other
## 182
##
## $DRIMPAIRNAME_2
##
## Asleep or Fatigued
## 64
## Ill, Blackout
## 32
## None/Apparently Normal
## 3973
## Not Reported
## 3842
## Under the Influence of Alcohol, Drugs or Medication
## 685
## Other
## 93
##
## $MDRMANAVNAME_2
##
## Driver Did Not Maneuver to Avoid
## 974
## Motor Vehicle
## 3
## Not Reported
## 7312
## Pedestrian, Pedalcyclist or Other Non-Motorist
## 387
## Phantom/Non-Contact Motor Vehicle
## 8
## Other
## 5
##
## $Day_Type
##
## Weekday Weekend
## 6234 2455
##
## $PSL
##
## 25_or_less 30_35 40_45 50_55 Not Reported Other
## 950 2205 2827 1781 479 447
keep_names <- c("CRASH_NUM1")
old_names <- names(dat3_lumped)
new_names <- ifelse(
old_names %in% keep_names,
old_names,
str_to_title(str_sub(str_replace_all(old_names, "_", ""), 1, 4))
)
new_names <- make.unique(new_names, sep = "")
name_key <- data.frame(
old_name = old_names,
new_name = new_names
)
names(dat3_lumped) <- new_names
names(dat3_lumped)
## [1] "Stat" "Year" "CRASH_NUM1" "Work" "Lati"
## [6] "Long" "Veto" "Fata" "Drun" "Hitr"
## [11] "Wrkz" "Relr" "Weat" "Lgtc" "Ruru"
## [16] "Func" "Pbse" "Bike" "Motm" "Bike1"
## [21] "Injs" "Sexn" "Drin" "Hazi" "Busu"
## [26] "Emer" "Trav" "Unde" "Roll" "Defo"
## [31] "Vtra" "Vnum" "Vali" "Vpro" "Vpav"
## [36] "Vsur" "Pcra" "Lsta" "Ltyp" "Cdls"
## [41] "Mdrd" "Drim" "Mdrm" "Peda" "Drag"
## [46] "Dayt" "Veht" "Psl" "Bike2" "Bike3"
## [51] "Acct"
datatable(
name_key,
options = list(pageLength = 25, scrollX = TRUE),
caption = "Variable name key"
)
datatable(
head(dat3_lumped, 50),
options = list(scrollX = TRUE, pageLength = 10),
caption = "Final processed dataset"
)
write.csv(dat3_lumped, "Bike01.csv", row.names = FALSE)
write_xlsx(dat3_lumped, "Bike01.xlsx")
write.csv(name_key, "Bike01_variable_name_key.csv", row.names = FALSE)
write_xlsx(name_key, "Bike01_variable_name_key.xlsx")
write.csv(category_summary, "Bike01_category_summary.csv", row.names = FALSE)
write_xlsx(category_summary, "Bike01_category_summary.xlsx")
The following files are created in the working directory:
Bike01.csvBike01.xlsxBike01_variable_name_key.csvBike01_variable_name_key.xlsxBike01_category_summary.csvBike01_category_summary.xlsx