library(readxl)
library(dplyr)
library(stringr)
library(forcats)
library(DT)
library(writexl)
library(tidyr)
data_dir <- "D:/DAS_CODE/Tools/data"
setwd(data_dir)
dat <- read_excel("Ped_Bike_Esc_2016_2024_All.xlsx", sheet = "Ped_Lim")
dim(dat)
## [1] 60741 81
names(dat)
## [1] "STATENAME" "YEAR" "CRASH_NUM1"
## [4] "CRASH_VEH_NUM1" "CRASH_VehPer_NUM1" "PER_TYPNAME"
## [7] "INJ_SEVNAME" "LATITUDENAME" "LONGITUDNAME"
## [10] "VE_TOTAL" "FATALS" "DRUNK_DR"
## [13] "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [16] "WEATHER1NAME" "LGT_CONDNAME" "BODY_TYPNAME"
## [19] "RUR_URBNAME" "FUNC_SYSNAME" "PBAGENAME"
## [22] "PBSEXNAME" "PEDCTYPENAME" "PEDPOSNAME"
## [25] "MOTMANNAME" "PEDLEGNAME" "PEDSNRNAME"
## [28] "PEDCGPNAME" "INJ_SEVNAMEDr" "AGENAMEDr"
## [31] "SEXNAMEDr" "DRINKINGNAMEDr" "ROUTENAME"
## [34] "TYP_INTNAME" "DRINKINGNAME" "ALC_RESNAME"
## [37] "DRUGSNAME" "DRUGRES1NAME" "WORK_INJNAME"
## [40] "RACENAME" "LOCATIONNAME" "NUMOCCS_2"
## [43] "MAKENAME_2" "MAK_MODNAME_2" "MOD_YEAR_2"
## [46] "HAZ_INVNAME_2" "BUS_USENAME_2" "EMER_USENAME_2"
## [49] "TRAV_SPNAME_2" "UNDERIDENAME_2" "ROLLOVERNAME_2"
## [52] "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2"
## [55] "VSPD_LIMNAME_2" "VALIGNNAME_2" "VPROFILENAME_2"
## [58] "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [61] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"
## [64] "CDL_STATNAME_2" "MDRDSTRDNAME_2" "DRIMPAIRNAME_2"
## [67] "MDRMANAVNAME_2" "MVISOBSCNAME_2" "INJ_SEVNAME_2"
## [70] "ALC_RESNAME_2" "DRUGSNAME_2" "COUNTYNAME"
## [73] "CITYNAME" "DAY" "MONTHNAME"
## [76] "DAY_WEEKNAME" "HOUR" "HOURNAME"
## [79] "MAN_COLLNAME" "WEATHERNAME" "SCH_BUSNAME"
dat1 <- dat[, c(1, 2, 3, 39, 8:32, 46:67, 76)]
dim(dat1)
## [1] 60741 52
names(dat1)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "BODY_TYPNAME" "RUR_URBNAME"
## [17] "FUNC_SYSNAME" "PBAGENAME" "PBSEXNAME" "PEDCTYPENAME"
## [21] "PEDPOSNAME" "MOTMANNAME" "PEDLEGNAME" "PEDSNRNAME"
## [25] "PEDCGPNAME" "INJ_SEVNAMEDr" "AGENAMEDr" "SEXNAMEDr"
## [29] "DRINKINGNAMEDr" "HAZ_INVNAME_2" "BUS_USENAME_2" "EMER_USENAME_2"
## [33] "TRAV_SPNAME_2" "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [37] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"
## [41] "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [45] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2" "CDL_STATNAME_2"
## [49] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"
datatable(
head(dat1, 25),
options = list(scrollX = TRUE, pageLength = 10),
caption = "Selected variables"
)
dat1 <- dat1 %>%
mutate(
age_num = as.numeric(str_extract(PBAGENAME, "\\d+")),
Ped_Age = case_when(
age_num <= 4 ~ "0-4",
age_num <= 9 ~ "5-9",
age_num <= 14 ~ "10-14",
age_num <= 20 ~ "15-20",
age_num <= 24 ~ "21-24",
age_num <= 34 ~ "25-34",
age_num <= 44 ~ "35-44",
age_num <= 54 ~ "45-54",
age_num <= 64 ~ "55-64",
age_num <= 74 ~ "65-74",
age_num >= 75 ~ "75+",
TRUE ~ "Unknown"
),
age_num1 = as.numeric(str_extract(AGENAMEDr, "\\d+")),
Dr_Age = case_when(
age_num1 <= 4 ~ "0-4",
age_num1 <= 9 ~ "5-9",
age_num1 <= 14 ~ "10-14",
age_num1 <= 20 ~ "15-20",
age_num1 <= 24 ~ "21-24",
age_num1 <= 34 ~ "25-34",
age_num1 <= 44 ~ "35-44",
age_num1 <= 54 ~ "45-54",
age_num1 <= 64 ~ "55-64",
age_num1 <= 74 ~ "65-74",
age_num1 >= 75 ~ "75+",
TRUE ~ "Unknown"
)
)
table(dat1$Ped_Age, useNA = "ifany")
##
## 0-4 10-14 15-20 21-24 25-34 35-44 45-54 5-9 55-64 65-74
## 552 667 2581 3077 9797 10098 9599 468 11021 7015
## 75+ Unknown
## 5130 736
table(dat1$Dr_Age, useNA = "ifany")
##
## 0-4 10-14 15-20 21-24 25-34 35-44 45-54 5-9 55-64 65-74
## 8 33 4178 5333 11846 9211 8105 9 7215 3924
## 75+ Unknown
## 1886 8993
dat1 <- dat1 %>%
mutate(
Day_Type = case_when(
DAY_WEEKNAME %in% c("Saturday", "Sunday") ~ "Weekend",
!is.na(DAY_WEEKNAME) ~ "Weekday",
TRUE ~ "Unknown"
)
)
table(dat1$Day_Type, useNA = "ifany")
##
## Weekday Weekend
## 42325 18416
dat2 <- dat1 %>%
mutate(
Veh_Type = case_when(
str_detect(BODY_TYPNAME, regex("sedan|coupe|hatchback|convertible|station wagon|automobile", ignore_case = TRUE)) ~ "Car",
str_detect(BODY_TYPNAME, regex("pickup|utility|light truck|cab chassis based", ignore_case = TRUE)) ~ "Light_Truck",
str_detect(BODY_TYPNAME, regex("truck-tractor|single-unit|heavy truck|medium/heavy|construction equipment|farm equipment|motorhome", ignore_case = TRUE)) ~ "Large_Truck",
str_detect(BODY_TYPNAME, regex("bus", ignore_case = TRUE)) ~ "Bus",
str_detect(BODY_TYPNAME, regex("van|minivan|step-van|walk-in van", ignore_case = TRUE)) ~ "Van",
str_detect(BODY_TYPNAME, regex("motorcycle|moped|scooter|motored cycle", ignore_case = TRUE)) ~ "Motorcycle",
str_detect(BODY_TYPNAME, regex("ATV|ATC|golf cart|off-highway|recreational|three-wheel|go-cart|fork-lift|street sweeper", ignore_case = TRUE)) ~ "Off_Road_Other",
is.na(BODY_TYPNAME) |
BODY_TYPNAME %in% c("NA", "Not Reported", "Unknown body type") |
str_detect(BODY_TYPNAME, regex("unknown|not reported", ignore_case = TRUE)) ~ "Unknown",
TRUE ~ "Off_Road_Other"
)
)
table(dat2$Veh_Type, useNA = "ifany")
##
## Bus Car Large_Truck Light_Truck Motorcycle
## 1205 24241 3394 22579 368
## Off_Road_Other Unknown Van
## 30 7036 1888
names(dat2)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "BODY_TYPNAME" "RUR_URBNAME"
## [17] "FUNC_SYSNAME" "PBAGENAME" "PBSEXNAME" "PEDCTYPENAME"
## [21] "PEDPOSNAME" "MOTMANNAME" "PEDLEGNAME" "PEDSNRNAME"
## [25] "PEDCGPNAME" "INJ_SEVNAMEDr" "AGENAMEDr" "SEXNAMEDr"
## [29] "DRINKINGNAMEDr" "HAZ_INVNAME_2" "BUS_USENAME_2" "EMER_USENAME_2"
## [33] "TRAV_SPNAME_2" "UNDERIDENAME_2" "ROLLOVERNAME_2" "DEFORMEDNAME_2"
## [37] "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2" "VALIGNNAME_2"
## [41] "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2" "P_CRASH1NAME_2"
## [45] "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2" "CDL_STATNAME_2"
## [49] "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2" "DAY_WEEKNAME"
## [53] "age_num" "Ped_Age" "age_num1" "Dr_Age"
## [57] "Day_Type" "Veh_Type"
dat2 <- dat2[, -c(18, 27, 52, 53, 55, 15)]
dim(dat2)
## [1] 60741 52
names(dat2)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "RUR_URBNAME" "FUNC_SYSNAME"
## [17] "PBSEXNAME" "PEDCTYPENAME" "PEDPOSNAME" "MOTMANNAME"
## [21] "PEDLEGNAME" "PEDSNRNAME" "PEDCGPNAME" "INJ_SEVNAMEDr"
## [25] "SEXNAMEDr" "DRINKINGNAMEDr" "HAZ_INVNAME_2" "BUS_USENAME_2"
## [29] "EMER_USENAME_2" "TRAV_SPNAME_2" "UNDERIDENAME_2" "ROLLOVERNAME_2"
## [33] "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2"
## [37] "VALIGNNAME_2" "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2"
## [41] "P_CRASH1NAME_2" "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"
## [45] "CDL_STATNAME_2" "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2"
## [49] "Ped_Age" "Dr_Age" "Day_Type" "Veh_Type"
dat2 <- dat2 %>%
mutate(
spd_num = as.numeric(str_extract(VSPD_LIMNAME_2, "\\d+")),
PSL = case_when(
spd_num <= 25 ~ "25_or_less",
spd_num <= 35 ~ "30_35",
spd_num <= 45 ~ "40_45",
spd_num <= 55 ~ "50_55",
spd_num >= 60 ~ "60_plus",
TRUE ~ "Unknown"
)
)
table(dat2$PSL, useNA = "ifany")
##
## 25_or_less 30_35 40_45 50_55 60_plus Unknown
## 5427 14474 18063 9752 8789 4236
dat3 <- dat2[, -c(53)]
dim(dat3)
## [1] 60741 53
dat3 <- dat3 %>%
mutate(
across(
where(~ is.character(.) | is.factor(.)),
~ case_when(
is.na(.) ~ "Not Reported",
str_trim(as.character(.)) %in% c(
"NA", "N/A", "Unknown", "Reported as Unknown",
" Reported as Unknown",
"Reported as Unknown if Distracted ",
"Reported as Unknown if Impaired",
"Unknown body type", "Not Reported"
) ~ "Not Reported",
str_detect(as.character(.), regex("^unknown|not reported$", ignore_case = TRUE)) ~ "Not Reported",
TRUE ~ as.character(.)
)
)
)
cat_summary <- dat3 %>%
summarise(across(
where(~ is.character(.) | is.factor(.)),
~ n_distinct(., na.rm = TRUE)
)) %>%
pivot_longer(
everything(),
names_to = "variable",
values_to = "n_categories"
) %>%
arrange(desc(n_categories))
cat_summary
## # A tibble: 49 × 2
## variable n_categories
## <chr> <int>
## 1 CRASH_NUM1 59912
## 2 LONGITUDNAME 59162
## 3 LATITUDENAME 58976
## 4 PEDSNRNAME 179
## 5 TRAV_SPNAME_2 123
## 6 ACC_TYPENAME_2 76
## 7 PEDCTYPENAME 60
## 8 STATENAME 51
## 9 MDRDSTRDNAME_2 27
## 10 PEDCGPNAME 21
## # ℹ 39 more rows
datatable(
cat_summary,
options = list(pageLength = 25),
caption = "Number of categories after lumping"
)
#### PEDCTYPENAME (From 60 to 12 Categories)
dat3 <- dat3 %>%
mutate(
ped_type_clean = PEDCTYPENAME %>%
str_replace_all("–|–", "-") %>%
str_squish(),
PedCTyp = case_when(
str_detect(ped_type_clean, regex("Pedestrian Failed|Dart|Dash", ignore_case = TRUE)) ~
"Pedestrian Crossing / Dart-Out",
str_detect(ped_type_clean, regex("Crossing an Expressway|Waiting to Cross", ignore_case = TRUE)) ~
"Expressway / Waiting to Cross",
str_detect(ped_type_clean, regex("Intersection", ignore_case = TRUE)) ~
"Intersection Related",
str_detect(ped_type_clean, regex("Walking/Running Along Roadway", ignore_case = TRUE)) ~
"Walking / Running Along Roadway",
str_detect(ped_type_clean, regex("Walking in Roadway|Lying|Standing|Playing|Working|Trapped", ignore_case = TRUE)) ~
"Pedestrian in Roadway",
str_detect(ped_type_clean, regex("Backing", ignore_case = TRUE)) ~
"Backing Vehicle",
str_detect(ped_type_clean, regex("Left Turn|Right Turn|Turn / Merge|Right Turn on Red", ignore_case = TRUE)) ~
"Turning / Merging Vehicle",
str_detect(ped_type_clean, regex("Driveway|Alley", ignore_case = TRUE)) ~
"Driveway / Alley Conflict",
str_detect(ped_type_clean, regex("Motorist Failed to Yield", ignore_case = TRUE)) ~
"Motorist Failed to Yield",
str_detect(ped_type_clean, regex("Disabled Vehicle|Driverless Vehicle|Loss of Control|Vehicle-Vehicle|Vehicle into|Object", ignore_case = TRUE)) ~
"Vehicle / Object / Control Related",
str_detect(ped_type_clean, regex("Bus|Transit|School Bus|Emergency|Vendor|Mailbox|Ice Cream", ignore_case = TRUE)) ~
"Transit / Service Vehicle Related",
TRUE ~
"Other / Unknown"
)
)
dat3 %>%
count(PedCTyp, sort = TRUE) %>%
mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 12 × 3
## PedCTyp n percent
## <chr> <int> <dbl>
## 1 Pedestrian Crossing / Dart-Out 23756 39.1
## 2 Walking / Running Along Roadway 8272 13.6
## 3 Intersection Related 7149 11.8
## 4 Vehicle / Object / Control Related 5369 8.84
## 5 Pedestrian in Roadway 4869 8.02
## 6 Expressway / Waiting to Cross 3307 5.44
## 7 Turning / Merging Vehicle 2602 4.28
## 8 Motorist Failed to Yield 2549 4.2
## 9 Other / Unknown 1598 2.63
## 10 Transit / Service Vehicle Related 516 0.85
## 11 Backing Vehicle 477 0.79
## 12 Driveway / Alley Conflict 277 0.46
#### PEDCTYPENAME (From 21 to 8 Categories)
dat3 <- dat3 %>%
mutate(
ped_cgp_clean = PEDCGPNAME %>%
str_replace_all("–|–|\\?", "-") %>%
str_squish(),
PedCgp = case_when(
str_detect(ped_cgp_clean, regex("Crossing Roadway|Crossing Expressway|Waiting to Cross", ignore_case = TRUE)) ~
"Crossing Roadway / Expressway",
str_detect(ped_cgp_clean, regex("Dash|Dart-Out", ignore_case = TRUE)) ~
"Dash / Dart-Out",
str_detect(ped_cgp_clean, regex("Walking/Running Along Roadway", ignore_case = TRUE)) ~
"Walking / Running Along Roadway",
str_detect(ped_cgp_clean, regex("Pedestrian in Roadway|Working or Playing", ignore_case = TRUE)) ~
"Pedestrian in Roadway",
str_detect(ped_cgp_clean, regex("Backing Vehicle|Driveway", ignore_case = TRUE)) ~
"Backing / Driveway Related",
str_detect(ped_cgp_clean, regex("Bus|Bus Stop", ignore_case = TRUE)) ~
"Bus / Bus Stop Related",
str_detect(ped_cgp_clean, regex("Non-Trafficway|Unique Midblock|Multiple Threat|Trapped", ignore_case = TRUE)) ~
"Non-Trafficway / Midblock / Multiple Threat",
TRUE ~
"Other / Unknown / Unusual"
)
)
dat3 %>%
count(PedCgp, sort = TRUE) %>%
mutate(
percent = round(100 * n / sum(n), 2)
)
## # A tibble: 8 × 3
## PedCgp n percent
## <chr> <int> <dbl>
## 1 Crossing Roadway / Expressway 28226 46.5
## 2 Other / Unknown / Unusual 13343 22.0
## 3 Walking / Running Along Roadway 8272 13.6
## 4 Pedestrian in Roadway 4833 7.96
## 5 Dash / Dart-Out 3988 6.57
## 6 Non-Trafficway / Midblock / Multiple Threat 1069 1.76
## 7 Backing / Driveway Related 754 1.24
## 8 Bus / Bus Stop Related 256 0.42
library(dplyr)
library(stringr)
dat3 <- dat3 %>%
mutate(
acc_type_clean = ACC_TYPENAME_2 %>%
str_replace_all("–|–", "-") %>%
str_remove("^[A-Z][0-9]+-") %>% # removes A1-, B6-, C13-, M99-, etc.
str_squish(),
AccTyp = case_when(
# 1-4. Roadside departure
str_detect(acc_type_clean, regex("Right Roadside Departure-Drive Off Road", ignore_case = TRUE)) ~
"Right Roadside Departure - Drive Off Road",
str_detect(acc_type_clean, regex("Right Roadside Departure-Control/Traction Loss", ignore_case = TRUE)) ~
"Right Roadside Departure - Control Loss",
str_detect(acc_type_clean, regex("Left Roadside Departure-Drive Off Road", ignore_case = TRUE)) ~
"Left Roadside Departure - Drive Off Road",
str_detect(acc_type_clean, regex("Left Roadside Departure-Control/Traction Loss", ignore_case = TRUE)) ~
"Left Roadside Departure - Control Loss",
# 5. Other roadside departure
str_detect(acc_type_clean, regex("Roadside Departure", ignore_case = TRUE)) ~
"Other Roadside Departure",
# 6-8. Forward impact
str_detect(acc_type_clean, regex("Forward Impact-Pedestrian|Pedestrian/ Animal", ignore_case = TRUE)) ~
"Forward Impact - Pedestrian/Animal",
str_detect(acc_type_clean, regex("Forward Impact-Parked Veh", ignore_case = TRUE)) ~
"Forward Impact - Parked Vehicle",
str_detect(acc_type_clean, regex("Forward Impact-Sta|Sta\\. Object|End Departure", ignore_case = TRUE)) ~
"Forward Impact - Object/End Departure",
# 9-11. Rear-end
str_detect(acc_type_clean, regex("Rear End-Stopped", ignore_case = TRUE)) ~
"Rear-End - Stopped Vehicle",
str_detect(acc_type_clean, regex("Rear End-Slower", ignore_case = TRUE)) ~
"Rear-End - Slower Vehicle",
str_detect(acc_type_clean, regex("Rear End-Decelerating|Rear End-Specifics", ignore_case = TRUE)) ~
"Rear-End - Decelerating/Other",
# 12-13. Same-direction sideswipe/angle
str_detect(acc_type_clean, regex("Same Direction-Angle, Sideswipe.*Straight", ignore_case = TRUE)) ~
"Same-Direction Sideswipe - Straight",
str_detect(acc_type_clean, regex("Same Direction-Angle, Sideswipe.*Changing Lanes|Same Direction-Angle, Sideswipe-Specifics", ignore_case = TRUE)) ~
"Same-Direction Sideswipe - Lane Change/Other",
# 14-15. Opposite-direction conflict
str_detect(acc_type_clean, regex("Opposite Direction-Head-On", ignore_case = TRUE)) ~
"Opposite-Direction Head-On",
str_detect(acc_type_clean, regex("Opposite Direction-Angle|Opposite Direction-Forward Impact", ignore_case = TRUE)) ~
"Opposite-Direction Angle/Sideswipe",
# 16-17. Turning conflicts
str_detect(acc_type_clean, regex("Turn Across Path", ignore_case = TRUE)) ~
"Turning Conflict - Turn Across Path",
str_detect(acc_type_clean, regex("Turn Into Path", ignore_case = TRUE)) ~
"Turning Conflict - Turn Into Path",
# 18. Intersecting paths
str_detect(acc_type_clean, regex("Intersecting Paths", ignore_case = TRUE)) ~
"Intersecting-Path Conflict",
# 19. Backing
str_detect(acc_type_clean, regex("Backing", ignore_case = TRUE)) ~
"Backing",
# 20. Other / unknown
TRUE ~
"Other / Unknown / No Impact"
)
)
dat3 %>%
count(AccTyp, sort = TRUE) %>%
mutate(percent = round(100 * n / sum(n), 2))
## # A tibble: 20 × 3
## AccTyp n percent
## <chr> <int> <dbl>
## 1 Forward Impact - Pedestrian/Animal 47624 78.4
## 2 Other / Unknown / No Impact 9211 15.2
## 3 Right Roadside Departure - Drive Off Road 1004 1.65
## 4 Left Roadside Departure - Drive Off Road 407 0.67
## 5 Backing 384 0.63
## 6 Forward Impact - Parked Vehicle 356 0.59
## 7 Rear-End - Decelerating/Other 298 0.49
## 8 Same-Direction Sideswipe - Lane Change/Other 276 0.45
## 9 Other Roadside Departure 181 0.3
## 10 Right Roadside Departure - Control Loss 170 0.28
## 11 Intersecting-Path Conflict 162 0.27
## 12 Rear-End - Stopped Vehicle 131 0.22
## 13 Turning Conflict - Turn Across Path 103 0.17
## 14 Left Roadside Departure - Control Loss 99 0.16
## 15 Forward Impact - Object/End Departure 86 0.14
## 16 Same-Direction Sideswipe - Straight 58 0.1
## 17 Rear-End - Slower Vehicle 57 0.09
## 18 Turning Conflict - Turn Into Path 50 0.08
## 19 Opposite-Direction Angle/Sideswipe 46 0.08
## 20 Opposite-Direction Head-On 38 0.06
names(dat3)
## [1] "STATENAME" "YEAR" "CRASH_NUM1" "WORK_INJNAME"
## [5] "LATITUDENAME" "LONGITUDNAME" "VE_TOTAL" "FATALS"
## [9] "DRUNK_DR" "HIT_RUNNAME_2" "WRK_ZONENAME" "REL_ROADNAME"
## [13] "WEATHER1NAME" "LGT_CONDNAME" "RUR_URBNAME" "FUNC_SYSNAME"
## [17] "PBSEXNAME" "PEDCTYPENAME" "PEDPOSNAME" "MOTMANNAME"
## [21] "PEDLEGNAME" "PEDSNRNAME" "PEDCGPNAME" "INJ_SEVNAMEDr"
## [25] "SEXNAMEDr" "DRINKINGNAMEDr" "HAZ_INVNAME_2" "BUS_USENAME_2"
## [29] "EMER_USENAME_2" "TRAV_SPNAME_2" "UNDERIDENAME_2" "ROLLOVERNAME_2"
## [33] "DEFORMEDNAME_2" "VTRAFWAYNAME_2" "VNUM_LANNAME_2" "VSPD_LIMNAME_2"
## [37] "VALIGNNAME_2" "VPROFILENAME_2" "VPAVETYPNAME_2" "VSURCONDNAME_2"
## [41] "P_CRASH1NAME_2" "ACC_TYPENAME_2" "L_STATUSNAME_2" "L_TYPENAME_2"
## [45] "CDL_STATNAME_2" "MDRDSTRDNAME_2" "DRIMPAIRNAME_2" "MDRMANAVNAME_2"
## [49] "Ped_Age" "Dr_Age" "Day_Type" "Veh_Type"
## [53] "PSL" "ped_type_clean" "PedCTyp" "ped_cgp_clean"
## [57] "PedCgp" "acc_type_clean" "AccTyp"
dat3 <- dat3[, -c(58, 54, 56)]
dim(dat3)
## [1] 60741 56
keep_cols <- c("CRASH_NUM1", "Ped_Age", "Dr_Age", "Veh_Type", "LATITUDENAME", "LONGITUDNAME", "STATENAME", "LGT_CONDNAME", "PedCTyp", "PedCgp","AccTyp")
dat3_lumped <- dat3 %>%
mutate(
across(
.cols = setdiff(
names(select(., where(~ is.character(.) | is.factor(.)))),
keep_cols
),
.fns = ~ if (n_distinct(., na.rm = TRUE) > 5) {
fct_lump_n(as.factor(.), n = 5, other_level = "Other")
} else {
as.factor(.)
}
)
)
dim(dat3_lumped)
## [1] 60741 56
category_summary <- data.frame(
Variable = names(dat3_lumped),
Categories = sapply(dat3_lumped, function(x) n_distinct(x, na.rm = FALSE))
) %>%
arrange(desc(Categories))
datatable(
category_summary,
options = list(pageLength = 25),
caption = "Number of categories after lumping"
)
freq_tables <- lapply(
dat3_lumped %>% select(-all_of(keep_cols)),
table,
useNA = "ifany"
)
freq_tables
## $YEAR
##
## 2016 2017 2018 2019 2020 2021 2022 2023 2024
## 6080 6075 6374 6272 6565 7388 7593 7314 7080
##
## $WORK_INJNAME
##
## No Not Reported Redacted Yes
## 53105 6460 133 1043
##
## $VE_TOTAL
##
## 1 2 3 4 5 6 7 8 9 10 11 12 14
## 52229 6160 1645 452 155 48 23 16 4 2 1 3 1
## 130
## 2
##
## $FATALS
##
## 1 2 3 4 5 6 8 20
## 58940 1531 188 57 6 9 8 2
##
## $DRUNK_DR
##
## 0 1 2 <NA>
## 29063 2289 14 29375
##
## $HIT_RUNNAME_2
##
## No Not Reported Yes
## 46422 988 13331
##
## $WRK_ZONENAME
##
## Construction Maintenance None
## 705 106 59455
## Utility Work Zone, Type Unknown
## 33 442
##
## $REL_ROADNAME
##
## Not Reported On Roadside On Roadway On Shoulder
## 295 1928 55468 1956
## Outside Trafficway Other
## 320 774
##
## $WEATHER1NAME
##
## Clear Cloudy Fog, Smog, Smoke Not Reported
## 17338 3484 274 37373
## Rain Other
## 2044 228
##
## $RUR_URBNAME
##
## Not Reported Rural
## 163 10237
## Trafficway Not in State Inventory Urban
## 326 50015
##
## $FUNC_SYSNAME
##
## Interstate Local
## 7231 6119
## Major Collector Minor Arterial
## 5856 14137
## Principal Arterial - Other Other
## 17820 9578
##
## $PBSEXNAME
##
## Female Male Not Reported
## 17933 42482 326
##
## $PEDCTYPENAME
##
## Crossing an Expressway
## 3168
## Disabled Vehicle-Related
## 2693
## Not At Intersection - Other / Unknown
## 5356
## Pedestrian Failed to Yield
## 19768
## Walking/Running Along Roadway With Traffic - From Behind
## 4768
## Other
## 24988
##
## $PEDPOSNAME
##
## Crosswalk Area
## 7700
## Intersection Area
## 2518
## Other/Unknown
## 1802
## Paved Shoulder / Bicycle Lane / Parking Lane
## 2543
## Travel Lane
## 43644
## Other
## 2534
##
## $MOTMANNAME
##
## Left Turn Not Applicable Not Reported Right Turn
## 1983 45795 359 798
## Straight through
## 11806
##
## $PEDLEGNAME
##
## Farside Nearside Not Applicable Not Reported
## 7608 5600 45836 1697
##
## $PEDSNRNAME
##
## Motorist traveling straight through - Crash Occurred on Far Side of Intersection / Pedestrian outside crosswalk area, other
## 456
## Motorist traveling straight through - Crash Occurred on Far Side of Intersection / Pedestrian within crosswalk area, traveled from motorist’s left
## 365
## Motorist traveling straight through - Crash Occurred on Near (Approach) Side of Intersection / Pedestrian within crosswalk area, traveled from motorist’s left
## 364
## Not Applicable
## 46065
## Not Reported
## 1555
## Other
## 11936
##
## $PEDCGPNAME
##
## Crossing Roadway - Vehicle Not Turning
## 22317
## Other / Unknown - Insufficient Details
## 7457
## Pedestrian in Roadway - Circumstances Unknown
## 4374
## Unusual Circumstances
## 5886
## Walking/Running Along Roadway
## 8272
## Other
## 12435
##
## $INJ_SEVNAMEDr
##
## No Apparent Injury (O) Not Reported
## 48413 6826
## Possible Injury (C) Suspected Minor Injury (B)
## 2473 1943
## Suspected Serious Injury (A) Other
## 607 479
##
## $SEXNAMEDr
##
## Female Male Not Reported
## 14830 37170 8741
##
## $DRINKINGNAMEDr
##
## No (Alcohol Not Involved) Not Reported Yes (Alcohol Involved)
## 35594 21161 3986
##
## $HAZ_INVNAME_2
##
## No Not Reported Yes
## 59732 936 73
##
## $BUS_USENAME_2
##
## Charter/Tour Not a Bus Not Reported School
## 39 54506 5804 104
## Transit/ Commuter Other
## 241 47
##
## $EMER_USENAME_2
##
## Emergency Operation, Emergency Warning Equipment in Use
## 57
## Emergency Operation, Emergency Warning Equipment in Use Unknown
## 43
## Non-Emergency, Non-Transport
## 53
## Not Applicable
## 58212
## Not Reported
## 2351
## Other
## 25
##
## $TRAV_SPNAME_2
##
## 035 MPH 040 MPH 045 MPH 055 MPH Not Reported Other
## 2333 2527 3414 1903 38258 12306
##
## $UNDERIDENAME_2
##
## No Underride or Override Noted
## 30840
## Not Reported
## 29877
## Overriding a Motor Vehicle In-Transport
## 4
## Overriding a Motor Vehicle Not In-Transport
## 7
## Underriding a Motor Vehicle In-Transport, Underride, Compartment Intrusion Unknown
## 3
## Underriding a Motor Vehicle Not In-Transport, Underride, Compartment Intrusion Unknown
## 3
## Other
## 7
##
## $ROLLOVERNAME_2
##
## No Rollover Not Applicable
## 59261 125
## Not Reported Rollover
## 936 155
## Rollover, Tripped by Object/Vehicle Other
## 227 37
##
## $DEFORMEDNAME_2
##
## Damage Reported, Extent Unknown Disabling Damage
## 4028 13269
## Functional Damage Minor Damage
## 15432 12081
## Not Reported Other
## 12483 3448
##
## $VTRAFWAYNAME_2
##
## Not Reported
## 1599
## Two-Way, Divided, Positive Median Barrier
## 9525
## Two-Way, Divided, Unprotected Median
## 13168
## Two-Way, Not Divided
## 24466
## Two-Way, Not Divided With a Continuous Left-Turn Lane
## 7725
## Other
## 4258
##
## $VNUM_LANNAME_2
##
## Five lanes Four lanes Not Reported Three lanes Two lanes Other
## 8514 9016 2029 10293 26179 4710
##
## $VSPD_LIMNAME_2
##
## 25 MPH 35 MPH 40 MPH 45 MPH 55 MPH Other
## 4874 9993 7268 10795 7212 20599
##
## $VALIGNNAME_2
##
## Curve - Left Curve Left Curve Right Not Reported Straight Other
## 708 927 944 2682 53922 1558
##
## $VPROFILENAME_2
##
## Downhill Grade, Unknown Slope Level
## 1837 3695 44259
## Not Reported Uphill Other
## 7903 1580 1467
##
## $VPAVETYPNAME_2
##
## Blacktop, Bituminous, or Asphalt Concrete
## 36107 4603
## Non-Trafficway or Driveway Access Not Reported
## 457 19416
## Slag, Gravel or Stone Other
## 69 89
##
## $VSURCONDNAME_2
##
## Dry Ice/Frost
## 50381 236
## Non-Trafficway or Driveway Access Not Reported
## 457 1828
## Wet Other
## 7490 349
##
## $P_CRASH1NAME_2
##
## Going Straight Negotiating a Curve Not Reported Turning Left
## 48441 3146 2633 2355
## Turning Right Other
## 982 3184
##
## $ACC_TYPENAME_2
##
## A1-Single Driver-Right Roadside Departure-Drive Off Road
## 1004
## B6-Single Driver-Left Roadside Departure-Drive Off Road
## 407
## C13-Single Driver-Forward Impact-Pedestrian/ Animal
## 47624
## M98-Other Crash Type
## 884
## Not Reported
## 8136
## Other
## 2686
##
## $L_STATUSNAME_2
##
## Expired Not licensed Not Reported Suspended Valid Other
## 636 2501 9226 2399 45437 542
##
## $L_TYPENAME_2
##
## Full Driver License Intermediate Driver License
## 47828 721
## Learner's Permit Not Licensed
## 213 2501
## Not Reported Other
## 9218 260
##
## $CDL_STATNAME_2
##
## Disqualified No (CDL) Not Reported Suspended Valid Other
## 149 45875 9248 131 5014 324
##
## $MDRDSTRDNAME_2
##
## Distraction/Inattention
## 276
## Inattention (Inattentive), Details Unknown
## 533
## Not Distracted
## 11548
## Not Reported
## 45743
## Reported as Unknown if Distracted
## 1372
## Other
## 1269
##
## $DRIMPAIRNAME_2
##
## Asleep or Fatigued
## 305
## Ill, Blackout
## 253
## None/Apparently Normal
## 27034
## Not Reported
## 28584
## Under the Influence of Alcohol, Drugs or Medication
## 3971
## Other
## 594
##
## $MDRMANAVNAME_2
##
## Driver Did Not Maneuver to Avoid
## 7082
## Motor Vehicle
## 160
## Not Reported
## 50283
## Pedestrian, Pedalcyclist or Other Non-Motorist
## 2952
## Phantom/Non-Contact Motor Vehicle
## 179
## Other
## 85
##
## $Day_Type
##
## Weekday Weekend
## 42325 18416
##
## $PSL
##
## 25_or_less 30_35 40_45 50_55 60_plus Other
## 5427 14474 18063 9752 8789 4236
keep_names <- c("CRASH_NUM1")
old_names <- names(dat3_lumped)
new_names <- ifelse(
old_names %in% keep_names,
old_names,
str_to_title(str_sub(str_replace_all(old_names, "_", ""), 1, 4))
)
new_names <- make.unique(new_names, sep = "")
name_key <- data.frame(
old_name = old_names,
new_name = new_names
)
names(dat3_lumped) <- new_names
names(dat3_lumped)
## [1] "Stat" "Year" "CRASH_NUM1" "Work" "Lati"
## [6] "Long" "Veto" "Fata" "Drun" "Hitr"
## [11] "Wrkz" "Relr" "Weat" "Lgtc" "Ruru"
## [16] "Func" "Pbse" "Pedc" "Pedp" "Motm"
## [21] "Pedl" "Peds" "Pedc1" "Injs" "Sexn"
## [26] "Drin" "Hazi" "Busu" "Emer" "Trav"
## [31] "Unde" "Roll" "Defo" "Vtra" "Vnum"
## [36] "Vspd" "Vali" "Vpro" "Vpav" "Vsur"
## [41] "Pcra" "Acct" "Lsta" "Ltyp" "Cdls"
## [46] "Mdrd" "Drim" "Mdrm" "Peda" "Drag"
## [51] "Dayt" "Veht" "Psl" "Pedc2" "Pedc3"
## [56] "Acct1"
datatable(
name_key,
options = list(pageLength = 25, scrollX = TRUE),
caption = "Variable name key"
)
datatable(
head(dat3_lumped, 50),
options = list(scrollX = TRUE, pageLength = 10),
caption = "Final processed dataset"
)
write.csv(dat3_lumped, "Ped01.csv", row.names = FALSE)
write_xlsx(dat3_lumped, "Ped01.xlsx")
write.csv(name_key, "Ped01_variable_name_key.csv", row.names = FALSE)
write_xlsx(name_key, "Ped01_variable_name_key.xlsx")
write.csv(category_summary, "Ped01_category_summary.csv", row.names = FALSE)
write_xlsx(category_summary, "Ped01_category_summary.xlsx")
The following files are created in the working directory:
Ped01.csvPed01.xlsxPed01_variable_name_key.csvPed01_variable_name_key.xlsxPed01_category_summary.csvPed01_category_summary.xlsx