Variable Select
## [1] "Stat" "Year" "CRASH_NUM1" "Work" "Lati"
## [6] "Long" "Veto" "Fata" "Drun" "Hitr"
## [11] "Wrkz" "Relr" "Weat" "Lgtc" "Ruru"
## [16] "Func" "Pbse" "Bike" "Motm" "Bike1"
## [21] "Injs" "Sexn" "Drin" "Hazi" "Busu"
## [26] "Emer" "Trav" "Unde" "Roll" "Defo"
## [31] "Vtra" "Vnum" "Vali" "Vpro" "Vpav"
## [36] "Vsur" "Pcra" "Lsta" "Ltyp" "Cdls"
## [41] "Mdrd" "Drim" "Mdrm" "Peda" "Drag"
## [46] "Dayt" "Veht" "Psl" "Bike2" "Bike3"
## [51] "Acct"
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(purrr)
library(forcats)
library(compareGroups)
library(randomForest)
library(xgboost)
library(patchwork)
dat1 <- subset(dat, Hitr == "Yes" | Hitr == "No")
dat1$Hitr <- factor(dat1$Hitr, levels = c("No", "Yes"))
dat2 <- dat1[, c(10, 13:20, 22:51)]
dim(dat2)## [1] 8679 39
res1 <- compareGroups(
Hitr ~ .,
data = dat2,
ref = 1,
max.ylev = 30,
max.xlev = 30
)
mm <- createTable(res1, show.ratio = TRUE)
mm##
## --------Summary descriptives table by 'Hitr'---------
##
## ________________________________________________________________________________________________________________________________
## No Yes OR p.ratio p.overall
## N=6875 N=1804
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Weat: .
## Clear 2099 (30.5%) 509 (28.2%) Ref. Ref.
## Cloudy 339 (4.93%) 77 (4.27%) 0.94 [0.71;1.22] 0.636
## Fog, Smog, Smoke 12 (0.17%) 1 (0.06%) 0.39 [0.02;1.99] 0.307
## Not Reported 4300 (62.5%) 1189 (65.9%) 1.14 [1.02;1.28] 0.026
## Other 7 (0.10%) 1 (0.06%) 0.66 [0.03;3.82] 0.694
## Rain 118 (1.72%) 27 (1.50%) 0.95 [0.60;1.43] 0.806
## Lgtc: .
## Dark - Lighted 1512 (22.0%) 693 (38.4%) Ref. Ref.
## Dark - Not Lighted 1400 (20.4%) 567 (31.4%) 0.88 [0.77;1.01] 0.068
## Dark - Unknown Lighting 62 (0.90%) 33 (1.83%) 1.16 [0.75;1.78] 0.496
## Dawn 128 (1.86%) 25 (1.39%) 0.43 [0.27;0.65] <0.001
## Daylight 3575 (52.0%) 414 (22.9%) 0.25 [0.22;0.29] 0.000
## Dusk 176 (2.56%) 43 (2.38%) 0.53 [0.37;0.75] <0.001
## Not Reported 21 (0.31%) 27 (1.50%) 2.80 [1.57;5.05] 0.001
## Other 1 (0.01%) 2 (0.11%) 4.10 [0.33;129] 0.266
## Ruru: <0.001
## Not Reported 29 (0.42%) 4 (0.22%) Ref. Ref.
## Rural 1430 (20.8%) 267 (14.8%) . [.;.] .
## Trafficway Not in State Inventory 12 (0.17%) 0 (0.00%) . [.;.] .
## Urban 5404 (78.6%) 1533 (85.0%) . [.;.] .
## Func: .
## Interstate 106 (1.54%) 38 (2.11%) Ref. Ref.
## Local 890 (12.9%) 277 (15.4%) . [.;.] .
## Major Collector 1080 (15.7%) 266 (14.7%) . [.;.] .
## Minor Arterial 1854 (27.0%) 515 (28.5%) . [.;.] .
## Minor Collector 208 (3.03%) 62 (3.44%) . [.;.] .
## Not Reported 23 (0.33%) 5 (0.28%) . [.;.] .
## Other Freeways and Expressways 26 (0.38%) 7 (0.39%) . [.;.] .
## Other Principal Arterial 662 (9.63%) 154 (8.54%) . [.;.] .
## Principal Arterial - Other 1930 (28.1%) 466 (25.8%) . [.;.] .
## Principal Arterial - Other Freeways and Expressways 84 (1.22%) 14 (0.78%) . [.;.] .
## Trafficway Not in State Inventory 12 (0.17%) 0 (0.00%) . [.;.] .
## Pbse: 0.058
## Female 908 (13.2%) 204 (11.3%) Ref. Ref.
## Male 5913 (86.0%) 1581 (87.6%) 1.19 [1.01;1.40] 0.033
## Not Reported 54 (0.79%) 19 (1.05%) 1.57 [0.89;2.67] 0.116
## Bike: .
## Bicycle Lane / Paved Shoulder / Parking Lane 438 (6.37%) 239 (13.2%) Ref. Ref.
## Non-Trafficway - Driveway 1 (0.01%) 0 (0.00%) . [.;.] .
## Non-Trafficway - Parking Lot/Other 5 (0.07%) 0 (0.00%) . [.;.] .
## Not Reported 63 (0.92%) 65 (3.60%) . [.;.] .
## Other 38 (0.55%) 16 (0.89%) . [.;.] .
## Shared-Use Path 3 (0.04%) 0 (0.00%) . [.;.] .
## Sidewalk / Crosswalk / Driveway Access 726 (10.6%) 120 (6.65%) . [.;.] .
## Sidewalk / Crosswalk / Driveway Access 116 (1.69%) 12 (0.67%) . [.;.] .
## Travel Lane 5485 (79.8%) 1352 (74.9%) . [.;.] .
## Motm: Not a Pedestrian 6875 (100%) 1804 (100%) Ref. Ref. .
## Bike1: .
## At Intersection 2195 (31.9%) 324 (18.0%) Ref. Ref.
## Intersection-Related 563 (8.19%) 143 (7.93%) . [.;.] .
## Non-Trafficway Location 8 (0.12%) 0 (0.00%) . [.;.] .
## Not At Intersection 4088 (59.5%) 1316 (72.9%) . [.;.] .
## Not Reported 21 (0.31%) 21 (1.16%) . [.;.] .
## Sexn: 0.000
## Female 2023 (29.4%) 191 (10.6%) Ref. Ref.
## Male 4828 (70.2%) 768 (42.6%) 1.68 [1.43;1.99] <0.001
## Not Reported 24 (0.35%) 845 (46.8%) 369 [245;586] 0.000
## Drin: 0.000
## No (Alcohol Not Involved) 5041 (73.3%) 406 (22.5%) Ref. Ref.
## Not Reported 1419 (20.6%) 1151 (63.8%) 10.1 [8.87;11.4] 0.000
## Yes (Alcohol Involved) 415 (6.04%) 247 (13.7%) 7.39 [6.12;8.91] 0.000
## Hazi: 0.748
## No 6863 (99.8%) 1802 (99.9%) Ref. Ref.
## Yes 12 (0.17%) 2 (0.11%) 0.68 [0.10;2.51] 0.598
## Busu: .
## Charter/Tour 7 (0.10%) 1 (0.06%) Ref. Ref.
## Not a Bus 6779 (98.6%) 1296 (71.8%) . [.;.] .
## Not Reported 20 (0.29%) 506 (28.0%) . [.;.] .
## Other 7 (0.10%) 0 (0.00%) . [.;.] .
## School 26 (0.38%) 0 (0.00%) . [.;.] .
## Transit/ Commuter 36 (0.52%) 1 (0.06%) . [.;.] .
## Emer: <0.001
## Emergency Operation, Emergency Warning Equipment in Use 5 (0.07%) 0 (0.00%) Ref. Ref.
## Emergency Operation, Emergency Warning Equipment in Use Unknown 9 (0.13%) 0 (0.00%) . [.;.] .
## Non-Emergency, Non-Transport 6 (0.09%) 0 (0.00%) . [.;.] .
## Not Applicable 6837 (99.4%) 1671 (92.6%) . [.;.] .
## Not Reported 11 (0.16%) 133 (7.37%) . [.;.] .
## Other 7 (0.10%) 0 (0.00%) . [.;.] .
## Trav: <0.001
## 035 MPH 337 (4.90%) 52 (2.88%) Ref. Ref.
## 040 MPH 360 (5.24%) 52 (2.88%) 0.94 [0.62;1.42] 0.754
## 045 MPH 497 (7.23%) 120 (6.65%) 1.56 [1.10;2.24] 0.012
## 055 MPH 306 (4.45%) 53 (2.94%) 1.12 [0.74;1.70] 0.585
## Not Reported 3689 (53.7%) 1309 (72.6%) 2.29 [1.72;3.13] <0.001
## Other 1686 (24.5%) 218 (12.1%) 0.84 [0.61;1.17] 0.287
## Unde: 0.002
## No Underride or Override Noted 3491 (50.8%) 839 (46.5%) Ref. Ref.
## Not Reported 3383 (49.2%) 965 (53.5%) . [.;.] .
## Overriding a Motor Vehicle Not In-Transport 1 (0.01%) 0 (0.00%) . [.;.] .
## Roll: 0.003
## No Rollover 6810 (99.1%) 1799 (99.7%) Ref. Ref.
## Not Applicable 30 (0.44%) 1 (0.06%) 0.14 [0.01;0.66] 0.007
## Other 1 (0.01%) 2 (0.11%) 7.11 [0.58;223] 0.122
## Rollover 15 (0.22%) 1 (0.06%) 0.29 [0.01;1.41] 0.147
## Rollover, Tripped by Object/Vehicle 19 (0.28%) 1 (0.06%) 0.23 [0.01;1.09] 0.067
## Defo: <0.001
## Damage Reported, Extent Unknown 479 (6.97%) 101 (5.60%) Ref. Ref.
## Disabling Damage 1572 (22.9%) 151 (8.37%) 0.46 [0.35;0.60] <0.001
## Functional Damage 1945 (28.3%) 449 (24.9%) 1.09 [0.87;1.39] 0.458
## Minor Damage 1799 (26.2%) 241 (13.4%) 0.63 [0.49;0.82] 0.001
## Not Reported 614 (8.93%) 813 (45.1%) 6.27 [4.95;7.99] 0.000
## Other 466 (6.78%) 49 (2.72%) 0.50 [0.34;0.72] <0.001
## Vtra: 0.020
## One-Way Trafficway 190 (2.76%) 75 (4.16%) Ref. Ref.
## Other 296 (4.31%) 73 (4.05%) 0.63 [0.43;0.91] 0.013
## Two-Way, Divided, Positive Median Barrier 393 (5.72%) 93 (5.16%) 0.60 [0.42;0.85] 0.005
## Two-Way, Divided, Unprotected Median 1581 (23.0%) 383 (21.2%) 0.61 [0.46;0.82] 0.001
## Two-Way, Not Divided 3667 (53.3%) 966 (53.5%) 0.67 [0.51;0.88] 0.005
## Two-Way, Not Divided With a Continuous Left-Turn Lane 748 (10.9%) 214 (11.9%) 0.72 [0.53;0.99] 0.043
## Vnum: 0.207
## Five lanes 799 (11.6%) 209 (11.6%) Ref. Ref.
## Four lanes 863 (12.6%) 238 (13.2%) 1.05 [0.86;1.30] 0.621
## Other 387 (5.63%) 118 (6.54%) 1.17 [0.90;1.50] 0.242
## Six lanes 150 (2.18%) 31 (1.72%) 0.79 [0.51;1.19] 0.266
## Three lanes 944 (13.7%) 217 (12.0%) 0.88 [0.71;1.09] 0.233
## Two lanes 3732 (54.3%) 991 (54.9%) 1.01 [0.86;1.20] 0.865
## Vali: <0.001
## Curve - Left 90 (1.31%) 23 (1.27%) Ref. Ref.
## Curve Left 132 (1.92%) 31 (1.72%) 0.92 [0.50;1.70] 0.782
## Curve Right 124 (1.80%) 26 (1.44%) 0.82 [0.44;1.54] 0.537
## Not Reported 166 (2.41%) 84 (4.66%) 1.97 [1.17;3.40] 0.010
## Other 167 (2.43%) 27 (1.50%) 0.63 [0.34;1.18] 0.148
## Straight 6196 (90.1%) 1613 (89.4%) 1.01 [0.65;1.65] 0.955
## Vpro: <0.001
## Downhill 184 (2.68%) 44 (2.44%) Ref. Ref.
## Grade, Unknown Slope 375 (5.45%) 92 (5.10%) 1.02 [0.69;1.54] 0.906
## Level 5166 (75.1%) 1314 (72.8%) 1.06 [0.77;1.50] 0.729
## Not Reported 737 (10.7%) 261 (14.5%) 1.48 [1.04;2.14] 0.029
## Other 171 (2.49%) 31 (1.72%) 0.76 [0.45;1.26] 0.285
## Uphill 242 (3.52%) 62 (3.44%) 1.07 [0.70;1.66] 0.758
## Vpav: .
## Blacktop, Bituminous, or Asphalt 3940 (57.3%) 1007 (55.8%) Ref. Ref.
## Concrete 427 (6.21%) 81 (4.49%) 0.74 [0.58;0.95] 0.016
## Non-Trafficway or Driveway Access 43 (0.63%) 2 (0.11%) 0.20 [0.03;0.63] 0.003
## Not Reported 2446 (35.6%) 711 (39.4%) 1.14 [1.02;1.27] 0.020
## Other 8 (0.12%) 2 (0.11%) 1.03 [0.14;4.24] 0.968
## Slag, Gravel or Stone 11 (0.16%) 1 (0.06%) 0.40 [0.02;2.09] 0.331
## Vsur: .
## Dry 6251 (90.9%) 1581 (87.6%) Ref. Ref.
## Non-Trafficway or Driveway Access 43 (0.63%) 2 (0.11%) 0.20 [0.03;0.64] 0.004
## Not Reported 69 (1.00%) 51 (2.83%) 2.92 [2.02;4.21] <0.001
## Other 16 (0.23%) 3 (0.17%) 0.77 [0.17;2.35] 0.678
## Snow 7 (0.10%) 3 (0.17%) 1.74 [0.36;6.45] 0.452
## Wet 489 (7.11%) 164 (9.09%) 1.33 [1.10;1.59] 0.003
## Pcra: <0.001
## Going Straight 5350 (77.8%) 1383 (76.7%) Ref. Ref.
## Negotiating a Curve 387 (5.63%) 93 (5.16%) 0.93 [0.73;1.17] 0.546
## Not Reported 27 (0.39%) 177 (9.81%) 25.2 [17.0;38.8] 0.000
## Other 341 (4.96%) 67 (3.71%) 0.76 [0.58;0.99] 0.041
## Turning Left 380 (5.53%) 46 (2.55%) 0.47 [0.34;0.63] <0.001
## Turning Right 390 (5.67%) 38 (2.11%) 0.38 [0.27;0.52] <0.001
## Lsta: 0.000
## Expired 64 (0.93%) 21 (1.16%) Ref. Ref.
## Not licensed 217 (3.16%) 117 (6.49%) 1.63 [0.96;2.87] 0.069
## Not Reported 54 (0.79%) 867 (48.1%) 48.2 [27.8;86.6] 0.000
## Other 44 (0.64%) 27 (1.50%) 1.86 [0.94;3.75] 0.077
## Suspended 226 (3.29%) 143 (7.93%) 1.92 [1.14;3.35] 0.014
## Valid 6270 (91.2%) 629 (34.9%) 0.30 [0.19;0.51] <0.001
## Ltyp: .
## Full Driver License 6424 (93.4%) 798 (44.2%) Ref. Ref.
## Intermediate Driver License 127 (1.85%) 14 (0.78%) 0.90 [0.49;1.51] 0.697
## Learner's Permit 37 (0.54%) 2 (0.11%) 0.47 [0.07;1.53] 0.241
## Not Licensed 217 (3.16%) 117 (6.49%) 4.34 [3.42;5.49] 0.000
## Not Reported 54 (0.79%) 867 (48.1%) 129 [97.8;174] 0.000
## Other 16 (0.23%) 6 (0.33%) 3.07 [1.08;7.55] 0.037
## Cdls: .
## Disqualified 15 (0.22%) 6 (0.33%) Ref. Ref.
## No (CDL) 5930 (86.3%) 855 (47.4%) 0.35 [0.14;1.01] 0.053
## Not Reported 58 (0.84%) 869 (48.2%) 36.5 [14.1;107] <0.001
## Other 39 (0.57%) 9 (0.50%) 0.58 [0.17;2.03] 0.383
## Suspended 12 (0.17%) 7 (0.39%) 1.44 [0.37;5.77] 0.598
## Valid 821 (11.9%) 58 (3.22%) 0.17 [0.07;0.51] 0.003
## Mdrd: <0.001
## Distraction/Inattention 42 (0.61%) 5 (0.28%) Ref. Ref.
## Inattention (Inattentive), Details Unknown 79 (1.15%) 21 (1.16%) 2.18 [0.81;7.04] 0.128
## Not Distracted 1506 (21.9%) 152 (8.43%) 0.83 [0.35;2.45] 0.699
## Not Reported 5003 (72.8%) 1510 (83.7%) 2.47 [1.07;7.25] 0.033
## Other 156 (2.27%) 26 (1.44%) 1.37 [0.53;4.32] 0.540
## Reported as Unknown if Distracted 89 (1.29%) 90 (4.99%) 8.22 [3.37;25.0] <0.001
## Drim: <0.001
## Asleep or Fatigued 51 (0.74%) 12 (0.67%) Ref. Ref.
## Ill, Blackout 30 (0.44%) 2 (0.11%) 0.30 [0.04;1.23] 0.101
## None/Apparently Normal 3746 (54.5%) 226 (12.5%) 0.25 [0.14;0.51] <0.001
## Not Reported 2509 (36.5%) 1325 (73.4%) 2.22 [1.22;4.40] 0.008
## Other 77 (1.12%) 16 (0.89%) 0.88 [0.38;2.07] 0.768
## Under the Influence of Alcohol, Drugs or Medication 462 (6.72%) 223 (12.4%) 2.03 [1.09;4.08] 0.024
## Mdrm: .
## Driver Did Not Maneuver to Avoid 873 (12.7%) 99 (5.49%) Ref. Ref.
## Motor Vehicle 2 (0.03%) 1 (0.06%) . [.;.] .
## Not Reported 5620 (81.7%) 1684 (93.3%) . [.;.] .
## Other 4 (0.06%) 1 (0.06%) . [.;.] .
## Pedestrian, Pedalcyclist or Other Non-Motorist 368 (5.35%) 19 (1.05%) . [.;.] .
## Phantom/Non-Contact Motor Vehicle 8 (0.12%) 0 (0.00%) . [.;.] .
## Peda: <0.001
## 0_4 33 (0.48%) 6 (0.33%) Ref. Ref.
## 10_14 245 (3.56%) 37 (2.05%) 0.82 [0.34;2.31] 0.679
## 15_20 390 (5.67%) 102 (5.65%) 1.41 [0.61;3.86] 0.442
## 21_24 231 (3.36%) 69 (3.82%) 1.61 [0.69;4.46] 0.289
## 25_34 764 (11.1%) 240 (13.3%) 1.69 [0.75;4.58] 0.220
## 35_44 917 (13.3%) 309 (17.1%) 1.81 [0.80;4.90] 0.160
## 45_54 1155 (16.8%) 348 (19.3%) 1.62 [0.72;4.38] 0.259
## 5_9 96 (1.40%) 11 (0.61%) 0.63 [0.22;1.98] 0.408
## 55_64 1538 (22.4%) 440 (24.4%) 1.54 [0.69;4.15] 0.315
## 65_74 932 (13.6%) 160 (8.87%) 0.92 [0.41;2.51] 0.865
## 75+ 481 (7.00%) 55 (3.05%) 0.62 [0.26;1.72] 0.329
## Not Reported 93 (1.35%) 27 (1.50%) 1.57 [0.62;4.57] 0.355
## Drag: .
## 10_14 2 (0.03%) 2 (0.11%) Ref. Ref.
## 15_20 677 (9.85%) 72 (3.99%) 0.11 [0.01;1.04] 0.054
## 21_24 648 (9.43%) 115 (6.37%) 0.18 [0.02;1.72] 0.125
## 25_34 1513 (22.0%) 286 (15.9%) 0.19 [0.02;1.82] 0.136
## 35_44 1197 (17.4%) 204 (11.3%) 0.17 [0.02;1.65] 0.116
## 45_54 1039 (15.1%) 122 (6.76%) 0.12 [0.01;1.14] 0.063
## 55_64 903 (13.1%) 86 (4.77%) 0.10 [0.01;0.93] 0.044
## 65_74 538 (7.83%) 24 (1.33%) 0.05 [0.00;0.45] 0.012
## 75+ 328 (4.77%) 15 (0.83%) 0.05 [0.00;0.48] 0.013
## Not Reported 30 (0.44%) 878 (48.7%) 28.8 [2.92;284] 0.007
## Dayt: <0.001
## Weekday 5066 (73.7%) 1161 (64.4%) Ref. Ref.
## Weekend 1809 (26.3%) 643 (35.6%) 1.55 [1.39;1.73] <0.001
## Veht: .
## Bus 193 (2.81%) 25 (1.39%) Ref. Ref.
## Car 2729 (39.7%) 518 (28.7%) 1.46 [0.97;2.29] 0.072
## Large_Truck 638 (9.28%) 60 (3.33%) 0.72 [0.45;1.21] 0.209
## Light_Truck 2967 (43.2%) 496 (27.5%) 1.28 [0.85;2.02] 0.240
## Motorcycle 76 (1.11%) 4 (0.22%) 0.42 [0.12;1.14] 0.091
## Not Reported 21 (0.31%) 668 (37.0%) 239 [134;452] 0.000
## Off_Road_Other 6 (0.09%) 2 (0.11%) 2.68 [0.34;12.8] 0.302
## Van 245 (3.56%) 31 (1.72%) 0.98 [0.56;1.72] 0.932
## Psl: <0.001
## 25_or_less 763 (11.1%) 187 (10.4%) Ref. Ref.
## 30_35 1695 (24.7%) 509 (28.2%) 1.22 [1.02;1.48] 0.033
## 40_45 2262 (32.9%) 564 (31.3%) 1.02 [0.85;1.23] 0.859
## 50_55 1456 (21.2%) 325 (18.0%) 0.91 [0.75;1.11] 0.360
## Not Reported 319 (4.64%) 153 (8.48%) 1.96 [1.52;2.51] <0.001
## Other 380 (5.53%) 66 (3.66%) 0.71 [0.52;0.96] 0.026
## Bike2: .
## Backing Vehicle 20 (0.29%) 2 (0.11%) Ref. Ref.
## Crossing Paths 409 (5.95%) 95 (5.27%) . [.;.] .
## Loss of Control 157 (2.28%) 32 (1.77%) . [.;.] .
## Multiple Threat / Trapped 26 (0.38%) 2 (0.11%) . [.;.] .
## Non-Roadway / Play Vehicle 21 (0.31%) 0 (0.00%) . [.;.] .
## Other / Unknown / Not Reported 498 (7.24%) 220 (12.2%) . [.;.] .
## Overtaking / Passing 1756 (25.5%) 848 (47.0%) . [.;.] .
## Parallel Paths 357 (5.19%) 159 (8.81%) . [.;.] .
## Ride-Out / Drive-Out 1146 (16.7%) 132 (7.32%) . [.;.] .
## Ride-Through / Drive-Through 842 (12.2%) 85 (4.71%) . [.;.] .
## Turning / Merging 1317 (19.2%) 124 (6.87%) . [.;.] .
## Wrong-Way / Wrong-Side 326 (4.74%) 105 (5.82%) . [.;.] .
## Bike3: <0.001
## Backing / Wrong-Way / Non-Trafficway 357 (5.19%) 107 (5.93%) Ref. Ref.
## Bicyclist Failed to Yield 1769 (25.7%) 165 (9.15%) 0.31 [0.24;0.41] <0.001
## Bicyclist Turn / Merge 705 (10.3%) 52 (2.88%) 0.25 [0.17;0.35] <0.001
## Crossing / Parallel Paths 962 (14.0%) 291 (16.1%) 1.01 [0.79;1.30] 0.947
## Loss of Control / Turning Error 306 (4.45%) 45 (2.49%) 0.49 [0.33;0.72] <0.001
## Motorist Failed to Yield 188 (2.73%) 53 (2.94%) 0.94 [0.64;1.36] 0.753
## Motorist Turn / Merge 430 (6.25%) 53 (2.94%) 0.41 [0.29;0.59] <0.001
## Other / Unknown / Unusual 402 (5.85%) 190 (10.5%) 1.58 [1.20;2.08] 0.001
## Overtaking 1756 (25.5%) 848 (47.0%) 1.61 [1.28;2.04] <0.001
## Acct: .
## Backing 18 (0.26%) 2 (0.11%) Ref. Ref.
## Forward Impact - Object/End Departure 3 (0.04%) 1 (0.06%) . [.;.] .
## Forward Impact - Parked Vehicle 3 (0.04%) 1 (0.06%) . [.;.] .
## Forward Impact - Pedestrian/Animal 5750 (83.6%) 1472 (81.6%) . [.;.] .
## Intersecting-Path Conflict 11 (0.16%) 0 (0.00%) . [.;.] .
## Left Roadside Departure - Control Loss 5 (0.07%) 1 (0.06%) . [.;.] .
## Left Roadside Departure - Drive Off Road 17 (0.25%) 4 (0.22%) . [.;.] .
## Opposite-Direction Angle/Sideswipe 4 (0.06%) 0 (0.00%) . [.;.] .
## Opposite-Direction Head-On 1 (0.01%) 0 (0.00%) . [.;.] .
## Other / Unknown / No Impact 954 (13.9%) 285 (15.8%) . [.;.] .
## Other Roadside Departure 8 (0.12%) 3 (0.17%) . [.;.] .
## Rear-End - Slower Vehicle 3 (0.04%) 2 (0.11%) . [.;.] .
## Right Roadside Departure - Control Loss 9 (0.13%) 1 (0.06%) . [.;.] .
## Right Roadside Departure - Drive Off Road 72 (1.05%) 31 (1.72%) . [.;.] .
## Same-Direction Sideswipe - Lane Change/Other 3 (0.04%) 0 (0.00%) . [.;.] .
## Same-Direction Sideswipe - Straight 1 (0.01%) 1 (0.06%) . [.;.] .
## Turning Conflict - Turn Across Path 10 (0.15%) 0 (0.00%) . [.;.] .
## Turning Conflict - Turn Into Path 3 (0.04%) 0 (0.00%) . [.;.] .
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
pvals <- getResults(mm, "p.overall")
pdat <- data.frame(
variable = names(pvals),
p_value_raw = as.character(pvals),
stringsAsFactors = FALSE
)
pdat$p_value <- suppressWarnings(
as.numeric(gsub("<", "", pdat$p_value_raw))
)
sig_vars <- pdat %>%
filter(!is.na(p_value), p_value <= 0.05) %>%
pull(variable)
sig_vars <- intersect(sig_vars, names(dat2))
sig_vars <- setdiff(sig_vars, "Hitr")
dat_ml <- dat2[, c("Hitr", sig_vars)]
dat_ml <- na.omit(dat_ml)
x <- dat_ml[, sig_vars, drop = FALSE]
y <- dat_ml$Hitr
rf_fit <- randomForest(
x = x,
y = y,
importance = TRUE,
ntree = 1000
)
rf_imp <- importance(rf_fit)
rf_rank <- data.frame(
variable = rownames(rf_imp),
rf_importance = rf_imp[, "MeanDecreaseGini"],
row.names = NULL
) %>%
arrange(desc(rf_importance))
x_mm <- model.matrix(Hitr ~ . - 1, data = dat_ml)
y_xgb <- ifelse(dat_ml$Hitr == "Yes", 1, 0)
dtrain <- xgb.DMatrix(data = x_mm, label = y_xgb)
xgb_fit <- xgb.train(
data = dtrain,
params = list(
objective = "binary:logistic",
eval_metric = "logloss",
max_depth = 3,
eta = 0.05,
subsample = 0.8,
colsample_bytree = 0.8
),
nrounds = 200,
verbose = 0
)
xgb_imp_raw <- xgb.importance(
feature_names = colnames(x_mm),
model = xgb_fit
)
map_xgb_to_original <- function(feature, vars) {
matched <- vars[str_starts(feature, fixed(vars))]
if (length(matched) == 0) return(NA_character_)
matched[which.max(nchar(matched))]
}
xgb_rank <- xgb_imp_raw %>%
mutate(variable = map_chr(Feature, map_xgb_to_original, vars = sig_vars)) %>%
filter(!is.na(variable)) %>%
group_by(variable) %>%
summarise(xgb_importance = sum(Gain), .groups = "drop") %>%
arrange(desc(xgb_importance))
rf_top <- rf_rank %>%
slice_head(n = 30) %>%
pull(variable)
xgb_top <- xgb_rank %>%
slice_head(n = 30) %>%
pull(variable)
rf_plot_dat <- rf_rank %>%
slice_max(rf_importance, n = 15) %>%
mutate(variable = fct_reorder(variable, rf_importance))
xgb_plot_dat <- xgb_rank %>%
slice_max(xgb_importance, n = 15) %>%
mutate(variable = fct_reorder(variable, xgb_importance))
p_rf <- ggplot(rf_plot_dat, aes(x = rf_importance, y = variable)) +
geom_col(fill = "#2C7FB8", width = 0.72) +
labs(
title = "Random Forest",
subtitle = "Top 15 variables by Mean Decrease Gini",
x = "Importance",
y = NULL
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 11, color = "gray35"),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank()
)
p_xgb <- ggplot(xgb_plot_dat, aes(x = xgb_importance, y = variable)) +
geom_col(fill = "#D95F0E", width = 0.72) +
labs(
title = "XGBoost",
subtitle = "Top 15 variables by Gain",
x = "Importance",
y = NULL
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 11, color = "gray35"),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank()
)
p_rf + p_xgb +
plot_annotation(
title = "Variable Importance Comparison",
subtitle = "Random forest and XGBoost models based on significant compareGroups variables",
theme = theme(
plot.title = element_text(face = "bold", size = 18),
plot.subtitle = element_text(size = 12, color = "gray35")
)
)common_vars <- intersect(rf_top, xgb_top)
top15_common <- tibble(variable = common_vars) %>%
left_join(rf_rank, by = "variable") %>%
left_join(xgb_rank, by = "variable") %>%
mutate(
rf_rank = rank(-rf_importance, ties.method = "min"),
xgb_rank = rank(-xgb_importance, ties.method = "min"),
mean_rank = rowMeans(cbind(rf_rank, xgb_rank), na.rm = TRUE)
) %>%
arrange(mean_rank) %>%
slice_head(n = 15) %>%
pull(variable)
dat_final <- dat2[, c("Hitr", top15_common)]
dim(dat_final)## [1] 8679 16
## [1] 8679 39
## [1] "Hitr" "Sexn" "Lsta" "Drin" "Defo" "Drim" "Bike3" "Peda" "Psl"
## [10] "Pcra" "Trav" "Vtra" "Vpro" "Ruru" "Mdrd" "Dayt"
## [1] "Hitr" "Weat" "Lgtc" "Ruru" "Func" "Pbse" "Bike" "Motm" "Bike1"
## [10] "Sexn" "Drin" "Hazi" "Busu" "Emer" "Trav" "Unde" "Roll" "Defo"
## [19] "Vtra" "Vnum" "Vali" "Vpro" "Vpav" "Vsur" "Pcra" "Lsta" "Ltyp"
## [28] "Cdls" "Mdrd" "Drim" "Mdrm" "Peda" "Drag" "Dayt" "Veht" "Psl"
## [37] "Bike2" "Bike3" "Acct"
res_final <- compareGroups(
Hitr ~ .,
data = dat_final,
ref = 1,
max.ylev = 30,
max.xlev = 30
)
final_table <- createTable(res_final, show.ratio = TRUE)
final_table##
## --------Summary descriptives table by 'Hitr'---------
##
## ______________________________________________________________________________________________________________________
## No Yes OR p.ratio p.overall
## N=6875 N=1804
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Sexn: 0.000
## Female 2023 (29.4%) 191 (10.6%) Ref. Ref.
## Male 4828 (70.2%) 768 (42.6%) 1.68 [1.43;1.99] <0.001
## Not Reported 24 (0.35%) 845 (46.8%) 369 [245;586] 0.000
## Lsta: 0.000
## Expired 64 (0.93%) 21 (1.16%) Ref. Ref.
## Not licensed 217 (3.16%) 117 (6.49%) 1.63 [0.96;2.87] 0.069
## Not Reported 54 (0.79%) 867 (48.1%) 48.2 [27.8;86.6] 0.000
## Other 44 (0.64%) 27 (1.50%) 1.86 [0.94;3.75] 0.077
## Suspended 226 (3.29%) 143 (7.93%) 1.92 [1.14;3.35] 0.014
## Valid 6270 (91.2%) 629 (34.9%) 0.30 [0.19;0.51] <0.001
## Drin: 0.000
## No (Alcohol Not Involved) 5041 (73.3%) 406 (22.5%) Ref. Ref.
## Not Reported 1419 (20.6%) 1151 (63.8%) 10.1 [8.87;11.4] 0.000
## Yes (Alcohol Involved) 415 (6.04%) 247 (13.7%) 7.39 [6.12;8.91] 0.000
## Defo: <0.001
## Damage Reported, Extent Unknown 479 (6.97%) 101 (5.60%) Ref. Ref.
## Disabling Damage 1572 (22.9%) 151 (8.37%) 0.46 [0.35;0.60] <0.001
## Functional Damage 1945 (28.3%) 449 (24.9%) 1.09 [0.87;1.39] 0.458
## Minor Damage 1799 (26.2%) 241 (13.4%) 0.63 [0.49;0.82] 0.001
## Not Reported 614 (8.93%) 813 (45.1%) 6.27 [4.95;7.99] 0.000
## Other 466 (6.78%) 49 (2.72%) 0.50 [0.34;0.72] <0.001
## Drim: <0.001
## Asleep or Fatigued 51 (0.74%) 12 (0.67%) Ref. Ref.
## Ill, Blackout 30 (0.44%) 2 (0.11%) 0.30 [0.04;1.23] 0.101
## None/Apparently Normal 3746 (54.5%) 226 (12.5%) 0.25 [0.14;0.51] <0.001
## Not Reported 2509 (36.5%) 1325 (73.4%) 2.22 [1.22;4.40] 0.008
## Other 77 (1.12%) 16 (0.89%) 0.88 [0.38;2.07] 0.768
## Under the Influence of Alcohol, Drugs or Medication 462 (6.72%) 223 (12.4%) 2.03 [1.09;4.08] 0.024
## Bike3: <0.001
## Backing / Wrong-Way / Non-Trafficway 357 (5.19%) 107 (5.93%) Ref. Ref.
## Bicyclist Failed to Yield 1769 (25.7%) 165 (9.15%) 0.31 [0.24;0.41] <0.001
## Bicyclist Turn / Merge 705 (10.3%) 52 (2.88%) 0.25 [0.17;0.35] <0.001
## Crossing / Parallel Paths 962 (14.0%) 291 (16.1%) 1.01 [0.79;1.30] 0.947
## Loss of Control / Turning Error 306 (4.45%) 45 (2.49%) 0.49 [0.33;0.72] <0.001
## Motorist Failed to Yield 188 (2.73%) 53 (2.94%) 0.94 [0.64;1.36] 0.753
## Motorist Turn / Merge 430 (6.25%) 53 (2.94%) 0.41 [0.29;0.59] <0.001
## Other / Unknown / Unusual 402 (5.85%) 190 (10.5%) 1.58 [1.20;2.08] 0.001
## Overtaking 1756 (25.5%) 848 (47.0%) 1.61 [1.28;2.04] <0.001
## Peda: <0.001
## 0_4 33 (0.48%) 6 (0.33%) Ref. Ref.
## 10_14 245 (3.56%) 37 (2.05%) 0.82 [0.34;2.31] 0.679
## 15_20 390 (5.67%) 102 (5.65%) 1.41 [0.61;3.86] 0.442
## 21_24 231 (3.36%) 69 (3.82%) 1.61 [0.69;4.46] 0.289
## 25_34 764 (11.1%) 240 (13.3%) 1.69 [0.75;4.58] 0.220
## 35_44 917 (13.3%) 309 (17.1%) 1.81 [0.80;4.90] 0.160
## 45_54 1155 (16.8%) 348 (19.3%) 1.62 [0.72;4.38] 0.259
## 5_9 96 (1.40%) 11 (0.61%) 0.63 [0.22;1.98] 0.408
## 55_64 1538 (22.4%) 440 (24.4%) 1.54 [0.69;4.15] 0.315
## 65_74 932 (13.6%) 160 (8.87%) 0.92 [0.41;2.51] 0.865
## 75+ 481 (7.00%) 55 (3.05%) 0.62 [0.26;1.72] 0.329
## Not Reported 93 (1.35%) 27 (1.50%) 1.57 [0.62;4.57] 0.355
## Psl: <0.001
## 25_or_less 763 (11.1%) 187 (10.4%) Ref. Ref.
## 30_35 1695 (24.7%) 509 (28.2%) 1.22 [1.02;1.48] 0.033
## 40_45 2262 (32.9%) 564 (31.3%) 1.02 [0.85;1.23] 0.859
## 50_55 1456 (21.2%) 325 (18.0%) 0.91 [0.75;1.11] 0.360
## Not Reported 319 (4.64%) 153 (8.48%) 1.96 [1.52;2.51] <0.001
## Other 380 (5.53%) 66 (3.66%) 0.71 [0.52;0.96] 0.026
## Pcra: <0.001
## Going Straight 5350 (77.8%) 1383 (76.7%) Ref. Ref.
## Negotiating a Curve 387 (5.63%) 93 (5.16%) 0.93 [0.73;1.17] 0.546
## Not Reported 27 (0.39%) 177 (9.81%) 25.2 [17.0;38.8] 0.000
## Other 341 (4.96%) 67 (3.71%) 0.76 [0.58;0.99] 0.041
## Turning Left 380 (5.53%) 46 (2.55%) 0.47 [0.34;0.63] <0.001
## Turning Right 390 (5.67%) 38 (2.11%) 0.38 [0.27;0.52] <0.001
## Trav: <0.001
## 035 MPH 337 (4.90%) 52 (2.88%) Ref. Ref.
## 040 MPH 360 (5.24%) 52 (2.88%) 0.94 [0.62;1.42] 0.754
## 045 MPH 497 (7.23%) 120 (6.65%) 1.56 [1.10;2.24] 0.012
## 055 MPH 306 (4.45%) 53 (2.94%) 1.12 [0.74;1.70] 0.585
## Not Reported 3689 (53.7%) 1309 (72.6%) 2.29 [1.72;3.13] <0.001
## Other 1686 (24.5%) 218 (12.1%) 0.84 [0.61;1.17] 0.287
## Vtra: 0.020
## One-Way Trafficway 190 (2.76%) 75 (4.16%) Ref. Ref.
## Other 296 (4.31%) 73 (4.05%) 0.63 [0.43;0.91] 0.013
## Two-Way, Divided, Positive Median Barrier 393 (5.72%) 93 (5.16%) 0.60 [0.42;0.85] 0.005
## Two-Way, Divided, Unprotected Median 1581 (23.0%) 383 (21.2%) 0.61 [0.46;0.82] 0.001
## Two-Way, Not Divided 3667 (53.3%) 966 (53.5%) 0.67 [0.51;0.88] 0.005
## Two-Way, Not Divided With a Continuous Left-Turn Lane 748 (10.9%) 214 (11.9%) 0.72 [0.53;0.99] 0.043
## Vpro: <0.001
## Downhill 184 (2.68%) 44 (2.44%) Ref. Ref.
## Grade, Unknown Slope 375 (5.45%) 92 (5.10%) 1.02 [0.69;1.54] 0.906
## Level 5166 (75.1%) 1314 (72.8%) 1.06 [0.77;1.50] 0.729
## Not Reported 737 (10.7%) 261 (14.5%) 1.48 [1.04;2.14] 0.029
## Other 171 (2.49%) 31 (1.72%) 0.76 [0.45;1.26] 0.285
## Uphill 242 (3.52%) 62 (3.44%) 1.07 [0.70;1.66] 0.758
## Ruru: <0.001
## Not Reported 29 (0.42%) 4 (0.22%) Ref. Ref.
## Rural 1430 (20.8%) 267 (14.8%) . [.;.] .
## Trafficway Not in State Inventory 12 (0.17%) 0 (0.00%) . [.;.] .
## Urban 5404 (78.6%) 1533 (85.0%) . [.;.] .
## Mdrd: <0.001
## Distraction/Inattention 42 (0.61%) 5 (0.28%) Ref. Ref.
## Inattention (Inattentive), Details Unknown 79 (1.15%) 21 (1.16%) 2.18 [0.81;7.04] 0.128
## Not Distracted 1506 (21.9%) 152 (8.43%) 0.83 [0.35;2.45] 0.699
## Not Reported 5003 (72.8%) 1510 (83.7%) 2.47 [1.07;7.25] 0.033
## Other 156 (2.27%) 26 (1.44%) 1.37 [0.53;4.32] 0.540
## Reported as Unknown if Distracted 89 (1.29%) 90 (4.99%) 8.22 [3.37;25.0] <0.001
## Dayt: <0.001
## Weekday 5066 (73.7%) 1161 (64.4%) Ref. Ref.
## Weekend 1809 (26.3%) 643 (35.6%) 1.55 [1.39;1.73] <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
# Original variables used in compareGroups, excluding outcome
vars_dat2 <- setdiff(names(dat2), "Hitr")
# Variables kept after first compareGroups p-value screening
vars_after_p <- sig_vars
# Variables removed by first p-value screening
removed_by_pvalue <- setdiff(vars_dat2, vars_after_p)
# Variables kept after RF/XGBoost common top 15
vars_final <- setdiff(names(dat_final), "Hitr")
# Variables removed after variable-importance selection
removed_by_importance <- setdiff(vars_after_p, vars_final)
# Summary counts
selection_summary <- data.frame(
Step = c(
"Starting variables in dat2",
"Kept after p-value <= 0.05",
"Removed by p-value",
"Kept in final top 15 common RF/XGBoost",
"Removed after variable importance"
),
N = c(
length(vars_dat2),
length(vars_after_p),
length(removed_by_pvalue),
length(vars_final),
length(removed_by_importance)
)
)
selection_summary## Step N
## 1 Starting variables in dat2 38
## 2 Kept after p-value <= 0.05 19
## 3 Removed by p-value 19
## 4 Kept in final top 15 common RF/XGBoost 15
## 5 Removed after variable importance 4
## [1] "Weat" "Lgtc" "Func" "Pbse" "Bike" "Motm" "Bike1" "Hazi" "Busu"
## [10] "Vnum" "Vpav" "Vsur" "Ltyp" "Cdls" "Mdrm" "Drag" "Veht" "Bike2"
## [19] "Acct"
## [1] "Emer" "Unde" "Roll" "Vali"
## [1] "Sexn" "Lsta" "Drin" "Defo" "Drim" "Bike3" "Peda" "Psl" "Pcra"
## [10] "Trav" "Vtra" "Vpro" "Ruru" "Mdrd" "Dayt"