Variable Select

setwd("D:/DAS_CODE/Tools/data/PedFARS/ProbData")

dat= read.csv("Bike01.csv")
names(dat)
##  [1] "Stat"       "Year"       "CRASH_NUM1" "Work"       "Lati"      
##  [6] "Long"       "Veto"       "Fata"       "Drun"       "Hitr"      
## [11] "Wrkz"       "Relr"       "Weat"       "Lgtc"       "Ruru"      
## [16] "Func"       "Pbse"       "Bike"       "Motm"       "Bike1"     
## [21] "Injs"       "Sexn"       "Drin"       "Hazi"       "Busu"      
## [26] "Emer"       "Trav"       "Unde"       "Roll"       "Defo"      
## [31] "Vtra"       "Vnum"       "Vali"       "Vpro"       "Vpav"      
## [36] "Vsur"       "Pcra"       "Lsta"       "Ltyp"       "Cdls"      
## [41] "Mdrd"       "Drim"       "Mdrm"       "Peda"       "Drag"      
## [46] "Dayt"       "Veht"       "Psl"        "Bike2"      "Bike3"     
## [51] "Acct"
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(purrr)
library(forcats)
library(compareGroups)
library(randomForest)
library(xgboost)
library(patchwork)


dat1 <- subset(dat, Hitr == "Yes" | Hitr == "No")
dat1$Hitr <- factor(dat1$Hitr, levels = c("No", "Yes"))

dat2 <- dat1[, c(10, 13:20, 22:51)]
dim(dat2)
## [1] 8679   39
res1 <- compareGroups(
  Hitr ~ .,
  data = dat2,
  ref = 1,
  max.ylev = 30,
  max.xlev = 30
)

mm <- createTable(res1, show.ratio = TRUE)
mm
## 
## --------Summary descriptives table by 'Hitr'---------
## 
## ________________________________________________________________________________________________________________________________ 
##                                                                          No          Yes             OR        p.ratio p.overall 
##                                                                        N=6875       N=1804                                       
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Weat:                                                                                                                      .     
##     Clear                                                           2099 (30.5%) 509 (28.2%)        Ref.        Ref.             
##     Cloudy                                                          339 (4.93%)   77 (4.27%)  0.94 [0.71;1.22]  0.636            
##     Fog, Smog, Smoke                                                 12 (0.17%)   1 (0.06%)   0.39 [0.02;1.99]  0.307            
##     Not Reported                                                    4300 (62.5%) 1189 (65.9%) 1.14 [1.02;1.28]  0.026            
##     Other                                                            7 (0.10%)    1 (0.06%)   0.66 [0.03;3.82]  0.694            
##     Rain                                                            118 (1.72%)   27 (1.50%)  0.95 [0.60;1.43]  0.806            
## Lgtc:                                                                                                                      .     
##     Dark - Lighted                                                  1512 (22.0%) 693 (38.4%)        Ref.        Ref.             
##     Dark - Not Lighted                                              1400 (20.4%) 567 (31.4%)  0.88 [0.77;1.01]  0.068            
##     Dark - Unknown Lighting                                          62 (0.90%)   33 (1.83%)  1.16 [0.75;1.78]  0.496            
##     Dawn                                                            128 (1.86%)   25 (1.39%)  0.43 [0.27;0.65] <0.001            
##     Daylight                                                        3575 (52.0%) 414 (22.9%)  0.25 [0.22;0.29]  0.000            
##     Dusk                                                            176 (2.56%)   43 (2.38%)  0.53 [0.37;0.75] <0.001            
##     Not Reported                                                     21 (0.31%)   27 (1.50%)  2.80 [1.57;5.05]  0.001            
##     Other                                                            1 (0.01%)    2 (0.11%)   4.10 [0.33;129]   0.266            
## Ruru:                                                                                                                   <0.001   
##     Not Reported                                                     29 (0.42%)   4 (0.22%)         Ref.        Ref.             
##     Rural                                                           1430 (20.8%) 267 (14.8%)      . [.;.]         .              
##     Trafficway Not in State Inventory                                12 (0.17%)   0 (0.00%)       . [.;.]         .              
##     Urban                                                           5404 (78.6%) 1533 (85.0%)     . [.;.]         .              
## Func:                                                                                                                      .     
##     Interstate                                                      106 (1.54%)   38 (2.11%)        Ref.        Ref.             
##     Local                                                           890 (12.9%)  277 (15.4%)      . [.;.]         .              
##     Major Collector                                                 1080 (15.7%) 266 (14.7%)      . [.;.]         .              
##     Minor Arterial                                                  1854 (27.0%) 515 (28.5%)      . [.;.]         .              
##     Minor Collector                                                 208 (3.03%)   62 (3.44%)      . [.;.]         .              
##     Not Reported                                                     23 (0.33%)   5 (0.28%)       . [.;.]         .              
##     Other Freeways and Expressways                                   26 (0.38%)   7 (0.39%)       . [.;.]         .              
##     Other Principal Arterial                                        662 (9.63%)  154 (8.54%)      . [.;.]         .              
##     Principal Arterial - Other                                      1930 (28.1%) 466 (25.8%)      . [.;.]         .              
##     Principal Arterial - Other Freeways and Expressways              84 (1.22%)   14 (0.78%)      . [.;.]         .              
##     Trafficway Not in State Inventory                                12 (0.17%)   0 (0.00%)       . [.;.]         .              
## Pbse:                                                                                                                    0.058   
##     Female                                                          908 (13.2%)  204 (11.3%)        Ref.        Ref.             
##     Male                                                            5913 (86.0%) 1581 (87.6%) 1.19 [1.01;1.40]  0.033            
##     Not Reported                                                     54 (0.79%)   19 (1.05%)  1.57 [0.89;2.67]  0.116            
## Bike:                                                                                                                      .     
##     Bicycle Lane / Paved Shoulder / Parking Lane                    438 (6.37%)  239 (13.2%)        Ref.        Ref.             
##     Non-Trafficway - Driveway                                        1 (0.01%)    0 (0.00%)       . [.;.]         .              
##     Non-Trafficway - Parking Lot/Other                               5 (0.07%)    0 (0.00%)       . [.;.]         .              
##     Not Reported                                                     63 (0.92%)   65 (3.60%)      . [.;.]         .              
##     Other                                                            38 (0.55%)   16 (0.89%)      . [.;.]         .              
##     Shared-Use Path                                                  3 (0.04%)    0 (0.00%)       . [.;.]         .              
##     Sidewalk  / Crosswalk / Driveway Access                         726 (10.6%)  120 (6.65%)      . [.;.]         .              
##     Sidewalk / Crosswalk / Driveway Access                          116 (1.69%)   12 (0.67%)      . [.;.]         .              
##     Travel Lane                                                     5485 (79.8%) 1352 (74.9%)     . [.;.]         .              
## Motm: Not a Pedestrian                                              6875 (100%)  1804 (100%)        Ref.        Ref.       .     
## Bike1:                                                                                                                     .     
##     At Intersection                                                 2195 (31.9%) 324 (18.0%)        Ref.        Ref.             
##     Intersection-Related                                            563 (8.19%)  143 (7.93%)      . [.;.]         .              
##     Non-Trafficway Location                                          8 (0.12%)    0 (0.00%)       . [.;.]         .              
##     Not At Intersection                                             4088 (59.5%) 1316 (72.9%)     . [.;.]         .              
##     Not Reported                                                     21 (0.31%)   21 (1.16%)      . [.;.]         .              
## Sexn:                                                                                                                    0.000   
##     Female                                                          2023 (29.4%) 191 (10.6%)        Ref.        Ref.             
##     Male                                                            4828 (70.2%) 768 (42.6%)  1.68 [1.43;1.99] <0.001            
##     Not Reported                                                     24 (0.35%)  845 (46.8%)   369 [245;586]    0.000            
## Drin:                                                                                                                    0.000   
##     No (Alcohol Not Involved)                                       5041 (73.3%) 406 (22.5%)        Ref.        Ref.             
##     Not Reported                                                    1419 (20.6%) 1151 (63.8%) 10.1 [8.87;11.4]  0.000            
##     Yes (Alcohol Involved)                                          415 (6.04%)  247 (13.7%)  7.39 [6.12;8.91]  0.000            
## Hazi:                                                                                                                    0.748   
##     No                                                              6863 (99.8%) 1802 (99.9%)       Ref.        Ref.             
##     Yes                                                              12 (0.17%)   2 (0.11%)   0.68 [0.10;2.51]  0.598            
## Busu:                                                                                                                      .     
##     Charter/Tour                                                     7 (0.10%)    1 (0.06%)         Ref.        Ref.             
##     Not a Bus                                                       6779 (98.6%) 1296 (71.8%)     . [.;.]         .              
##     Not Reported                                                     20 (0.29%)  506 (28.0%)      . [.;.]         .              
##     Other                                                            7 (0.10%)    0 (0.00%)       . [.;.]         .              
##     School                                                           26 (0.38%)   0 (0.00%)       . [.;.]         .              
##     Transit/ Commuter                                                36 (0.52%)   1 (0.06%)       . [.;.]         .              
## Emer:                                                                                                                   <0.001   
##     Emergency Operation, Emergency Warning Equipment in Use          5 (0.07%)    0 (0.00%)         Ref.        Ref.             
##     Emergency Operation, Emergency Warning Equipment in Use Unknown  9 (0.13%)    0 (0.00%)       . [.;.]         .              
##     Non-Emergency, Non-Transport                                     6 (0.09%)    0 (0.00%)       . [.;.]         .              
##     Not Applicable                                                  6837 (99.4%) 1671 (92.6%)     . [.;.]         .              
##     Not Reported                                                     11 (0.16%)  133 (7.37%)      . [.;.]         .              
##     Other                                                            7 (0.10%)    0 (0.00%)       . [.;.]         .              
## Trav:                                                                                                                   <0.001   
##     035 MPH                                                         337 (4.90%)   52 (2.88%)        Ref.        Ref.             
##     040 MPH                                                         360 (5.24%)   52 (2.88%)  0.94 [0.62;1.42]  0.754            
##     045 MPH                                                         497 (7.23%)  120 (6.65%)  1.56 [1.10;2.24]  0.012            
##     055 MPH                                                         306 (4.45%)   53 (2.94%)  1.12 [0.74;1.70]  0.585            
##     Not Reported                                                    3689 (53.7%) 1309 (72.6%) 2.29 [1.72;3.13] <0.001            
##     Other                                                           1686 (24.5%) 218 (12.1%)  0.84 [0.61;1.17]  0.287            
## Unde:                                                                                                                    0.002   
##     No Underride or Override Noted                                  3491 (50.8%) 839 (46.5%)        Ref.        Ref.             
##     Not Reported                                                    3383 (49.2%) 965 (53.5%)      . [.;.]         .              
##     Overriding a Motor Vehicle Not In-Transport                      1 (0.01%)    0 (0.00%)       . [.;.]         .              
## Roll:                                                                                                                    0.003   
##     No Rollover                                                     6810 (99.1%) 1799 (99.7%)       Ref.        Ref.             
##     Not Applicable                                                   30 (0.44%)   1 (0.06%)   0.14 [0.01;0.66]  0.007            
##     Other                                                            1 (0.01%)    2 (0.11%)   7.11 [0.58;223]   0.122            
##     Rollover                                                         15 (0.22%)   1 (0.06%)   0.29 [0.01;1.41]  0.147            
##     Rollover, Tripped by Object/Vehicle                              19 (0.28%)   1 (0.06%)   0.23 [0.01;1.09]  0.067            
## Defo:                                                                                                                   <0.001   
##     Damage Reported, Extent Unknown                                 479 (6.97%)  101 (5.60%)        Ref.        Ref.             
##     Disabling Damage                                                1572 (22.9%) 151 (8.37%)  0.46 [0.35;0.60] <0.001            
##     Functional Damage                                               1945 (28.3%) 449 (24.9%)  1.09 [0.87;1.39]  0.458            
##     Minor Damage                                                    1799 (26.2%) 241 (13.4%)  0.63 [0.49;0.82]  0.001            
##     Not Reported                                                    614 (8.93%)  813 (45.1%)  6.27 [4.95;7.99]  0.000            
##     Other                                                           466 (6.78%)   49 (2.72%)  0.50 [0.34;0.72] <0.001            
## Vtra:                                                                                                                    0.020   
##     One-Way Trafficway                                              190 (2.76%)   75 (4.16%)        Ref.        Ref.             
##     Other                                                           296 (4.31%)   73 (4.05%)  0.63 [0.43;0.91]  0.013            
##     Two-Way,  Divided, Positive  Median Barrier                     393 (5.72%)   93 (5.16%)  0.60 [0.42;0.85]  0.005            
##     Two-Way, Divided, Unprotected Median                            1581 (23.0%) 383 (21.2%)  0.61 [0.46;0.82]  0.001            
##     Two-Way, Not Divided                                            3667 (53.3%) 966 (53.5%)  0.67 [0.51;0.88]  0.005            
##     Two-Way, Not Divided With a Continuous Left-Turn Lane           748 (10.9%)  214 (11.9%)  0.72 [0.53;0.99]  0.043            
## Vnum:                                                                                                                    0.207   
##     Five lanes                                                      799 (11.6%)  209 (11.6%)        Ref.        Ref.             
##     Four lanes                                                      863 (12.6%)  238 (13.2%)  1.05 [0.86;1.30]  0.621            
##     Other                                                           387 (5.63%)  118 (6.54%)  1.17 [0.90;1.50]  0.242            
##     Six lanes                                                       150 (2.18%)   31 (1.72%)  0.79 [0.51;1.19]  0.266            
##     Three lanes                                                     944 (13.7%)  217 (12.0%)  0.88 [0.71;1.09]  0.233            
##     Two lanes                                                       3732 (54.3%) 991 (54.9%)  1.01 [0.86;1.20]  0.865            
## Vali:                                                                                                                   <0.001   
##     Curve - Left                                                     90 (1.31%)   23 (1.27%)        Ref.        Ref.             
##     Curve Left                                                      132 (1.92%)   31 (1.72%)  0.92 [0.50;1.70]  0.782            
##     Curve Right                                                     124 (1.80%)   26 (1.44%)  0.82 [0.44;1.54]  0.537            
##     Not Reported                                                    166 (2.41%)   84 (4.66%)  1.97 [1.17;3.40]  0.010            
##     Other                                                           167 (2.43%)   27 (1.50%)  0.63 [0.34;1.18]  0.148            
##     Straight                                                        6196 (90.1%) 1613 (89.4%) 1.01 [0.65;1.65]  0.955            
## Vpro:                                                                                                                   <0.001   
##     Downhill                                                        184 (2.68%)   44 (2.44%)        Ref.        Ref.             
##     Grade, Unknown Slope                                            375 (5.45%)   92 (5.10%)  1.02 [0.69;1.54]  0.906            
##     Level                                                           5166 (75.1%) 1314 (72.8%) 1.06 [0.77;1.50]  0.729            
##     Not Reported                                                    737 (10.7%)  261 (14.5%)  1.48 [1.04;2.14]  0.029            
##     Other                                                           171 (2.49%)   31 (1.72%)  0.76 [0.45;1.26]  0.285            
##     Uphill                                                          242 (3.52%)   62 (3.44%)  1.07 [0.70;1.66]  0.758            
## Vpav:                                                                                                                      .     
##     Blacktop, Bituminous, or Asphalt                                3940 (57.3%) 1007 (55.8%)       Ref.        Ref.             
##     Concrete                                                        427 (6.21%)   81 (4.49%)  0.74 [0.58;0.95]  0.016            
##     Non-Trafficway or Driveway Access                                43 (0.63%)   2 (0.11%)   0.20 [0.03;0.63]  0.003            
##     Not Reported                                                    2446 (35.6%) 711 (39.4%)  1.14 [1.02;1.27]  0.020            
##     Other                                                            8 (0.12%)    2 (0.11%)   1.03 [0.14;4.24]  0.968            
##     Slag, Gravel or Stone                                            11 (0.16%)   1 (0.06%)   0.40 [0.02;2.09]  0.331            
## Vsur:                                                                                                                      .     
##     Dry                                                             6251 (90.9%) 1581 (87.6%)       Ref.        Ref.             
##     Non-Trafficway or Driveway Access                                43 (0.63%)   2 (0.11%)   0.20 [0.03;0.64]  0.004            
##     Not Reported                                                     69 (1.00%)   51 (2.83%)  2.92 [2.02;4.21] <0.001            
##     Other                                                            16 (0.23%)   3 (0.17%)   0.77 [0.17;2.35]  0.678            
##     Snow                                                             7 (0.10%)    3 (0.17%)   1.74 [0.36;6.45]  0.452            
##     Wet                                                             489 (7.11%)  164 (9.09%)  1.33 [1.10;1.59]  0.003            
## Pcra:                                                                                                                   <0.001   
##     Going Straight                                                  5350 (77.8%) 1383 (76.7%)       Ref.        Ref.             
##     Negotiating a Curve                                             387 (5.63%)   93 (5.16%)  0.93 [0.73;1.17]  0.546            
##     Not Reported                                                     27 (0.39%)  177 (9.81%)  25.2 [17.0;38.8]  0.000            
##     Other                                                           341 (4.96%)   67 (3.71%)  0.76 [0.58;0.99]  0.041            
##     Turning Left                                                    380 (5.53%)   46 (2.55%)  0.47 [0.34;0.63] <0.001            
##     Turning Right                                                   390 (5.67%)   38 (2.11%)  0.38 [0.27;0.52] <0.001            
## Lsta:                                                                                                                    0.000   
##     Expired                                                          64 (0.93%)   21 (1.16%)        Ref.        Ref.             
##     Not licensed                                                    217 (3.16%)  117 (6.49%)  1.63 [0.96;2.87]  0.069            
##     Not Reported                                                     54 (0.79%)  867 (48.1%)  48.2 [27.8;86.6]  0.000            
##     Other                                                            44 (0.64%)   27 (1.50%)  1.86 [0.94;3.75]  0.077            
##     Suspended                                                       226 (3.29%)  143 (7.93%)  1.92 [1.14;3.35]  0.014            
##     Valid                                                           6270 (91.2%) 629 (34.9%)  0.30 [0.19;0.51] <0.001            
## Ltyp:                                                                                                                      .     
##     Full Driver License                                             6424 (93.4%) 798 (44.2%)        Ref.        Ref.             
##     Intermediate Driver License                                     127 (1.85%)   14 (0.78%)  0.90 [0.49;1.51]  0.697            
##     Learner's Permit                                                 37 (0.54%)   2 (0.11%)   0.47 [0.07;1.53]  0.241            
##     Not Licensed                                                    217 (3.16%)  117 (6.49%)  4.34 [3.42;5.49]  0.000            
##     Not Reported                                                     54 (0.79%)  867 (48.1%)   129 [97.8;174]   0.000            
##     Other                                                            16 (0.23%)   6 (0.33%)   3.07 [1.08;7.55]  0.037            
## Cdls:                                                                                                                      .     
##     Disqualified                                                     15 (0.22%)   6 (0.33%)         Ref.        Ref.             
##     No (CDL)                                                        5930 (86.3%) 855 (47.4%)  0.35 [0.14;1.01]  0.053            
##     Not Reported                                                     58 (0.84%)  869 (48.2%)  36.5 [14.1;107]  <0.001            
##     Other                                                            39 (0.57%)   9 (0.50%)   0.58 [0.17;2.03]  0.383            
##     Suspended                                                        12 (0.17%)   7 (0.39%)   1.44 [0.37;5.77]  0.598            
##     Valid                                                           821 (11.9%)   58 (3.22%)  0.17 [0.07;0.51]  0.003            
## Mdrd:                                                                                                                   <0.001   
##     Distraction/Inattention                                          42 (0.61%)   5 (0.28%)         Ref.        Ref.             
##     Inattention (Inattentive), Details Unknown                       79 (1.15%)   21 (1.16%)  2.18 [0.81;7.04]  0.128            
##     Not Distracted                                                  1506 (21.9%) 152 (8.43%)  0.83 [0.35;2.45]  0.699            
##     Not Reported                                                    5003 (72.8%) 1510 (83.7%) 2.47 [1.07;7.25]  0.033            
##     Other                                                           156 (2.27%)   26 (1.44%)  1.37 [0.53;4.32]  0.540            
##     Reported as Unknown if Distracted                                89 (1.29%)   90 (4.99%)  8.22 [3.37;25.0] <0.001            
## Drim:                                                                                                                   <0.001   
##     Asleep or Fatigued                                               51 (0.74%)   12 (0.67%)        Ref.        Ref.             
##     Ill, Blackout                                                    30 (0.44%)   2 (0.11%)   0.30 [0.04;1.23]  0.101            
##     None/Apparently Normal                                          3746 (54.5%) 226 (12.5%)  0.25 [0.14;0.51] <0.001            
##     Not Reported                                                    2509 (36.5%) 1325 (73.4%) 2.22 [1.22;4.40]  0.008            
##     Other                                                            77 (1.12%)   16 (0.89%)  0.88 [0.38;2.07]  0.768            
##     Under the Influence of Alcohol, Drugs or Medication             462 (6.72%)  223 (12.4%)  2.03 [1.09;4.08]  0.024            
## Mdrm:                                                                                                                      .     
##     Driver Did Not Maneuver to Avoid                                873 (12.7%)   99 (5.49%)        Ref.        Ref.             
##     Motor Vehicle                                                    2 (0.03%)    1 (0.06%)       . [.;.]         .              
##     Not Reported                                                    5620 (81.7%) 1684 (93.3%)     . [.;.]         .              
##     Other                                                            4 (0.06%)    1 (0.06%)       . [.;.]         .              
##     Pedestrian, Pedalcyclist or Other Non-Motorist                  368 (5.35%)   19 (1.05%)      . [.;.]         .              
##     Phantom/Non-Contact Motor Vehicle                                8 (0.12%)    0 (0.00%)       . [.;.]         .              
## Peda:                                                                                                                   <0.001   
##     0_4                                                              33 (0.48%)   6 (0.33%)         Ref.        Ref.             
##     10_14                                                           245 (3.56%)   37 (2.05%)  0.82 [0.34;2.31]  0.679            
##     15_20                                                           390 (5.67%)  102 (5.65%)  1.41 [0.61;3.86]  0.442            
##     21_24                                                           231 (3.36%)   69 (3.82%)  1.61 [0.69;4.46]  0.289            
##     25_34                                                           764 (11.1%)  240 (13.3%)  1.69 [0.75;4.58]  0.220            
##     35_44                                                           917 (13.3%)  309 (17.1%)  1.81 [0.80;4.90]  0.160            
##     45_54                                                           1155 (16.8%) 348 (19.3%)  1.62 [0.72;4.38]  0.259            
##     5_9                                                              96 (1.40%)   11 (0.61%)  0.63 [0.22;1.98]  0.408            
##     55_64                                                           1538 (22.4%) 440 (24.4%)  1.54 [0.69;4.15]  0.315            
##     65_74                                                           932 (13.6%)  160 (8.87%)  0.92 [0.41;2.51]  0.865            
##     75+                                                             481 (7.00%)   55 (3.05%)  0.62 [0.26;1.72]  0.329            
##     Not Reported                                                     93 (1.35%)   27 (1.50%)  1.57 [0.62;4.57]  0.355            
## Drag:                                                                                                                      .     
##     10_14                                                            2 (0.03%)    2 (0.11%)         Ref.        Ref.             
##     15_20                                                           677 (9.85%)   72 (3.99%)  0.11 [0.01;1.04]  0.054            
##     21_24                                                           648 (9.43%)  115 (6.37%)  0.18 [0.02;1.72]  0.125            
##     25_34                                                           1513 (22.0%) 286 (15.9%)  0.19 [0.02;1.82]  0.136            
##     35_44                                                           1197 (17.4%) 204 (11.3%)  0.17 [0.02;1.65]  0.116            
##     45_54                                                           1039 (15.1%) 122 (6.76%)  0.12 [0.01;1.14]  0.063            
##     55_64                                                           903 (13.1%)   86 (4.77%)  0.10 [0.01;0.93]  0.044            
##     65_74                                                           538 (7.83%)   24 (1.33%)  0.05 [0.00;0.45]  0.012            
##     75+                                                             328 (4.77%)   15 (0.83%)  0.05 [0.00;0.48]  0.013            
##     Not Reported                                                     30 (0.44%)  878 (48.7%)  28.8 [2.92;284]   0.007            
## Dayt:                                                                                                                   <0.001   
##     Weekday                                                         5066 (73.7%) 1161 (64.4%)       Ref.        Ref.             
##     Weekend                                                         1809 (26.3%) 643 (35.6%)  1.55 [1.39;1.73] <0.001            
## Veht:                                                                                                                      .     
##     Bus                                                             193 (2.81%)   25 (1.39%)        Ref.        Ref.             
##     Car                                                             2729 (39.7%) 518 (28.7%)  1.46 [0.97;2.29]  0.072            
##     Large_Truck                                                     638 (9.28%)   60 (3.33%)  0.72 [0.45;1.21]  0.209            
##     Light_Truck                                                     2967 (43.2%) 496 (27.5%)  1.28 [0.85;2.02]  0.240            
##     Motorcycle                                                       76 (1.11%)   4 (0.22%)   0.42 [0.12;1.14]  0.091            
##     Not Reported                                                     21 (0.31%)  668 (37.0%)   239 [134;452]    0.000            
##     Off_Road_Other                                                   6 (0.09%)    2 (0.11%)   2.68 [0.34;12.8]  0.302            
##     Van                                                             245 (3.56%)   31 (1.72%)  0.98 [0.56;1.72]  0.932            
## Psl:                                                                                                                    <0.001   
##     25_or_less                                                      763 (11.1%)  187 (10.4%)        Ref.        Ref.             
##     30_35                                                           1695 (24.7%) 509 (28.2%)  1.22 [1.02;1.48]  0.033            
##     40_45                                                           2262 (32.9%) 564 (31.3%)  1.02 [0.85;1.23]  0.859            
##     50_55                                                           1456 (21.2%) 325 (18.0%)  0.91 [0.75;1.11]  0.360            
##     Not Reported                                                    319 (4.64%)  153 (8.48%)  1.96 [1.52;2.51] <0.001            
##     Other                                                           380 (5.53%)   66 (3.66%)  0.71 [0.52;0.96]  0.026            
## Bike2:                                                                                                                     .     
##     Backing Vehicle                                                  20 (0.29%)   2 (0.11%)         Ref.        Ref.             
##     Crossing Paths                                                  409 (5.95%)   95 (5.27%)      . [.;.]         .              
##     Loss of Control                                                 157 (2.28%)   32 (1.77%)      . [.;.]         .              
##     Multiple Threat / Trapped                                        26 (0.38%)   2 (0.11%)       . [.;.]         .              
##     Non-Roadway / Play Vehicle                                       21 (0.31%)   0 (0.00%)       . [.;.]         .              
##     Other / Unknown / Not Reported                                  498 (7.24%)  220 (12.2%)      . [.;.]         .              
##     Overtaking / Passing                                            1756 (25.5%) 848 (47.0%)      . [.;.]         .              
##     Parallel Paths                                                  357 (5.19%)  159 (8.81%)      . [.;.]         .              
##     Ride-Out / Drive-Out                                            1146 (16.7%) 132 (7.32%)      . [.;.]         .              
##     Ride-Through / Drive-Through                                    842 (12.2%)   85 (4.71%)      . [.;.]         .              
##     Turning / Merging                                               1317 (19.2%) 124 (6.87%)      . [.;.]         .              
##     Wrong-Way / Wrong-Side                                          326 (4.74%)  105 (5.82%)      . [.;.]         .              
## Bike3:                                                                                                                  <0.001   
##     Backing / Wrong-Way / Non-Trafficway                            357 (5.19%)  107 (5.93%)        Ref.        Ref.             
##     Bicyclist Failed to Yield                                       1769 (25.7%) 165 (9.15%)  0.31 [0.24;0.41] <0.001            
##     Bicyclist Turn / Merge                                          705 (10.3%)   52 (2.88%)  0.25 [0.17;0.35] <0.001            
##     Crossing / Parallel Paths                                       962 (14.0%)  291 (16.1%)  1.01 [0.79;1.30]  0.947            
##     Loss of Control / Turning Error                                 306 (4.45%)   45 (2.49%)  0.49 [0.33;0.72] <0.001            
##     Motorist Failed to Yield                                        188 (2.73%)   53 (2.94%)  0.94 [0.64;1.36]  0.753            
##     Motorist Turn / Merge                                           430 (6.25%)   53 (2.94%)  0.41 [0.29;0.59] <0.001            
##     Other / Unknown / Unusual                                       402 (5.85%)  190 (10.5%)  1.58 [1.20;2.08]  0.001            
##     Overtaking                                                      1756 (25.5%) 848 (47.0%)  1.61 [1.28;2.04] <0.001            
## Acct:                                                                                                                      .     
##     Backing                                                          18 (0.26%)   2 (0.11%)         Ref.        Ref.             
##     Forward Impact - Object/End Departure                            3 (0.04%)    1 (0.06%)       . [.;.]         .              
##     Forward Impact - Parked Vehicle                                  3 (0.04%)    1 (0.06%)       . [.;.]         .              
##     Forward Impact - Pedestrian/Animal                              5750 (83.6%) 1472 (81.6%)     . [.;.]         .              
##     Intersecting-Path Conflict                                       11 (0.16%)   0 (0.00%)       . [.;.]         .              
##     Left Roadside Departure - Control Loss                           5 (0.07%)    1 (0.06%)       . [.;.]         .              
##     Left Roadside Departure - Drive Off Road                         17 (0.25%)   4 (0.22%)       . [.;.]         .              
##     Opposite-Direction Angle/Sideswipe                               4 (0.06%)    0 (0.00%)       . [.;.]         .              
##     Opposite-Direction Head-On                                       1 (0.01%)    0 (0.00%)       . [.;.]         .              
##     Other / Unknown / No Impact                                     954 (13.9%)  285 (15.8%)      . [.;.]         .              
##     Other Roadside Departure                                         8 (0.12%)    3 (0.17%)       . [.;.]         .              
##     Rear-End - Slower Vehicle                                        3 (0.04%)    2 (0.11%)       . [.;.]         .              
##     Right Roadside Departure - Control Loss                          9 (0.13%)    1 (0.06%)       . [.;.]         .              
##     Right Roadside Departure - Drive Off Road                        72 (1.05%)   31 (1.72%)      . [.;.]         .              
##     Same-Direction Sideswipe - Lane Change/Other                     3 (0.04%)    0 (0.00%)       . [.;.]         .              
##     Same-Direction Sideswipe - Straight                              1 (0.01%)    1 (0.06%)       . [.;.]         .              
##     Turning Conflict - Turn Across Path                              10 (0.15%)   0 (0.00%)       . [.;.]         .              
##     Turning Conflict - Turn Into Path                                3 (0.04%)    0 (0.00%)       . [.;.]         .              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
pvals <- getResults(mm, "p.overall")

pdat <- data.frame(
  variable = names(pvals),
  p_value_raw = as.character(pvals),
  stringsAsFactors = FALSE
)

pdat$p_value <- suppressWarnings(
  as.numeric(gsub("<", "", pdat$p_value_raw))
)

sig_vars <- pdat %>%
  filter(!is.na(p_value), p_value <= 0.05) %>%
  pull(variable)

sig_vars <- intersect(sig_vars, names(dat2))
sig_vars <- setdiff(sig_vars, "Hitr")

dat_ml <- dat2[, c("Hitr", sig_vars)]
dat_ml <- na.omit(dat_ml)

x <- dat_ml[, sig_vars, drop = FALSE]
y <- dat_ml$Hitr

rf_fit <- randomForest(
  x = x,
  y = y,
  importance = TRUE,
  ntree = 1000
)

rf_imp <- importance(rf_fit)
rf_rank <- data.frame(
  variable = rownames(rf_imp),
  rf_importance = rf_imp[, "MeanDecreaseGini"],
  row.names = NULL
) %>%
  arrange(desc(rf_importance))

x_mm <- model.matrix(Hitr ~ . - 1, data = dat_ml)
y_xgb <- ifelse(dat_ml$Hitr == "Yes", 1, 0)

dtrain <- xgb.DMatrix(data = x_mm, label = y_xgb)

xgb_fit <- xgb.train(
  data = dtrain,
  params = list(
    objective = "binary:logistic",
    eval_metric = "logloss",
    max_depth = 3,
    eta = 0.05,
    subsample = 0.8,
    colsample_bytree = 0.8
  ),
  nrounds = 200,
  verbose = 0
)

xgb_imp_raw <- xgb.importance(
  feature_names = colnames(x_mm),
  model = xgb_fit
)

map_xgb_to_original <- function(feature, vars) {
  matched <- vars[str_starts(feature, fixed(vars))]
  if (length(matched) == 0) return(NA_character_)
  matched[which.max(nchar(matched))]
}

xgb_rank <- xgb_imp_raw %>%
  mutate(variable = map_chr(Feature, map_xgb_to_original, vars = sig_vars)) %>%
  filter(!is.na(variable)) %>%
  group_by(variable) %>%
  summarise(xgb_importance = sum(Gain), .groups = "drop") %>%
  arrange(desc(xgb_importance))

rf_top <- rf_rank %>%
  slice_head(n = 30) %>%
  pull(variable)

xgb_top <- xgb_rank %>%
  slice_head(n = 30) %>%
  pull(variable)


rf_plot_dat <- rf_rank %>%
  slice_max(rf_importance, n = 15) %>%
  mutate(variable = fct_reorder(variable, rf_importance))

xgb_plot_dat <- xgb_rank %>%
  slice_max(xgb_importance, n = 15) %>%
  mutate(variable = fct_reorder(variable, xgb_importance))

p_rf <- ggplot(rf_plot_dat, aes(x = rf_importance, y = variable)) +
  geom_col(fill = "#2C7FB8", width = 0.72) +
  labs(
    title = "Random Forest",
    subtitle = "Top 15 variables by Mean Decrease Gini",
    x = "Importance",
    y = NULL
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 11, color = "gray35"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank()
  )

p_xgb <- ggplot(xgb_plot_dat, aes(x = xgb_importance, y = variable)) +
  geom_col(fill = "#D95F0E", width = 0.72) +
  labs(
    title = "XGBoost",
    subtitle = "Top 15 variables by Gain",
    x = "Importance",
    y = NULL
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(size = 11, color = "gray35"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank()
  )

p_rf + p_xgb +
  plot_annotation(
    title = "Variable Importance Comparison",
    subtitle = "Random forest and XGBoost models based on significant compareGroups variables",
    theme = theme(
      plot.title = element_text(face = "bold", size = 18),
      plot.subtitle = element_text(size = 12, color = "gray35")
    )
  )

common_vars <- intersect(rf_top, xgb_top)

top15_common <- tibble(variable = common_vars) %>%
  left_join(rf_rank, by = "variable") %>%
  left_join(xgb_rank, by = "variable") %>%
  mutate(
    rf_rank = rank(-rf_importance, ties.method = "min"),
    xgb_rank = rank(-xgb_importance, ties.method = "min"),
    mean_rank = rowMeans(cbind(rf_rank, xgb_rank), na.rm = TRUE)
  ) %>%
  arrange(mean_rank) %>%
  slice_head(n = 15) %>%
  pull(variable)

dat_final <- dat2[, c("Hitr", top15_common)]
dim(dat_final)
## [1] 8679   16
dim(dat2)
## [1] 8679   39
names(dat_final)
##  [1] "Hitr"  "Sexn"  "Lsta"  "Drin"  "Defo"  "Drim"  "Bike3" "Peda"  "Psl"  
## [10] "Pcra"  "Trav"  "Vtra"  "Vpro"  "Ruru"  "Mdrd"  "Dayt"
names(dat2)
##  [1] "Hitr"  "Weat"  "Lgtc"  "Ruru"  "Func"  "Pbse"  "Bike"  "Motm"  "Bike1"
## [10] "Sexn"  "Drin"  "Hazi"  "Busu"  "Emer"  "Trav"  "Unde"  "Roll"  "Defo" 
## [19] "Vtra"  "Vnum"  "Vali"  "Vpro"  "Vpav"  "Vsur"  "Pcra"  "Lsta"  "Ltyp" 
## [28] "Cdls"  "Mdrd"  "Drim"  "Mdrm"  "Peda"  "Drag"  "Dayt"  "Veht"  "Psl"  
## [37] "Bike2" "Bike3" "Acct"
res_final <- compareGroups(
  Hitr ~ .,
  data = dat_final,
  ref = 1,
  max.ylev = 30,
  max.xlev = 30
)

final_table <- createTable(res_final, show.ratio = TRUE)
final_table
## 
## --------Summary descriptives table by 'Hitr'---------
## 
## ______________________________________________________________________________________________________________________ 
##                                                                No          Yes             OR        p.ratio p.overall 
##                                                              N=6875       N=1804                                       
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Sexn:                                                                                                          0.000   
##     Female                                                2023 (29.4%) 191 (10.6%)        Ref.        Ref.             
##     Male                                                  4828 (70.2%) 768 (42.6%)  1.68 [1.43;1.99] <0.001            
##     Not Reported                                           24 (0.35%)  845 (46.8%)   369 [245;586]    0.000            
## Lsta:                                                                                                          0.000   
##     Expired                                                64 (0.93%)   21 (1.16%)        Ref.        Ref.             
##     Not licensed                                          217 (3.16%)  117 (6.49%)  1.63 [0.96;2.87]  0.069            
##     Not Reported                                           54 (0.79%)  867 (48.1%)  48.2 [27.8;86.6]  0.000            
##     Other                                                  44 (0.64%)   27 (1.50%)  1.86 [0.94;3.75]  0.077            
##     Suspended                                             226 (3.29%)  143 (7.93%)  1.92 [1.14;3.35]  0.014            
##     Valid                                                 6270 (91.2%) 629 (34.9%)  0.30 [0.19;0.51] <0.001            
## Drin:                                                                                                          0.000   
##     No (Alcohol Not Involved)                             5041 (73.3%) 406 (22.5%)        Ref.        Ref.             
##     Not Reported                                          1419 (20.6%) 1151 (63.8%) 10.1 [8.87;11.4]  0.000            
##     Yes (Alcohol Involved)                                415 (6.04%)  247 (13.7%)  7.39 [6.12;8.91]  0.000            
## Defo:                                                                                                         <0.001   
##     Damage Reported, Extent Unknown                       479 (6.97%)  101 (5.60%)        Ref.        Ref.             
##     Disabling Damage                                      1572 (22.9%) 151 (8.37%)  0.46 [0.35;0.60] <0.001            
##     Functional Damage                                     1945 (28.3%) 449 (24.9%)  1.09 [0.87;1.39]  0.458            
##     Minor Damage                                          1799 (26.2%) 241 (13.4%)  0.63 [0.49;0.82]  0.001            
##     Not Reported                                          614 (8.93%)  813 (45.1%)  6.27 [4.95;7.99]  0.000            
##     Other                                                 466 (6.78%)   49 (2.72%)  0.50 [0.34;0.72] <0.001            
## Drim:                                                                                                         <0.001   
##     Asleep or Fatigued                                     51 (0.74%)   12 (0.67%)        Ref.        Ref.             
##     Ill, Blackout                                          30 (0.44%)   2 (0.11%)   0.30 [0.04;1.23]  0.101            
##     None/Apparently Normal                                3746 (54.5%) 226 (12.5%)  0.25 [0.14;0.51] <0.001            
##     Not Reported                                          2509 (36.5%) 1325 (73.4%) 2.22 [1.22;4.40]  0.008            
##     Other                                                  77 (1.12%)   16 (0.89%)  0.88 [0.38;2.07]  0.768            
##     Under the Influence of Alcohol, Drugs or Medication   462 (6.72%)  223 (12.4%)  2.03 [1.09;4.08]  0.024            
## Bike3:                                                                                                        <0.001   
##     Backing / Wrong-Way / Non-Trafficway                  357 (5.19%)  107 (5.93%)        Ref.        Ref.             
##     Bicyclist Failed to Yield                             1769 (25.7%) 165 (9.15%)  0.31 [0.24;0.41] <0.001            
##     Bicyclist Turn / Merge                                705 (10.3%)   52 (2.88%)  0.25 [0.17;0.35] <0.001            
##     Crossing / Parallel Paths                             962 (14.0%)  291 (16.1%)  1.01 [0.79;1.30]  0.947            
##     Loss of Control / Turning Error                       306 (4.45%)   45 (2.49%)  0.49 [0.33;0.72] <0.001            
##     Motorist Failed to Yield                              188 (2.73%)   53 (2.94%)  0.94 [0.64;1.36]  0.753            
##     Motorist Turn / Merge                                 430 (6.25%)   53 (2.94%)  0.41 [0.29;0.59] <0.001            
##     Other / Unknown / Unusual                             402 (5.85%)  190 (10.5%)  1.58 [1.20;2.08]  0.001            
##     Overtaking                                            1756 (25.5%) 848 (47.0%)  1.61 [1.28;2.04] <0.001            
## Peda:                                                                                                         <0.001   
##     0_4                                                    33 (0.48%)   6 (0.33%)         Ref.        Ref.             
##     10_14                                                 245 (3.56%)   37 (2.05%)  0.82 [0.34;2.31]  0.679            
##     15_20                                                 390 (5.67%)  102 (5.65%)  1.41 [0.61;3.86]  0.442            
##     21_24                                                 231 (3.36%)   69 (3.82%)  1.61 [0.69;4.46]  0.289            
##     25_34                                                 764 (11.1%)  240 (13.3%)  1.69 [0.75;4.58]  0.220            
##     35_44                                                 917 (13.3%)  309 (17.1%)  1.81 [0.80;4.90]  0.160            
##     45_54                                                 1155 (16.8%) 348 (19.3%)  1.62 [0.72;4.38]  0.259            
##     5_9                                                    96 (1.40%)   11 (0.61%)  0.63 [0.22;1.98]  0.408            
##     55_64                                                 1538 (22.4%) 440 (24.4%)  1.54 [0.69;4.15]  0.315            
##     65_74                                                 932 (13.6%)  160 (8.87%)  0.92 [0.41;2.51]  0.865            
##     75+                                                   481 (7.00%)   55 (3.05%)  0.62 [0.26;1.72]  0.329            
##     Not Reported                                           93 (1.35%)   27 (1.50%)  1.57 [0.62;4.57]  0.355            
## Psl:                                                                                                          <0.001   
##     25_or_less                                            763 (11.1%)  187 (10.4%)        Ref.        Ref.             
##     30_35                                                 1695 (24.7%) 509 (28.2%)  1.22 [1.02;1.48]  0.033            
##     40_45                                                 2262 (32.9%) 564 (31.3%)  1.02 [0.85;1.23]  0.859            
##     50_55                                                 1456 (21.2%) 325 (18.0%)  0.91 [0.75;1.11]  0.360            
##     Not Reported                                          319 (4.64%)  153 (8.48%)  1.96 [1.52;2.51] <0.001            
##     Other                                                 380 (5.53%)   66 (3.66%)  0.71 [0.52;0.96]  0.026            
## Pcra:                                                                                                         <0.001   
##     Going Straight                                        5350 (77.8%) 1383 (76.7%)       Ref.        Ref.             
##     Negotiating a Curve                                   387 (5.63%)   93 (5.16%)  0.93 [0.73;1.17]  0.546            
##     Not Reported                                           27 (0.39%)  177 (9.81%)  25.2 [17.0;38.8]  0.000            
##     Other                                                 341 (4.96%)   67 (3.71%)  0.76 [0.58;0.99]  0.041            
##     Turning Left                                          380 (5.53%)   46 (2.55%)  0.47 [0.34;0.63] <0.001            
##     Turning Right                                         390 (5.67%)   38 (2.11%)  0.38 [0.27;0.52] <0.001            
## Trav:                                                                                                         <0.001   
##     035 MPH                                               337 (4.90%)   52 (2.88%)        Ref.        Ref.             
##     040 MPH                                               360 (5.24%)   52 (2.88%)  0.94 [0.62;1.42]  0.754            
##     045 MPH                                               497 (7.23%)  120 (6.65%)  1.56 [1.10;2.24]  0.012            
##     055 MPH                                               306 (4.45%)   53 (2.94%)  1.12 [0.74;1.70]  0.585            
##     Not Reported                                          3689 (53.7%) 1309 (72.6%) 2.29 [1.72;3.13] <0.001            
##     Other                                                 1686 (24.5%) 218 (12.1%)  0.84 [0.61;1.17]  0.287            
## Vtra:                                                                                                          0.020   
##     One-Way Trafficway                                    190 (2.76%)   75 (4.16%)        Ref.        Ref.             
##     Other                                                 296 (4.31%)   73 (4.05%)  0.63 [0.43;0.91]  0.013            
##     Two-Way,  Divided, Positive  Median Barrier           393 (5.72%)   93 (5.16%)  0.60 [0.42;0.85]  0.005            
##     Two-Way, Divided, Unprotected Median                  1581 (23.0%) 383 (21.2%)  0.61 [0.46;0.82]  0.001            
##     Two-Way, Not Divided                                  3667 (53.3%) 966 (53.5%)  0.67 [0.51;0.88]  0.005            
##     Two-Way, Not Divided With a Continuous Left-Turn Lane 748 (10.9%)  214 (11.9%)  0.72 [0.53;0.99]  0.043            
## Vpro:                                                                                                         <0.001   
##     Downhill                                              184 (2.68%)   44 (2.44%)        Ref.        Ref.             
##     Grade, Unknown Slope                                  375 (5.45%)   92 (5.10%)  1.02 [0.69;1.54]  0.906            
##     Level                                                 5166 (75.1%) 1314 (72.8%) 1.06 [0.77;1.50]  0.729            
##     Not Reported                                          737 (10.7%)  261 (14.5%)  1.48 [1.04;2.14]  0.029            
##     Other                                                 171 (2.49%)   31 (1.72%)  0.76 [0.45;1.26]  0.285            
##     Uphill                                                242 (3.52%)   62 (3.44%)  1.07 [0.70;1.66]  0.758            
## Ruru:                                                                                                         <0.001   
##     Not Reported                                           29 (0.42%)   4 (0.22%)         Ref.        Ref.             
##     Rural                                                 1430 (20.8%) 267 (14.8%)      . [.;.]         .              
##     Trafficway Not in State Inventory                      12 (0.17%)   0 (0.00%)       . [.;.]         .              
##     Urban                                                 5404 (78.6%) 1533 (85.0%)     . [.;.]         .              
## Mdrd:                                                                                                         <0.001   
##     Distraction/Inattention                                42 (0.61%)   5 (0.28%)         Ref.        Ref.             
##     Inattention (Inattentive), Details Unknown             79 (1.15%)   21 (1.16%)  2.18 [0.81;7.04]  0.128            
##     Not Distracted                                        1506 (21.9%) 152 (8.43%)  0.83 [0.35;2.45]  0.699            
##     Not Reported                                          5003 (72.8%) 1510 (83.7%) 2.47 [1.07;7.25]  0.033            
##     Other                                                 156 (2.27%)   26 (1.44%)  1.37 [0.53;4.32]  0.540            
##     Reported as Unknown if Distracted                      89 (1.29%)   90 (4.99%)  8.22 [3.37;25.0] <0.001            
## Dayt:                                                                                                         <0.001   
##     Weekday                                               5066 (73.7%) 1161 (64.4%)       Ref.        Ref.             
##     Weekend                                               1809 (26.3%) 643 (35.6%)  1.55 [1.39;1.73] <0.001            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
# Original variables used in compareGroups, excluding outcome
vars_dat2 <- setdiff(names(dat2), "Hitr")

# Variables kept after first compareGroups p-value screening
vars_after_p <- sig_vars

# Variables removed by first p-value screening
removed_by_pvalue <- setdiff(vars_dat2, vars_after_p)

# Variables kept after RF/XGBoost common top 15
vars_final <- setdiff(names(dat_final), "Hitr")

# Variables removed after variable-importance selection
removed_by_importance <- setdiff(vars_after_p, vars_final)

# Summary counts
selection_summary <- data.frame(
  Step = c(
    "Starting variables in dat2",
    "Kept after p-value <= 0.05",
    "Removed by p-value",
    "Kept in final top 15 common RF/XGBoost",
    "Removed after variable importance"
  ),
  N = c(
    length(vars_dat2),
    length(vars_after_p),
    length(removed_by_pvalue),
    length(vars_final),
    length(removed_by_importance)
  )
)

selection_summary
##                                     Step  N
## 1             Starting variables in dat2 38
## 2             Kept after p-value <= 0.05 19
## 3                     Removed by p-value 19
## 4 Kept in final top 15 common RF/XGBoost 15
## 5      Removed after variable importance  4
removed_by_pvalue
##  [1] "Weat"  "Lgtc"  "Func"  "Pbse"  "Bike"  "Motm"  "Bike1" "Hazi"  "Busu" 
## [10] "Vnum"  "Vpav"  "Vsur"  "Ltyp"  "Cdls"  "Mdrm"  "Drag"  "Veht"  "Bike2"
## [19] "Acct"
removed_by_importance
## [1] "Emer" "Unde" "Roll" "Vali"
vars_final
##  [1] "Sexn"  "Lsta"  "Drin"  "Defo"  "Drim"  "Bike3" "Peda"  "Psl"   "Pcra" 
## [10] "Trav"  "Vtra"  "Vpro"  "Ruru"  "Mdrd"  "Dayt"