Modelling

Classification

For the classification task, our primary objective was to predict the binary outcome of whether a traffic accident resulted in an injury. The feature selection process began with an initial set of predictors chosen from our exploratory analysis, including first_crash_type, traffic_control_device, weather_condition, and lighting_condition. To enhance the model’s predictive capabilities, we then engineered several new, more descriptive features. The numeric crash_hour_of_day was transformed into a categorical time_of_day feature (e.g., “Morning Rush”, “Night”) to better capture time-based traffic patterns. Similarly, crash_day_of_week was simplified into a binary is_weekend feature to distinguish between weekday and weekend driving behaviour.

if (!require("caret")) install.packages("caret", dependencies = TRUE)
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
if (!require("rpart")) install.packages("rpart")
## Loading required package: rpart
if (!require("rpart.plot")) install.packages("rpart.plot")
## Loading required package: rpart.plot
if (!require("randomForest")) install.packages("randomForest")
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
if (!require("xgboost")) install.packages("xgboost")
## Loading required package: xgboost
suppressPackageStartupMessages({
  library(caret)
  library(rpart)
  library(rpart.plot)
  library(randomForest)
  library(xgboost)
  library(dplyr)
  library(lubridate)
})
accidents <- read.csv("traffic_accidents.csv")
accidents <- accidents %>% filter(!is.na(crash_date))
glimpse(accidents)
## Rows: 209,306
## Columns: 24
## $ crash_date                    <chr> "07/29/2023 01:00:00 PM", "08/13/2023 12…
## $ traffic_control_device        <chr> "TRAFFIC SIGNAL", "TRAFFIC SIGNAL", "TRA…
## $ weather_condition             <chr> "CLEAR", "CLEAR", "CLEAR", "CLEAR", "CLE…
## $ lighting_condition            <chr> "DAYLIGHT", "DARKNESS, LIGHTED ROAD", "D…
## $ first_crash_type              <chr> "TURNING", "TURNING", "REAR END", "ANGLE…
## $ trafficway_type               <chr> "NOT DIVIDED", "FOUR WAY", "T-INTERSECTI…
## $ alignment                     <chr> "STRAIGHT AND LEVEL", "STRAIGHT AND LEVE…
## $ roadway_surface_cond          <chr> "UNKNOWN", "DRY", "DRY", "DRY", "UNKNOWN…
## $ road_defect                   <chr> "UNKNOWN", "NO DEFECTS", "NO DEFECTS", "…
## $ crash_type                    <chr> "NO INJURY / DRIVE AWAY", "NO INJURY / D…
## $ intersection_related_i        <chr> "Y", "Y", "Y", "Y", "Y", "N", "Y", "Y", …
## $ damage                        <chr> "$501 - $1,500", "OVER $1,500", "$501 - …
## $ prim_contributory_cause       <chr> "UNABLE TO DETERMINE", "IMPROPER TURNING…
## $ num_units                     <int> 2, 2, 3, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2…
## $ most_severe_injury            <chr> "NO INDICATION OF INJURY", "NO INDICATIO…
## $ injuries_total                <dbl> 0, 0, 0, 5, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0…
## $ injuries_fatal                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ injuries_incapacitating       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ injuries_non_incapacitating   <dbl> 0, 0, 0, 5, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0…
## $ injuries_reported_not_evident <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ injuries_no_indication        <dbl> 3, 2, 3, 0, 3, 0, 2, 1, 3, 4, 1, 2, 2, 2…
## $ crash_hour                    <int> 13, 0, 10, 19, 14, 0, 11, 14, 18, 17, 20…
## $ crash_day_of_week             <int> 7, 1, 5, 4, 7, 4, 3, 4, 2, 5, 5, 4, 5, 4…
## $ crash_month                   <int> 7, 8, 12, 8, 8, 9, 12, 9, 6, 9, 9, 11, 7…
# Convert date to proper datetime format
accidents <- accidents %>%
  mutate(crash_date = parse_date_time(crash_date, orders = c("mdy HM", "mdy HMS", "ymd HM")))

# Extract additional time features
accidents <- accidents %>%
  mutate(
    crash_hour_of_day = hour(crash_date),
    crash_day_of_week = wday(crash_date, label = TRUE, abbr = FALSE),
    crash_month = month(crash_date, label = TRUE, abbr = FALSE),
    crash_year = year(crash_date)
  )

# Clean up categorical variables by converting to factors
accidents <- accidents %>%
  mutate(
    traffic_control_device = factor(traffic_control_device),
    weather_condition = factor(weather_condition),
    lighting_condition = factor(lighting_condition),
    first_crash_type = factor(first_crash_type),
    trafficway_type = factor(trafficway_type),
    alignment = factor(alignment),
    roadway_surface_cond = factor(roadway_surface_cond),
    road_defect = factor(road_defect),
    crash_type = factor(crash_type),
    intersection_related_i = factor(intersection_related_i),
    damage = factor(damage, levels = c("$500 OR LESS", "$501 - $1,500", "OVER $1,500")),
    prim_contributory_cause = factor(prim_contributory_cause),
    most_severe_injury = factor(most_severe_injury)
  )

# Check for missing values
colSums(is.na(accidents))
##                    crash_date        traffic_control_device 
##                             0                             0 
##             weather_condition            lighting_condition 
##                             0                             0 
##              first_crash_type               trafficway_type 
##                             0                             0 
##                     alignment          roadway_surface_cond 
##                             0                             0 
##                   road_defect                    crash_type 
##                             0                             0 
##        intersection_related_i                        damage 
##                             0                             0 
##       prim_contributory_cause                     num_units 
##                             0                             0 
##            most_severe_injury                injuries_total 
##                             0                             0 
##                injuries_fatal       injuries_incapacitating 
##                             0                             0 
##   injuries_non_incapacitating injuries_reported_not_evident 
##                             0                             0 
##        injuries_no_indication                    crash_hour 
##                             0                             0 
##             crash_day_of_week                   crash_month 
##                             0                             0 
##             crash_hour_of_day                    crash_year 
##                             0                             0
# Handle missing values if necessary (example for one column)
# accidents$roadway_surface_cond[is.na(accidents$roadway_surface_cond)] <- "UNKNOWN"

# Create severity indicator
accidents <- accidents %>%
  mutate(
    severity = case_when(
      injuries_fatal > 0 ~ "Fatal",
      injuries_incapacitating > 0 ~ "Severe",
      injuries_non_incapacitating > 0 ~ "Moderate",
      injuries_reported_not_evident > 0 ~ "Minor",
      TRUE ~ "No_Injury"
    ),
    severity = factor(severity, levels = c("No_Injury", "Minor", "Moderate", "Severe", "Fatal"))
  )

Label encoding is used here to convert categorical (factor) variables into numerical representations. Since most of the machine learning algorithms require numerical input and cannot directly process text-based categorical data. By converting factors to numeric codes, we enable these algorithms to work with our dataset while preserving the distinct categories.

cat("\n--- Label Encoding for Categorical (Factor) Columns ---\n")
## 
## --- Label Encoding for Categorical (Factor) Columns ---
categorical_columns_to_encode <- names(accidents)[sapply(accidents, is.factor) & names(accidents) != "crash_type"]

original_factor_levels <- list()

for (col_name in categorical_columns_to_encode) {
  cat(paste0("\nColumn: ", col_name, " Encoding and Decoding:\n"))

  # Store original levels
  original_factor_levels[[col_name]] <- levels(accidents[[col_name]])

  # Convert factor to numeric (this uses the internal integer codes)
  # The first level will be 1, second 2, etc.
  accidents[[col_name]] <- as.numeric(accidents[[col_name]])

  # Print the mapping: original level -> new numeric code
  for (i in seq_along(original_factor_levels[[col_name]])) {
    cat(paste0(original_factor_levels[[col_name]][i], " -> ", i, "\n"))
  }
}
## 
## Column: traffic_control_device Encoding and Decoding:
## BICYCLE CROSSING SIGN -> 1
## DELINEATORS -> 2
## FLASHING CONTROL SIGNAL -> 3
## LANE USE MARKING -> 4
## NO CONTROLS -> 5
## NO PASSING -> 6
## OTHER -> 7
## OTHER RAILROAD CROSSING -> 8
## OTHER REG. SIGN -> 9
## OTHER WARNING SIGN -> 10
## PEDESTRIAN CROSSING SIGN -> 11
## POLICE/FLAGMAN -> 12
## RAILROAD CROSSING GATE -> 13
## RR CROSSING SIGN -> 14
## SCHOOL ZONE -> 15
## STOP SIGN/FLASHER -> 16
## TRAFFIC SIGNAL -> 17
## UNKNOWN -> 18
## YIELD -> 19
## 
## Column: weather_condition Encoding and Decoding:
## BLOWING SAND, SOIL, DIRT -> 1
## BLOWING SNOW -> 2
## CLEAR -> 3
## CLOUDY/OVERCAST -> 4
## FOG/SMOKE/HAZE -> 5
## FREEZING RAIN/DRIZZLE -> 6
## OTHER -> 7
## RAIN -> 8
## SEVERE CROSS WIND GATE -> 9
## SLEET/HAIL -> 10
## SNOW -> 11
## UNKNOWN -> 12
## 
## Column: lighting_condition Encoding and Decoding:
## DARKNESS -> 1
## DARKNESS, LIGHTED ROAD -> 2
## DAWN -> 3
## DAYLIGHT -> 4
## DUSK -> 5
## UNKNOWN -> 6
## 
## Column: first_crash_type Encoding and Decoding:
## ANGLE -> 1
## ANIMAL -> 2
## FIXED OBJECT -> 3
## HEAD ON -> 4
## OTHER NONCOLLISION -> 5
## OTHER OBJECT -> 6
## OVERTURNED -> 7
## PARKED MOTOR VEHICLE -> 8
## PEDALCYCLIST -> 9
## PEDESTRIAN -> 10
## REAR END -> 11
## REAR TO FRONT -> 12
## REAR TO REAR -> 13
## REAR TO SIDE -> 14
## SIDESWIPE OPPOSITE DIRECTION -> 15
## SIDESWIPE SAME DIRECTION -> 16
## TRAIN -> 17
## TURNING -> 18
## 
## Column: trafficway_type Encoding and Decoding:
## ALLEY -> 1
## CENTER TURN LANE -> 2
## DIVIDED - W/MEDIAN (NOT RAISED) -> 3
## DIVIDED - W/MEDIAN BARRIER -> 4
## DRIVEWAY -> 5
## FIVE POINT, OR MORE -> 6
## FOUR WAY -> 7
## L-INTERSECTION -> 8
## NOT DIVIDED -> 9
## NOT REPORTED -> 10
## ONE-WAY -> 11
## OTHER -> 12
## PARKING LOT -> 13
## RAMP -> 14
## ROUNDABOUT -> 15
## T-INTERSECTION -> 16
## TRAFFIC ROUTE -> 17
## UNKNOWN -> 18
## UNKNOWN INTERSECTION TYPE -> 19
## Y-INTERSECTION -> 20
## 
## Column: alignment Encoding and Decoding:
## CURVE ON GRADE -> 1
## CURVE ON HILLCREST -> 2
## CURVE, LEVEL -> 3
## STRAIGHT AND LEVEL -> 4
## STRAIGHT ON GRADE -> 5
## STRAIGHT ON HILLCREST -> 6
## 
## Column: roadway_surface_cond Encoding and Decoding:
## DRY -> 1
## ICE -> 2
## OTHER -> 3
## SAND, MUD, DIRT -> 4
## SNOW OR SLUSH -> 5
## UNKNOWN -> 6
## WET -> 7
## 
## Column: road_defect Encoding and Decoding:
## DEBRIS ON ROADWAY -> 1
## NO DEFECTS -> 2
## OTHER -> 3
## RUT, HOLES -> 4
## SHOULDER DEFECT -> 5
## UNKNOWN -> 6
## WORN SURFACE -> 7
## 
## Column: intersection_related_i Encoding and Decoding:
## N -> 1
## Y -> 2
## 
## Column: damage Encoding and Decoding:
## $500 OR LESS -> 1
## $501 - $1,500 -> 2
## OVER $1,500 -> 3
## 
## Column: prim_contributory_cause Encoding and Decoding:
## ANIMAL -> 1
## BICYCLE ADVANCING LEGALLY ON RED LIGHT -> 2
## CELL PHONE USE OTHER THAN TEXTING -> 3
## DISREGARDING OTHER TRAFFIC SIGNS -> 4
## DISREGARDING ROAD MARKINGS -> 5
## DISREGARDING STOP SIGN -> 6
## DISREGARDING TRAFFIC SIGNALS -> 7
## DISREGARDING YIELD SIGN -> 8
## DISTRACTION - FROM INSIDE VEHICLE -> 9
## DISTRACTION - FROM OUTSIDE VEHICLE -> 10
## DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.) -> 11
## DRIVING ON WRONG SIDE/WRONG WAY -> 12
## DRIVING SKILLS/KNOWLEDGE/EXPERIENCE -> 13
## EQUIPMENT - VEHICLE CONDITION -> 14
## EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST -> 15
## EXCEEDING AUTHORIZED SPEED LIMIT -> 16
## EXCEEDING SAFE SPEED FOR CONDITIONS -> 17
## FAILING TO REDUCE SPEED TO AVOID CRASH -> 18
## FAILING TO YIELD RIGHT-OF-WAY -> 19
## FOLLOWING TOO CLOSELY -> 20
## HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE) -> 21
## IMPROPER BACKING -> 22
## IMPROPER LANE USAGE -> 23
## IMPROPER OVERTAKING/PASSING -> 24
## IMPROPER TURNING/NO SIGNAL -> 25
## MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT -> 26
## NOT APPLICABLE -> 27
## OBSTRUCTED CROSSWALKS -> 28
## OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER -> 29
## PASSING STOPPED SCHOOL BUS -> 30
## PHYSICAL CONDITION OF DRIVER -> 31
## RELATED TO BUS STOP -> 32
## ROAD CONSTRUCTION/MAINTENANCE -> 33
## ROAD ENGINEERING/SURFACE/MARKING DEFECTS -> 34
## TEXTING -> 35
## TURNING RIGHT ON RED -> 36
## UNABLE TO DETERMINE -> 37
## UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED) -> 38
## VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.) -> 39
## WEATHER -> 40
## 
## Column: most_severe_injury Encoding and Decoding:
## FATAL -> 1
## INCAPACITATING INJURY -> 2
## NO INDICATION OF INJURY -> 3
## NONINCAPACITATING INJURY -> 4
## REPORTED, NOT EVIDENT -> 5
## 
## Column: crash_day_of_week Encoding and Decoding:
## Sunday -> 1
## Monday -> 2
## Tuesday -> 3
## Wednesday -> 4
## Thursday -> 5
## Friday -> 6
## Saturday -> 7
## 
## Column: crash_month Encoding and Decoding:
## January -> 1
## February -> 2
## March -> 3
## April -> 4
## May -> 5
## June -> 6
## July -> 7
## August -> 8
## September -> 9
## October -> 10
## November -> 11
## December -> 12
## 
## Column: severity Encoding and Decoding:
## No_Injury -> 1
## Minor -> 2
## Moderate -> 3
## Severe -> 4
## Fatal -> 5
cat("---\n")
## ---
accidents <- accidents %>% select(-'crash_date')

X <- accidents %>% select(-'crash_type')
y <- accidents[['crash_type']]

Before modeling, we examined the initial distribution of the target variable, crash_type, which captures whether an accident resulted in injury or required a tow. The dataset comprised 91,930 cases (43.9%) labeled as “INJURY AND / OR TOW DUE TO CRASH”, and 117,376 cases (56.1%) labeled as “NO INJURY / DRIVE AWAY”. This indicates a moderately imbalanced class distribution, with non-injury cases slightly more prevalent.

cat("\n--- Initial Class Distribution of 'crash_type' ---\n")
## 
## --- Initial Class Distribution of 'crash_type' ---
print(table(y))
## y
## INJURY AND / OR TOW DUE TO CRASH           NO INJURY / DRIVE AWAY 
##                            91930                           117376
cat("\n--- Initial Class Proportion of 'crash_type' ---\n")
## 
## --- Initial Class Proportion of 'crash_type' ---
print(prop.table(table(y)))
## y
## INJURY AND / OR TOW DUE TO CRASH           NO INJURY / DRIVE AWAY 
##                        0.4392134                        0.5607866

Before proceeding with model training, we inspected the target variable crash_type and observed that its factor levels contained characters such as slashes (/) and spaces, which can cause issues during model fitting or result interpretation—especially when using algorithms or libraries that require syntactically valid variable names. To address this, we renamed the factor levels by first replacing spaces with underscores and then applying make.names() to ensure all labels were syntactically valid.

current_target_levels <- levels(accidents[['crash_type']])

levels_with_underscores <- gsub(" ", "_", current_target_levels)

valid_target_levels <- make.names(levels_with_underscores)

accidents[['crash_type']] <- factor(accidents[['crash_type']],
                                       levels = current_target_levels,
                                       labels = valid_target_levels)

y <- accidents[['crash_type']]
cat("\n--- Initial Class Distribution of 'crash_type' ---\n")
## 
## --- Initial Class Distribution of 'crash_type' ---
print(table(y))
## y
## INJURY_AND_._OR_TOW_DUE_TO_CRASH           NO_INJURY_._DRIVE_AWAY 
##                            91930                           117376
cat("\n--- Initial Class Proportion of 'crash_type' ---\n")
## 
## --- Initial Class Proportion of 'crash_type' ---
print(prop.table(table(y)))
## y
## INJURY_AND_._OR_TOW_DUE_TO_CRASH           NO_INJURY_._DRIVE_AWAY 
##                        0.4392134                        0.5607866

Data Splitting

set.seed(42)

train_index <- createDataPartition(y, p = 0.7, list = FALSE, times = 1)

X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]

train_data <- data.frame(X_train, crash_type = y_train)
plot_confusion_matrix_heatmap <- function(conf_matrix, model_name) {
  cm_table <- as.table(conf_matrix)
  cm_df <- as.data.frame(cm_table)
  names(cm_df) <- c("Prediction", "Reference", "Frequency")

  ggplot(data = cm_df, aes(x = Reference, y = Prediction, fill = Frequency)) +
    geom_tile(color = "white") +
    scale_fill_gradient(low = "white", high = "steelblue") +
    geom_text(aes(label = sprintf("%d", Frequency)), vjust = 1, color = "black") +
    labs(
      title = paste("Confusion Matrix ", model_name),
      x = "Actual",
      y = "Predicted"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
      axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
      axis.text.y = element_text(size = 10),
      axis.title = element_text(size = 12),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      legend.position = "right"
    ) +
    coord_fixed()
}

We trained two different classification models and evaluated them on a separate test set to find the most effective one. The comparison focused on three key metrics: overall accuracy, sensitivity(recall), and F1 score which is a fairer measure when dealing with imbalanced data.

Random Forest Model

train_control <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = multiClassSummary,

)
set.seed(42)
rf_model <- train(
  crash_type ~ .,
  data = train_data,
  method = "rf",
  trControl = train_control,
  tuneLength = 3
)
cat("--- Random Forest Model Summary ---\n")
## --- Random Forest Model Summary ---
print(rf_model)
## Random Forest 
## 
## 146515 samples
##     25 predictor
##      2 classes: 'INJURY_AND_._OR_TOW_DUE_TO_CRASH', 'NO_INJURY_._DRIVE_AWAY' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 117213, 117212, 117212, 117212, 117211 
## Resampling results across tuning parameters:
## 
##   mtry  logLoss    AUC        prAUC      Accuracy   Kappa      F1       
##    2    0.5471718  0.9114362  0.7470384  0.8337508  0.6491500  0.7704025
##   13    0.3400071  0.9187443  0.6079218  0.8394226  0.6696509  0.8053004
##   25    0.3466631  0.9170241  0.5971850  0.8378733  0.6667464  0.8042275
##   Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value  Precision
##   0.6350951    0.9893384    0.9790172       0.7758796       0.9790172
##   0.7561189    0.9046663    0.8613374       0.8256784       0.8613374
##   0.7582167    0.9002605    0.8561954       0.8262183       0.8561954
##   Recall     Detection_Rate  Balanced_Accuracy
##   0.6350951  0.2789407       0.8122168        
##   0.7561189  0.3320957       0.8303926        
##   0.7582167  0.3330171       0.8292386        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 13.
y_pred <- predict(rf_model, newdata = X_test)

rf_conf_matrix <- confusionMatrix(y_pred, y_test, mode = "everything")

cat("\n--- Random Forest Model Evaluation: Classification Report ---\n")
## 
## --- Random Forest Model Evaluation: Classification Report ---
print(rf_conf_matrix)
## Confusion Matrix and Statistics
## 
##                                   Reference
## Prediction                         INJURY_AND_._OR_TOW_DUE_TO_CRASH
##   INJURY_AND_._OR_TOW_DUE_TO_CRASH                            20866
##   NO_INJURY_._DRIVE_AWAY                                       6713
##                                   Reference
## Prediction                         NO_INJURY_._DRIVE_AWAY
##   INJURY_AND_._OR_TOW_DUE_TO_CRASH                   3480
##   NO_INJURY_._DRIVE_AWAY                            31732
##                                                           
##                Accuracy : 0.8377                          
##                  95% CI : (0.8348, 0.8405)                
##     No Information Rate : 0.5608                          
##     P-Value [Acc > NIR] : < 2.2e-16                       
##                                                           
##                   Kappa : 0.6662                          
##                                                           
##  Mcnemar's Test P-Value : < 2.2e-16                       
##                                                           
##             Sensitivity : 0.7566                          
##             Specificity : 0.9012                          
##          Pos Pred Value : 0.8571                          
##          Neg Pred Value : 0.8254                          
##               Precision : 0.8571                          
##                  Recall : 0.7566                          
##                      F1 : 0.8037                          
##              Prevalence : 0.4392                          
##          Detection Rate : 0.3323                          
##    Detection Prevalence : 0.3877                          
##       Balanced Accuracy : 0.8289                          
##                                                           
##        'Positive' Class : INJURY_AND_._OR_TOW_DUE_TO_CRASH
## 
plot_confusion_matrix_heatmap(rf_conf_matrix, "(Random Forest)")

XGBoost Model

X_train_matrix <- model.matrix(~ . -1, data = X_train)
X_test_matrix <- model.matrix(~ . -1, data = X_test)


y_train_bin <- as.numeric(y_train) - 1
y_test_bin <- as.numeric(y_test) - 1

dtrain <- xgb.DMatrix(data = X_train_matrix, label = y_train_bin)
dtest <- xgb.DMatrix(data = X_test_matrix, label = y_test_bin)
params <- list(
  objective = "binary:logistic",
  eval_metric = "logloss"
)

xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100,
  watchlist = list(train = dtrain),
  verbose = 1
)
## [1]  train-logloss:0.547584 
## [2]  train-logloss:0.467789 
## [3]  train-logloss:0.419169 
## [4]  train-logloss:0.387830 
## [5]  train-logloss:0.366302 
## [6]  train-logloss:0.351629 
## [7]  train-logloss:0.341321 
## [8]  train-logloss:0.333911 
## [9]  train-logloss:0.328446 
## [10] train-logloss:0.324356 
## [11] train-logloss:0.321032 
## [12] train-logloss:0.318676 
## [13] train-logloss:0.316785 
## [14] train-logloss:0.314899 
## [15] train-logloss:0.313734 
## [16] train-logloss:0.312740 
## [17] train-logloss:0.311817 
## [18] train-logloss:0.310911 
## [19] train-logloss:0.310186 
## [20] train-logloss:0.309508 
## [21] train-logloss:0.308978 
## [22] train-logloss:0.308284 
## [23] train-logloss:0.307944 
## [24] train-logloss:0.307539 
## [25] train-logloss:0.307181 
## [26] train-logloss:0.306570 
## [27] train-logloss:0.306257 
## [28] train-logloss:0.305557 
## [29] train-logloss:0.305248 
## [30] train-logloss:0.304535 
## [31] train-logloss:0.304242 
## [32] train-logloss:0.303759 
## [33] train-logloss:0.303426 
## [34] train-logloss:0.303032 
## [35] train-logloss:0.302747 
## [36] train-logloss:0.302373 
## [37] train-logloss:0.302179 
## [38] train-logloss:0.301790 
## [39] train-logloss:0.301169 
## [40] train-logloss:0.300843 
## [41] train-logloss:0.300361 
## [42] train-logloss:0.299990 
## [43] train-logloss:0.299623 
## [44] train-logloss:0.299125 
## [45] train-logloss:0.298876 
## [46] train-logloss:0.298467 
## [47] train-logloss:0.298294 
## [48] train-logloss:0.298053 
## [49] train-logloss:0.297867 
## [50] train-logloss:0.297542 
## [51] train-logloss:0.297392 
## [52] train-logloss:0.296869 
## [53] train-logloss:0.296544 
## [54] train-logloss:0.296154 
## [55] train-logloss:0.295736 
## [56] train-logloss:0.295417 
## [57] train-logloss:0.295328 
## [58] train-logloss:0.294973 
## [59] train-logloss:0.294491 
## [60] train-logloss:0.294201 
## [61] train-logloss:0.293883 
## [62] train-logloss:0.293696 
## [63] train-logloss:0.293350 
## [64] train-logloss:0.293144 
## [65] train-logloss:0.292838 
## [66] train-logloss:0.292609 
## [67] train-logloss:0.292371 
## [68] train-logloss:0.292072 
## [69] train-logloss:0.291838 
## [70] train-logloss:0.291621 
## [71] train-logloss:0.291272 
## [72] train-logloss:0.291091 
## [73] train-logloss:0.290759 
## [74] train-logloss:0.290526 
## [75] train-logloss:0.290355 
## [76] train-logloss:0.290271 
## [77] train-logloss:0.290132 
## [78] train-logloss:0.289889 
## [79] train-logloss:0.289739 
## [80] train-logloss:0.289632 
## [81] train-logloss:0.289268 
## [82] train-logloss:0.288967 
## [83] train-logloss:0.288630 
## [84] train-logloss:0.288518 
## [85] train-logloss:0.288248 
## [86] train-logloss:0.288108 
## [87] train-logloss:0.288058 
## [88] train-logloss:0.287947 
## [89] train-logloss:0.287733 
## [90] train-logloss:0.287466 
## [91] train-logloss:0.287303 
## [92] train-logloss:0.287028 
## [93] train-logloss:0.286882 
## [94] train-logloss:0.286717 
## [95] train-logloss:0.286430 
## [96] train-logloss:0.286103 
## [97] train-logloss:0.285827 
## [98] train-logloss:0.285619 
## [99] train-logloss:0.285357 
## [100]    train-logloss:0.285123
xgb_probs <- predict(xgb_model, newdata = dtest)

xgb_preds <- ifelse(xgb_probs > 0.5, 1, 0)

predicted_factor <- factor(xgb_preds, levels = c(0,1), labels = levels(y_test))
actual_factor <- factor(y_test_bin, levels = c(0,1), labels = levels(y_test))
xgb_conf_matrix <- confusionMatrix(predicted_factor, actual_factor, mode = "everything")
cat("\n--- XGBoost Model Evaluation: Classification Report ---\n")
## 
## --- XGBoost Model Evaluation: Classification Report ---
print(xgb_conf_matrix)
## Confusion Matrix and Statistics
## 
##                                   Reference
## Prediction                         INJURY_AND_._OR_TOW_DUE_TO_CRASH
##   INJURY_AND_._OR_TOW_DUE_TO_CRASH                            20718
##   NO_INJURY_._DRIVE_AWAY                                       6861
##                                   Reference
## Prediction                         NO_INJURY_._DRIVE_AWAY
##   INJURY_AND_._OR_TOW_DUE_TO_CRASH                   2894
##   NO_INJURY_._DRIVE_AWAY                            32318
##                                                           
##                Accuracy : 0.8446                          
##                  95% CI : (0.8418, 0.8475)                
##     No Information Rate : 0.5608                          
##     P-Value [Acc > NIR] : < 2.2e-16                       
##                                                           
##                   Kappa : 0.6796                          
##                                                           
##  Mcnemar's Test P-Value : < 2.2e-16                       
##                                                           
##             Sensitivity : 0.7512                          
##             Specificity : 0.9178                          
##          Pos Pred Value : 0.8774                          
##          Neg Pred Value : 0.8249                          
##               Precision : 0.8774                          
##                  Recall : 0.7512                          
##                      F1 : 0.8094                          
##              Prevalence : 0.4392                          
##          Detection Rate : 0.3300                          
##    Detection Prevalence : 0.3760                          
##       Balanced Accuracy : 0.8345                          
##                                                           
##        'Positive' Class : INJURY_AND_._OR_TOW_DUE_TO_CRASH
## 
plot_confusion_matrix_heatmap(xgb_conf_matrix, "XGBoost")

Between the two models evaluated, XGBoost slightly outperformed Random Forest in terms of overall classification performance. XGBoost achieved a higher accuracy (84.5%) compared to Random Forest’s 83.8%, and also recorded a better F1 score (0.8094 vs. 0.8037), indicating a more balanced performance between precision and recall. Although Random Forest showed marginally higher recall (75.7% vs. 75.1%), XGBoost’s superior F1 score and accuracy suggest it is the more effective model overall for predicting injury-related crashes.