Pre_Crash_Mode Classification Analysis

Task 1: Developing Several Machine Learning Models to Classify Pre_Crash_Mode from the Structured Data (without text or crash narrative column)

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# Load necessary libraries
library(caret)
library(dplyr)
library(randomForest)
library(e1071)
library(gbm)
library(ggplot2)

Dataset Loading

# Load the file with updated path or after confirming the working directory
data <- read.csv("C:/Users/amlan/Downloads/CA_AV_CrashReportsSample.csv")

Data Preprocessing

# Remove unnecessary columns
data <- data %>% select(-CrashNarrative, -File_name, -Manufacturer_Name, -Business_Name, 
                        -Date, -Time, -Location_Street, -Location_City, -Location_County,
                        -Location_State, -Location_Zip, -Latitude, -Longitude,
                        -Name_Injury.Death, -Bicyclist_Injury.Death, -Property_Damage,
                        -Property_Owner_Name, -Otherparty_Vehicle_Year, -Otherparty_Vehicle_Model,
                        -Otherparty_Vehicle_State, -Vehicle_was_OtherParty,
                        -Involved_Party_Otherparty, -Driver, -Passenger, -Vehicle_Year_3rdparty,
                        -Model_3rdParty, -X3rdparty_Vehicle_State, -Vehicle_was_3rdParty,
                        -Involve_in_Accident_3rdParty, -Driver_3rdParty, -Passenger_3rdParty)

# Handle missing values (example: remove rows with NA values)
data <- na.omit(data)

# Convert categorical variables to factors
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data <- data %>% mutate_if(is.character, as.factor)

Feature Selection (optional, based on initial data examination)

# Example using caret's nearZeroVar to remove low-variance predictors
nzv <- nearZeroVar(data, saveMetrics = TRUE)
data <- data[, !nzv$nzv]

Model Development

set.seed(123)
trainIndex <- createDataPartition(data$Pre_Crash_Mode, p = 0.8, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]

# Train models
# Logistic Regression
log_model <- train(Pre_Crash_Mode ~ ., data = train, method = "glm")

# Decision Tree
tree_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rpart")

# Random Forest
rf_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rf")

# GBM
gbm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "gbm", verbose = FALSE)

# SVM
svm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "svmRadial")

# kNN
knn_model <- train(Pre_Crash_Mode ~ ., data = train, method = "knn")

Model Comparison

# Predict and evaluate each model
models <- list(Logistic = log_model, Tree = tree_model, RandomForest = rf_model,
               GBM = gbm_model, SVM = svm_model, kNN = knn_model)

# Summarize resampling results
results <- resamples(models)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Logistic, Tree, RandomForest, GBM, SVM, kNN 
## Number of resamples: 25 
## 
## Accuracy 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Logistic     0.2916667 0.4666667 0.5283019 0.5201006 0.5769231 0.6938776    0
## Tree         0.5531915 0.6222222 0.6666667 0.6680400 0.7058824 0.8181818    0
## RandomForest 0.5555556 0.6521739 0.6792453 0.6827927 0.7090909 0.7884615    0
## GBM          0.6222222 0.6734694 0.7254902 0.7198410 0.7547170 0.8510638    0
## SVM          0.5714286 0.6200000 0.6666667 0.6672720 0.7000000 0.7708333    0
## kNN          0.4318182 0.6041667 0.6458333 0.6364959 0.6666667 0.7826087    0
## 
## Kappa 
##                     Min.     1st Qu.     Median       Mean   3rd Qu.      Max.
## Logistic     -0.38071066 -0.08695652 0.03361345 0.02947798 0.1428571 0.3839061
## Tree          0.02868526  0.23728814 0.28954424 0.32030619 0.4163265 0.6206897
## RandomForest  0.04255319  0.27969349 0.34567901 0.34252017 0.4206897 0.5731343
## GBM           0.18219038  0.31406045 0.43884892 0.42126062 0.4877323 0.6984418
## SVM           0.09961686  0.22857143 0.32432432 0.31581797 0.3862520 0.5164835
## kNN          -0.11788618  0.20689655 0.24014023 0.24650917 0.3030853 0.5418327
##              NA's
## Logistic        0
## Tree            0
## RandomForest    0
## GBM             0
## SVM             0
## kNN             0
# Accuracy for each model on test data
log_acc <- mean(predict(log_model, test) == test$Pre_Crash_Mode)
tree_acc <- mean(predict(tree_model, test) == test$Pre_Crash_Mode)
rf_acc <- mean(predict(rf_model, test) == test$Pre_Crash_Mode)
gbm_acc <- mean(predict(gbm_model, test) == test$Pre_Crash_Mode)
svm_acc <- mean(predict(svm_model, test) == test$Pre_Crash_Mode)
knn_acc <- mean(predict(knn_model, test) == test$Pre_Crash_Mode)

Task 2: Developing Several Machine Learning Models to Classify Pre_Crash_Mode using only Text Data or Crash Narrative cColumn with Explainable AI Results.

# Load necessary libraries
library(tidyverse)
library(tidytext)
library(caret)
library(tm)
library(glmnet)  # For regularized logistic regression
library(e1071)   # For Naive Bayes
library(randomForest)
library(lime)    # For model explainability
library(ggplot2)

Dataset Loading

data <- read.csv("C:/Users/amlan/Downloads/CA_AV_CrashReportsSample.csv")
data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()

Text Preprocessing

corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

# TF-IDF Vectorization
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
tfidf_data <- as.data.frame(as.matrix(dtm))
tfidf_data$Pre_Crash_Mode <- data$Pre_Crash_Mode

# Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(tfidf_data$Pre_Crash_Mode, p = 0.8, list = FALSE)
train_data <- tfidf_data[trainIndex,]
test_data <- tfidf_data[-trainIndex,]
colnames(train_data) <- make.names(colnames(train_data))
colnames(test_data) <- make.names(colnames(test_data))

Model Training

# Apply class balancing
train_control <- trainControl(method = "cv", number = 5, sampling = "up")

# Train Logistic Regression with class balancing
log_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "glmnet", 
  trControl = train_control,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01))
)
  
# Train Random Forest model using caret
rf_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "rf",         # Random Forest method in caret
  ntree = 200,           # Number of trees
  tuneGrid = data.frame(mtry = 3),  # Number of variables tried at each split
  trControl = trainControl(method = "cv", number = 5)  # Cross-validation
)
# Train a Gradient Boosting Machine (GBM) Model
# Using caret to ensure compatibility with lime
gbm_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "gbm",            # Specify GBM as the model
  trControl = trainControl(method = "cv", number = 5),  # 5-fold cross-validation
  tuneGrid = expand.grid(
    n.trees = 100,          # Number of trees
    interaction.depth = 3,  # Depth of each tree
    shrinkage = 0.1,        # Learning rate
    n.minobsinnode = 10     # Minimum number of observations in nodes
  ),
  verbose = FALSE            # Suppress printing
)

Model Evaluation

log_pred <- predict(log_model, test_data)
gbm_pred <- predict(gbm_model, test_data)
rf_pred <- predict(rf_model, test_data)

log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test_data$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)