Pre_Crash_Mode Classification Analysis

Task 1: Developing Several Machine Learning Models to Classify Pre_Crash_Mode from the Structured Data (without text or crash narrative column)

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# Load necessary libraries
library(caret)
library(dplyr)
library(randomForest)
library(e1071)
library(gbm)
library(ggplot2)

Dataset Loading

# Load the file with updated path or after confirming the working directory
data <- read.csv("C:/Users/amlan/Downloads/CA_AV_CrashReportsSample.csv")

Data Preprocessing

# Remove unnecessary columns
data <- data %>% select(-CrashNarrative, -File_name, -Manufacturer_Name, -Business_Name, 
                        -Date, -Time, -Location_Street, -Location_City, -Location_County,
                        -Location_State, -Location_Zip, -Latitude, -Longitude,
                        -Name_Injury.Death, -Bicyclist_Injury.Death, -Property_Damage,
                        -Property_Owner_Name, -Otherparty_Vehicle_Year, -Otherparty_Vehicle_Model,
                        -Otherparty_Vehicle_State, -Vehicle_was_OtherParty,
                        -Involved_Party_Otherparty, -Driver, -Passenger, -Vehicle_Year_3rdparty,
                        -Model_3rdParty, -X3rdparty_Vehicle_State, -Vehicle_was_3rdParty,
                        -Involve_in_Accident_3rdParty, -Driver_3rdParty, -Passenger_3rdParty)

# Handle missing values (example: remove rows with NA values)
data <- na.omit(data)

# Convert categorical variables to factors
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data <- data %>% mutate_if(is.character, as.factor)

Feature Selection (optional, based on initial data examination)

# Example using caret's nearZeroVar to remove low-variance predictors
nzv <- nearZeroVar(data, saveMetrics = TRUE)
data <- data[, !nzv$nzv]

Model Development

set.seed(123)
trainIndex <- createDataPartition(data$Pre_Crash_Mode, p = 0.8, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]

# Train models
# Logistic Regression
log_model <- train(Pre_Crash_Mode ~ ., data = train, method = "glm")

# Decision Tree
tree_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rpart")

# Random Forest
rf_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rf")

# GBM
gbm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "gbm", verbose = FALSE)

# SVM
svm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "svmRadial")

# kNN
knn_model <- train(Pre_Crash_Mode ~ ., data = train, method = "knn")

Model Comparison

# Predict and evaluate each model
models <- list(Logistic = log_model, Tree = tree_model, RandomForest = rf_model,
               GBM = gbm_model, SVM = svm_model, kNN = knn_model)

# Summarize resampling results
results <- resamples(models)
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: Logistic, Tree, RandomForest, GBM, SVM, kNN 
## Number of resamples: 25 
## 
## Accuracy 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Logistic     0.2916667 0.4666667 0.5283019 0.5201006 0.5769231 0.6938776    0
## Tree         0.5531915 0.6222222 0.6666667 0.6680400 0.7058824 0.8181818    0
## RandomForest 0.5555556 0.6521739 0.6792453 0.6827927 0.7090909 0.7884615    0
## GBM          0.6222222 0.6734694 0.7254902 0.7198410 0.7547170 0.8510638    0
## SVM          0.5714286 0.6200000 0.6666667 0.6672720 0.7000000 0.7708333    0
## kNN          0.4318182 0.6041667 0.6458333 0.6364959 0.6666667 0.7826087    0
## 
## Kappa 
##                     Min.     1st Qu.     Median       Mean   3rd Qu.      Max.
## Logistic     -0.38071066 -0.08695652 0.03361345 0.02947798 0.1428571 0.3839061
## Tree          0.02868526  0.23728814 0.28954424 0.32030619 0.4163265 0.6206897
## RandomForest  0.04255319  0.27969349 0.34567901 0.34252017 0.4206897 0.5731343
## GBM           0.18219038  0.31406045 0.43884892 0.42126062 0.4877323 0.6984418
## SVM           0.09961686  0.22857143 0.32432432 0.31581797 0.3862520 0.5164835
## kNN          -0.11788618  0.20689655 0.24014023 0.24650917 0.3030853 0.5418327
##              NA's
## Logistic        0
## Tree            0
## RandomForest    0
## GBM             0
## SVM             0
## kNN             0

# Accuracy for each model on test data
log_acc <- mean(predict(log_model, test) == test$Pre_Crash_Mode)
tree_acc <- mean(predict(tree_model, test) == test$Pre_Crash_Mode)
rf_acc <- mean(predict(rf_model, test) == test$Pre_Crash_Mode)
gbm_acc <- mean(predict(gbm_model, test) == test$Pre_Crash_Mode)
svm_acc <- mean(predict(svm_model, test) == test$Pre_Crash_Mode)
knn_acc <- mean(predict(knn_model, test) == test$Pre_Crash_Mode)

Print Accuracy Results

cat("Logistic Regression Accuracy:", log_acc, "\n")

## Logistic Regression Accuracy: 0.6060606

cat("Decision Tree Accuracy:", tree_acc, "\n")

## Decision Tree Accuracy: 0.6060606

cat("Random Forest Accuracy:", rf_acc, "\n")

## Random Forest Accuracy: 0.4545455

cat("GBM Accuracy:", gbm_acc, "\n")

## GBM Accuracy: 0.5757576

cat("SVM Accuracy:", svm_acc, "\n")

## SVM Accuracy: 0.5151515

cat("kNN Accuracy:", knn_acc, "\n")

## kNN Accuracy: 0.6363636

# Plotting model accuracy from cross-validation results
dotplot(results)

# Bar plot of accuracy results on test data
accuracy_data <- data.frame(
  Model = c("Logistic Regression", "Decision Tree", "Random Forest", "GBM", "SVM", "kNN"),
  Accuracy = c(log_acc, tree_acc, rf_acc, gbm_acc, svm_acc, knn_acc)
)

ggplot(accuracy_data, aes(x = Model, y = Accuracy)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Test Accuracy of Various Models", x = "Model", y = "Accuracy")

Task 2: Developing Several Machine Learning Models to Classify Pre_Crash_Mode using only Text Data or Crash Narrative cColumn with Explainable AI Results.

# Load necessary libraries
library(tidyverse)
library(tidytext)
library(caret)
library(tm)
library(glmnet)  # For regularized logistic regression
library(e1071)   # For Naive Bayes
library(randomForest)
library(lime)    # For model explainability
library(ggplot2)

Dataset Loading

data <- read.csv("C:/Users/amlan/Downloads/CA_AV_CrashReportsSample.csv")
data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()

Text Preprocessing

corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

# TF-IDF Vectorization
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
tfidf_data <- as.data.frame(as.matrix(dtm))
tfidf_data$Pre_Crash_Mode <- data$Pre_Crash_Mode

# Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(tfidf_data$Pre_Crash_Mode, p = 0.8, list = FALSE)
train_data <- tfidf_data[trainIndex,]
test_data <- tfidf_data[-trainIndex,]
colnames(train_data) <- make.names(colnames(train_data))
colnames(test_data) <- make.names(colnames(test_data))

Model Training

# Apply class balancing
train_control <- trainControl(method = "cv", number = 5, sampling = "up")

# Train Logistic Regression with class balancing
log_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "glmnet", 
  trControl = train_control,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01))
)
  
# Train Random Forest model using caret
rf_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "rf",         # Random Forest method in caret
  ntree = 200,           # Number of trees
  tuneGrid = data.frame(mtry = 3),  # Number of variables tried at each split
  trControl = trainControl(method = "cv", number = 5)  # Cross-validation
)
# Train a Gradient Boosting Machine (GBM) Model
# Using caret to ensure compatibility with lime
gbm_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "gbm",            # Specify GBM as the model
  trControl = trainControl(method = "cv", number = 5),  # 5-fold cross-validation
  tuneGrid = expand.grid(
    n.trees = 100,          # Number of trees
    interaction.depth = 3,  # Depth of each tree
    shrinkage = 0.1,        # Learning rate
    n.minobsinnode = 10     # Minimum number of observations in nodes
  ),
  verbose = FALSE            # Suppress printing
)

Model Evaluation

log_pred <- predict(log_model, test_data)
gbm_pred <- predict(gbm_model, test_data)
rf_pred <- predict(rf_model, test_data)

log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test_data$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)

Print Accuracy Results

cat("Regularized Logistic Regression Accuracy:", log_acc, "\n")

## Regularized Logistic Regression Accuracy: 0.9056604

cat("Gradient Boosting Machine Accuracy:", gbm_acc, "\n")

## Gradient Boosting Machine Accuracy: 0.9245283

cat("Random Forest Accuracy with tuned parameters:", rf_acc, "\n")

## Random Forest Accuracy with tuned parameters: 0.6037736

# Accuracy comparison bar plot
accuracy_data <- data.frame(
  Model = c("Logistic Regression", "Random Forest", "GBM"),
  Accuracy = c(log_acc, rf_acc, gbm_acc)
)

# Plot model accuracy on test data with colors
ggplot(accuracy_data, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = c("skyblue", "orange", "green")) +
  theme_minimal() +
  labs(title = "Test Accuracy of Text Classification Models", x = "Model", y = "Accuracy") +
  geom_text(aes(label = round(Accuracy, 2)), vjust = -0.3, size = 5) +
  theme(legend.position = "none")

# Explainable AI with LIME for Logistic Regression and SVM
explainer_log <- lime(train_data[, -ncol(train_data)], log_model)
explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explainer_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)

# LIME Explanations with more features and a smaller subset
explanation_log <- explain(test_data[1:2, -ncol(test_data)], explainer_log, n_labels = 1, n_features = 10)
explanation_rf <- explain(test_data[1:5, -ncol(test_data)], explainer_rf, n_labels = 1, n_features = 12)
explanation_gbm <- explain(test_data[1:5, -ncol(test_data)], explainer_gbm, n_labels = 1, n_features = 5)

# Print explanations
print(explanation_log)

## # A tibble: 20 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 3     Conv…      0.984    0.536           0.722            0.984
##  2 classificat… 3     Conv…      0.984    0.536           0.722            0.984
##  3 classificat… 3     Conv…      0.984    0.536           0.722            0.984
##  4 classificat… 3     Conv…      0.984    0.536           0.722            0.984
##  5 classificat… 3     Conv…      0.984    0.536           0.722            0.984
##  6 classificat… 3     Conv…      0.984    0.536           0.722            0.984
##  7 classificat… 3     Conv…      0.984    0.536           0.722            0.984
##  8 classificat… 3     Conv…      0.984    0.536           0.722            0.984
##  9 classificat… 3     Conv…      0.984    0.536           0.722            0.984
## 10 classificat… 3     Conv…      0.984    0.536           0.722            0.984
## 11 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 12 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 13 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 14 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 15 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 16 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 17 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 18 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 19 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## 20 classificat… 4     Conv…      0.938    0.524           1.34             0.960
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>

print(explanation_rf)

## # A tibble: 60 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
##  2 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
##  3 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
##  4 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
##  5 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
##  6 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
##  7 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
##  8 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
##  9 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
## 10 classificat… 3     Auto…       0.67   0.0138           0.722            0.670
## # ℹ 50 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>

print(explanation_gbm)

## # A tibble: 25 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 3     Conv…      0.984    0.226           0.607             1.11
##  2 classificat… 3     Conv…      0.984    0.226           0.607             1.11
##  3 classificat… 3     Conv…      0.984    0.226           0.607             1.11
##  4 classificat… 3     Conv…      0.984    0.226           0.607             1.11
##  5 classificat… 3     Conv…      0.984    0.226           0.607             1.11
##  6 classificat… 4     Conv…      0.998    0.226           0.686             1.07
##  7 classificat… 4     Conv…      0.998    0.226           0.686             1.07
##  8 classificat… 4     Conv…      0.998    0.226           0.686             1.07
##  9 classificat… 4     Conv…      0.998    0.226           0.686             1.07
## 10 classificat… 4     Conv…      0.998    0.226           0.686             1.07
## # ℹ 15 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>