Task 1 : Classify Pre Crash Mode Using Structured Data ( Excluding Text Columns)

Load Libraries

library(caret)
library(dplyr)
library(randomForest)
library(e1071)
library(gbm)
library(glmnet)
library(knitr)
library(ggplot2)

Load and Process Data

# Load the dataset
data <- read.csv("C:/AI in CE/CA_AV_CrashReportsSample (1).csv")

# Step 1: Data Preprocessing
# Select only the specified columns
data <- data %>% select(Year, Veh1_Year, Veh1_Make, Veh1_Model, Veh1_Register_State, Vehicle_Was,Involved_in_the_Accident, IS_VRU_FromNarra, IS_VRU, No_Vehicle_Involved,
Otherparty_Vehicle_Year, Vehicle_was_OtherParty, Involved_Party_Otherparty,Pre_Crash_Mode, Weather, Lighting, Roadway_Surface, Roadway.Conditions,Movement_Bef_Veh1, Movement_Bef_Veh2, Movement_Bef_Veh3, Collision_Type,Collision_Type1)

# Handle missing values (example: remove rows with NA values)
data <- na.omit(data)

# Convert categorical variables to factors
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data <- data %>% mutate_if(is.character, as.factor)

# Step 2: PCA Transformation
# Scale the data and apply PCA
preProc <- preProcess(data[, -ncol(data)], method = c("center", "scale", "pca"), pcaComp = 15)  # Adjust pcaComp as needed
data_pca <- predict(preProc, data)

# Add the target variable back to the PCA-transformed data
data_pca$Pre_Crash_Mode <- data$Pre_Crash_Mode

# Step 3: Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(data_pca$Pre_Crash_Mode, p = 0.8, list = FALSE)
train <- data_pca[trainIndex, ]
test <- data_pca[-trainIndex, ]

# Define training control with cross-validation
train_control <- trainControl(method = "cv", number = 5)

Apply ML Models

# Step 4: Model Training

# Logistic Regression with PCA-transformed data
log_model <- train(
  Pre_Crash_Mode ~ ., data = train, 
  method = "glmnet", 
  trControl = train_control,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01))
)

# Random Forest with PCA-transformed data
rf_model <- train(
  Pre_Crash_Mode ~ ., data = train, 
  method = "rf", 
  trControl = train_control,
  tuneGrid = expand.grid(mtry = c(2, 3, 4, 5))
)

# GBM with PCA-transformed data
gbm_model <- train(
  Pre_Crash_Mode ~ ., data = train, 
  method = "gbm", 
  trControl = train_control,
  tuneGrid = expand.grid(
    n.trees = c(50, 100),
    interaction.depth = c(1, 3),
    shrinkage = c(0.1, 0.05),
    n.minobsinnode = c(10, 20)
  ),
  verbose = FALSE
)

# SVM with PCA-transformed data
svm_model <- train(
  Pre_Crash_Mode ~ ., data = train,
  method = "svmRadial",
  trControl = train_control,
  tuneGrid = expand.grid(C = c(0.1, 1, 10), sigma = c(0.01, 0.05, 0.1))
)

Model Evaluation

# Predict on the test set and calculate accuracy and Kappa
log_pred <- predict(log_model, test)
rf_pred <- predict(rf_model, test)
gbm_pred <- predict(gbm_model, test)
svm_pred <- predict(svm_model, test)

# Calculate accuracy
log_acc <- mean(log_pred == test$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test$Pre_Crash_Mode)
svm_acc <- mean(svm_pred == test$Pre_Crash_Mode)

# Calculate Kappa
log_kappa <- confusionMatrix(log_pred, test$Pre_Crash_Mode)$overall['Kappa']
rf_kappa <- confusionMatrix(rf_pred, test$Pre_Crash_Mode)$overall['Kappa']
gbm_kappa <- confusionMatrix(gbm_pred, test$Pre_Crash_Mode)$overall['Kappa']
svm_kappa <- confusionMatrix(svm_pred, test$Pre_Crash_Mode)$overall['Kappa']

# Print accuracies and Kappa values
cat("Logistic Regression (PCA) Accuracy:", log_acc, "Kappa:", log_kappa, "\n")
## Logistic Regression (PCA) Accuracy: 0.6956522 Kappa: 0.3878327
cat("Random Forest (PCA) Accuracy:", rf_acc, "Kappa:", rf_kappa, "\n")
## Random Forest (PCA) Accuracy: 0.7391304 Kappa: 0.4297521
cat("Gradient Boosting Machine (PCA) Accuracy:", gbm_acc, "Kappa:", gbm_kappa, "\n")
## Gradient Boosting Machine (PCA) Accuracy: 0.8695652 Kappa: 0.7376426
cat("SVM (PCA) Accuracy:", svm_acc, "Kappa:", svm_kappa, "\n")
## SVM (PCA) Accuracy: 0.8695652 Kappa: 0.7315175

Visualization of Results

# View best tuning parameters for each model
print(log_model$bestTune)
##   alpha lambda
## 1     1  0.001
print(rf_model$bestTune)
##   mtry
## 2    3
print(gbm_model$bestTune)
##   n.trees interaction.depth shrinkage n.minobsinnode
## 9      50                 1       0.1             10
print(svm_model$bestTune)
##   sigma  C
## 7  0.01 10
# Create a summary table
model_performance <- data.frame(
  Model = c("Logistic Regression", "Random Forest", "Gradient Boosting Machine", "SVM"),
  Accuracy = c(log_acc, rf_acc, gbm_acc, svm_acc),
  Kappa = c(log_kappa, rf_kappa, gbm_kappa, svm_kappa)
)

# Print the summary table
print(model_performance)
##                       Model  Accuracy     Kappa
## 1       Logistic Regression 0.6956522 0.3878327
## 2             Random Forest 0.7391304 0.4297521
## 3 Gradient Boosting Machine 0.8695652 0.7376426
## 4                       SVM 0.8695652 0.7315175
# View the table in a nicer format
kable(model_performance, caption = "Model Performance Summary with Accuracy and Kappa")
Model Performance Summary with Accuracy and Kappa
Model Accuracy Kappa
Logistic Regression 0.6956522 0.3878327
Random Forest 0.7391304 0.4297521
Gradient Boosting Machine 0.8695652 0.7376426
SVM 0.8695652 0.7315175
# Visualization: Graphical Representation of Model Performance (Accuracy and Kappa)

# Accuracy plot
ggplot(model_performance, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", width = 0.6) +
  theme_minimal() +
  labs(
    title = "Model Performance Comparison (Accuracy)",
    x = "Model",
    y = "Accuracy"
  ) +
  geom_text(aes(label = round(Accuracy, 3)), vjust = -0.3) +
  theme(legend.position = "none")

# Kappa plot
ggplot(model_performance, aes(x = Model, y = Kappa, fill = Model)) +
  geom_bar(stat = "identity", width = 0.6) +
  theme_minimal() +
  labs(
    title = "Model Performance Comparison (Kappa)",
    x = "Model",
    y = "Kappa"
  ) +
  geom_text(aes(label = round(Kappa, 3)), vjust = -0.3) +
  theme(legend.position = "none")

Task 2: Classifying Pre_Cash_Mode using Crash Narrative Data with Explainable AI

Load Libraries

library(tidyverse)
library(caret)
library(tm)
library(glmnet)
library(e1071)
library(randomForest)
library(gbm)
library(lime)

Load and Process Data

# Load dataset
data <- read.csv("C:/AI in CE/CA_AV_CrashReportsSample (1).csv")
data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()

# Text Preprocessing
corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

# TF-IDF Vectorization
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
tfidf_data <- as.data.frame(as.matrix(dtm))
tfidf_data$Pre_Crash_Mode <- data$Pre_Crash_Mode

# Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(tfidf_data$Pre_Crash_Mode, p = 0.8, list = FALSE)
train_data <- tfidf_data[trainIndex,]
test_data <- tfidf_data[-trainIndex,]
colnames(train_data) <- make.names(colnames(train_data))
colnames(test_data) <- make.names(colnames(test_data))

Apply ML Models

# Model Training
# Apply class balancing
train_control <- trainControl(method = "cv", number = 5, sampling = "up")

# Train Logistic Regression
log_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "glmnet", 
  trControl = train_control,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01))
)

# Train Random Forest model
rf_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "rf",
  ntree = 200,
  tuneGrid = data.frame(mtry = 3),
  trControl = train_control
)

# Train Gradient Boosting Machine (GBM) model
gbm_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "gbm",
  trControl = train_control,
  tuneGrid = expand.grid(
    n.trees = 100,
    interaction.depth = 3,
    shrinkage = 0.1,
    n.minobsinnode = 10
  ),
  verbose = FALSE
)

Model Evaluation and Visualization of Results

# Model Evaluation
log_pred <- predict(log_model, test_data)
gbm_pred <- predict(gbm_model, test_data)
rf_pred <- predict(rf_model, test_data)

log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test_data$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)

cat("Logistic Regression Accuracy:", log_acc, "\n")
## Logistic Regression Accuracy: 0.9056604
cat("GBM Accuracy:", gbm_acc, "\n")
## GBM Accuracy: 0.9245283
cat("Random Forest Accuracy:", rf_acc, "\n")
## Random Forest Accuracy: 0.7169811
# Explainable AI with LIME
explainer_log <- lime(train_data[, -ncol(train_data)], log_model)
explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explainer_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)

# LIME Explanations
explanation_log <- explain(test_data[1:5, -ncol(test_data)], explainer_log, n_labels = 1, n_features = 20)
explanation_rf <- explain(test_data[1:5, -ncol(test_data)], explainer_rf, n_labels = 1, n_features = 20)
explanation_gbm <- explain(test_data[1:5, -ncol(test_data)], explainer_gbm, n_labels = 1, n_features = 20)

# Print LIME explanations for Logistic Regression
cat("\nLIME Explanation for Logistic Regression:\n")
## 
## LIME Explanation for Logistic Regression:
print(explanation_log)
## # A tibble: 100 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 3     Conv…      0.984    0.522            1.13            0.984
##  2 classificat… 3     Conv…      0.984    0.522            1.13            0.984
##  3 classificat… 3     Conv…      0.984    0.522            1.13            0.984
##  4 classificat… 3     Conv…      0.984    0.522            1.13            0.984
##  5 classificat… 3     Conv…      0.984    0.522            1.13            0.984
##  6 classificat… 3     Conv…      0.984    0.522            1.13            0.984
##  7 classificat… 3     Conv…      0.984    0.522            1.13            0.984
##  8 classificat… 3     Conv…      0.984    0.522            1.13            0.984
##  9 classificat… 3     Conv…      0.984    0.522            1.13            0.984
## 10 classificat… 3     Conv…      0.984    0.522            1.13            0.984
## # ℹ 90 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
# Print LIME explanations for Random Forest
cat("\nLIME Explanation for Random Forest:\n")
## 
## LIME Explanation for Random Forest:
print(explanation_rf)
## # A tibble: 100 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 3     Conv…        0.7    0.181           0.327            0.700
##  2 classificat… 3     Conv…        0.7    0.181           0.327            0.700
##  3 classificat… 3     Conv…        0.7    0.181           0.327            0.700
##  4 classificat… 3     Conv…        0.7    0.181           0.327            0.700
##  5 classificat… 3     Conv…        0.7    0.181           0.327            0.700
##  6 classificat… 3     Conv…        0.7    0.181           0.327            0.700
##  7 classificat… 3     Conv…        0.7    0.181           0.327            0.700
##  8 classificat… 3     Conv…        0.7    0.181           0.327            0.700
##  9 classificat… 3     Conv…        0.7    0.181           0.327            0.700
## 10 classificat… 3     Conv…        0.7    0.181           0.327            0.700
## # ℹ 90 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
# Print LIME explanations for GBM
cat("\nLIME Explanation for GBM:\n")
## 
## LIME Explanation for GBM:
print(explanation_gbm)
## # A tibble: 100 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 3     Conv…      0.988    0.171           0.813            0.988
##  2 classificat… 3     Conv…      0.988    0.171           0.813            0.988
##  3 classificat… 3     Conv…      0.988    0.171           0.813            0.988
##  4 classificat… 3     Conv…      0.988    0.171           0.813            0.988
##  5 classificat… 3     Conv…      0.988    0.171           0.813            0.988
##  6 classificat… 3     Conv…      0.988    0.171           0.813            0.988
##  7 classificat… 3     Conv…      0.988    0.171           0.813            0.988
##  8 classificat… 3     Conv…      0.988    0.171           0.813            0.988
##  9 classificat… 3     Conv…      0.988    0.171           0.813            0.988
## 10 classificat… 3     Conv…      0.988    0.171           0.813            0.988
## # ℹ 90 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
# Extract Feature Importances for Top 20 Features
importance_log <- varImp(log_model)$importance
importance_rf <- varImp(rf_model)$importance
importance_gbm <- varImp(gbm_model)$importance

top_20_log <- importance_log %>% arrange(desc(Overall)) %>% head(20)
top_20_rf <- importance_rf %>% arrange(desc(Overall)) %>% head(20)
top_20_gbm <- importance_gbm %>% arrange(desc(Overall)) %>% head(20)

# Plot Top 20 Feature Importances with Different Colors
ggplot(top_20_log, aes(x = reorder(rownames(top_20_log), Overall), y = Overall, fill = "Logistic Regression")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "blue") +
  ggtitle("Top 20 Features - Logistic Regression") +
  xlab("Feature") +
  ylab("Importance")

ggplot(top_20_rf, aes(x = reorder(rownames(top_20_rf), Overall), y = Overall, fill = "Random Forest")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "green") +
  ggtitle("Top 20 Features - Random Forest") +
  xlab("Feature") +
  ylab("Importance")

ggplot(top_20_gbm, aes(x = reorder(rownames(top_20_gbm), Overall), y = Overall, fill = "GBM")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "pink") +
  ggtitle("Top 20 Features - GBM") +
  xlab("Feature") +
  ylab("Importance")