Task1 : Classifying Precrash mode using structured data

Loading necessary libraries

library(caret)
library(dplyr)
library(randomForest)
library(e1071)
library(gbm)
library(glmnet)

Loading and Preprocessing the dataset

data <- read.csv("C:/Users/rupes/Downloads/CA_AV_CrashReportsSample.csv")

# Step 1: Data Preprocessing
# Select only the specified columns
data <- data %>% select(Year, Veh1_Year, Veh1_Make, Veh1_Model, Veh1_Register_State, Vehicle_Was,
                        Involved_in_the_Accident, IS_VRU_FromNarra, IS_VRU, No_Vehicle_Involved,
                        Otherparty_Vehicle_Year, Vehicle_was_OtherParty, Involved_Party_Otherparty,
                        Pre_Crash_Mode, Weather, Lighting, Roadway_Surface, Roadway.Conditions,
                        Movement_Bef_Veh1, Movement_Bef_Veh2, Movement_Bef_Veh3, Collision_Type,
                        Collision_Type1)

# Handling the missing values (example: remove rows with NA values)
data <- na.omit(data)

# Converting categorical variables to factors
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data <- data %>% mutate_if(is.character, as.factor)

# Step 2: PCA Transformation
# Scaling the data and applying PCA
preProc <- preProcess(data[, -ncol(data)], method = c("center", "scale", "pca"), pcaComp = 15)  # Adjust pcaComp as needed
data_pca <- predict(preProc, data)

# Adding the target variable back to the PCA-transformed data
data_pca$Pre_Crash_Mode <- data$Pre_Crash_Mode

# Step 3: Train-Test Splitting
set.seed(123)
trainIndex <- createDataPartition(data_pca$Pre_Crash_Mode, p = 0.75, list = FALSE)
train <- data_pca[trainIndex, ]
test <- data_pca[-trainIndex, ]

# Defining the training control with cross-validation
train_control <- trainControl(method = "cv", number =8)

ML Modeling

# Step 4: Model Training

# Logistic Regression with PCA-transformed data
log_model <- train(
  Pre_Crash_Mode ~ ., data = train, 
  method = "glmnet", 
  trControl = train_control,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.3, by = 0.01))
)

# Random Forest with PCA-transformed data
rf_model <- train(
  Pre_Crash_Mode ~ ., data = train, 
  method = "rf", 
  trControl = train_control,
  tuneGrid = expand.grid(mtry = c(4, 5,6 , 7))
)

# GBM with PCA-transformed data
gbm_model <- train(
  Pre_Crash_Mode ~ ., data = train, 
  method = "gbm", 
  trControl = train_control,
  tuneGrid = expand.grid(
    n.trees = c(100, 120),
    interaction.depth = c(1, 4),
    shrinkage = c(0.2, 0.05),
    n.minobsinnode = c(10, 30)
  ),
  verbose = FALSE
)

# SVM with PCA-transformed data
svm_model <- train(
  Pre_Crash_Mode ~ ., data = train,
  method = "svmRadial",
  trControl = train_control,
  tuneGrid = expand.grid(C = c(0.1, 2, 10), sigma = c(0.01, 0.05, 0.1))
)

Evaluation of the model and its visualisation

# Step 5: Model Evaluation

# Prediction on the test set and calculating accuracy
log_pred <- predict(log_model, test)
rf_pred <- predict(rf_model, test)
gbm_pred <- predict(gbm_model, test)
svm_pred <- predict(svm_model, test)

log_acc <- mean(log_pred == test$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test$Pre_Crash_Mode)
svm_acc <- mean(svm_pred == test$Pre_Crash_Mode)

# Calculating Kappa
log_kappa <- confusionMatrix(log_pred, test$Pre_Crash_Mode)$overall['Kappa']
rf_kappa <- confusionMatrix(rf_pred, test$Pre_Crash_Mode)$overall['Kappa']
gbm_kappa <- confusionMatrix(gbm_pred, test$Pre_Crash_Mode)$overall['Kappa']
svm_kappa <- confusionMatrix(svm_pred, test$Pre_Crash_Mode)$overall['Kappa']

# Printing accuracies and Kappa values
cat("Logistic Regression (PCA) Accuracy:", log_acc, "Kappa:", log_kappa, "\n")
## Logistic Regression (PCA) Accuracy: 0.7666667 Kappa: 0.5023697
cat("Random Forest (PCA) Accuracy:", rf_acc, "Kappa:", rf_kappa, "\n")
## Random Forest (PCA) Accuracy: 0.7333333 Kappa: 0.4146341
cat("Gradient Boosting Machine (PCA) Accuracy:", gbm_acc, "Kappa:", gbm_kappa, "\n")
## Gradient Boosting Machine (PCA) Accuracy: 0.7333333 Kappa: 0.4366197
cat("SVM (PCA) Accuracy:", svm_acc, "Kappa:", svm_kappa, "\n")
## SVM (PCA) Accuracy: 0.6666667 Kappa: 0.2537313
# View best tuning parameters for each model
print(log_model$bestTune)
##   alpha lambda
## 6     1  0.051
print(rf_model$bestTune)
##   mtry
## 2    5
print(gbm_model$bestTune)
##    n.trees interaction.depth shrinkage n.minobsinnode
## 10     120                 1       0.2             10
print(svm_model$bestTune)
##   sigma C
## 4  0.01 2
# Creating a summary table
model_performance <- data.frame(
  Model = c("Logistic Regression", "Random Forest", "Gradient Boosting Machine", "SVM"),
  Accuracy = c(log_acc, rf_acc, gbm_acc, svm_acc),Kappa = c(log_kappa, rf_kappa, gbm_kappa, svm_kappa)
)

# Printing the summary table
print(model_performance)
##                       Model  Accuracy     Kappa
## 1       Logistic Regression 0.7666667 0.5023697
## 2             Random Forest 0.7333333 0.4146341
## 3 Gradient Boosting Machine 0.7333333 0.4366197
## 4                       SVM 0.6666667 0.2537313
#  Tabulation in decent format
library(knitr)
kable(model_performance, caption = "Model Performance Summary")
Model Performance Summary
Model Accuracy Kappa
Logistic Regression 0.7666667 0.5023697
Random Forest 0.7333333 0.4146341
Gradient Boosting Machine 0.7333333 0.4366197
SVM 0.6666667 0.2537313
# # Loading ggplot2 for graphical representation
library(ggplot2)

# Visualization: Graphical Representation of Model Performance
ggplot(model_performance, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", width = 0.7) +
  theme_minimal() +
  labs(
    title = "Model Performance Comparison",
    x = "Model",
    y = "Accuracy"
  ) +
  geom_text(aes(label = round(Accuracy, 2)), vjust = -0.3) +
  theme(legend.position = "none")

# For Kappa
ggplot(model_performance, aes(x = Model, y = Kappa, fill = Model)) +
  geom_bar(stat = "identity", width = 0.7) +
  theme_minimal() +
  labs(
    title = "Model Performance Comparison",
    x = "Model",
    y = "Kappa"
  ) +
  geom_text(aes(label = round(Kappa, 2)), vjust = -0.3) +
  theme(legend.position = "none")

##Task 2: Classifying Precrash mode using Crash Narrative Data with explainable AI

# Loading necessary libraries
library(tidyverse)
library(caret)
library(tm)
library(glmnet)
library(e1071)
library(randomForest)
library(gbm)
library(lime)

# Loading dataset
data <- read.csv("C:/Users/rupes/Downloads/CA_AV_CrashReportsSample.csv")
data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()

# Text Preprocessing
corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

# TF-IDF Vectorization
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
tfidf_data <- as.data.frame(as.matrix(dtm))
tfidf_data$Pre_Crash_Mode <- data$Pre_Crash_Mode

# Train-Test Splitting
set.seed(123)
trainIndex <- createDataPartition(tfidf_data$Pre_Crash_Mode, p = 0.75, list = FALSE)
train_data <- tfidf_data[trainIndex,]
test_data <- tfidf_data[-trainIndex,]
colnames(train_data) <- make.names(colnames(train_data))
colnames(test_data) <- make.names(colnames(test_data))

#Model Training and Evaluating

# Application of class balancing
train_control <- trainControl(method = "cv", number = 6, sampling = "up")

# Training Logistic Regression
log_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "glmnet", 
  trControl = train_control,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01))
)

# Training Random Forest model
rf_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "rf",
  ntree = 150,
  tuneGrid = data.frame(mtry = 5),
  trControl = train_control
)

# Training Gradient Boosting Machine (GBM) model
gbm_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "gbm",
  trControl = train_control,
  tuneGrid = expand.grid(
    n.trees = 130,
    interaction.depth = 4,
    shrinkage = 0.3,
    n.minobsinnode = 10
  ),
  verbose = FALSE
)

# Model Evaluation
log_pred <- predict(log_model, test_data)
gbm_pred <- predict(gbm_model, test_data)
rf_pred <- predict(rf_model, test_data)

log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test_data$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)

cat("Logistic Regression Accuracy:", log_acc, "\n")
## Logistic Regression Accuracy: 0.8970588
cat("GBM Accuracy:", gbm_acc, "\n")
## GBM Accuracy: 0.8823529
cat("Random Forest Accuracy:", rf_acc, "\n")
## Random Forest Accuracy: 0.6911765
# Explainable AI with LIME
explainer_log <- lime(train_data[, -ncol(train_data)], log_model)
explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explainer_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)

# LIME Explanations
explanation_log <- explain(test_data[1:9, -ncol(test_data)], explainer_log, n_labels = 1, n_features = 13)
explanation_rf <- explain(test_data[1:9, -ncol(test_data)], explainer_rf, n_labels = 1, n_features = 13)
explanation_gbm <- explain(test_data[1:9, -ncol(test_data)], explainer_gbm, n_labels = 1, n_features = 13)


# Extracting Feature Importances for Top 13 Features
importance_log <- varImp(log_model)$importance
importance_rf <- varImp(rf_model)$importance
importance_gbm <- varImp(gbm_model)$importance

top_13_log <- importance_log %>% arrange(desc(Overall)) %>% head(13)
top_13_rf <- importance_rf %>% arrange(desc(Overall)) %>% head(13)
top_13_gbm <- importance_gbm %>% arrange(desc(Overall)) %>% head(13)

# Plotting Top 13 Feature Importances with Different Colors
ggplot(top_13_log, aes(x = reorder(rownames(top_13_log), Overall), y = Overall, fill = "Logistic Regression")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "lightgreen") +
  ggtitle("Top 13 Features - Logistic Regression") +
  xlab("Feature") +
  ylab("Importance")

ggplot(top_13_rf, aes(x = reorder(rownames(top_13_rf), Overall), y = Overall, fill = "Random Forest")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "brown") +
  ggtitle("Top 13 Features - Random Forest") +
  xlab("Feature") +
  ylab("Importance")

ggplot(top_13_gbm, aes(x = reorder(rownames(top_13_gbm), Overall), y = Overall, fill = "GBM")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "lightpink") +
  ggtitle("Top 13 Features - GBM") +
  xlab("Feature") +
  ylab("Importance")