AI application in CE HW#4

TASK 1

Loading the libraries

library(pROC)

library(knitr)

library(caret)

library(dplyr)

library(randomForest)

library(e1071)

library(gbm)

library(tidyverse)

library(tidytext)

library(tm)

library(glmnet)  

library(e1071)

library(randomForest)

library(lime)   

setwd("C:/Users/10207/Downloads")

Loading the data

data <- read.csv("C:/Users/10207/Downloads/CA_AV_CrashReportsSample.csv")

Data preprocessing

# Remove unnecessary Data

data <- data %>% select(-CrashNarrative, -File_name, -Manufacturer_Name, -Business_Name,-Date, -Time, -Location_Street, -Location_City,
-Location_County,-Location_State, -Location_Zip, -Latitude, -Longitude,-Name_Injury.Death, -Bicyclist_Injury.Death, -Property_Damage,
-Property_Owner_Name, -Otherparty_Vehicle_Year, -Otherparty_Vehicle_Model,
-Otherparty_Vehicle_State, -Vehicle_was_OtherParty,
-Involved_Party_Otherparty, -Driver, -Passenger, -Vehicle_Year_3rdparty,
-Model_3rdParty, -X3rdparty_Vehicle_State, -Vehicle_was_3rdParty,
-Involve_in_Accident_3rdParty, -Driver_3rdParty, -Passenger_3rdParty)

# Handling missing values

data <- na.omit(data)

# Converting categorical variables to factors

data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data <- data %>% mutate_if(is.character, as.factor)

Feature Selection

nzv <- nearZeroVar(data, saveMetrics = TRUE)
data <- data[, !nzv$nzv]

Model development and training

# Model development
set.seed(123)
trainIndex <- createDataPartition(data$Pre_Crash_Mode, p = 0.6, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]

# Training the model
# Decision Tree

tree_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rpart")

# Random Forest

rf_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rf",tuneGrid = expand.grid(mtry = c(3, 4, 5, 6)))

# GBM

gbm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "gbm", verbose = FALSE)

# SVM

svm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "svmRadial")

# kNN

knn_model <- train(Pre_Crash_Mode ~ ., data = train, method = "knn")

# GLM

log_model <- train(Pre_Crash_Mode ~ ., data = train, method = "glm")

Determing the best tuning parameter

print(log_model$bestTune)
##   parameter
## 1      none
print(rf_model$bestTune)
##   mtry
## 3    5
print(gbm_model$bestTune)
##   n.trees interaction.depth shrinkage n.minobsinnode
## 4      50                 2       0.1             10
print(svm_model$bestTune)
##       sigma C
## 3 0.0657767 1
print(knn_model$bestTune)
##   k
## 3 9
print(tree_model$bestTune)
##   cp
## 1  0

Predictions on the test set

pred_rf <- predict(rf_model, newdata = test)
pred_tree <- predict(tree_model, newdata = test)
pred_svm <- predict(svm_model, newdata = test)
pred_gbm <- predict(gbm_model, newdata = test)
pred_knn <- predict(knn_model, newdata = test)
pred_log <- predict(log_model, newdata = test)

Accuracy and Kappa for each model

# Initializing an empty data frame to store model performance metrics

model_metrics <- data.frame(Model = character(), Accuracy = numeric(), Kappa = numeric(), stringsAsFactors = FALSE)

# Generating confusion matrices and extracting Accuracy and Kappa for each model

# Random Forest

cm_rf <- confusionMatrix(pred_rf, test$Pre_Crash_Mode)
accuracy_rf <- cm_rf$overall["Accuracy"]
kappa_rf <- cm_rf$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "Random Forest", Accuracy = as.numeric(accuracy_rf), Kappa = as.numeric(kappa_rf)))

# Decision Tree

cm_tree <- confusionMatrix(pred_tree, test$Pre_Crash_Mode)
accuracy_tree <- cm_tree$overall["Accuracy"]
kappa_tree <- cm_tree$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "Decision Tree", Accuracy = as.numeric(accuracy_tree), Kappa = as.numeric(kappa_tree)))

# SVM

cm_svm <- confusionMatrix(pred_svm, test$Pre_Crash_Mode)
accuracy_svm <- cm_svm$overall["Accuracy"]
kappa_svm <- cm_svm$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "SVM", Accuracy = as.numeric(accuracy_svm), Kappa = as.numeric(kappa_svm)))

# GBM

cm_gbm <- confusionMatrix(pred_gbm, test$Pre_Crash_Mode)
accuracy_gbm <- cm_gbm$overall["Accuracy"]
kappa_gbm <- cm_gbm$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "GBM", Accuracy = as.numeric(accuracy_gbm), Kappa = as.numeric(kappa_gbm)))

# GLM

cm_log = confusionMatrix(pred_log, test$Pre_Crash_Mode)
accuracy_log <- cm_log$overall["Accuracy"]
kappa_log <- cm_log$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "GLM", Accuracy = as.numeric(accuracy_log), Kappa = as.numeric(kappa_log)))

# kNN

cm_knn <- confusionMatrix(pred_knn, test$Pre_Crash_Mode)
accuracy_knn <- cm_knn$overall["Accuracy"]
kappa_knn <- cm_knn$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "kNN", Accuracy = as.numeric(accuracy_knn), Kappa = as.numeric(kappa_knn)))

# Convert accuracy and kappa columns to numeric

model_metrics$Accuracy <- as.numeric(model_metrics$Accuracy)
model_metrics$Kappa <- as.numeric(model_metrics$Kappa)

Displaying accuracy and kappa in table

kable(model_metrics, caption = "Model Performance Metrics for Accuracy and Kappa")
Model Performance Metrics for Accuracy and Kappa
Model Accuracy Kappa
Random Forest 0.6865672 0.3258265
Decision Tree 0.6567164 0.2921452
SVM 0.6865672 0.3315914
GBM 0.6119403 0.1897674
GLM 0.5522388 0.0880218
kNN 0.6567164 0.2551957

Plotting the results using ggplot2

# Accuracy plot

ggplot(model_metrics, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  theme_bw()

# Kappa plot

ggplot(model_metrics, aes(x = Model, y = Kappa, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Model Kappa Comparison", x = "Model", y = "Kappa") +
  theme_minimal() +
  theme(legend.position = "none")

TASK 2

Loading the dataset and selecting required variables

data <- read.csv("C:/Users/10207/Downloads/CA_AV_CrashReportsSample.csv")

data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()

Text preprocessing

corpus <- VCorpus(VectorSource(data$CrashNarrative))

corpus <- tm_map(corpus, content_transformer(tolower))

corpus <- tm_map(corpus, removePunctuation)

corpus <- tm_map(corpus, removeNumbers)

corpus <- tm_map(corpus, removeWords, stopwords("en"))

corpus <- tm_map(corpus, stripWhitespace)

TF-IDF vectorization

document <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))

tfidfData <- as.data.frame(as.matrix(document))

tfidfData$Pre_Crash_Mode <- data$Pre_Crash_Mode

Splitting into train and test set

set.seed(123)
trainIndex <- createDataPartition(tfidfData$Pre_Crash_Mode, p = 0.75, list = FALSE)

train_data <- tfidfData[trainIndex,]

test_data <- tfidfData[-trainIndex,]

colnames(train_data) <- make.names(colnames(train_data))

colnames(test_data) <- make.names(colnames(test_data))

Applying class balancing and training model

# Class balancing

train_control <- trainControl(method = "cv", number = 7, sampling = "up")

# Training Logistic Regression with class balancing

log_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "glmnet", 
  trControl = train_control,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.2, by = 0.01))
)

# Training Random Forest model using caret

rf_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "rf",         
  ntree = 150,           
  tuneGrid = data.frame(mtry = 4),  
  trControl = trainControl(method = "cv", number = 6)
)
#  Training a Gradient Boosting Machine (GBM) Model using caret to ensure compatibility with lime
gbm_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "gbm",   
  trControl = trainControl(method = "cv", number = 7),
  tuneGrid = expand.grid(
    n.trees = 100,          
    interaction.depth = 4,
    shrinkage = 0.2,        
    n.minobsinnode = 15)
  ,verbose = FALSE            
)

Predictions and accuracy

# Predictions

log_pred <- predict(log_model, test_data)

gbm_pred <- predict(gbm_model, test_data)

rf_pred <- predict(rf_model, test_data)

# Accuracy extraction
log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)

gbm_acc <- mean(gbm_pred == test_data$Pre_Crash_Mode)

rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)

cat("Accuracy of Regularized Logistic Regression:", log_acc, "\n")
## Accuracy of Regularized Logistic Regression: 0.9117647
cat("Accuracy of Gradient Boosting Machine:", gbm_acc, "\n")
## Accuracy of Gradient Boosting Machine: 0.9264706
cat("Random Forest Accuracy with tuned parameters:", rf_acc, "\n")
## Random Forest Accuracy with tuned parameters: 0.6470588

Explainable AI using LIME

explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)

explainer_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)

explainer_log <- lime(train_data[, -ncol(train_data)], log_model)

Generating LIME explanations table

# Creating LIME Interpreter Objects
explainer_log <- lime::lime(train_data[, -ncol(train_data)], log_model)
explainer_rf <- lime::lime(train_data[, -ncol(train_data)], rf_model)
explainer_gbm <- lime::lime(train_data[, -ncol(train_data)], gbm_model)

# Interpreting Test Data with Interpreter Objects
# The first 8 samples were interpreted and the 12 most important features were extracted.

# Interpretation of the GLM model
explanation_log <- lime::explain(
  x = test_data[1:8, -ncol(test_data)],  # Test data, remove target columns
  explainer = explainer_log,
  n_labels = 1,
  n_features = 12
)

# Interpretation of the Random Forest Model
explanation_rf <- lime::explain(
  x = test_data[1:8, -ncol(test_data)],
  explainer = explainer_rf,
  n_labels = 1,
  n_features = 12
)

# Explanation of the GBM model
explanation_gbm <- lime::explain(
  x = test_data[1:8, -ncol(test_data)],
  explainer = explainer_gbm,
  n_labels = 1,
  n_features = 12
)

# Output Interpretation Results
cat("GLM Model Explanations:\n")
## GLM Model Explanations:
print(explanation_log)
## # A tibble: 96 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
##  2 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
##  3 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
##  4 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
##  5 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
##  6 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
##  7 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
##  8 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
##  9 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
## 10 classificat… 2     Auto…      0.655    0.702          0.0931            0.655
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
cat("\nRandom Forest Model Explanations:\n")
## 
## Random Forest Model Explanations:
print(explanation_rf)
## # A tibble: 96 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
##  2 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
##  3 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
##  4 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
##  5 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
##  6 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
##  7 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
##  8 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
##  9 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
## 10 classificat… 2     Auto…       0.86   0.0371           0.512            0.860
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
cat("\nGBM Model Explanations:\n")
## 
## GBM Model Explanations:
print(explanation_gbm)
## # A tibble: 96 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
##  2 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
##  3 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
##  4 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
##  5 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
##  6 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
##  7 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
##  8 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
##  9 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
## 10 classificat… 2     Auto…       1.00    0.130          -0.420             1.00
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>

Extracting and preparing the top 12 important features for all three models

# Extract and prepare the first 12 important features

# Logistic Regression
importance_log <- varImp(log_model)$importance %>%
  rownames_to_column(var = "feature") %>%
  arrange(desc(Overall)) %>%
  slice_head(n = 12)

# Random Forest
importance_rf <- varImp(rf_model)$importance %>%
  rownames_to_column(var = "feature") %>%
  arrange(desc(Overall)) %>%
  slice_head(n = 12)

# GBM
importance_gbm <- varImp(gbm_model)$importance %>%
  rownames_to_column(var = "feature") %>%
  arrange(desc(Overall)) %>%
  slice_head(n = 12)

# Visualisation functions to avoid duplicate code
plot_feature_importance <- function(importance_data, model_name, fill_color) {
  ggplot(importance_data, aes(x = reorder(feature, Overall), y = Overall, fill = model_name)) +
    geom_bar(stat = "identity", color = "#501214") +
    coord_flip() +
    scale_fill_manual(values = fill_color, name = "Model") +
    ggtitle(paste("Top 12 Features -", model_name)) +
    xlab("Feature") +
    ylab("Importance") +
    theme_minimal()
}

# Charting important features

# Logistic Regression
plot_feature_importance(importance_log, "Logistic Regression", "#AC9155")

# Random Forest
plot_feature_importance(importance_rf, "Random Forest", "#501214")

# GBM
plot_feature_importance(importance_gbm, "GBM", "#AC9155")