HW 4 AI

TASK 1

Loading the libraries

library(caret)

library(dplyr)

library(randomForest)

library(e1071)

library(gbm)

library(pROC)

library(knitr)

library(tidyverse)

library(tidytext)

library(tm)

library(glmnet)  

library(e1071)

library(randomForest)

library(lime)   

Loading the data

testdata <- read.csv("C:/Users/gyc22/Documents/AI/Files/CA_AV_CrashReportsSample.csv")

Data preprocessing

# Remove unnecessary columns

testdata <- testdata %>% select(-CrashNarrative, -File_name, -Manufacturer_Name, -Business_Name,-Date, -Time, -Location_Street, -Location_City,
-Location_County,-Location_State, -Location_Zip, -Latitude, -Longitude,-Name_Injury.Death, -Bicyclist_Injury.Death, -Property_Damage,
-Property_Owner_Name, -Otherparty_Vehicle_Year, -Otherparty_Vehicle_Model,
-Otherparty_Vehicle_State, -Vehicle_was_OtherParty,
-Involved_Party_Otherparty, -Driver, -Passenger, -Vehicle_Year_3rdparty,
-Model_3rdParty, -X3rdparty_Vehicle_State, -Vehicle_was_3rdParty,
-Involve_in_Accident_3rdParty, -Driver_3rdParty, -Passenger_3rdParty)

# Handling missing values

testdata <- na.omit(testdata)

# Converting categorical variables to factors

testdata$Pre_Crash_Mode <- as.factor(testdata$Pre_Crash_Mode)
testdata <- testdata %>% mutate_if(is.character, as.factor)

Feature Selection

nzvr <- nearZeroVar(testdata, saveMetrics = TRUE)
testdata <- testdata[, !nzvr$nzv]

Model development and training

# Model development
set.seed(123)
trainingIndex <- createDataPartition(testdata$Pre_Crash_Mode, p = 0.65, list = FALSE)
train <- testdata[trainingIndex, ]
test <- testdata[-trainingIndex, ]

# Training the model
# Decision Tree

tree_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rpart")

# Random Forest

rf_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rf",tuneGrid = expand.grid(mtry = c(3, 4, 5, 6)))

# GBM

gbm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "gbm", verbose = FALSE)

# SVM

svm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "svmRadial")

# kNN

knn_model <- train(Pre_Crash_Mode ~ ., data = train, method = "knn")

# GLM

log_model <- train(Pre_Crash_Mode ~ ., data = train, method = "glm")

Determing the best tuning parameter

print(log_model$bestTune)
##   parameter
## 1      none
print(rf_model$bestTune)
##   mtry
## 4    6
print(gbm_model$bestTune)
##   n.trees interaction.depth shrinkage n.minobsinnode
## 3     150                 1       0.1             10
print(svm_model$bestTune)
##        sigma C
## 3 0.06299541 1
print(knn_model$bestTune)
##   k
## 3 9
print(tree_model$bestTune)
##           cp
## 1 0.02083333

Predictions on the test set

predict_rf <- predict(rf_model, newdata = test)
predict_tree <- predict(tree_model, newdata = test)
predict_svm <- predict(svm_model, newdata = test)
predict_gbm <- predict(gbm_model, newdata = test)
predict_knn <- predict(knn_model, newdata = test)
predict_log <- predict(log_model, newdata = test)

Accuracy and Kappa for each model

# Initializing an empty data frame to store model performance metrics

model_metrics <- data.frame(Model = character(), Accuracy = numeric(), Kappa = numeric(), stringsAsFactors = FALSE)

# Generating confusion matrices and extracting Accuracy and Kappa for each model

# Random Forest

cm_rf <- confusionMatrix(predict_rf, test$Pre_Crash_Mode)
accuracy_rf <- cm_rf$overall["Accuracy"]
kappa_rf <- cm_rf$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "Random Forest", Accuracy = as.numeric(accuracy_rf), Kappa = as.numeric(kappa_rf)))

# Decision Tree

cm_tree <- confusionMatrix(predict_tree, test$Pre_Crash_Mode)
accuracy_tree <- cm_tree$overall["Accuracy"]
kappa_tree <- cm_tree$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "Decision Tree", Accuracy = as.numeric(accuracy_tree), Kappa = as.numeric(kappa_tree)))

# SVM

cm_svm <- confusionMatrix(predict_svm, test$Pre_Crash_Mode)
accuracy_svm <- cm_svm$overall["Accuracy"]
kappa_svm <- cm_svm$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "SVM", Accuracy = as.numeric(accuracy_svm), Kappa = as.numeric(kappa_svm)))

# GBM

cm_gbm <- confusionMatrix(predict_gbm, test$Pre_Crash_Mode)
accuracy_gbm <- cm_gbm$overall["Accuracy"]
kappa_gbm <- cm_gbm$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "GBM", Accuracy = as.numeric(accuracy_gbm), Kappa = as.numeric(kappa_gbm)))

# GLM

cm_log = confusionMatrix(predict_log, test$Pre_Crash_Mode)
accuracy_log <- cm_log$overall["Accuracy"]
kappa_log <- cm_log$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "GLM", Accuracy = as.numeric(accuracy_log), Kappa = as.numeric(kappa_log)))

# kNN

cm_knn <- confusionMatrix(predict_knn, test$Pre_Crash_Mode)
accuracy_knn <- cm_knn$overall["Accuracy"]
kappa_knn <- cm_knn$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "kNN", Accuracy = as.numeric(accuracy_knn), Kappa = as.numeric(kappa_knn)))

# Convert accuracy and kappa columns to numeric

model_metrics$Accuracy <- as.numeric(model_metrics$Accuracy)
model_metrics$Kappa <- as.numeric(model_metrics$Kappa)

Displaying accuracy and kappa in table

kable(model_metrics, caption = "Model Performance Metrics for Accuracy and Kappa")
Model Performance Metrics for Accuracy and Kappa
Model Accuracy Kappa
Random Forest 0.6896552 0.3140604
Decision Tree 0.6724138 0.3288672
SVM 0.6206897 0.1954603
GBM 0.6724138 0.3288672
GLM 0.5517241 0.0679852
kNN 0.7068966 0.3555556

Plotting the results using ggplot2

# Accuracy plot

ggplot(model_metrics, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  theme_bw()

# Kappa plot

ggplot(model_metrics, aes(x = Model, y = Kappa, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Model Kappa Comparison", x = "Model", y = "Kappa") +
  theme_minimal() +
  theme(legend.position = "none")

TASK 2

Loading the dataset and selecting required variables

newdata <- read.csv("C:/Users/gyc22/Documents/AI/Files/CA_AV_CrashReportsSample.csv")

newdata <- newdata %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()

Text preprocessing

corpusset <- VCorpus(VectorSource(newdata$CrashNarrative))

corpusset <- tm_map(corpusset, content_transformer(tolower))

corpusset <- tm_map(corpusset, removePunctuation)

corpusset <- tm_map(corpusset, removeNumbers)

corpusset <- tm_map(corpusset, removeWords, stopwords("en"))

corpusset <- tm_map(corpusset, stripWhitespace)

TF-IDF vectorization

document <- DocumentTermMatrix(corpusset, control = list(weighting = weightTfIdf))

tfidfData <- as.data.frame(as.matrix(document))

tfidfData$Pre_Crash_Mode <- newdata$Pre_Crash_Mode

Splitting into train and test set

set.seed(123)
trainIndex <- createDataPartition(tfidfData$Pre_Crash_Mode, p = 0.75, list = FALSE)

train_data <- tfidfData[trainIndex,]

test_data <- tfidfData[-trainIndex,]

colnames(train_data) <- make.names(colnames(train_data))

colnames(test_data) <- make.names(colnames(test_data))

Applying class balancing and training model

# Class balancing

train_control <- trainControl(method = "cv", number = 4, sampling = "up")

# Training Random Forest model using caret

rf_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "rf",         
  ntree = 150,           
  tuneGrid = data.frame(mtry = 4),  
  trControl = trainControl(method = "cv", number = 6)
)

# Training Logistic Regression with class balancing

log_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "glmnet", 
  trControl = train_control,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.15, by = 0.01))
)

#  Training a Gradient Boosting Machine (GBM) Model using caret to ensure compatibility with lime

gbm_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "gbm",   
  trControl = trainControl(method = "cv", number = 7),
  tuneGrid = expand.grid(
    n.trees = 100,          
    interaction.depth = 4,
    shrinkage = 0.2,        
    n.minobsinnode = 15)
  ,verbose = FALSE            
)

Predictions and accuracy

# Predictions

log_predict <- predict(log_model, test_data)

rf_predict <- predict(rf_model, test_data)

gbm_predict <- predict(gbm_model, test_data)


# Accuracy extraction
log_acc <- mean(log_predict == test_data$Pre_Crash_Mode)

gbm_acc <- mean(gbm_predict == test_data$Pre_Crash_Mode)

rf_acc <- mean(rf_predict == test_data$Pre_Crash_Mode)

cat("Accuracy of Regularized Logistic Regression:", log_acc, "\n")
## Accuracy of Regularized Logistic Regression: 0.9264706
cat("Accuracy of Gradient Boosting Machine:", gbm_acc, "\n")
## Accuracy of Gradient Boosting Machine: 0.8970588
cat("Random Forest Accuracy with tuned parameters:", rf_acc, "\n")
## Random Forest Accuracy with tuned parameters: 0.6470588

Explainable AI using LIME

explain_rf <- lime(train_data[, -ncol(train_data)], rf_model)

explain_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)

explain_log <- lime(train_data[, -ncol(train_data)], log_model)

Generating LIME explanations table

# LIME Explanations

explaining_log <- explain(test_data[1:8, -ncol(test_data)], explain_log, n_labels = 1, n_features = 12)

explaining_rf <- explain(test_data[1:8, -ncol(test_data)], explain_rf, n_labels = 1, n_features = 12)

explaining_gbm <- explain(test_data[1:8, -ncol(test_data)], explain_gbm, n_labels = 1, n_features = 12)

# Print explanations for GLM

print(explaining_log)
## # A tibble: 96 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
##  2 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
##  3 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
##  4 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
##  5 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
##  6 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
##  7 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
##  8 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
##  9 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
## 10 classificat… 2     Auto…      0.914    0.389          0.0874            0.914
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
# Explanations for RF

print(explaining_rf)
## # A tibble: 96 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
##  2 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
##  3 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
##  4 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
##  5 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
##  6 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
##  7 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
##  8 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
##  9 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
## 10 classificat… 2     Auto…       0.84  0.00951           0.794            0.840
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
# Explanations for GBM

print(explaining_gbm)
## # A tibble: 96 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…       1.00    0.161           0.606             1.00
##  2 classificat… 2     Auto…       1.00    0.161           0.606             1.00
##  3 classificat… 2     Auto…       1.00    0.161           0.606             1.00
##  4 classificat… 2     Auto…       1.00    0.161           0.606             1.00
##  5 classificat… 2     Auto…       1.00    0.161           0.606             1.00
##  6 classificat… 2     Auto…       1.00    0.161           0.606             1.00
##  7 classificat… 2     Auto…       1.00    0.161           0.606             1.00
##  8 classificat… 2     Auto…       1.00    0.161           0.606             1.00
##  9 classificat… 2     Auto…       1.00    0.161           0.606             1.00
## 10 classificat… 2     Auto…       1.00    0.161           0.606             1.00
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>

Extracting and preparing the top 12 important features for all three models

# Extract and prepare the top 12 important features for Random Forest

importance_rf <- varImp(rf_model)$importance %>%
  rownames_to_column(var = "feature") %>%
  arrange(desc(Overall)) %>%
  slice_head(n = 12)

# Extract and prepare the top 12 important features for GBM

importance_gbm <- varImp(gbm_model)$importance %>%
  rownames_to_column(var = "feature") %>%
  arrange(desc(Overall)) %>%
  slice_head(n = 12)

# Extract and prepare the top 12 important features for Logistic Regression

importance_log <- varImp(log_model)$importance %>%
  rownames_to_column(var = "feature") %>%
  arrange(desc(Overall)) %>%
  slice_head(n = 12)

# Plot for Random Forest
ggplot(importance_rf, aes(x = reorder(feature, Overall), y = Overall, fill = "Random Forest")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "red", name = "Model") +
  ggtitle("Top 12 Features - Random Forest") +
  xlab("Feature") +
  ylab("Importance") +
  theme_minimal()

# Plot for Logistic Regression

ggplot(importance_log, aes(x = reorder(feature, Overall), y = Overall, fill = "Logistic Regression")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "pink", name = "Model") +
  ggtitle("Top 12 Features - Logistic Regression") +
  xlab("Feature") +
  ylab("Importance") +
  theme_minimal()

# Plot for GBM

ggplot(importance_gbm, aes(x = reorder(feature, Overall), y = Overall, fill = "GBM")) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  scale_fill_manual(values = "green", name = "Model") +
  ggtitle("Top 12 Features - GBM") +
  xlab("Feature") +
  ylab("Importance") +
  theme_minimal()