HW04

knitr::opts_chunk$set(
  warning = FALSE,
  message = FALSE,
  fig.height = 8,
  fig.width = 18
)

Task 1: ML model without crash narratives

# Load necessary libraries
library(caret)
library(randomForest)
library(dplyr)
library(ggplot2)
library(naivebayes)  # For Naive Bayes
library(gbm)         # For GBM
library(pROC)        # For ROC and AUC

# Load and preprocess data
data <- read.csv("C:/Users/xuw12/OneDrive - Texas State University/Fall 2024/CE 7393/HW4/CA_AV_CrashReportsSample_v2.csv")
data <- subset(data, select = -c(CrashNarrative))  # Remove CrashNarrative column

# Handle missing values
data <- na.omit(data)

# Separate predictors and target
predictors <- subset(data, select = -Pre_Crash_Mode)  # Exclude Pre_Crash_Mode from predictors
target <- data$Pre_Crash_Mode  # Save target variable

# Encode categorical predictors only
dummies <- dummyVars("~ .", data = predictors)
predictors_encoded <- predict(dummies, newdata = predictors)
predictors_encoded <- as.data.frame(predictors_encoded)

# Combine encoded predictors and target back into one dataset
data_encoded <- cbind(predictors_encoded, Pre_Crash_Mode = target)

# Split data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(data_encoded$Pre_Crash_Mode, p = 0.7, list = FALSE)
trainData <- data_encoded[trainIndex,]
testData <- data_encoded[-trainIndex,]

# Define control for cross-validation
control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = defaultSummary)

# Initialize results data frame
results <- data.frame(Model = character(), Accuracy = numeric(), Precision = numeric(), Recall = numeric(), F1 = numeric(), AUC = numeric(), stringsAsFactors = FALSE)

# Function to calculate evaluation metrics
evaluate_model <- function(pred, prob, true_labels) {
  conf <- confusionMatrix(pred, true_labels)
  accuracy <- conf$overall['Accuracy']
  precision <- mean(conf$byClass['Pos Pred Value'], na.rm = TRUE)
  recall <- mean(conf$byClass['Sensitivity'], na.rm = TRUE)
  f1 <- 2 * (precision * recall) / (precision + recall)
  auc <- auc(true_labels, prob[,2])  # Calculate AUC for the second class
  return(c(accuracy, precision, recall, f1, auc))
}

Gradient boosting machine

# Gradient Boosting Machine (GBM) Model
gbm_grid <- expand.grid(interaction.depth = c(1, 3, 5), n.trees = c(50, 100, 150), shrinkage = 0.1, n.minobsinnode = 10)
gbm_model <- train(Pre_Crash_Mode ~ ., data = trainData, method = "gbm", trControl = control, tuneGrid = gbm_grid, verbose = FALSE)
gbm_pred <- predict(gbm_model, testData)
gbm_prob <- predict(gbm_model, testData, type = "prob")

# Ensure levels match for GBM predictions and actual labels
combined_levels <- union(levels(factor(gbm_pred)), levels(factor(testData$Pre_Crash_Mode)))
gbm_pred <- factor(gbm_pred, levels = combined_levels)
testData$Pre_Crash_Mode <- factor(testData$Pre_Crash_Mode, levels = combined_levels)

# Store GBM results
gbm_metrics <- evaluate_model(gbm_pred, gbm_prob, testData$Pre_Crash_Mode)
results <- rbind(results, data.frame(Model = "GBM", Accuracy = gbm_metrics[1], Precision = gbm_metrics[2], Recall = gbm_metrics[3], F1 = gbm_metrics[4], AUC = gbm_metrics[5]))

RF

# Random Forest Model
rf_model <- train(Pre_Crash_Mode ~ ., data = trainData, method = "rf", trControl = control)
rf_pred <- predict(rf_model, testData)
rf_prob <- predict(rf_model, testData, type = "prob")

# Ensure levels match
combined_levels <- union(levels(factor(rf_pred)), levels(factor(testData$Pre_Crash_Mode)))
rf_pred <- factor(rf_pred, levels = combined_levels)
testData$Pre_Crash_Mode <- factor(testData$Pre_Crash_Mode, levels = combined_levels)

# Store Random Forest results
rf_metrics <- evaluate_model(rf_pred, rf_prob, testData$Pre_Crash_Mode)
results <- rbind(results, data.frame(Model = "Random Forest", Accuracy = rf_metrics[1], Precision = rf_metrics[2], Recall = rf_metrics[3], F1 = rf_metrics[4], AUC = rf_metrics[5]))

Naive Bayes

# Naive Bayes Model
nb_model <- train(Pre_Crash_Mode ~ ., data = trainData, method = "naive_bayes", trControl = control)
nb_pred <- predict(nb_model, testData)
nb_prob <- predict(nb_model, testData, type = "prob")

# Ensure levels match
nb_pred <- factor(nb_pred, levels = combined_levels)

# Store Naive Bayes results
nb_metrics <- evaluate_model(nb_pred, nb_prob, testData$Pre_Crash_Mode)
results <- rbind(results, data.frame(Model = "Naive Bayes", Accuracy = nb_metrics[1], Precision = nb_metrics[2], Recall = nb_metrics[3], F1 = nb_metrics[4], AUC = nb_metrics[5]))

# Convert Model to factor
results$Model <- as.factor(results$Model)

Model comparison

# Plotting model comparison
custom_colors <- c("Random Forest" = "azure3", "GBM" = "darkkhaki", "Naive Bayes" = "salmon")

# Accuracy Comparison Plot
ggplot(results, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Model Comparison: Accuracy") +
  theme_minimal() +
  xlab("Model") +
  ylab("Accuracy") +
  scale_fill_manual(values = custom_colors) + theme_bw(base_size = 20)

# Precision Comparison Plot
ggplot(results, aes(x = Model, y = Precision, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Model Comparison: Precision") +
  theme_minimal() +
  xlab("Model") +
  ylab("Precision") +
  scale_fill_manual(values = custom_colors) + theme_bw(base_size = 20)

# Recall Comparison Plot
ggplot(results, aes(x = Model, y = Recall, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Model Comparison: Recall") +
  theme_minimal() +
  xlab("Model") +
  ylab("Recall") +
  scale_fill_manual(values = custom_colors) + theme_bw(base_size = 20)

# F1 Score Comparison Plot
ggplot(results, aes(x = Model, y = F1, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Model Comparison: F1 Score") +
  theme_minimal() +
  xlab("Model") +
  ylab("F1 Score") +
  scale_fill_manual(values = custom_colors) + theme_bw(base_size = 20)

# AUC Comparison Plot
ggplot(results, aes(x = Model, y = AUC, fill = Model)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Model Comparison: AUC") +
  theme_minimal() +
  xlab("Model") +
  ylab("AUC") +
  scale_fill_manual(values = custom_colors) + theme_bw(base_size = 20)

# Plot ROC Curves for Each Model
par(mfrow = c(2, 2))   # Set layout for 2x2 plots

roc_rf <- roc(testData$Pre_Crash_Mode, rf_prob[,2])
plot(roc_rf, main = "Random Forest ROC", col = "azure3") + theme_bw(base_size = 20)
## NULL
roc_gbm <- roc(testData$Pre_Crash_Mode, gbm_prob[,2])
plot(roc_gbm, main = "GBM ROC", col = "darkkhaki") + theme_bw(base_size = 20)
## NULL
roc_nb <- roc(testData$Pre_Crash_Mode, nb_prob[,2])
plot(roc_nb, main = "Naive Bayes ROC", col = "salmon") + theme_bw(base_size = 20)
## NULL

Task 2: Explainable AI with crash narratives

# Load necessary libraries
library(caret)
library(text2vec)
library(glmnet)  # For logistic regression
library(lime)    # For explainability
library(wordcloud)
library(tm)
library(ggplot2)

# Load the data
data <- read.csv("C:/Users/xuw12/OneDrive - Texas State University/Fall 2024/CE 7393/HW4/CA_AV_CrashReportsSample_v2.csv")
text_data <- data[c("CrashNarrative", "Pre_Crash_Mode")]

# Ensure no missing values in the relevant columns
text_data <- na.omit(text_data)

# Preprocess text data
text_data$CrashNarrative <- tolower(text_data$CrashNarrative)  # Lowercase the text
text_data$CrashNarrative <- gsub("[^[:alnum:] ]", " ", text_data$CrashNarrative)  # Remove punctuation

# Split the data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(text_data$Pre_Crash_Mode, p = 0.7, list = FALSE)
trainData <- text_data[trainIndex, ]
testData <- text_data[-trainIndex, ]

# Text Vectorization using TF-IDF
it <- itoken(trainData$CrashNarrative, progressbar = FALSE)
vocab <- create_vocabulary(it)
vectorizer <- vocab_vectorizer(vocab)
dtm_train <- create_dtm(it, vectorizer)

# Create test data matrix
it_test <- itoken(testData$CrashNarrative, progressbar = FALSE)
dtm_test <- create_dtm(it_test, vectorizer)

# Convert dtm_train and dtm_test to data frames for compatibility with lime
dtm_train_df <- as.data.frame(as.matrix(dtm_train))
dtm_test_df <- as.data.frame(as.matrix(dtm_test))

# Train a logistic regression model on the TF-IDF features
model <- cv.glmnet(dtm_train, as.factor(trainData$Pre_Crash_Mode), family = "multinomial")

# ---- Training Process Plots ----

# 1. Cross-Validation MSE Plot
plot(model) + theme_bw(base_size = 30)  # Shows cross-validated MSE across lambda values

## NULL
# 2. Coefficient Path Plot
plot(model$glmnet.fit, xvar = "lambda", label = TRUE) + theme_bw(base_size = 30)  # Shows the coefficient paths across lambda

## NULL
# 3. Custom Cross-Validation MSE Plot
# Extract cross-validated mean error and standard deviation
cv_results <- data.frame(
  lambda = model$lambda,
  mean_mse = model$cvm,
  mse_std_error = model$cvsd
)

# Plot cross-validated mean MSE with error bars across lambda values
ggplot(cv_results, aes(x = log(lambda), y = mean_mse)) +
  geom_line(color = "blue") +
  geom_point() +
  geom_errorbar(aes(ymin = mean_mse - mse_std_error, ymax = mean_mse + mse_std_error), width = 0.1) +
  labs(title = "Cross-Validation Mean Squared Error across Lambda",
       x = "Log(Lambda)",
       y = "Mean Squared Error") +
  theme_minimal() + theme_bw(base_size = 22)

# ---- Additional Analysis ----

# Predict on test data
pred <- predict(model, dtm_test, s = "lambda.min", type = "class")
conf_matrix <- confusionMatrix(as.factor(pred), as.factor(testData$Pre_Crash_Mode))
print("Logistic Regression Results for Text Data:")
## [1] "Logistic Regression Results for Text Data:"
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Autonomous Conventional
##   Autonomous           36            2
##   Conventional          2           19
##                                           
##                Accuracy : 0.9322          
##                  95% CI : (0.8354, 0.9812)
##     No Information Rate : 0.6441          
##     P-Value [Acc > NIR] : 2.586e-07       
##                                           
##                   Kappa : 0.8521          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9474          
##             Specificity : 0.9048          
##          Pos Pred Value : 0.9474          
##          Neg Pred Value : 0.9048          
##              Prevalence : 0.6441          
##          Detection Rate : 0.6102          
##    Detection Prevalence : 0.6441          
##       Balanced Accuracy : 0.9261          
##                                           
##        'Positive' Class : Autonomous      
## 
# ---- Additional Plots ----

# 1. Word Cloud of Important Words in CrashNarrative
text_corpus <- Corpus(VectorSource(text_data$CrashNarrative))
text_corpus <- tm_map(text_corpus, content_transformer(tolower))
text_corpus <- tm_map(text_corpus, removePunctuation)
text_corpus <- tm_map(text_corpus, removeNumbers)
text_corpus <- tm_map(text_corpus, removeWords, stopwords("en"))

# Generate word cloud
wordcloud(text_corpus, max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))

# 2. Confusion Matrix Heatmap
conf_matrix_table <- as.data.frame(conf_matrix$table)
ggplot(conf_matrix_table, aes(x = Prediction, y = Reference, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue") +
  labs(title = "Confusion Matrix Heatmap", x = "Predicted", y = "Actual") +
  theme_minimal() + theme_bw(base_size = 22)

# ---- Explainable AI with LIME ----

# Custom model_type function for glmnet
model_type.cv.glmnet <- function(x, ...) {
  return("classification")
}

# Custom predict_model function for glmnet
predict_model.cv.glmnet <- function(x, newdata, type, ...) {
  # Get probabilities for each class
  pred_probs <- predict(x, as.matrix(newdata), s = "lambda.min", type = "response")
  colnames(pred_probs) <- levels(as.factor(trainData$Pre_Crash_Mode))  # Ensure column names match class labels
  as.data.frame(pred_probs)
}

# Register the custom methods with lime
class(model) <- c("cv.glmnet", class(model))  # Explicitly set the model class
explainer <- lime(dtm_train_df, model, bin_continuous = TRUE)

# Explain the predictions for the first 5 instances in the test set
explanation <- explain(dtm_test_df[1:5, ], explainer, n_labels = 1, n_features = 5)

# Plot the explanations
plot_features(explanation) + theme_bw(base_size = 16) +
  labs(title = "Feature Contributions for Predictions")