HW_04 by Momin

Submitted by Md Shah Mominul Islam Momin; NetID: mif37

Task_01: Develop Machine Learning Models To Classify Pre_Crash_Mode From The Structured Data

Installation of the Packages

#Install Packages
#install.packages("tidyr")
#install.packages("gbm")
#install.packages("class")
#install.packages("e1071")
#install.packages("caret")
#install.packages("rpart")
#install.packages("randomForest")
#install.packages("reshape2")

Loading the Packages

# Load necessary libraries
library(ggplot2)
library(lattice)
library(dplyr)
library(tidyr)
library(gbm)
library(class) # For KNN Model
library(e1071) # For SVM Model
library(tidyr)
library(caret)  # For model training and evaluation
library(rpart)  # For decision trees
library(randomForest)  # For random forests
library(reshape2)
library(tidyverse)
library(tidytext)
library(tm)
library(glmnet)
library(lime) #For Explainable AI Results

Data Processing

# Read the CSV file
data <- read.csv("D:/TXST/OneDrive - Texas State University/1_TXST/CE7393/HW4/CA_AV_CrashReportsSample.csv" , stringsAsFactors = FALSE)

# Specify the columns to remove
columns_to_remove <- c(
  "CrashNarrative", "Manufacturer_Name", "File_name", "Business_Name", "Year", 
  "Date", "Time", "Veh1_Year", "Veh1_Make", "Veh1_Model", "Veh1_Register_State", 
  "Location_Street", "Location_City", "Location_County", "Location_State", 
  "Location_Zip", "Latitude", "Longitude", "Involved_in_the_Accident", 
  "IS_VRU_FromNarra", "IS_VRU", "Otherparty_Vehicle_Year", 
  "Otherparty_Vehicle_Model", "Otherparty_Vehicle_State", "Vehicle_was_OtherParty", 
  "Involved_Party_Otherparty", "Driver", "Passenger", "Vehicle_Year_3rdparty", 
  "Model_3rdParty", "X3rdparty_Vehicle_State", "Vehicle_was_3rdParty", 
  "Involve_in_Accident_3rdParty", "Driver_3rdParty", "Passenger_3rdParty", 
  "Movement_Bef_Veh3", "Collision_Type", "Name_Injury.Death", 
  "Bicyclist_Injury.Death", "Property_Damage", "Property_Owner_Name" , "Vehicle_Was" , 
  "Roadway.Conditions"
)

# Remove the specified columns
cleaned_data <- data %>% select(-all_of(columns_to_remove))

# Check for missing data
missing_data <- colSums(is.na(cleaned_data))

# Print columns with missing data and their counts
missing_data <- missing_data[missing_data > 0]
if (length(missing_data) > 0) {
  print("Columns with missing data and their counts:")
  print(missing_data)
} else {
  print("No missing data found in the cleaned dataset.")
}
## [1] "Columns with missing data and their counts:"
##     No_Vehicle_Involved Describe_Vehicle_Damage                 Weather 
##                       1                       1                       2 
##                Lighting         Roadway_Surface       Movement_Bef_Veh1 
##                       1                       2                       2 
##       Movement_Bef_Veh2         Collision_Type1 
##                      26                      40
# Replace blank cells with NA to standardize missing values
cleaned_data[cleaned_data == ""] <- NA

# Remove rows with any NA values
# This will remove all rows where there is at least one NA
final_data <- cleaned_data %>% drop_na()

# Check the cleaned dataset
head(final_data)
##   No_Vehicle_Involved Describe_Vehicle_Damage Pre_Crash_Mode Weather
## 1                   2                Moderate   Conventional   Clear
## 2                   2                   Minor     Autonomous   Clear
## 3                   2                    None   Conventional  Cloudy
## 4                   2                Moderate   Conventional   Clear
## 5                   2                Moderate     Autonomous   Clear
## 6                   2                Moderate   Conventional   Clear
##               Lighting Roadway_Surface   Movement_Bef_Veh1
## 1 Dark - Street Lights             Dry Proceeding Straight
## 2             Daylight             Dry    Slowing/Stopping
## 3             Daylight             Wet             Stopped
## 4             Daylight             Dry Proceeding Straight
## 5             Daylight             Dry             Stopped
## 6             Daylight             Dry Proceeding Straight
##                           Movement_Bef_Veh2 Collision_Type1
## 1 Making Right Turn/Xing into opposing lane      Side Swipe
## 2                       Proceeding Straight        Rear End
## 3                                   Backing         Head-On
## 4                            Changing Lanes      Side Swipe
## 5                       Proceeding Straight        Rear End
## 6                       Proceeding Straight       Broadside
# Check if any NA values remain
remaining_na <- colSums(is.na(final_data))
print("Remaining NA values after cleaning (should be zero if removed/replaced):")
## [1] "Remaining NA values after cleaning (should be zero if removed/replaced):"
print(remaining_na[remaining_na > 0])
## named numeric(0)
# Remove all unique values from the final_data
for (col in names(final_data)) {
  # Identify unique values
  unique_values <- final_data %>%
    group_by(!!sym(col)) %>%
    filter(n() == 1) %>%
    pull(!!sym(col))
  
  # Remove rows with unique values in the current column
  final_data <- final_data %>%
    filter(!(!!sym(col) %in% unique_values))
}


# Save the cleaned data to a new CSV file
write.csv(final_data, "Final_Data.csv", row.names = FALSE)


# Identify low-variance predictors
low_variance_cols <- nearZeroVar(final_data, saveMetrics = TRUE)

# View which columns are low-variance
print(low_variance_cols)
##                         freqRatio percentUnique zeroVar   nzv
## No_Vehicle_Involved     31.000000      1.522843   FALSE  TRUE
## Describe_Vehicle_Damage  4.516129      2.030457   FALSE FALSE
## Pre_Crash_Mode           1.984848      1.015228   FALSE FALSE
## Weather                 11.600000      1.522843   FALSE FALSE
## Lighting                 1.750000      2.030457   FALSE FALSE
## Roadway_Surface         18.500000      1.522843   FALSE FALSE
## Movement_Bef_Veh1        1.649123      4.568528   FALSE FALSE
## Movement_Bef_Veh2        6.250000      7.106599   FALSE FALSE
## Collision_Type1          1.407407      2.538071   FALSE FALSE
# Remove the low-variance columns from the dataset
data_filtered <- final_data[, -nearZeroVar(final_data)]



# Save the cleaned data to a new CSV file
write.csv(data_filtered, "Filtered_Data.csv", row.names = FALSE)


# Convert target variable to factor if not already done
final_data$Pre_Crash_Mode <- as.factor(final_data$Pre_Crash_Mode)

Data Splitting for Training and Test

# Split the data into training and testing sets
set.seed(123)
train_index <- createDataPartition(final_data$Pre_Crash_Mode, p = 0.7, list = FALSE)
train_data <- final_data[train_index, ]
test_data <- final_data[-train_index, ]

Logistic Regression Model

# Logistic Regression Model
logistic_model <- glm(Pre_Crash_Mode ~ ., data = train_data, family = binomial)
summary(logistic_model)

# Predictions
logistic_pred <- predict(logistic_model, test_data, type = "response")
logistic_pred_class <- ifelse(logistic_pred > 0.5, levels(train_data$Pre_Crash_Mode)[2], levels(train_data$Pre_Crash_Mode)[1])

# Confusion Matrix
confusionMatrix(factor(logistic_pred_class, levels = levels(train_data$Pre_Crash_Mode)), test_data$Pre_Crash_Mode)

Decision Tree Model

# Decision Tree Model
tree_model <- rpart(Pre_Crash_Mode ~ ., data = train_data)
print(tree_model)

# Predictions
tree_pred <- predict(tree_model, test_data, type = "class")

# Confusion Matrix
confusionMatrix(tree_pred, test_data$Pre_Crash_Mode)

Random Forest Model

# Random Forest Model
rf_model <- randomForest(Pre_Crash_Mode ~ ., data = train_data)
print(rf_model)

# Predictions
rf_pred <- predict(rf_model, test_data)

# Confusion Matrix
confusionMatrix(rf_pred, test_data$Pre_Crash_Mode)

SVM Radial Model

# SVM Radial Model
svm_model <- svm(Pre_Crash_Mode ~ ., data = train_data, kernel = "radial")

# Predictions
svm_pred <- predict(svm_model, test_data)

# Confusion Matrix
confusionMatrix(svm_pred, test_data$Pre_Crash_Mode)

Model Accuracy

# Initialize a data frame to store the results
accuracy_results <- data.frame(
  Model = character(),
  Accuracy = numeric(),
  stringsAsFactors = FALSE
)

# Logistic Regression Model
logistic_model <- glm(Pre_Crash_Mode ~ ., data = train_data, family = binomial)
logistic_pred <- predict(logistic_model, test_data, type = "response")
logistic_pred_class <- ifelse(logistic_pred > 0.5, levels(train_data$Pre_Crash_Mode)[2], levels(train_data$Pre_Crash_Mode)[1])
logistic_cm <- confusionMatrix(factor(logistic_pred_class, levels = levels(train_data$Pre_Crash_Mode)), test_data$Pre_Crash_Mode)
#Kappa
logistic_kappa <- logistic_cm$overall['Kappa']

# Store Logistic Regression accuracy
accuracy_results <- rbind(accuracy_results, data.frame(
  Model = "Logistic Regression",
  Accuracy = logistic_cm$overall['Accuracy']
))

# Decision Tree Model
tree_model <- rpart(Pre_Crash_Mode ~ ., data = train_data)
tree_pred <- predict(tree_model, test_data, type = "class")
tree_cm <- confusionMatrix(tree_pred, test_data$Pre_Crash_Mode)
#Kappa
tree_kappa <- tree_cm$overall['Kappa']

# Store Decision Tree accuracy
accuracy_results <- rbind(accuracy_results, data.frame(
  Model = "Decision Tree",
  Accuracy = tree_cm$overall['Accuracy']
))

# Random Forest Model
rf_model <- randomForest(Pre_Crash_Mode ~ ., data = train_data)
rf_pred <- predict(rf_model, test_data)
rf_cm <- confusionMatrix(rf_pred, test_data$Pre_Crash_Mode)
#Kappa
rf_kappa <- rf_cm$overall['Kappa']

# Store Random Forest accuracy
accuracy_results <- rbind(accuracy_results, data.frame(
  Model = "Random Forest",
  Accuracy = rf_cm$overall['Accuracy']
))

# Support Vector Machine (SVM Radial)
svm_model <- svm(Pre_Crash_Mode ~ ., data = train_data, kernel = "radial")
svm_pred <- predict(svm_model, test_data)
svm_cm <- confusionMatrix(svm_pred, test_data$Pre_Crash_Mode)
#Kappa
svm_kappa <- svm_cm$overall['Kappa']

# Store SVM accuracy
accuracy_results <- rbind(accuracy_results, data.frame(
  Model = "SVM Radial",
  Accuracy = svm_cm$overall['Accuracy']
))

# Print the accuracy results
print("Model Accuracy Results:")
## [1] "Model Accuracy Results:"
print(accuracy_results)
##                         Model  Accuracy
## Accuracy  Logistic Regression 0.6379310
## Accuracy1       Decision Tree 0.6206897
## Accuracy2       Random Forest 0.5517241
## Accuracy3          SVM Radial 0.6724138
# Create a data frame for accuracy results
accuracy_results <- data.frame(
  Model = c("Logistic Regression", "Decision Tree", "Random Forest", "SVM Radial"),
  Accuracy = c(logistic_cm$overall['Accuracy'], tree_cm$overall['Accuracy'], rf_cm$overall['Accuracy'], svm_cm$overall['Accuracy'])
)

Model Accuracy Comparison Plot

ggplot(accuracy_results, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity") +
  labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Class Distribution Plot

ggplot(final_data, aes(x = Pre_Crash_Mode)) +
  geom_bar(fill = "blue") +
  labs(title = "Class Distribution in Pre_Crash_Mode", x = "Class", y = "Count") +
  theme_minimal()

Confusion Matrix Heatmap Plot

confusion_data <- function(cm) {
  as.data.frame(cm$table) %>%
    mutate(Reference = factor(Reference, levels = unique(test_data$Pre_Crash_Mode)),
           Prediction = factor(Prediction, levels = unique(test_data$Pre_Crash_Mode)))
}

# Plot function for confusion matrix heatmaps
plot_confusion_matrix <- function(cm_data, model_name) {
  ggplot(cm_data, aes(x = Reference, y = Prediction)) +
    geom_tile(aes(fill = Freq), color = "white") +
    scale_fill_gradient(low = "lightblue", high = "red") +
    labs(title = paste(model_name, "Confusion Matrix"), x = "Actual", y = "Predicted") +
    theme_minimal()
}

# Plot for each model
plot_confusion_matrix(confusion_data(logistic_cm), "Logistic Regression")

plot_confusion_matrix(confusion_data(tree_cm), "Decision Tree")

plot_confusion_matrix(confusion_data(rf_cm), "Random Forest")

plot_confusion_matrix(confusion_data(svm_cm), "SVM Radial")

Plot of Kappa Values

# Kappa values from the models
logistic_kappa <- logistic_cm$overall['Kappa']
tree_kappa <- tree_cm$overall['Kappa']
rf_kappa <- rf_cm$overall['Kappa']
svm_kappa <- svm_cm$overall['Kappa']

# Create a data frame from the Kappa values
kappa_values <- data.frame(
  Model = c("Logistic Regression", "Decision Tree", "Random Forest", "SVM"),
  Kappa = c(logistic_kappa, tree_kappa, rf_kappa, svm_kappa)
)

# Print the Kappa data frame
print(kappa_values)
##                 Model      Kappa
## 1 Logistic Regression 0.14345992
## 2       Decision Tree 0.03479576
## 3       Random Forest 0.10344828
## 4                 SVM 0.00000000
# Plot the Kappa values
ggplot(kappa_values, aes(x = Model, y = Kappa, fill = Model)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("blue", "green", "red", "purple")) +
  labs(title = "Kappa Values for Different Models", x = "Model", y = "Kappa") +
  ylim(0, 1) +
  theme_minimal()

Task_02: Develop Machine Learning Models To Classify Pre_Crash_Mode Using The Crash Narrative Column With Explainable AI Results

Read and Factorize the Data

data <- read.csv("D:/TXST/OneDrive - Texas State University/1_TXST/CE7393/HW4/CA_AV_CrashReportsSample.csv")
data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()

# Factorize the 'Pre_Crash_Mode' and 'CrashNarrative' column
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data$CrashNarrative <- as.factor(data$CrashNarrative)

Text Preprocessing

# Text Preprocessing
corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

# TF-IDF Vectorization
text_data <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
text_data <- as.data.frame(as.matrix(text_data))
text_data$Pre_Crash_Mode <- data$Pre_Crash_Mode

Splitting the Data for Training and Test

#Data Splitting
set.seed(123)
trainIndex <- createDataPartition(text_data$Pre_Crash_Mode, p = 0.7, list = FALSE)
train_data <- text_data[trainIndex,]
test_data <- text_data[-trainIndex,]

Random Forest Model

# Train Random Forest model
rf_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "rf",         # Random Forest method
  ntree = 100,           # Number of trees
  trControl = trainControl(method = "cv", number = 8)  # Cross-validation
)

# Evaluate Random Forest model
rf_pred <- predict(rf_model, test_data)
rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)
rf_kappa <- confusionMatrix(rf_pred, test_data$Pre_Crash_Mode)$overall['Kappa']
cat("Random Forest Accuracy:", rf_acc, "\n")
## Random Forest Accuracy: 0.9382716
cat("Random Forest Kappa:", rf_kappa, "\n")
## Random Forest Kappa: 0.8672566
rf_cm <- confusionMatrix(rf_pred, test_data$Pre_Crash_Mode)
rf_cm_data <- as.data.frame(rf_cm$table)

SVM Model

# Train SVM model
svm_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "svmRadial",  # SVM with radial kernel
  trControl = trainControl(method = "cv", number = 4)  # Cross-validation
)

# Evaluate SVM model
svm_pred <- predict(svm_model, test_data)
svm_acc <- mean(svm_pred == test_data$Pre_Crash_Mode)
svm_kappa <- confusionMatrix(svm_pred, test_data$Pre_Crash_Mode)$overall['Kappa']
cat("SVM Accuracy:", svm_acc, "\n")
## SVM Accuracy: 0.6296296
cat("SVM Kappa:", svm_kappa, "\n")
## SVM Kappa: 0.1309013
svm_cm <- confusionMatrix(svm_pred, test_data$Pre_Crash_Mode)
svm_cm_data <- as.data.frame(svm_cm$table)

Logistic Regression Model

# Train Logistic Regression model
log_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "glmnet",     # Regularized logistic regression
  trControl = trainControl(method = "cv", number = 12)  # Cross-validation
)

# Evaluate Logistic Regression model
log_pred <- predict(log_model, test_data)
log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)
log_kappa <- confusionMatrix(log_pred, test_data$Pre_Crash_Mode)$overall['Kappa']
cat("Logistic Regression Accuracy:", log_acc, "\n")
## Logistic Regression Accuracy: 0.9259259
cat("Logistic Regression Kappa:", log_kappa, "\n")
## Logistic Regression Kappa: 0.8398154
log_cm <- confusionMatrix(log_pred, test_data$Pre_Crash_Mode)
log_cm_data <- as.data.frame(log_cm$table)

KNN Model

# Train KNN model
knn_model <- train(
  Pre_Crash_Mode ~ ., data = train_data, 
  method = "knn",        # KNN method
  trControl = trainControl(method = "cv", number = 15)  # Cross-validation
)

# Evaluate KNN model
knn_pred <- predict(knn_model, test_data)
knn_acc <- mean(knn_pred == test_data$Pre_Crash_Mode)
knn_kappa <- confusionMatrix(knn_pred, test_data$Pre_Crash_Mode)$overall['Kappa']
cat("KNN Accuracy:", knn_acc, "\n")
## KNN Accuracy: 0.617284
cat("KNN Kappa:", knn_kappa, "\n")
## KNN Kappa: 0.04994325
knn_cm <- confusionMatrix(knn_pred, test_data$Pre_Crash_Mode)
knn_cm_data <- as.data.frame(knn_cm$table)

Model Performance Data Frame

# Create a data frame for model performance
model_performance <- data.frame(
  Model = c("Random Forest", "SVM", "Logistic Regression", "KNN"),
  Accuracy = c(rf_acc, svm_acc, log_acc, knn_acc),
  Kappa = c(rf_kappa, svm_kappa, log_kappa, knn_kappa)
)

Distribution Class Plot

ggplot(data, aes(x = Pre_Crash_Mode)) +
  geom_bar(fill = "green") +
  labs(title = "Distribution of Pre_Crash_Mode Classes", x = "Pre_Crash_Mode", y = "Count") +
  theme_minimal()

Accuracy Plot

# Plot Accuracy
ggplot(model_performance, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_bar(stat = "identity") +
  ggtitle("Model Accuracy Comparison") +
  ylab("Accuracy") +
  theme_minimal()

Plot of Kappa Values

# Plot Kappa
ggplot(model_performance, aes(x = Model, y = Kappa, fill = Model)) +
  geom_bar(stat = "identity") +
  ggtitle("Model Kappa Comparison") +
  ylab("Kappa") +
  theme_minimal()

Confusion Matrix Plot

# Plot function for confusion matrix heatmaps
plot_confusion_matrix <- function(cm_data, model_name) {
  ggplot(cm_data, aes(x = Reference, y = Prediction)) +
    geom_tile(aes(fill = Freq), color = "white") +
    scale_fill_gradient(low = "lightblue", high = "red") +
    labs(title = paste(model_name, "Confusion Matrix"), x = "Actual", y = "Predicted") +
    theme_minimal()
}


# Plot confusion matrices for each model
plot_confusion_matrix(rf_cm_data, "Random Forest")

plot_confusion_matrix(svm_cm_data, "SVM Radial")

plot_confusion_matrix(log_cm_data, "Logistic Regression")

plot_confusion_matrix(knn_cm_data, "KNN")

Explainable AI using Lime

# Define a function to generate LIME explanations
generate_lime_explanation <- function(test_data, explainer, n_labels, n_features) {
  explain(test_data[1:5, -ncol(test_data)], explainer, n_labels = n_labels, n_features = n_features)
}

# Create explainers for each model
explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explainer_svm <- lime(train_data[, -ncol(train_data)], svm_model)
explainer_log <- lime(train_data[, -ncol(train_data)], log_model)
explainer_knn <- lime(train_data[, -ncol(train_data)], knn_model)

# Generate explanations using the function
explanation_rf <- generate_lime_explanation(test_data, explainer_rf, n_labels = 1, n_features = 8)
explanation_log <- generate_lime_explanation(test_data, explainer_log, n_labels = 1, n_features = 15)
explanation_knn <- generate_lime_explanation(test_data, explainer_knn, n_labels = 1, n_features = 12)

##The SVM Model showed  "Response is constant across permutations" Error. So decided to ignore it. 
##SVMs can sometimes produce uniform or overly simplified responses as the data has certain characteristics that make the model behave more rigidly.
# Define a function to print LIME explanations
print_lime_explanations <- function(...) {
  explanations <- list(...)
  for (i in seq_along(explanations)) {
    print(explanations[[i]])
  }
}

# Print explanations for each model
print_lime_explanations(explanation_rf, explanation_log, explanation_knn)
## # A tibble: 40 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…       0.99    0.116           0.155            0.990
##  2 classificat… 2     Auto…       0.99    0.116           0.155            0.990
##  3 classificat… 2     Auto…       0.99    0.116           0.155            0.990
##  4 classificat… 2     Auto…       0.99    0.116           0.155            0.990
##  5 classificat… 2     Auto…       0.99    0.116           0.155            0.990
##  6 classificat… 2     Auto…       0.99    0.116           0.155            0.990
##  7 classificat… 2     Auto…       0.99    0.116           0.155            0.990
##  8 classificat… 2     Auto…       0.99    0.116           0.155            0.990
##  9 classificat… 3     Conv…       0.97    0.122           1.13             0.970
## 10 classificat… 3     Conv…       0.97    0.122           1.13             0.970
## # ℹ 30 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
## # A tibble: 75 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…      0.953    0.215           0.295            0.951
##  2 classificat… 2     Auto…      0.953    0.215           0.295            0.951
##  3 classificat… 2     Auto…      0.953    0.215           0.295            0.951
##  4 classificat… 2     Auto…      0.953    0.215           0.295            0.951
##  5 classificat… 2     Auto…      0.953    0.215           0.295            0.951
##  6 classificat… 2     Auto…      0.953    0.215           0.295            0.951
##  7 classificat… 2     Auto…      0.953    0.215           0.295            0.951
##  8 classificat… 2     Auto…      0.953    0.215           0.295            0.951
##  9 classificat… 2     Auto…      0.953    0.215           0.295            0.951
## 10 classificat… 2     Auto…      0.953    0.215           0.295            0.951
## # ℹ 65 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>
## # A tibble: 60 × 13
##    model_type   case  label label_prob model_r2 model_intercept model_prediction
##    <chr>        <chr> <chr>      <dbl>    <dbl>           <dbl>            <dbl>
##  1 classificat… 2     Auto…          1   0.0353           0.817             1.00
##  2 classificat… 2     Auto…          1   0.0353           0.817             1.00
##  3 classificat… 2     Auto…          1   0.0353           0.817             1.00
##  4 classificat… 2     Auto…          1   0.0353           0.817             1.00
##  5 classificat… 2     Auto…          1   0.0353           0.817             1.00
##  6 classificat… 2     Auto…          1   0.0353           0.817             1.00
##  7 classificat… 2     Auto…          1   0.0353           0.817             1.00
##  8 classificat… 2     Auto…          1   0.0353           0.817             1.00
##  9 classificat… 2     Auto…          1   0.0353           0.817             1.00
## 10 classificat… 2     Auto…          1   0.0353           0.817             1.00
## # ℹ 50 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## #   feature_desc <chr>, data <list>, prediction <list>

Distribution of Feature Weights by Model

# Combine explanations into one data frame
explanation_all <- rbind(
  explanation_rf %>% mutate(Model = "Random Forest"),
  explanation_log %>% mutate(Model = "Logistic Regression"),
  explanation_knn %>% mutate(Model = "KNN")
)


ggplot(explanation_all, aes(x = Model, y = feature_weight, fill = Model)) +
  geom_violin(trim = FALSE, scale = "width") +
  geom_boxplot(width = 0.1, fill = "white", color = "black") +
  labs(title = "Distribution of Feature Weights by Model",
       x = "Model", y = "Feature Weight") +
  theme_minimal()