HW_04 by Momin
Submitted by Md Shah Mominul Islam Momin; NetID: mif37
Task_01: Develop Machine Learning Models To Classify Pre_Crash_Mode From The Structured Data
Installation of the Packages
Loading the Packages
# Load necessary libraries
library(ggplot2)
library(lattice)
library(dplyr)
library(tidyr)
library(gbm)
library(class) # For KNN Model
library(e1071) # For SVM Model
library(tidyr)
library(caret) # For model training and evaluation
library(rpart) # For decision trees
library(randomForest) # For random forests
library(reshape2)
library(tidyverse)
library(tidytext)
library(tm)
library(glmnet)
library(lime) #For Explainable AI Results
Data Processing
# Read the CSV file
data <- read.csv("D:/TXST/OneDrive - Texas State University/1_TXST/CE7393/HW4/CA_AV_CrashReportsSample.csv" , stringsAsFactors = FALSE)
# Specify the columns to remove
columns_to_remove <- c(
"CrashNarrative", "Manufacturer_Name", "File_name", "Business_Name", "Year",
"Date", "Time", "Veh1_Year", "Veh1_Make", "Veh1_Model", "Veh1_Register_State",
"Location_Street", "Location_City", "Location_County", "Location_State",
"Location_Zip", "Latitude", "Longitude", "Involved_in_the_Accident",
"IS_VRU_FromNarra", "IS_VRU", "Otherparty_Vehicle_Year",
"Otherparty_Vehicle_Model", "Otherparty_Vehicle_State", "Vehicle_was_OtherParty",
"Involved_Party_Otherparty", "Driver", "Passenger", "Vehicle_Year_3rdparty",
"Model_3rdParty", "X3rdparty_Vehicle_State", "Vehicle_was_3rdParty",
"Involve_in_Accident_3rdParty", "Driver_3rdParty", "Passenger_3rdParty",
"Movement_Bef_Veh3", "Collision_Type", "Name_Injury.Death",
"Bicyclist_Injury.Death", "Property_Damage", "Property_Owner_Name" , "Vehicle_Was" ,
"Roadway.Conditions"
)
# Remove the specified columns
cleaned_data <- data %>% select(-all_of(columns_to_remove))
# Check for missing data
missing_data <- colSums(is.na(cleaned_data))
# Print columns with missing data and their counts
missing_data <- missing_data[missing_data > 0]
if (length(missing_data) > 0) {
print("Columns with missing data and their counts:")
print(missing_data)
} else {
print("No missing data found in the cleaned dataset.")
}
## [1] "Columns with missing data and their counts:"
## No_Vehicle_Involved Describe_Vehicle_Damage Weather
## 1 1 2
## Lighting Roadway_Surface Movement_Bef_Veh1
## 1 2 2
## Movement_Bef_Veh2 Collision_Type1
## 26 40
# Replace blank cells with NA to standardize missing values
cleaned_data[cleaned_data == ""] <- NA
# Remove rows with any NA values
# This will remove all rows where there is at least one NA
final_data <- cleaned_data %>% drop_na()
# Check the cleaned dataset
head(final_data)
## No_Vehicle_Involved Describe_Vehicle_Damage Pre_Crash_Mode Weather
## 1 2 Moderate Conventional Clear
## 2 2 Minor Autonomous Clear
## 3 2 None Conventional Cloudy
## 4 2 Moderate Conventional Clear
## 5 2 Moderate Autonomous Clear
## 6 2 Moderate Conventional Clear
## Lighting Roadway_Surface Movement_Bef_Veh1
## 1 Dark - Street Lights Dry Proceeding Straight
## 2 Daylight Dry Slowing/Stopping
## 3 Daylight Wet Stopped
## 4 Daylight Dry Proceeding Straight
## 5 Daylight Dry Stopped
## 6 Daylight Dry Proceeding Straight
## Movement_Bef_Veh2 Collision_Type1
## 1 Making Right Turn/Xing into opposing lane Side Swipe
## 2 Proceeding Straight Rear End
## 3 Backing Head-On
## 4 Changing Lanes Side Swipe
## 5 Proceeding Straight Rear End
## 6 Proceeding Straight Broadside
# Check if any NA values remain
remaining_na <- colSums(is.na(final_data))
print("Remaining NA values after cleaning (should be zero if removed/replaced):")
## [1] "Remaining NA values after cleaning (should be zero if removed/replaced):"
## named numeric(0)
# Remove all unique values from the final_data
for (col in names(final_data)) {
# Identify unique values
unique_values <- final_data %>%
group_by(!!sym(col)) %>%
filter(n() == 1) %>%
pull(!!sym(col))
# Remove rows with unique values in the current column
final_data <- final_data %>%
filter(!(!!sym(col) %in% unique_values))
}
# Save the cleaned data to a new CSV file
write.csv(final_data, "Final_Data.csv", row.names = FALSE)
# Identify low-variance predictors
low_variance_cols <- nearZeroVar(final_data, saveMetrics = TRUE)
# View which columns are low-variance
print(low_variance_cols)
## freqRatio percentUnique zeroVar nzv
## No_Vehicle_Involved 31.000000 1.522843 FALSE TRUE
## Describe_Vehicle_Damage 4.516129 2.030457 FALSE FALSE
## Pre_Crash_Mode 1.984848 1.015228 FALSE FALSE
## Weather 11.600000 1.522843 FALSE FALSE
## Lighting 1.750000 2.030457 FALSE FALSE
## Roadway_Surface 18.500000 1.522843 FALSE FALSE
## Movement_Bef_Veh1 1.649123 4.568528 FALSE FALSE
## Movement_Bef_Veh2 6.250000 7.106599 FALSE FALSE
## Collision_Type1 1.407407 2.538071 FALSE FALSE
# Remove the low-variance columns from the dataset
data_filtered <- final_data[, -nearZeroVar(final_data)]
# Save the cleaned data to a new CSV file
write.csv(data_filtered, "Filtered_Data.csv", row.names = FALSE)
# Convert target variable to factor if not already done
final_data$Pre_Crash_Mode <- as.factor(final_data$Pre_Crash_Mode)
Data Splitting for Training and Test
Logistic Regression Model
# Logistic Regression Model
logistic_model <- glm(Pre_Crash_Mode ~ ., data = train_data, family = binomial)
summary(logistic_model)
# Predictions
logistic_pred <- predict(logistic_model, test_data, type = "response")
logistic_pred_class <- ifelse(logistic_pred > 0.5, levels(train_data$Pre_Crash_Mode)[2], levels(train_data$Pre_Crash_Mode)[1])
# Confusion Matrix
confusionMatrix(factor(logistic_pred_class, levels = levels(train_data$Pre_Crash_Mode)), test_data$Pre_Crash_Mode)
Decision Tree Model
Random Forest Model
SVM Radial Model
Model Accuracy
# Initialize a data frame to store the results
accuracy_results <- data.frame(
Model = character(),
Accuracy = numeric(),
stringsAsFactors = FALSE
)
# Logistic Regression Model
logistic_model <- glm(Pre_Crash_Mode ~ ., data = train_data, family = binomial)
logistic_pred <- predict(logistic_model, test_data, type = "response")
logistic_pred_class <- ifelse(logistic_pred > 0.5, levels(train_data$Pre_Crash_Mode)[2], levels(train_data$Pre_Crash_Mode)[1])
logistic_cm <- confusionMatrix(factor(logistic_pred_class, levels = levels(train_data$Pre_Crash_Mode)), test_data$Pre_Crash_Mode)
#Kappa
logistic_kappa <- logistic_cm$overall['Kappa']
# Store Logistic Regression accuracy
accuracy_results <- rbind(accuracy_results, data.frame(
Model = "Logistic Regression",
Accuracy = logistic_cm$overall['Accuracy']
))
# Decision Tree Model
tree_model <- rpart(Pre_Crash_Mode ~ ., data = train_data)
tree_pred <- predict(tree_model, test_data, type = "class")
tree_cm <- confusionMatrix(tree_pred, test_data$Pre_Crash_Mode)
#Kappa
tree_kappa <- tree_cm$overall['Kappa']
# Store Decision Tree accuracy
accuracy_results <- rbind(accuracy_results, data.frame(
Model = "Decision Tree",
Accuracy = tree_cm$overall['Accuracy']
))
# Random Forest Model
rf_model <- randomForest(Pre_Crash_Mode ~ ., data = train_data)
rf_pred <- predict(rf_model, test_data)
rf_cm <- confusionMatrix(rf_pred, test_data$Pre_Crash_Mode)
#Kappa
rf_kappa <- rf_cm$overall['Kappa']
# Store Random Forest accuracy
accuracy_results <- rbind(accuracy_results, data.frame(
Model = "Random Forest",
Accuracy = rf_cm$overall['Accuracy']
))
# Support Vector Machine (SVM Radial)
svm_model <- svm(Pre_Crash_Mode ~ ., data = train_data, kernel = "radial")
svm_pred <- predict(svm_model, test_data)
svm_cm <- confusionMatrix(svm_pred, test_data$Pre_Crash_Mode)
#Kappa
svm_kappa <- svm_cm$overall['Kappa']
# Store SVM accuracy
accuracy_results <- rbind(accuracy_results, data.frame(
Model = "SVM Radial",
Accuracy = svm_cm$overall['Accuracy']
))
# Print the accuracy results
print("Model Accuracy Results:")
## [1] "Model Accuracy Results:"
## Model Accuracy
## Accuracy Logistic Regression 0.6379310
## Accuracy1 Decision Tree 0.6206897
## Accuracy2 Random Forest 0.5517241
## Accuracy3 SVM Radial 0.6724138
Model Accuracy Comparison Plot
ggplot(accuracy_results, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Class Distribution Plot
ggplot(final_data, aes(x = Pre_Crash_Mode)) +
geom_bar(fill = "blue") +
labs(title = "Class Distribution in Pre_Crash_Mode", x = "Class", y = "Count") +
theme_minimal()
Confusion Matrix Heatmap Plot
confusion_data <- function(cm) {
as.data.frame(cm$table) %>%
mutate(Reference = factor(Reference, levels = unique(test_data$Pre_Crash_Mode)),
Prediction = factor(Prediction, levels = unique(test_data$Pre_Crash_Mode)))
}
# Plot function for confusion matrix heatmaps
plot_confusion_matrix <- function(cm_data, model_name) {
ggplot(cm_data, aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq), color = "white") +
scale_fill_gradient(low = "lightblue", high = "red") +
labs(title = paste(model_name, "Confusion Matrix"), x = "Actual", y = "Predicted") +
theme_minimal()
}
# Plot for each model
plot_confusion_matrix(confusion_data(logistic_cm), "Logistic Regression")
Plot of Kappa Values
# Kappa values from the models
logistic_kappa <- logistic_cm$overall['Kappa']
tree_kappa <- tree_cm$overall['Kappa']
rf_kappa <- rf_cm$overall['Kappa']
svm_kappa <- svm_cm$overall['Kappa']
# Create a data frame from the Kappa values
kappa_values <- data.frame(
Model = c("Logistic Regression", "Decision Tree", "Random Forest", "SVM"),
Kappa = c(logistic_kappa, tree_kappa, rf_kappa, svm_kappa)
)
# Print the Kappa data frame
print(kappa_values)
## Model Kappa
## 1 Logistic Regression 0.14345992
## 2 Decision Tree 0.03479576
## 3 Random Forest 0.10344828
## 4 SVM 0.00000000
# Plot the Kappa values
ggplot(kappa_values, aes(x = Model, y = Kappa, fill = Model)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("blue", "green", "red", "purple")) +
labs(title = "Kappa Values for Different Models", x = "Model", y = "Kappa") +
ylim(0, 1) +
theme_minimal()
Task_02: Develop Machine Learning Models To Classify Pre_Crash_Mode Using The Crash Narrative Column With Explainable AI Results
Read and Factorize the Data
data <- read.csv("D:/TXST/OneDrive - Texas State University/1_TXST/CE7393/HW4/CA_AV_CrashReportsSample.csv")
data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()
# Factorize the 'Pre_Crash_Mode' and 'CrashNarrative' column
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data$CrashNarrative <- as.factor(data$CrashNarrative)
Text Preprocessing
# Text Preprocessing
corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)
# TF-IDF Vectorization
text_data <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
text_data <- as.data.frame(as.matrix(text_data))
text_data$Pre_Crash_Mode <- data$Pre_Crash_Mode
Splitting the Data for Training and Test
Random Forest Model
# Train Random Forest model
rf_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "rf", # Random Forest method
ntree = 100, # Number of trees
trControl = trainControl(method = "cv", number = 8) # Cross-validation
)
# Evaluate Random Forest model
rf_pred <- predict(rf_model, test_data)
rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)
rf_kappa <- confusionMatrix(rf_pred, test_data$Pre_Crash_Mode)$overall['Kappa']
cat("Random Forest Accuracy:", rf_acc, "\n")
## Random Forest Accuracy: 0.9382716
## Random Forest Kappa: 0.8672566
SVM Model
# Train SVM model
svm_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "svmRadial", # SVM with radial kernel
trControl = trainControl(method = "cv", number = 4) # Cross-validation
)
# Evaluate SVM model
svm_pred <- predict(svm_model, test_data)
svm_acc <- mean(svm_pred == test_data$Pre_Crash_Mode)
svm_kappa <- confusionMatrix(svm_pred, test_data$Pre_Crash_Mode)$overall['Kappa']
cat("SVM Accuracy:", svm_acc, "\n")
## SVM Accuracy: 0.6296296
## SVM Kappa: 0.1309013
Logistic Regression Model
# Train Logistic Regression model
log_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "glmnet", # Regularized logistic regression
trControl = trainControl(method = "cv", number = 12) # Cross-validation
)
# Evaluate Logistic Regression model
log_pred <- predict(log_model, test_data)
log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)
log_kappa <- confusionMatrix(log_pred, test_data$Pre_Crash_Mode)$overall['Kappa']
cat("Logistic Regression Accuracy:", log_acc, "\n")
## Logistic Regression Accuracy: 0.9259259
## Logistic Regression Kappa: 0.8398154
KNN Model
# Train KNN model
knn_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "knn", # KNN method
trControl = trainControl(method = "cv", number = 15) # Cross-validation
)
# Evaluate KNN model
knn_pred <- predict(knn_model, test_data)
knn_acc <- mean(knn_pred == test_data$Pre_Crash_Mode)
knn_kappa <- confusionMatrix(knn_pred, test_data$Pre_Crash_Mode)$overall['Kappa']
cat("KNN Accuracy:", knn_acc, "\n")
## KNN Accuracy: 0.617284
## KNN Kappa: 0.04994325
Model Performance Data Frame
Distribution Class Plot
ggplot(data, aes(x = Pre_Crash_Mode)) +
geom_bar(fill = "green") +
labs(title = "Distribution of Pre_Crash_Mode Classes", x = "Pre_Crash_Mode", y = "Count") +
theme_minimal()
Accuracy Plot
# Plot Accuracy
ggplot(model_performance, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
ggtitle("Model Accuracy Comparison") +
ylab("Accuracy") +
theme_minimal()
Plot of Kappa Values
# Plot Kappa
ggplot(model_performance, aes(x = Model, y = Kappa, fill = Model)) +
geom_bar(stat = "identity") +
ggtitle("Model Kappa Comparison") +
ylab("Kappa") +
theme_minimal()
Confusion Matrix Plot
# Plot function for confusion matrix heatmaps
plot_confusion_matrix <- function(cm_data, model_name) {
ggplot(cm_data, aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq), color = "white") +
scale_fill_gradient(low = "lightblue", high = "red") +
labs(title = paste(model_name, "Confusion Matrix"), x = "Actual", y = "Predicted") +
theme_minimal()
}
# Plot confusion matrices for each model
plot_confusion_matrix(rf_cm_data, "Random Forest")
Explainable AI using Lime
# Define a function to generate LIME explanations
generate_lime_explanation <- function(test_data, explainer, n_labels, n_features) {
explain(test_data[1:5, -ncol(test_data)], explainer, n_labels = n_labels, n_features = n_features)
}
# Create explainers for each model
explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explainer_svm <- lime(train_data[, -ncol(train_data)], svm_model)
explainer_log <- lime(train_data[, -ncol(train_data)], log_model)
explainer_knn <- lime(train_data[, -ncol(train_data)], knn_model)
# Generate explanations using the function
explanation_rf <- generate_lime_explanation(test_data, explainer_rf, n_labels = 1, n_features = 8)
explanation_log <- generate_lime_explanation(test_data, explainer_log, n_labels = 1, n_features = 15)
explanation_knn <- generate_lime_explanation(test_data, explainer_knn, n_labels = 1, n_features = 12)
##The SVM Model showed "Response is constant across permutations" Error. So decided to ignore it.
##SVMs can sometimes produce uniform or overly simplified responses as the data has certain characteristics that make the model behave more rigidly.
# Define a function to print LIME explanations
print_lime_explanations <- function(...) {
explanations <- list(...)
for (i in seq_along(explanations)) {
print(explanations[[i]])
}
}
# Print explanations for each model
print_lime_explanations(explanation_rf, explanation_log, explanation_knn)
## # A tibble: 40 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 0.99 0.116 0.155 0.990
## 2 classificat… 2 Auto… 0.99 0.116 0.155 0.990
## 3 classificat… 2 Auto… 0.99 0.116 0.155 0.990
## 4 classificat… 2 Auto… 0.99 0.116 0.155 0.990
## 5 classificat… 2 Auto… 0.99 0.116 0.155 0.990
## 6 classificat… 2 Auto… 0.99 0.116 0.155 0.990
## 7 classificat… 2 Auto… 0.99 0.116 0.155 0.990
## 8 classificat… 2 Auto… 0.99 0.116 0.155 0.990
## 9 classificat… 3 Conv… 0.97 0.122 1.13 0.970
## 10 classificat… 3 Conv… 0.97 0.122 1.13 0.970
## # ℹ 30 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
## # A tibble: 75 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 2 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 3 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 4 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 5 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 6 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 7 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 8 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 9 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## 10 classificat… 2 Auto… 0.953 0.215 0.295 0.951
## # ℹ 65 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
## # A tibble: 60 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 2 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 3 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 4 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 5 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 6 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 7 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 8 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 9 classificat… 2 Auto… 1 0.0353 0.817 1.00
## 10 classificat… 2 Auto… 1 0.0353 0.817 1.00
## # ℹ 50 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
Distribution of Feature Weights by Model
# Combine explanations into one data frame
explanation_all <- rbind(
explanation_rf %>% mutate(Model = "Random Forest"),
explanation_log %>% mutate(Model = "Logistic Regression"),
explanation_knn %>% mutate(Model = "KNN")
)
ggplot(explanation_all, aes(x = Model, y = feature_weight, fill = Model)) +
geom_violin(trim = FALSE, scale = "width") +
geom_boxplot(width = 0.1, fill = "white", color = "black") +
labs(title = "Distribution of Feature Weights by Model",
x = "Model", y = "Feature Weight") +
theme_minimal()