Pre_Crash_Mode Classification Analysis
Task 1: Developing Several Machine Learning Models to Classify Pre_Crash_Mode from the Structured Data (without text or crash narrative column)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
# Load necessary libraries
library(caret)
library(dplyr)
library(randomForest)
library(e1071)
library(gbm)
library(ggplot2)Dataset Loading
Data Preprocessing
# Remove unnecessary columns
data <- data %>% select(-CrashNarrative, -File_name, -Manufacturer_Name, -Business_Name,
-Date, -Time, -Location_Street, -Location_City, -Location_County,
-Location_State, -Location_Zip, -Latitude, -Longitude,
-Name_Injury.Death, -Bicyclist_Injury.Death, -Property_Damage,
-Property_Owner_Name, -Otherparty_Vehicle_Year, -Otherparty_Vehicle_Model,
-Otherparty_Vehicle_State, -Vehicle_was_OtherParty,
-Involved_Party_Otherparty, -Driver, -Passenger, -Vehicle_Year_3rdparty,
-Model_3rdParty, -X3rdparty_Vehicle_State, -Vehicle_was_3rdParty,
-Involve_in_Accident_3rdParty, -Driver_3rdParty, -Passenger_3rdParty)
# Handle missing values (example: remove rows with NA values)
data <- na.omit(data)
# Convert categorical variables to factors
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data <- data %>% mutate_if(is.character, as.factor)Feature Selection (optional, based on initial data examination)
Model Development
set.seed(123)
trainIndex <- createDataPartition(data$Pre_Crash_Mode, p = 0.8, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]
# Train models
# Logistic Regression
log_model <- train(Pre_Crash_Mode ~ ., data = train, method = "glm")
# Decision Tree
tree_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rpart")
# Random Forest
rf_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rf")
# GBM
gbm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "gbm", verbose = FALSE)
# SVM
svm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "svmRadial")
# kNN
knn_model <- train(Pre_Crash_Mode ~ ., data = train, method = "knn")Model Comparison
# Predict and evaluate each model
models <- list(Logistic = log_model, Tree = tree_model, RandomForest = rf_model,
GBM = gbm_model, SVM = svm_model, kNN = knn_model)
# Summarize resampling results
results <- resamples(models)
summary(results)##
## Call:
## summary.resamples(object = results)
##
## Models: Logistic, Tree, RandomForest, GBM, SVM, kNN
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Logistic 0.2916667 0.4666667 0.5283019 0.5201006 0.5769231 0.6938776 0
## Tree 0.5531915 0.6222222 0.6666667 0.6680400 0.7058824 0.8181818 0
## RandomForest 0.5555556 0.6521739 0.6792453 0.6827927 0.7090909 0.7884615 0
## GBM 0.6222222 0.6734694 0.7254902 0.7198410 0.7547170 0.8510638 0
## SVM 0.5714286 0.6200000 0.6666667 0.6672720 0.7000000 0.7708333 0
## kNN 0.4318182 0.6041667 0.6458333 0.6364959 0.6666667 0.7826087 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Logistic -0.38071066 -0.08695652 0.03361345 0.02947798 0.1428571 0.3839061
## Tree 0.02868526 0.23728814 0.28954424 0.32030619 0.4163265 0.6206897
## RandomForest 0.04255319 0.27969349 0.34567901 0.34252017 0.4206897 0.5731343
## GBM 0.18219038 0.31406045 0.43884892 0.42126062 0.4877323 0.6984418
## SVM 0.09961686 0.22857143 0.32432432 0.31581797 0.3862520 0.5164835
## kNN -0.11788618 0.20689655 0.24014023 0.24650917 0.3030853 0.5418327
## NA's
## Logistic 0
## Tree 0
## RandomForest 0
## GBM 0
## SVM 0
## kNN 0
# Accuracy for each model on test data
log_acc <- mean(predict(log_model, test) == test$Pre_Crash_Mode)
tree_acc <- mean(predict(tree_model, test) == test$Pre_Crash_Mode)
rf_acc <- mean(predict(rf_model, test) == test$Pre_Crash_Mode)
gbm_acc <- mean(predict(gbm_model, test) == test$Pre_Crash_Mode)
svm_acc <- mean(predict(svm_model, test) == test$Pre_Crash_Mode)
knn_acc <- mean(predict(knn_model, test) == test$Pre_Crash_Mode)Print Accuracy Results
## Logistic Regression Accuracy: 0.6060606
## Decision Tree Accuracy: 0.6060606
## Random Forest Accuracy: 0.4545455
## GBM Accuracy: 0.5757576
## SVM Accuracy: 0.5151515
## kNN Accuracy: 0.6363636
# Bar plot of accuracy results on test data
accuracy_data <- data.frame(
Model = c("Logistic Regression", "Decision Tree", "Random Forest", "GBM", "SVM", "kNN"),
Accuracy = c(log_acc, tree_acc, rf_acc, gbm_acc, svm_acc, knn_acc)
)
ggplot(accuracy_data, aes(x = Model, y = Accuracy)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(title = "Test Accuracy of Various Models", x = "Model", y = "Accuracy")Task 2: Developing Several Machine Learning Models to Classify Pre_Crash_Mode using only Text Data or Crash Narrative cColumn with Explainable AI Results.
# Load necessary libraries
library(tidyverse)
library(tidytext)
library(caret)
library(tm)
library(glmnet) # For regularized logistic regression
library(e1071) # For Naive Bayes
library(randomForest)
library(lime) # For model explainability
library(ggplot2)Dataset Loading
Text Preprocessing
corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)
# TF-IDF Vectorization
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
tfidf_data <- as.data.frame(as.matrix(dtm))
tfidf_data$Pre_Crash_Mode <- data$Pre_Crash_Mode
# Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(tfidf_data$Pre_Crash_Mode, p = 0.8, list = FALSE)
train_data <- tfidf_data[trainIndex,]
test_data <- tfidf_data[-trainIndex,]
colnames(train_data) <- make.names(colnames(train_data))
colnames(test_data) <- make.names(colnames(test_data))Model Training
# Apply class balancing
train_control <- trainControl(method = "cv", number = 5, sampling = "up")
# Train Logistic Regression with class balancing
log_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01))
)
# Train Random Forest model using caret
rf_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "rf", # Random Forest method in caret
ntree = 200, # Number of trees
tuneGrid = data.frame(mtry = 3), # Number of variables tried at each split
trControl = trainControl(method = "cv", number = 5) # Cross-validation
)
# Train a Gradient Boosting Machine (GBM) Model
# Using caret to ensure compatibility with lime
gbm_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "gbm", # Specify GBM as the model
trControl = trainControl(method = "cv", number = 5), # 5-fold cross-validation
tuneGrid = expand.grid(
n.trees = 100, # Number of trees
interaction.depth = 3, # Depth of each tree
shrinkage = 0.1, # Learning rate
n.minobsinnode = 10 # Minimum number of observations in nodes
),
verbose = FALSE # Suppress printing
)Model Evaluation
Print Accuracy Results
## Regularized Logistic Regression Accuracy: 0.9056604
## Gradient Boosting Machine Accuracy: 0.9245283
## Random Forest Accuracy with tuned parameters: 0.6037736
# Accuracy comparison bar plot
accuracy_data <- data.frame(
Model = c("Logistic Regression", "Random Forest", "GBM"),
Accuracy = c(log_acc, rf_acc, gbm_acc)
)
# Plot model accuracy on test data with colors
ggplot(accuracy_data, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity", color = "black") +
scale_fill_manual(values = c("skyblue", "orange", "green")) +
theme_minimal() +
labs(title = "Test Accuracy of Text Classification Models", x = "Model", y = "Accuracy") +
geom_text(aes(label = round(Accuracy, 2)), vjust = -0.3, size = 5) +
theme(legend.position = "none")# Explainable AI with LIME for Logistic Regression and SVM
explainer_log <- lime(train_data[, -ncol(train_data)], log_model)
explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explainer_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)
# LIME Explanations with more features and a smaller subset
explanation_log <- explain(test_data[1:2, -ncol(test_data)], explainer_log, n_labels = 1, n_features = 10)
explanation_rf <- explain(test_data[1:5, -ncol(test_data)], explainer_rf, n_labels = 1, n_features = 12)
explanation_gbm <- explain(test_data[1:5, -ncol(test_data)], explainer_gbm, n_labels = 1, n_features = 5)
# Print explanations
print(explanation_log)## # A tibble: 20 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 2 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 3 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 4 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 5 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 6 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 7 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 8 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 9 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 10 classificat… 3 Conv… 0.984 0.536 0.722 0.984
## 11 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 12 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 13 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 14 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 15 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 16 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 17 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 18 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 19 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## 20 classificat… 4 Conv… 0.938 0.524 1.34 0.960
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
## # A tibble: 60 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 2 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 3 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 4 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 5 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 6 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 7 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 8 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 9 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## 10 classificat… 3 Auto… 0.67 0.0138 0.722 0.670
## # ℹ 50 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
## # A tibble: 25 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 3 Conv… 0.984 0.226 0.607 1.11
## 2 classificat… 3 Conv… 0.984 0.226 0.607 1.11
## 3 classificat… 3 Conv… 0.984 0.226 0.607 1.11
## 4 classificat… 3 Conv… 0.984 0.226 0.607 1.11
## 5 classificat… 3 Conv… 0.984 0.226 0.607 1.11
## 6 classificat… 4 Conv… 0.998 0.226 0.686 1.07
## 7 classificat… 4 Conv… 0.998 0.226 0.686 1.07
## 8 classificat… 4 Conv… 0.998 0.226 0.686 1.07
## 9 classificat… 4 Conv… 0.998 0.226 0.686 1.07
## 10 classificat… 4 Conv… 0.998 0.226 0.686 1.07
## # ℹ 15 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>