Loading the libraries
library(pROC)
library(knitr)
library(caret)
library(dplyr)
library(randomForest)
library(e1071)
library(gbm)
library(tidyverse)
library(tidytext)
library(tm)
library(glmnet)
library(e1071)
library(randomForest)
library(lime)
setwd("C:/Users/10207/Downloads")
Loading the data
data <- read.csv("C:/Users/10207/Downloads/CA_AV_CrashReportsSample.csv")
Data preprocessing
# Remove unnecessary Data
data <- data %>% select(-CrashNarrative, -File_name, -Manufacturer_Name, -Business_Name,-Date, -Time, -Location_Street, -Location_City,
-Location_County,-Location_State, -Location_Zip, -Latitude, -Longitude,-Name_Injury.Death, -Bicyclist_Injury.Death, -Property_Damage,
-Property_Owner_Name, -Otherparty_Vehicle_Year, -Otherparty_Vehicle_Model,
-Otherparty_Vehicle_State, -Vehicle_was_OtherParty,
-Involved_Party_Otherparty, -Driver, -Passenger, -Vehicle_Year_3rdparty,
-Model_3rdParty, -X3rdparty_Vehicle_State, -Vehicle_was_3rdParty,
-Involve_in_Accident_3rdParty, -Driver_3rdParty, -Passenger_3rdParty)
# Handling missing values
data <- na.omit(data)
# Converting categorical variables to factors
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data <- data %>% mutate_if(is.character, as.factor)
Feature Selection
nzv <- nearZeroVar(data, saveMetrics = TRUE)
data <- data[, !nzv$nzv]
Model development and training
# Model development
set.seed(123)
trainIndex <- createDataPartition(data$Pre_Crash_Mode, p = 0.6, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]
# Training the model
# Decision Tree
tree_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rpart")
# Random Forest
rf_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rf",tuneGrid = expand.grid(mtry = c(3, 4, 5, 6)))
# GBM
gbm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "gbm", verbose = FALSE)
# SVM
svm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "svmRadial")
# kNN
knn_model <- train(Pre_Crash_Mode ~ ., data = train, method = "knn")
# GLM
log_model <- train(Pre_Crash_Mode ~ ., data = train, method = "glm")
Determing the best tuning parameter
print(log_model$bestTune)
## parameter
## 1 none
print(rf_model$bestTune)
## mtry
## 3 5
print(gbm_model$bestTune)
## n.trees interaction.depth shrinkage n.minobsinnode
## 4 50 2 0.1 10
print(svm_model$bestTune)
## sigma C
## 3 0.0657767 1
print(knn_model$bestTune)
## k
## 3 9
print(tree_model$bestTune)
## cp
## 1 0
Predictions on the test set
pred_rf <- predict(rf_model, newdata = test)
pred_tree <- predict(tree_model, newdata = test)
pred_svm <- predict(svm_model, newdata = test)
pred_gbm <- predict(gbm_model, newdata = test)
pred_knn <- predict(knn_model, newdata = test)
pred_log <- predict(log_model, newdata = test)
Accuracy and Kappa for each model
# Initializing an empty data frame to store model performance metrics
model_metrics <- data.frame(Model = character(), Accuracy = numeric(), Kappa = numeric(), stringsAsFactors = FALSE)
# Generating confusion matrices and extracting Accuracy and Kappa for each model
# Random Forest
cm_rf <- confusionMatrix(pred_rf, test$Pre_Crash_Mode)
accuracy_rf <- cm_rf$overall["Accuracy"]
kappa_rf <- cm_rf$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "Random Forest", Accuracy = as.numeric(accuracy_rf), Kappa = as.numeric(kappa_rf)))
# Decision Tree
cm_tree <- confusionMatrix(pred_tree, test$Pre_Crash_Mode)
accuracy_tree <- cm_tree$overall["Accuracy"]
kappa_tree <- cm_tree$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "Decision Tree", Accuracy = as.numeric(accuracy_tree), Kappa = as.numeric(kappa_tree)))
# SVM
cm_svm <- confusionMatrix(pred_svm, test$Pre_Crash_Mode)
accuracy_svm <- cm_svm$overall["Accuracy"]
kappa_svm <- cm_svm$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "SVM", Accuracy = as.numeric(accuracy_svm), Kappa = as.numeric(kappa_svm)))
# GBM
cm_gbm <- confusionMatrix(pred_gbm, test$Pre_Crash_Mode)
accuracy_gbm <- cm_gbm$overall["Accuracy"]
kappa_gbm <- cm_gbm$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "GBM", Accuracy = as.numeric(accuracy_gbm), Kappa = as.numeric(kappa_gbm)))
# GLM
cm_log = confusionMatrix(pred_log, test$Pre_Crash_Mode)
accuracy_log <- cm_log$overall["Accuracy"]
kappa_log <- cm_log$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "GLM", Accuracy = as.numeric(accuracy_log), Kappa = as.numeric(kappa_log)))
# kNN
cm_knn <- confusionMatrix(pred_knn, test$Pre_Crash_Mode)
accuracy_knn <- cm_knn$overall["Accuracy"]
kappa_knn <- cm_knn$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "kNN", Accuracy = as.numeric(accuracy_knn), Kappa = as.numeric(kappa_knn)))
# Convert accuracy and kappa columns to numeric
model_metrics$Accuracy <- as.numeric(model_metrics$Accuracy)
model_metrics$Kappa <- as.numeric(model_metrics$Kappa)
Displaying accuracy and kappa in table
kable(model_metrics, caption = "Model Performance Metrics for Accuracy and Kappa")
| Model | Accuracy | Kappa |
|---|---|---|
| Random Forest | 0.6865672 | 0.3258265 |
| Decision Tree | 0.6567164 | 0.2921452 |
| SVM | 0.6865672 | 0.3315914 |
| GBM | 0.6119403 | 0.1897674 |
| GLM | 0.5522388 | 0.0880218 |
| kNN | 0.6567164 | 0.2551957 |
Plotting the results using ggplot2
# Accuracy plot
ggplot(model_metrics, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
theme_bw()
# Kappa plot
ggplot(model_metrics, aes(x = Model, y = Kappa, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Model Kappa Comparison", x = "Model", y = "Kappa") +
theme_minimal() +
theme(legend.position = "none")
TASK 2
Loading the dataset and selecting required variables
data <- read.csv("C:/Users/10207/Downloads/CA_AV_CrashReportsSample.csv")
data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()
Text preprocessing
corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)
TF-IDF vectorization
document <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
tfidfData <- as.data.frame(as.matrix(document))
tfidfData$Pre_Crash_Mode <- data$Pre_Crash_Mode
Splitting into train and test set
set.seed(123)
trainIndex <- createDataPartition(tfidfData$Pre_Crash_Mode, p = 0.75, list = FALSE)
train_data <- tfidfData[trainIndex,]
test_data <- tfidfData[-trainIndex,]
colnames(train_data) <- make.names(colnames(train_data))
colnames(test_data) <- make.names(colnames(test_data))
Applying class balancing and training model
# Class balancing
train_control <- trainControl(method = "cv", number = 7, sampling = "up")
# Training Logistic Regression with class balancing
log_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.2, by = 0.01))
)
# Training Random Forest model using caret
rf_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "rf",
ntree = 150,
tuneGrid = data.frame(mtry = 4),
trControl = trainControl(method = "cv", number = 6)
)
# Training a Gradient Boosting Machine (GBM) Model using caret to ensure compatibility with lime
gbm_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "gbm",
trControl = trainControl(method = "cv", number = 7),
tuneGrid = expand.grid(
n.trees = 100,
interaction.depth = 4,
shrinkage = 0.2,
n.minobsinnode = 15)
,verbose = FALSE
)
Predictions and accuracy
# Predictions
log_pred <- predict(log_model, test_data)
gbm_pred <- predict(gbm_model, test_data)
rf_pred <- predict(rf_model, test_data)
# Accuracy extraction
log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test_data$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)
cat("Accuracy of Regularized Logistic Regression:", log_acc, "\n")
## Accuracy of Regularized Logistic Regression: 0.9117647
cat("Accuracy of Gradient Boosting Machine:", gbm_acc, "\n")
## Accuracy of Gradient Boosting Machine: 0.9264706
cat("Random Forest Accuracy with tuned parameters:", rf_acc, "\n")
## Random Forest Accuracy with tuned parameters: 0.6470588
Explainable AI using LIME
explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explainer_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)
explainer_log <- lime(train_data[, -ncol(train_data)], log_model)
Generating LIME explanations table
# Creating LIME Interpreter Objects
explainer_log <- lime::lime(train_data[, -ncol(train_data)], log_model)
explainer_rf <- lime::lime(train_data[, -ncol(train_data)], rf_model)
explainer_gbm <- lime::lime(train_data[, -ncol(train_data)], gbm_model)
# Interpreting Test Data with Interpreter Objects
# The first 8 samples were interpreted and the 12 most important features were extracted.
# Interpretation of the GLM model
explanation_log <- lime::explain(
x = test_data[1:8, -ncol(test_data)], # Test data, remove target columns
explainer = explainer_log,
n_labels = 1,
n_features = 12
)
# Interpretation of the Random Forest Model
explanation_rf <- lime::explain(
x = test_data[1:8, -ncol(test_data)],
explainer = explainer_rf,
n_labels = 1,
n_features = 12
)
# Explanation of the GBM model
explanation_gbm <- lime::explain(
x = test_data[1:8, -ncol(test_data)],
explainer = explainer_gbm,
n_labels = 1,
n_features = 12
)
# Output Interpretation Results
cat("GLM Model Explanations:\n")
## GLM Model Explanations:
print(explanation_log)
## # A tibble: 96 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 2 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 3 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 4 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 5 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 6 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 7 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 8 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 9 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## 10 classificat… 2 Auto… 0.655 0.702 0.0931 0.655
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
cat("\nRandom Forest Model Explanations:\n")
##
## Random Forest Model Explanations:
print(explanation_rf)
## # A tibble: 96 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 2 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 3 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 4 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 5 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 6 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 7 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 8 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 9 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## 10 classificat… 2 Auto… 0.86 0.0371 0.512 0.860
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
cat("\nGBM Model Explanations:\n")
##
## GBM Model Explanations:
print(explanation_gbm)
## # A tibble: 96 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 2 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 3 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 4 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 5 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 6 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 7 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 8 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 9 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## 10 classificat… 2 Auto… 1.00 0.130 -0.420 1.00
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
Extracting and preparing the top 12 important features for all three models
# Extract and prepare the first 12 important features
# Logistic Regression
importance_log <- varImp(log_model)$importance %>%
rownames_to_column(var = "feature") %>%
arrange(desc(Overall)) %>%
slice_head(n = 12)
# Random Forest
importance_rf <- varImp(rf_model)$importance %>%
rownames_to_column(var = "feature") %>%
arrange(desc(Overall)) %>%
slice_head(n = 12)
# GBM
importance_gbm <- varImp(gbm_model)$importance %>%
rownames_to_column(var = "feature") %>%
arrange(desc(Overall)) %>%
slice_head(n = 12)
# Visualisation functions to avoid duplicate code
plot_feature_importance <- function(importance_data, model_name, fill_color) {
ggplot(importance_data, aes(x = reorder(feature, Overall), y = Overall, fill = model_name)) +
geom_bar(stat = "identity", color = "#501214") +
coord_flip() +
scale_fill_manual(values = fill_color, name = "Model") +
ggtitle(paste("Top 12 Features -", model_name)) +
xlab("Feature") +
ylab("Importance") +
theme_minimal()
}
# Charting important features
# Logistic Regression
plot_feature_importance(importance_log, "Logistic Regression", "#AC9155")
# Random Forest
plot_feature_importance(importance_rf, "Random Forest", "#501214")
# GBM
plot_feature_importance(importance_gbm, "GBM", "#AC9155")