Loading the libraries
library(caret)
library(dplyr)
library(randomForest)
library(e1071)
library(gbm)
library(pROC)
library(knitr)
library(tidyverse)
library(tidytext)
library(tm)
library(glmnet)
library(e1071)
library(randomForest)
library(lime)
Loading the data
testdata <- read.csv("C:/Users/gyc22/Documents/AI/Files/CA_AV_CrashReportsSample.csv")
Data preprocessing
# Remove unnecessary columns
testdata <- testdata %>% select(-CrashNarrative, -File_name, -Manufacturer_Name, -Business_Name,-Date, -Time, -Location_Street, -Location_City,
-Location_County,-Location_State, -Location_Zip, -Latitude, -Longitude,-Name_Injury.Death, -Bicyclist_Injury.Death, -Property_Damage,
-Property_Owner_Name, -Otherparty_Vehicle_Year, -Otherparty_Vehicle_Model,
-Otherparty_Vehicle_State, -Vehicle_was_OtherParty,
-Involved_Party_Otherparty, -Driver, -Passenger, -Vehicle_Year_3rdparty,
-Model_3rdParty, -X3rdparty_Vehicle_State, -Vehicle_was_3rdParty,
-Involve_in_Accident_3rdParty, -Driver_3rdParty, -Passenger_3rdParty)
# Handling missing values
testdata <- na.omit(testdata)
# Converting categorical variables to factors
testdata$Pre_Crash_Mode <- as.factor(testdata$Pre_Crash_Mode)
testdata <- testdata %>% mutate_if(is.character, as.factor)
Feature Selection
nzvr <- nearZeroVar(testdata, saveMetrics = TRUE)
testdata <- testdata[, !nzvr$nzv]
Model development and training
# Model development
set.seed(123)
trainingIndex <- createDataPartition(testdata$Pre_Crash_Mode, p = 0.65, list = FALSE)
train <- testdata[trainingIndex, ]
test <- testdata[-trainingIndex, ]
# Training the model
# Decision Tree
tree_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rpart")
# Random Forest
rf_model <- train(Pre_Crash_Mode ~ ., data = train, method = "rf",tuneGrid = expand.grid(mtry = c(3, 4, 5, 6)))
# GBM
gbm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "gbm", verbose = FALSE)
# SVM
svm_model <- train(Pre_Crash_Mode ~ ., data = train, method = "svmRadial")
# kNN
knn_model <- train(Pre_Crash_Mode ~ ., data = train, method = "knn")
# GLM
log_model <- train(Pre_Crash_Mode ~ ., data = train, method = "glm")
Determing the best tuning parameter
print(log_model$bestTune)
## parameter
## 1 none
print(rf_model$bestTune)
## mtry
## 4 6
print(gbm_model$bestTune)
## n.trees interaction.depth shrinkage n.minobsinnode
## 3 150 1 0.1 10
print(svm_model$bestTune)
## sigma C
## 3 0.06299541 1
print(knn_model$bestTune)
## k
## 3 9
print(tree_model$bestTune)
## cp
## 1 0.02083333
Predictions on the test set
predict_rf <- predict(rf_model, newdata = test)
predict_tree <- predict(tree_model, newdata = test)
predict_svm <- predict(svm_model, newdata = test)
predict_gbm <- predict(gbm_model, newdata = test)
predict_knn <- predict(knn_model, newdata = test)
predict_log <- predict(log_model, newdata = test)
Accuracy and Kappa for each model
# Initializing an empty data frame to store model performance metrics
model_metrics <- data.frame(Model = character(), Accuracy = numeric(), Kappa = numeric(), stringsAsFactors = FALSE)
# Generating confusion matrices and extracting Accuracy and Kappa for each model
# Random Forest
cm_rf <- confusionMatrix(predict_rf, test$Pre_Crash_Mode)
accuracy_rf <- cm_rf$overall["Accuracy"]
kappa_rf <- cm_rf$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "Random Forest", Accuracy = as.numeric(accuracy_rf), Kappa = as.numeric(kappa_rf)))
# Decision Tree
cm_tree <- confusionMatrix(predict_tree, test$Pre_Crash_Mode)
accuracy_tree <- cm_tree$overall["Accuracy"]
kappa_tree <- cm_tree$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "Decision Tree", Accuracy = as.numeric(accuracy_tree), Kappa = as.numeric(kappa_tree)))
# SVM
cm_svm <- confusionMatrix(predict_svm, test$Pre_Crash_Mode)
accuracy_svm <- cm_svm$overall["Accuracy"]
kappa_svm <- cm_svm$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "SVM", Accuracy = as.numeric(accuracy_svm), Kappa = as.numeric(kappa_svm)))
# GBM
cm_gbm <- confusionMatrix(predict_gbm, test$Pre_Crash_Mode)
accuracy_gbm <- cm_gbm$overall["Accuracy"]
kappa_gbm <- cm_gbm$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "GBM", Accuracy = as.numeric(accuracy_gbm), Kappa = as.numeric(kappa_gbm)))
# GLM
cm_log = confusionMatrix(predict_log, test$Pre_Crash_Mode)
accuracy_log <- cm_log$overall["Accuracy"]
kappa_log <- cm_log$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "GLM", Accuracy = as.numeric(accuracy_log), Kappa = as.numeric(kappa_log)))
# kNN
cm_knn <- confusionMatrix(predict_knn, test$Pre_Crash_Mode)
accuracy_knn <- cm_knn$overall["Accuracy"]
kappa_knn <- cm_knn$overall["Kappa"]
model_metrics <- rbind(model_metrics, data.frame(Model = "kNN", Accuracy = as.numeric(accuracy_knn), Kappa = as.numeric(kappa_knn)))
# Convert accuracy and kappa columns to numeric
model_metrics$Accuracy <- as.numeric(model_metrics$Accuracy)
model_metrics$Kappa <- as.numeric(model_metrics$Kappa)
Displaying accuracy and kappa in table
kable(model_metrics, caption = "Model Performance Metrics for Accuracy and Kappa")
| Model | Accuracy | Kappa |
|---|---|---|
| Random Forest | 0.6896552 | 0.3140604 |
| Decision Tree | 0.6724138 | 0.3288672 |
| SVM | 0.6206897 | 0.1954603 |
| GBM | 0.6724138 | 0.3288672 |
| GLM | 0.5517241 | 0.0679852 |
| kNN | 0.7068966 | 0.3555556 |
Plotting the results using ggplot2
# Accuracy plot
ggplot(model_metrics, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Model Accuracy Comparison", x = "Model", y = "Accuracy") +
theme_bw()
# Kappa plot
ggplot(model_metrics, aes(x = Model, y = Kappa, fill = Model)) +
geom_bar(stat = "identity") +
labs(title = "Model Kappa Comparison", x = "Model", y = "Kappa") +
theme_minimal() +
theme(legend.position = "none")
TASK 2
Loading the dataset and selecting required variables
newdata <- read.csv("C:/Users/gyc22/Documents/AI/Files/CA_AV_CrashReportsSample.csv")
newdata <- newdata %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()
Text preprocessing
corpusset <- VCorpus(VectorSource(newdata$CrashNarrative))
corpusset <- tm_map(corpusset, content_transformer(tolower))
corpusset <- tm_map(corpusset, removePunctuation)
corpusset <- tm_map(corpusset, removeNumbers)
corpusset <- tm_map(corpusset, removeWords, stopwords("en"))
corpusset <- tm_map(corpusset, stripWhitespace)
TF-IDF vectorization
document <- DocumentTermMatrix(corpusset, control = list(weighting = weightTfIdf))
tfidfData <- as.data.frame(as.matrix(document))
tfidfData$Pre_Crash_Mode <- newdata$Pre_Crash_Mode
Splitting into train and test set
set.seed(123)
trainIndex <- createDataPartition(tfidfData$Pre_Crash_Mode, p = 0.75, list = FALSE)
train_data <- tfidfData[trainIndex,]
test_data <- tfidfData[-trainIndex,]
colnames(train_data) <- make.names(colnames(train_data))
colnames(test_data) <- make.names(colnames(test_data))
Applying class balancing and training model
# Class balancing
train_control <- trainControl(method = "cv", number = 4, sampling = "up")
# Training Random Forest model using caret
rf_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "rf",
ntree = 150,
tuneGrid = data.frame(mtry = 4),
trControl = trainControl(method = "cv", number = 6)
)
# Training Logistic Regression with class balancing
log_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.15, by = 0.01))
)
# Training a Gradient Boosting Machine (GBM) Model using caret to ensure compatibility with lime
gbm_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "gbm",
trControl = trainControl(method = "cv", number = 7),
tuneGrid = expand.grid(
n.trees = 100,
interaction.depth = 4,
shrinkage = 0.2,
n.minobsinnode = 15)
,verbose = FALSE
)
Predictions and accuracy
# Predictions
log_predict <- predict(log_model, test_data)
rf_predict <- predict(rf_model, test_data)
gbm_predict <- predict(gbm_model, test_data)
# Accuracy extraction
log_acc <- mean(log_predict == test_data$Pre_Crash_Mode)
gbm_acc <- mean(gbm_predict == test_data$Pre_Crash_Mode)
rf_acc <- mean(rf_predict == test_data$Pre_Crash_Mode)
cat("Accuracy of Regularized Logistic Regression:", log_acc, "\n")
## Accuracy of Regularized Logistic Regression: 0.9264706
cat("Accuracy of Gradient Boosting Machine:", gbm_acc, "\n")
## Accuracy of Gradient Boosting Machine: 0.8970588
cat("Random Forest Accuracy with tuned parameters:", rf_acc, "\n")
## Random Forest Accuracy with tuned parameters: 0.6470588
Explainable AI using LIME
explain_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explain_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)
explain_log <- lime(train_data[, -ncol(train_data)], log_model)
Generating LIME explanations table
# LIME Explanations
explaining_log <- explain(test_data[1:8, -ncol(test_data)], explain_log, n_labels = 1, n_features = 12)
explaining_rf <- explain(test_data[1:8, -ncol(test_data)], explain_rf, n_labels = 1, n_features = 12)
explaining_gbm <- explain(test_data[1:8, -ncol(test_data)], explain_gbm, n_labels = 1, n_features = 12)
# Print explanations for GLM
print(explaining_log)
## # A tibble: 96 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 2 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 3 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 4 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 5 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 6 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 7 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 8 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 9 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## 10 classificat… 2 Auto… 0.914 0.389 0.0874 0.914
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
# Explanations for RF
print(explaining_rf)
## # A tibble: 96 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 2 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 3 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 4 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 5 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 6 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 7 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 8 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 9 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## 10 classificat… 2 Auto… 0.84 0.00951 0.794 0.840
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
# Explanations for GBM
print(explaining_gbm)
## # A tibble: 96 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 2 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 3 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 4 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 5 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 6 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 7 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 8 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 9 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## 10 classificat… 2 Auto… 1.00 0.161 0.606 1.00
## # ℹ 86 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
Extracting and preparing the top 12 important features for all three models
# Extract and prepare the top 12 important features for Random Forest
importance_rf <- varImp(rf_model)$importance %>%
rownames_to_column(var = "feature") %>%
arrange(desc(Overall)) %>%
slice_head(n = 12)
# Extract and prepare the top 12 important features for GBM
importance_gbm <- varImp(gbm_model)$importance %>%
rownames_to_column(var = "feature") %>%
arrange(desc(Overall)) %>%
slice_head(n = 12)
# Extract and prepare the top 12 important features for Logistic Regression
importance_log <- varImp(log_model)$importance %>%
rownames_to_column(var = "feature") %>%
arrange(desc(Overall)) %>%
slice_head(n = 12)
# Plot for Random Forest
ggplot(importance_rf, aes(x = reorder(feature, Overall), y = Overall, fill = "Random Forest")) +
geom_bar(stat = "identity", color = "black") +
coord_flip() +
scale_fill_manual(values = "red", name = "Model") +
ggtitle("Top 12 Features - Random Forest") +
xlab("Feature") +
ylab("Importance") +
theme_minimal()
# Plot for Logistic Regression
ggplot(importance_log, aes(x = reorder(feature, Overall), y = Overall, fill = "Logistic Regression")) +
geom_bar(stat = "identity", color = "black") +
coord_flip() +
scale_fill_manual(values = "pink", name = "Model") +
ggtitle("Top 12 Features - Logistic Regression") +
xlab("Feature") +
ylab("Importance") +
theme_minimal()
# Plot for GBM
ggplot(importance_gbm, aes(x = reorder(feature, Overall), y = Overall, fill = "GBM")) +
geom_bar(stat = "identity", color = "black") +
coord_flip() +
scale_fill_manual(values = "green", name = "Model") +
ggtitle("Top 12 Features - GBM") +
xlab("Feature") +
ylab("Importance") +
theme_minimal()