Load Libraries
library(caret)
library(dplyr)
library(randomForest)
library(e1071)
library(gbm)
library(glmnet)
library(knitr)
library(ggplot2)
Load and Process Data
# Load the dataset
data <- read.csv("C:/AI in CE/CA_AV_CrashReportsSample (1).csv")
# Step 1: Data Preprocessing
# Select only the specified columns
data <- data %>% select(Year, Veh1_Year, Veh1_Make, Veh1_Model, Veh1_Register_State, Vehicle_Was,Involved_in_the_Accident, IS_VRU_FromNarra, IS_VRU, No_Vehicle_Involved,
Otherparty_Vehicle_Year, Vehicle_was_OtherParty, Involved_Party_Otherparty,Pre_Crash_Mode, Weather, Lighting, Roadway_Surface, Roadway.Conditions,Movement_Bef_Veh1, Movement_Bef_Veh2, Movement_Bef_Veh3, Collision_Type,Collision_Type1)
# Handle missing values (example: remove rows with NA values)
data <- na.omit(data)
# Convert categorical variables to factors
data$Pre_Crash_Mode <- as.factor(data$Pre_Crash_Mode)
data <- data %>% mutate_if(is.character, as.factor)
# Step 2: PCA Transformation
# Scale the data and apply PCA
preProc <- preProcess(data[, -ncol(data)], method = c("center", "scale", "pca"), pcaComp = 15) # Adjust pcaComp as needed
data_pca <- predict(preProc, data)
# Add the target variable back to the PCA-transformed data
data_pca$Pre_Crash_Mode <- data$Pre_Crash_Mode
# Step 3: Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(data_pca$Pre_Crash_Mode, p = 0.8, list = FALSE)
train <- data_pca[trainIndex, ]
test <- data_pca[-trainIndex, ]
# Define training control with cross-validation
train_control <- trainControl(method = "cv", number = 5)
Apply ML Models
# Step 4: Model Training
# Logistic Regression with PCA-transformed data
log_model <- train(
Pre_Crash_Mode ~ ., data = train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01))
)
# Random Forest with PCA-transformed data
rf_model <- train(
Pre_Crash_Mode ~ ., data = train,
method = "rf",
trControl = train_control,
tuneGrid = expand.grid(mtry = c(2, 3, 4, 5))
)
# GBM with PCA-transformed data
gbm_model <- train(
Pre_Crash_Mode ~ ., data = train,
method = "gbm",
trControl = train_control,
tuneGrid = expand.grid(
n.trees = c(50, 100),
interaction.depth = c(1, 3),
shrinkage = c(0.1, 0.05),
n.minobsinnode = c(10, 20)
),
verbose = FALSE
)
# SVM with PCA-transformed data
svm_model <- train(
Pre_Crash_Mode ~ ., data = train,
method = "svmRadial",
trControl = train_control,
tuneGrid = expand.grid(C = c(0.1, 1, 10), sigma = c(0.01, 0.05, 0.1))
)
Model Evaluation
# Predict on the test set and calculate accuracy and Kappa
log_pred <- predict(log_model, test)
rf_pred <- predict(rf_model, test)
gbm_pred <- predict(gbm_model, test)
svm_pred <- predict(svm_model, test)
# Calculate accuracy
log_acc <- mean(log_pred == test$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test$Pre_Crash_Mode)
svm_acc <- mean(svm_pred == test$Pre_Crash_Mode)
# Calculate Kappa
log_kappa <- confusionMatrix(log_pred, test$Pre_Crash_Mode)$overall['Kappa']
rf_kappa <- confusionMatrix(rf_pred, test$Pre_Crash_Mode)$overall['Kappa']
gbm_kappa <- confusionMatrix(gbm_pred, test$Pre_Crash_Mode)$overall['Kappa']
svm_kappa <- confusionMatrix(svm_pred, test$Pre_Crash_Mode)$overall['Kappa']
# Print accuracies and Kappa values
cat("Logistic Regression (PCA) Accuracy:", log_acc, "Kappa:", log_kappa, "\n")
## Logistic Regression (PCA) Accuracy: 0.6956522 Kappa: 0.3878327
cat("Random Forest (PCA) Accuracy:", rf_acc, "Kappa:", rf_kappa, "\n")
## Random Forest (PCA) Accuracy: 0.7391304 Kappa: 0.4297521
cat("Gradient Boosting Machine (PCA) Accuracy:", gbm_acc, "Kappa:", gbm_kappa, "\n")
## Gradient Boosting Machine (PCA) Accuracy: 0.8695652 Kappa: 0.7376426
cat("SVM (PCA) Accuracy:", svm_acc, "Kappa:", svm_kappa, "\n")
## SVM (PCA) Accuracy: 0.8695652 Kappa: 0.7315175
Visualization of Results
# View best tuning parameters for each model
print(log_model$bestTune)
## alpha lambda
## 1 1 0.001
print(rf_model$bestTune)
## mtry
## 2 3
print(gbm_model$bestTune)
## n.trees interaction.depth shrinkage n.minobsinnode
## 9 50 1 0.1 10
print(svm_model$bestTune)
## sigma C
## 7 0.01 10
# Create a summary table
model_performance <- data.frame(
Model = c("Logistic Regression", "Random Forest", "Gradient Boosting Machine", "SVM"),
Accuracy = c(log_acc, rf_acc, gbm_acc, svm_acc),
Kappa = c(log_kappa, rf_kappa, gbm_kappa, svm_kappa)
)
# Print the summary table
print(model_performance)
## Model Accuracy Kappa
## 1 Logistic Regression 0.6956522 0.3878327
## 2 Random Forest 0.7391304 0.4297521
## 3 Gradient Boosting Machine 0.8695652 0.7376426
## 4 SVM 0.8695652 0.7315175
# View the table in a nicer format
kable(model_performance, caption = "Model Performance Summary with Accuracy and Kappa")
Model | Accuracy | Kappa |
---|---|---|
Logistic Regression | 0.6956522 | 0.3878327 |
Random Forest | 0.7391304 | 0.4297521 |
Gradient Boosting Machine | 0.8695652 | 0.7376426 |
SVM | 0.8695652 | 0.7315175 |
# Visualization: Graphical Representation of Model Performance (Accuracy and Kappa)
# Accuracy plot
ggplot(model_performance, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity", width = 0.6) +
theme_minimal() +
labs(
title = "Model Performance Comparison (Accuracy)",
x = "Model",
y = "Accuracy"
) +
geom_text(aes(label = round(Accuracy, 3)), vjust = -0.3) +
theme(legend.position = "none")
# Kappa plot
ggplot(model_performance, aes(x = Model, y = Kappa, fill = Model)) +
geom_bar(stat = "identity", width = 0.6) +
theme_minimal() +
labs(
title = "Model Performance Comparison (Kappa)",
x = "Model",
y = "Kappa"
) +
geom_text(aes(label = round(Kappa, 3)), vjust = -0.3) +
theme(legend.position = "none")
Load Libraries
library(tidyverse)
library(caret)
library(tm)
library(glmnet)
library(e1071)
library(randomForest)
library(gbm)
library(lime)
Load and Process Data
# Load dataset
data <- read.csv("C:/AI in CE/CA_AV_CrashReportsSample (1).csv")
data <- data %>% select(CrashNarrative, Pre_Crash_Mode) %>% drop_na()
# Text Preprocessing
corpus <- VCorpus(VectorSource(data$CrashNarrative))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)
# TF-IDF Vectorization
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))
tfidf_data <- as.data.frame(as.matrix(dtm))
tfidf_data$Pre_Crash_Mode <- data$Pre_Crash_Mode
# Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(tfidf_data$Pre_Crash_Mode, p = 0.8, list = FALSE)
train_data <- tfidf_data[trainIndex,]
test_data <- tfidf_data[-trainIndex,]
colnames(train_data) <- make.names(colnames(train_data))
colnames(test_data) <- make.names(colnames(test_data))
Apply ML Models
# Model Training
# Apply class balancing
train_control <- trainControl(method = "cv", number = 5, sampling = "up")
# Train Logistic Regression
log_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01))
)
# Train Random Forest model
rf_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "rf",
ntree = 200,
tuneGrid = data.frame(mtry = 3),
trControl = train_control
)
# Train Gradient Boosting Machine (GBM) model
gbm_model <- train(
Pre_Crash_Mode ~ ., data = train_data,
method = "gbm",
trControl = train_control,
tuneGrid = expand.grid(
n.trees = 100,
interaction.depth = 3,
shrinkage = 0.1,
n.minobsinnode = 10
),
verbose = FALSE
)
Model Evaluation and Visualization of Results
# Model Evaluation
log_pred <- predict(log_model, test_data)
gbm_pred <- predict(gbm_model, test_data)
rf_pred <- predict(rf_model, test_data)
log_acc <- mean(log_pred == test_data$Pre_Crash_Mode)
gbm_acc <- mean(gbm_pred == test_data$Pre_Crash_Mode)
rf_acc <- mean(rf_pred == test_data$Pre_Crash_Mode)
cat("Logistic Regression Accuracy:", log_acc, "\n")
## Logistic Regression Accuracy: 0.9056604
cat("GBM Accuracy:", gbm_acc, "\n")
## GBM Accuracy: 0.9245283
cat("Random Forest Accuracy:", rf_acc, "\n")
## Random Forest Accuracy: 0.7169811
# Explainable AI with LIME
explainer_log <- lime(train_data[, -ncol(train_data)], log_model)
explainer_rf <- lime(train_data[, -ncol(train_data)], rf_model)
explainer_gbm <- lime(train_data[, -ncol(train_data)], gbm_model)
# LIME Explanations
explanation_log <- explain(test_data[1:5, -ncol(test_data)], explainer_log, n_labels = 1, n_features = 20)
explanation_rf <- explain(test_data[1:5, -ncol(test_data)], explainer_rf, n_labels = 1, n_features = 20)
explanation_gbm <- explain(test_data[1:5, -ncol(test_data)], explainer_gbm, n_labels = 1, n_features = 20)
# Print LIME explanations for Logistic Regression
cat("\nLIME Explanation for Logistic Regression:\n")
##
## LIME Explanation for Logistic Regression:
print(explanation_log)
## # A tibble: 100 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 2 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 3 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 4 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 5 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 6 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 7 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 8 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 9 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## 10 classificat… 3 Conv… 0.984 0.522 1.13 0.984
## # ℹ 90 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
# Print LIME explanations for Random Forest
cat("\nLIME Explanation for Random Forest:\n")
##
## LIME Explanation for Random Forest:
print(explanation_rf)
## # A tibble: 100 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 2 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 3 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 4 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 5 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 6 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 7 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 8 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 9 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## 10 classificat… 3 Conv… 0.7 0.181 0.327 0.700
## # ℹ 90 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
# Print LIME explanations for GBM
cat("\nLIME Explanation for GBM:\n")
##
## LIME Explanation for GBM:
print(explanation_gbm)
## # A tibble: 100 × 13
## model_type case label label_prob model_r2 model_intercept model_prediction
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 2 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 3 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 4 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 5 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 6 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 7 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 8 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 9 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## 10 classificat… 3 Conv… 0.988 0.171 0.813 0.988
## # ℹ 90 more rows
## # ℹ 6 more variables: feature <chr>, feature_value <dbl>, feature_weight <dbl>,
## # feature_desc <chr>, data <list>, prediction <list>
# Extract Feature Importances for Top 20 Features
importance_log <- varImp(log_model)$importance
importance_rf <- varImp(rf_model)$importance
importance_gbm <- varImp(gbm_model)$importance
top_20_log <- importance_log %>% arrange(desc(Overall)) %>% head(20)
top_20_rf <- importance_rf %>% arrange(desc(Overall)) %>% head(20)
top_20_gbm <- importance_gbm %>% arrange(desc(Overall)) %>% head(20)
# Plot Top 20 Feature Importances with Different Colors
ggplot(top_20_log, aes(x = reorder(rownames(top_20_log), Overall), y = Overall, fill = "Logistic Regression")) +
geom_bar(stat = "identity", color = "black") +
coord_flip() +
scale_fill_manual(values = "blue") +
ggtitle("Top 20 Features - Logistic Regression") +
xlab("Feature") +
ylab("Importance")
ggplot(top_20_rf, aes(x = reorder(rownames(top_20_rf), Overall), y = Overall, fill = "Random Forest")) +
geom_bar(stat = "identity", color = "black") +
coord_flip() +
scale_fill_manual(values = "green") +
ggtitle("Top 20 Features - Random Forest") +
xlab("Feature") +
ylab("Importance")
ggplot(top_20_gbm, aes(x = reorder(rownames(top_20_gbm), Overall), y = Overall, fill = "GBM")) +
geom_bar(stat = "identity", color = "black") +
coord_flip() +
scale_fill_manual(values = "pink") +
ggtitle("Top 20 Features - GBM") +
xlab("Feature") +
ylab("Importance")