Training and evaluating supervised machine learning models (Logistic Regression, Random Forest and XGBoost) on historical insurance data to detect fraudulent claims.
Dataset: Vehicle Claim Fraud Detection on Kaggle
library(tidyverse)
library(ggplot2)
library(reshape2)
library(corrplot)
library(dplyr)
library(caret)
library(caTools)
library(ROSE)
library(xgboost)
library(randomForest)
library(pROC)
library(recipes)
library(knitr)
set.seed(123)
data <- read.csv("fraud_oracle.csv", stringsAsFactors = TRUE)
fraud_counts <- table(data$FraudFound_P)
barplot(fraud_counts, names.arg = c("Non-Fraud", "Fraud"), col = c("skyblue", "red"), main = "Distribution of Fraud within the Dataset", ylab = "Count")
Employing a correlation heatmap as a visual aid for feature selection.
df_all <- data
df_all[] <- lapply(df_all, function(x) if(is.factor(x)) as.numeric(x) else x)
corr_all <- cor(df_all, use = "pairwise.complete.obs")
corr_melt <- melt(corr_all)
ggplot(corr_melt, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), size = 3) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 8),
axis.text.y = element_text(size = 8),
axis.title = element_blank()) +
labs(title = "Correlation Heatmap")
Eliminating variables with correlation > 0.2 to reduce multicollinearity while retaining predictive power.
features <- c("FraudFound_P", "AgeOfPolicyHolder", "Fault", "VehiclePrice",
"Days_Policy_Accident", "PastNumberOfClaims", "AgeOfVehicle",
"PoliceReportFiled", "AgentType", "NumberOfSuppliments",
"AddressChange_Claim", "WitnessPresent", "Sex", "AccidentArea",
"BasePolicy", "MaritalStatus", "DriverRating", "NumberOfCars",
"Deductible", "MonthClaimed", "Make", "Month", "Year")
df <- data %>% select(all_of(features))
Allocating 70% of the data for training the models and the remaining 30% reserved for testing.
sample_split <- sample.split(df$FraudFound_P, SplitRatio = 0.7)
train_data <- subset(df, sample_split == TRUE)
test_data <- subset(df, sample_split == FALSE)
ROSE (Random Over-Sampling Examples) uses Synthetic Minority Over-sampling Technique (SMOTE) to generate synthetic instances of the minority class and randomly selects examples from the majority class.
traindata <- ROSE(FraudFound_P ~ ., data = train_data, p = 0.5, seed = 123)
train_balanced <- traindata$data
balanced_fraud_counts <- table(train_balanced$FraudFound_P)
balanced_non_fraud_count <- balanced_fraud_counts[1]
balanced_fraud_count <- balanced_fraud_counts[2]
barplot(balanced_fraud_counts, names.arg = c("Non-Fraud", "Fraud"), col = c("skyblue", "red"),
main = "Distribution of Fraud within the Dataset after Balancing", ylab = "Count")
Performing one-hot encoding to transform categorical variables into binary vectors.
Scaling to prevent features with larger numeric ranges from dominating the model’s learning.
rec <- recipe(FraudFound_P ~ ., data = train_balanced) %>%
step_dummy(all_nominal_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
prep(training = train_balanced)
train_processed <- bake(rec, new_data = train_balanced)
test_processed <- bake(rec, new_data = test_data)
train_processed$FraudFound_P <- factor(train_balanced$FraudFound_P)
test_processed$FraudFound_P <- factor(test_data$FraudFound_P)
levels(train_processed$FraudFound_P) <- make.names(levels(train_processed$FraudFound_P))
levels(test_processed$FraudFound_P) <- make.names(levels(test_processed$FraudFound_P))
Training with 5-fold cross-validation and optimizing using the ROC metric.
ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
log_model <- train(FraudFound_P ~ ., data = train_processed,
method = "glm", family = "binomial",
trControl = ctrl, metric = "ROC")
Specifying an ensemble of 500 trees, randomly selecting a subset of predictors at each split, and computing feature importance.
rf_model <- randomForest(FraudFound_P ~ ., data = train_processed, ntree = 500,
mtry = floor(sqrt(ncol(train_processed)-1)), importance = TRUE)
Training on 200 rounds, testing shallow vs moderate tree depth, small steps for stable learning, training each tree on 80% of the data, and allowing splits as long as a child has at least 1 instance.
xgb_grid <- expand.grid(
nrounds = 200,
max_depth = c(3,6),
eta = c(0.05, 0.1),
gamma = 0,
colsample_bytree = 0.8,
min_child_weight = 1,
subsample = 0.8
)
xgb_model <- train(FraudFound_P ~ ., data = train_processed,
method = "xgbTree",
trControl = ctrl,
tuneGrid = xgb_grid,
metric = "ROC"
)
Defining a function to return the Confusion Matrix, Accuracy, Precision, Recall and F1 Score of each model.
evaluate_model <- function(model, test_data, type_prob = TRUE) {
if (type_prob) {
probs <- predict(model, newdata = test_data, type = "prob")[,2]
} else {
probs <- as.numeric(predict(model, newdata = test_data, type = "raw"))
}
actual <- ifelse(test_data$FraudFound_P == levels(test_data$FraudFound_P)[2], 1, 0)
pred_labels <- ifelse(probs > 0.5, 1, 0)
cm <- table(Predicted = pred_labels, Actual = actual)
accuracy <- sum(diag(cm)) / sum(cm)
precision <- cm[2,2] / sum(cm[2,])
recall <- cm[2,2] / sum(cm[,2])
f1 <- 2*(precision*recall)/(precision+recall)
roc_obj <- roc(actual, probs)
auc_val <- auc(roc_obj)
return(list(
confusion_matrix = cm,
accuracy = accuracy,
precision = precision,
recall = recall,
f1 = f1,
roc_obj = roc_obj,
auc = auc_val
))
}
log_eval <- evaluate_model(log_model, test_processed, type_prob=TRUE)
par(mfrow = c(1, 2))
plot(log_eval$roc_obj, col="blue", lwd=2, main="ROC - Logistic Regression")
plot.new()
cm_text <- apply(log_eval$confusion_matrix, 1, function(x) paste(x, collapse = " "))
cm_text <- paste(names(cm_text), cm_text, sep = ": ", collapse = "\n")
text(0.3, 0.9, paste("Accuracy:", round(log_eval$accuracy, 3)))
text(0.3, 0.8, paste("Precision:", round(log_eval$precision, 3)))
text(0.3, 0.7, paste("Recall:", round(log_eval$recall, 3)))
text(0.3, 0.6, paste("F1 Score:", round(log_eval$f1, 3)))
text(0.3, 0.5, paste("AUC:", round(log_eval$auc, 3)))
text(0.3, 0.3, labels = "Confusion Matrix:", font = 2, cex = 1)
text(0.3, 0.1, labels = cm_text, adj = 0)
par(mfrow = c(1, 1))
rf_eval <- evaluate_model(rf_model, test_processed, type_prob=TRUE)
par(mfrow = c(1, 2))
plot(rf_eval$roc_obj, col="green", main="ROC - Random Forest")
plot.new()
cm_text <- apply(rf_eval$confusion_matrix, 1, function(x) paste(x, collapse = " "))
cm_text <- paste(names(cm_text), cm_text, sep = ": ", collapse = "\n")
text(0.3, 0.9, paste("Accuracy:", round(rf_eval$accuracy, 3)))
text(0.3, 0.8, paste("Precision:", round(rf_eval$precision, 3)))
text(0.3, 0.7, paste("Recall:", round(rf_eval$recall, 3)))
text(0.3, 0.6, paste("F1 Score:", round(rf_eval$f1, 3)))
text(0.3, 0.5, paste("AUC:", round(rf_eval$auc, 3)))
text(0.3, 0.3, labels = "Confusion Matrix:", font = 2, cex = 1)
text(0.3, 0.1, labels = cm_text, adj = 0)
par(mfrow = c(1, 1))
xgb_eval <- evaluate_model(xgb_model, test_processed)
par(mfrow = c(1, 2))
plot(xgb_eval$roc_obj, col="red", main="ROC - XGBoost")
plot.new()
cm_text <- apply(xgb_eval$confusion_matrix, 1, function(x) paste(x, collapse = " "))
cm_text <- paste(names(cm_text), cm_text, sep = ": ", collapse = "\n")
text(0.3, 0.9, paste("Accuracy:", round(xgb_eval$accuracy, 3)))
text(0.3, 0.8, paste("Precision:", round(xgb_eval$precision, 3)))
text(0.3, 0.7, paste("Recall:", round(xgb_eval$recall, 3)))
text(0.3, 0.6, paste("F1 Score:", round(xgb_eval$f1, 3)))
text(0.3, 0.5, paste("AUC:", round(xgb_eval$auc, 3)))
text(0.3, 0.3, labels = "Confusion Matrix:", font = 2, cex = 1)
text(0.3, 0.1, labels = cm_text, adj = 0)
Quantifying the contribution of each input variable to the model’s predictions regarding the probability of a claim being classified as fraudulent.
par(mfrow = c(1, 1))
coef_vals <- summary(log_model$finalModel)$coefficients
coef_df <- data.frame(
Feature = rownames(coef_vals),
Coefficient = coef_vals[, "Estimate"]
)
coef_df <- coef_df[coef_df$Feature != "(Intercept)", ]
coef_df <- coef_df[order(abs(coef_df$Coefficient), decreasing = TRUE), ]
top_n <- 10
top_features <- coef_df[1:top_n, ]
ggplot(top_features, aes(x = reorder(Feature, abs(Coefficient)), y = Coefficient)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = paste("Top", top_n, "Features - Logistic Regression"),
x = "Feature",
y = "Coefficient") +
theme_minimal(base_size = 12)
top_n <- 10
importance_vals <- importance(rf_model)
importance_sorted <- importance_vals[order(importance_vals[, "MeanDecreaseGini"], decreasing = TRUE), , drop = FALSE]
top_features <- importance_sorted[1:top_n, , drop = FALSE]
top_features_df <- data.frame(
Feature = rownames(top_features),
MeanDecreaseGini = round(top_features[, "MeanDecreaseGini"], 3)
)
ggplot(top_features_df, aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
geom_bar(stat = "identity", fill = "forestgreen") +
coord_flip() +
labs(title = paste("Top", top_n, "Features - Random Forest"),
x = "Feature",
y = "Mean Decrease in Gini") +
theme_minimal(base_size = 12)
feature_names <- colnames(train_processed)[colnames(train_processed) != "FraudFound_P"]
xgb_imp <- xgb.importance(feature_names = feature_names, model = xgb_model$finalModel)
top_n <- 10
top_features <- xgb_imp[1:top_n, ]
ggplot(top_features, aes(x = reorder(Feature, Gain), y = Gain)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Features - XGBoost", x = "Feature", y = "Gain") +
theme_minimal(base_size = 12)
The XGBoost model outperformed the other models with an accuracy of 90.1%. Saving and leveraging the model on new data to generate the probability of each of the records being fraudulent.
new_data <- read.csv("newrecords.csv") %>%
select(all_of(features))
new_data_processed <- bake(rec, new_data = new_data)
saveRDS(xgb_model, "XGB_Fraud_Model.rds")
xgb_loaded <- readRDS("XGB_Fraud_Model.rds")
pred_probs <- predict(xgb_loaded, newdata = new_data_processed, type = "prob")[,2]
str(new_data)
## 'data.frame': 5 obs. of 23 variables:
## $ FraudFound_P : int 0 0 0 0 0
## $ AgeOfPolicyHolder : chr "21 to 25" "41 to 50" "36 to 40" "31 to 35" ...
## $ Fault : chr "Third Party" "Policy Holder" "Policy Holder" "Policy Holder" ...
## $ VehiclePrice : chr "more than 69000" "20000 to 29000" "more than 69000" "20000 to 29000" ...
## $ Days_Policy_Accident: chr "1 to 7" "more than 30" "more than 30" "more than 30" ...
## $ PastNumberOfClaims : chr "none" "2 to 4" "none" "1" ...
## $ AgeOfVehicle : chr "5 years" "more than 7" "7 years" "7 years" ...
## $ PoliceReportFiled : chr "No" "Yes" "No" "No" ...
## $ AgentType : chr "External" "External" "Internal" "External" ...
## $ NumberOfSuppliments : chr "3 to 5" "none" "none" "none" ...
## $ AddressChange_Claim : chr "no change" "no change" "no change" "2 to 3 years" ...
## $ WitnessPresent : chr "No" "No" "No" "Yes" ...
## $ Sex : chr "Male" "Male" "Female" "Male" ...
## $ AccidentArea : chr "Urban" "Urban" "Urban" "Rural" ...
## $ BasePolicy : chr "Collision" "Liability" "All Perils" "Liability" ...
## $ MaritalStatus : chr "Single" "Married" "Married" "Single" ...
## $ DriverRating : int 3 1 4 1 2
## $ NumberOfCars : chr "1 vehicle" "2 vehicles" "1 vehicle" "1 vehicle" ...
## $ Deductible : int 400 400 400 400 400
## $ MonthClaimed : chr "Nov" "Mar" "May" "May" ...
## $ Make : chr "Honda" "Honda" "Ford" "Pontiac" ...
## $ Month : chr "Oct" "Mar" "Apr" "May" ...
## $ Year : int 1994 1994 1994 1994 1994
pred_probs
## [1] 0.0004897118 0.0085116625 0.2984132767 0.0069570541 0.0008995533