Training and evaluating supervised machine learning models (Logistic Regression, Random Forest and XGBoost) on historical insurance data to detect fraudulent claims.

Dataset: Vehicle Claim Fraud Detection on Kaggle

library(tidyverse)
library(ggplot2)
library(reshape2)
library(corrplot)
library(dplyr)
library(caret)
library(caTools)
library(ROSE)
library(xgboost)
library(randomForest)
library(pROC)
library(recipes)
library(knitr)
set.seed(123)

data <- read.csv("fraud_oracle.csv", stringsAsFactors = TRUE)

fraud_counts <- table(data$FraudFound_P)
barplot(fraud_counts, names.arg = c("Non-Fraud", "Fraud"), col = c("skyblue", "red"), main = "Distribution of Fraud within the Dataset", ylab = "Count")

1. Data Preparation and Pre-processing

1.1 Feature Selection

Employing a correlation heatmap as a visual aid for feature selection.

df_all <- data
df_all[] <- lapply(df_all, function(x) if(is.factor(x)) as.numeric(x) else x)

corr_all <- cor(df_all, use = "pairwise.complete.obs")
corr_melt <- melt(corr_all)

ggplot(corr_melt, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), size = 3) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 8),
        axis.text.y = element_text(size = 8),
        axis.title = element_blank()) +
  labs(title = "Correlation Heatmap")

Eliminating variables with correlation > 0.2 to reduce multicollinearity while retaining predictive power.

features <- c("FraudFound_P", "AgeOfPolicyHolder", "Fault", "VehiclePrice",
"Days_Policy_Accident", "PastNumberOfClaims", "AgeOfVehicle",
"PoliceReportFiled", "AgentType", "NumberOfSuppliments",
"AddressChange_Claim", "WitnessPresent", "Sex", "AccidentArea",
"BasePolicy", "MaritalStatus", "DriverRating", "NumberOfCars",
"Deductible", "MonthClaimed", "Make", "Month", "Year")
df <- data %>% select(all_of(features))

1.2 Train-Test Split

Allocating 70% of the data for training the models and the remaining 30% reserved for testing.

sample_split <- sample.split(df$FraudFound_P, SplitRatio = 0.7)
train_data <- subset(df, sample_split == TRUE)
test_data <- subset(df, sample_split == FALSE)

1.3 Data Balancing using Rose

ROSE (Random Over-Sampling Examples) uses Synthetic Minority Over-sampling Technique (SMOTE) to generate synthetic instances of the minority class and randomly selects examples from the majority class.

traindata <- ROSE(FraudFound_P ~ ., data = train_data, p = 0.5, seed = 123)
train_balanced <- traindata$data
balanced_fraud_counts <- table(train_balanced$FraudFound_P)
balanced_non_fraud_count <- balanced_fraud_counts[1]
balanced_fraud_count <- balanced_fraud_counts[2]
barplot(balanced_fraud_counts, names.arg = c("Non-Fraud", "Fraud"), col = c("skyblue", "red"), 
        main = "Distribution of Fraud within the Dataset after Balancing", ylab = "Count")

1.4 Encoding and Scaling

Performing one-hot encoding to transform categorical variables into binary vectors.

Scaling to prevent features with larger numeric ranges from dominating the model’s learning.

rec <- recipe(FraudFound_P ~ ., data = train_balanced) %>%
  step_dummy(all_nominal_predictors()) %>% 
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors()) %>%
  prep(training = train_balanced)

train_processed <- bake(rec, new_data = train_balanced)
test_processed  <- bake(rec, new_data = test_data)


train_processed$FraudFound_P <- factor(train_balanced$FraudFound_P)
test_processed$FraudFound_P <- factor(test_data$FraudFound_P)
levels(train_processed$FraudFound_P) <- make.names(levels(train_processed$FraudFound_P))
levels(test_processed$FraudFound_P)  <- make.names(levels(test_processed$FraudFound_P))

3. Model Training with Hyperparameter Tuning

3.1 Logistic Regression

Training with 5-fold cross-validation and optimizing using the ROC metric.

ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
log_model <- train(FraudFound_P ~ ., data = train_processed, 
                   method = "glm", family = "binomial", 
                   trControl = ctrl, metric = "ROC")

3.2 Random Forest

Specifying an ensemble of 500 trees, randomly selecting a subset of predictors at each split, and computing feature importance.

rf_model <- randomForest(FraudFound_P ~ ., data = train_processed, ntree = 500,
                         mtry = floor(sqrt(ncol(train_processed)-1)), importance = TRUE)

3.3 XGBoost

Training on 200 rounds, testing shallow vs moderate tree depth, small steps for stable learning, training each tree on 80% of the data, and allowing splits as long as a child has at least 1 instance.

xgb_grid <- expand.grid(
  nrounds = 200,
  max_depth = c(3,6),
  eta = c(0.05, 0.1),
  gamma = 0,
  colsample_bytree = 0.8,
  min_child_weight = 1,
  subsample = 0.8
)
xgb_model <- train(FraudFound_P ~ ., data = train_processed,
  method = "xgbTree",
  trControl = ctrl,
  tuneGrid = xgb_grid,
  metric = "ROC"
)

4. Model Performance Evaluation

Defining a function to return the Confusion Matrix, Accuracy, Precision, Recall and F1 Score of each model.

evaluate_model <- function(model, test_data, type_prob = TRUE) {
    if (type_prob) {
      probs <- predict(model, newdata = test_data, type = "prob")[,2]
    } else {
      probs <- as.numeric(predict(model, newdata = test_data, type = "raw"))
    }
    actual <- ifelse(test_data$FraudFound_P == levels(test_data$FraudFound_P)[2], 1, 0)
    pred_labels <- ifelse(probs > 0.5, 1, 0)
    cm <- table(Predicted = pred_labels, Actual = actual)
    accuracy <- sum(diag(cm)) / sum(cm)
    precision <- cm[2,2] / sum(cm[2,])
    recall <- cm[2,2] / sum(cm[,2])
    f1 <- 2*(precision*recall)/(precision+recall)
    roc_obj <- roc(actual, probs)
    auc_val <- auc(roc_obj)
    return(list(
      confusion_matrix = cm,
      accuracy = accuracy,
      precision = precision,
      recall = recall,
      f1 = f1,
      roc_obj = roc_obj,
      auc = auc_val
    ))
}

4.1 Logistic Regression

log_eval <- evaluate_model(log_model, test_processed, type_prob=TRUE)

par(mfrow = c(1, 2))

plot(log_eval$roc_obj, col="blue", lwd=2, main="ROC - Logistic Regression")

plot.new()

cm_text <- apply(log_eval$confusion_matrix, 1, function(x) paste(x, collapse = " "))
cm_text <- paste(names(cm_text), cm_text, sep = ": ", collapse = "\n")

text(0.3, 0.9, paste("Accuracy:", round(log_eval$accuracy, 3)))
text(0.3, 0.8, paste("Precision:", round(log_eval$precision, 3)))
text(0.3, 0.7, paste("Recall:", round(log_eval$recall, 3)))
text(0.3, 0.6, paste("F1 Score:", round(log_eval$f1, 3)))
text(0.3, 0.5, paste("AUC:", round(log_eval$auc, 3)))
text(0.3, 0.3, labels = "Confusion Matrix:", font = 2, cex = 1)
text(0.3, 0.1, labels = cm_text, adj = 0)

4.2 Random Forest

par(mfrow = c(1, 1))

rf_eval <- evaluate_model(rf_model, test_processed, type_prob=TRUE)

par(mfrow = c(1, 2))

plot(rf_eval$roc_obj, col="green", main="ROC - Random Forest")

plot.new()

cm_text <- apply(rf_eval$confusion_matrix, 1, function(x) paste(x, collapse = " "))
cm_text <- paste(names(cm_text), cm_text, sep = ": ", collapse = "\n")

text(0.3, 0.9, paste("Accuracy:", round(rf_eval$accuracy, 3)))
text(0.3, 0.8, paste("Precision:", round(rf_eval$precision, 3)))
text(0.3, 0.7, paste("Recall:", round(rf_eval$recall, 3)))
text(0.3, 0.6, paste("F1 Score:", round(rf_eval$f1, 3)))
text(0.3, 0.5, paste("AUC:", round(rf_eval$auc, 3)))
text(0.3, 0.3, labels = "Confusion Matrix:", font = 2, cex = 1)
text(0.3, 0.1, labels = cm_text, adj = 0)

4.3 XGBoost

par(mfrow = c(1, 1))

xgb_eval <- evaluate_model(xgb_model, test_processed)

par(mfrow = c(1, 2))

plot(xgb_eval$roc_obj, col="red", main="ROC - XGBoost")

plot.new()

cm_text <- apply(xgb_eval$confusion_matrix, 1, function(x) paste(x, collapse = " "))
cm_text <- paste(names(cm_text), cm_text, sep = ": ", collapse = "\n")

text(0.3, 0.9, paste("Accuracy:", round(xgb_eval$accuracy, 3)))
text(0.3, 0.8, paste("Precision:", round(xgb_eval$precision, 3)))
text(0.3, 0.7, paste("Recall:", round(xgb_eval$recall, 3)))
text(0.3, 0.6, paste("F1 Score:", round(xgb_eval$f1, 3)))
text(0.3, 0.5, paste("AUC:", round(xgb_eval$auc, 3)))
text(0.3, 0.3, labels = "Confusion Matrix:", font = 2, cex = 1)
text(0.3, 0.1, labels = cm_text, adj = 0)

5. Feature Importance

Quantifying the contribution of each input variable to the model’s predictions regarding the probability of a claim being classified as fraudulent.

5.1 Logistic Regression

par(mfrow = c(1, 1))

coef_vals <- summary(log_model$finalModel)$coefficients
coef_df <- data.frame(
  Feature = rownames(coef_vals),
  Coefficient = coef_vals[, "Estimate"]
)
coef_df <- coef_df[coef_df$Feature != "(Intercept)", ]
coef_df <- coef_df[order(abs(coef_df$Coefficient), decreasing = TRUE), ]

top_n <- 10
top_features <- coef_df[1:top_n, ]

ggplot(top_features, aes(x = reorder(Feature, abs(Coefficient)), y = Coefficient)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = paste("Top", top_n, "Features - Logistic Regression"),
       x = "Feature",
       y = "Coefficient") +
  theme_minimal(base_size = 12)

5.2 Random Forest

top_n <- 10
importance_vals <- importance(rf_model)
importance_sorted <- importance_vals[order(importance_vals[, "MeanDecreaseGini"], decreasing = TRUE), , drop = FALSE]
top_features <- importance_sorted[1:top_n, , drop = FALSE]
top_features_df <- data.frame(
  Feature = rownames(top_features),
  MeanDecreaseGini = round(top_features[, "MeanDecreaseGini"], 3)
)

ggplot(top_features_df, aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_bar(stat = "identity", fill = "forestgreen") +
  coord_flip() +
  labs(title = paste("Top", top_n, "Features - Random Forest"),
       x = "Feature",
       y = "Mean Decrease in Gini") +
  theme_minimal(base_size = 12)

5.3 XGBoost

feature_names <- colnames(train_processed)[colnames(train_processed) != "FraudFound_P"]
xgb_imp <- xgb.importance(feature_names = feature_names, model = xgb_model$finalModel)
top_n <- 10
top_features <- xgb_imp[1:top_n, ]
ggplot(top_features, aes(x = reorder(Feature, Gain), y = Gain)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Features - XGBoost", x = "Feature", y = "Gain") +
  theme_minimal(base_size = 12)

6. Model Deployment

The XGBoost model outperformed the other models with an accuracy of 90.1%. Saving and leveraging the model on new data to generate the probability of each of the records being fraudulent.

new_data <- read.csv("newrecords.csv") %>%
  select(all_of(features))
new_data_processed <- bake(rec, new_data = new_data)

saveRDS(xgb_model, "XGB_Fraud_Model.rds")
xgb_loaded <- readRDS("XGB_Fraud_Model.rds")

pred_probs <- predict(xgb_loaded, newdata = new_data_processed, type = "prob")[,2]

str(new_data)
## 'data.frame':    5 obs. of  23 variables:
##  $ FraudFound_P        : int  0 0 0 0 0
##  $ AgeOfPolicyHolder   : chr  "21 to 25" "41 to 50" "36 to 40" "31 to 35" ...
##  $ Fault               : chr  "Third Party" "Policy Holder" "Policy Holder" "Policy Holder" ...
##  $ VehiclePrice        : chr  "more than 69000" "20000 to 29000" "more than 69000" "20000 to 29000" ...
##  $ Days_Policy_Accident: chr  "1 to 7" "more than 30" "more than 30" "more than 30" ...
##  $ PastNumberOfClaims  : chr  "none" "2 to 4" "none" "1" ...
##  $ AgeOfVehicle        : chr  "5 years" "more than 7" "7 years" "7 years" ...
##  $ PoliceReportFiled   : chr  "No" "Yes" "No" "No" ...
##  $ AgentType           : chr  "External" "External" "Internal" "External" ...
##  $ NumberOfSuppliments : chr  "3 to 5" "none" "none" "none" ...
##  $ AddressChange_Claim : chr  "no change" "no change" "no change" "2 to 3 years" ...
##  $ WitnessPresent      : chr  "No" "No" "No" "Yes" ...
##  $ Sex                 : chr  "Male" "Male" "Female" "Male" ...
##  $ AccidentArea        : chr  "Urban" "Urban" "Urban" "Rural" ...
##  $ BasePolicy          : chr  "Collision" "Liability" "All Perils" "Liability" ...
##  $ MaritalStatus       : chr  "Single" "Married" "Married" "Single" ...
##  $ DriverRating        : int  3 1 4 1 2
##  $ NumberOfCars        : chr  "1 vehicle" "2 vehicles" "1 vehicle" "1 vehicle" ...
##  $ Deductible          : int  400 400 400 400 400
##  $ MonthClaimed        : chr  "Nov" "Mar" "May" "May" ...
##  $ Make                : chr  "Honda" "Honda" "Ford" "Pontiac" ...
##  $ Month               : chr  "Oct" "Mar" "Apr" "May" ...
##  $ Year                : int  1994 1994 1994 1994 1994
pred_probs
## [1] 0.0004897118 0.0085116625 0.2984132767 0.0069570541 0.0008995533