1 Data Importing

library(readr)
data <- read_csv("C:/Users/Fakudze/Desktop/.archivetempsynthetic_financial_data.csv")
View(data)

2 Data Review

library(tidyverse)
str(data)

## spc_tbl_ [10,000 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ transaction_id         : num [1:10000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ customer_id            : num [1:10000] 1082 1015 1004 1095 1036 ...
##  $ merchant_id            : num [1:10000] 2027 2053 2035 2037 2083 ...
##  $ amount                 : num [1:10000] 5759 1902 1249 7619 1890 ...
##  $ transaction_time       : POSIXct[1:10000], format: "2023-01-01 00:00:00" "2023-01-01 00:00:01" ...
##  $ is_fraudulent          : num [1:10000] 0 1 1 1 1 1 1 0 0 0 ...
##  $ card_type              : chr [1:10000] "MasterCard" "Visa" "MasterCard" "Discover" ...
##  $ location               : chr [1:10000] "City-30" "City-47" "City-6" "City-6" ...
##  $ purchase_category      : chr [1:10000] "Gas Station" "Online Shopping" "Gas Station" "Travel" ...
##  $ customer_age           : num [1:10000] 43 61 57 59 36 43 19 40 43 55 ...
##  $ transaction_description: chr [1:10000] "Purchase at Merchant-2027" "Purchase at Merchant-2053" "Purchase at Merchant-2035" "Purchase at Merchant-2037" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   transaction_id = col_double(),
##   ..   customer_id = col_double(),
##   ..   merchant_id = col_double(),
##   ..   amount = col_double(),
##   ..   transaction_time = col_datetime(format = ""),
##   ..   is_fraudulent = col_double(),
##   ..   card_type = col_character(),
##   ..   location = col_character(),
##   ..   purchase_category = col_character(),
##   ..   customer_age = col_double(),
##   ..   transaction_description = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3 Data cleaning

3.1 Feature Transformation

library(dplyr)
data <- data %>% 
  mutate(across(c(is_fraudulent, purchase_category),
                as.factor)) %>% 
  mutate(date = as.Date(data$transaction_time))

3.2 Extracting sample data for further analysis

data <- data %>% 
  select(is_fraudulent, amount, card_type, location, purchase_category, customer_age)

3.3 Checking for Missing data

colSums(is.na(data))

##     is_fraudulent            amount         card_type          location 
##                 0                 0                 0                 0 
## purchase_category      customer_age 
##                 0                 0

4 Exploratory Data Analysis

4.1 Descriptive Statistics (Numeric variables Distribution)

library(summarytools)
data %>% 
  select(c(amount, customer_age)) %>% 
  descr(order = "preserve",
        stats = c("min", "q1", "med", "mean", "q3", "max")
    
  )

## Descriptive Statistics  
## data  
## N: 10000  
## 
##                 amount   customer_age
## ------------ --------- --------------
##          Min     10.61          18.00
##           Q1   2438.07          31.00
##       Median   4943.94          44.00
##         Mean   4958.38          44.05
##           Q3   7499.49          57.00
##          Max   9999.75          70.00

4.2 Distribution of Categorical variables

library(ggplot2)

factor_vars <- c("is_fraudulent", "purchase_category")
 
plot_data <- data %>% 
  select(all_of(factor_vars)) %>% 
  pivot_longer(cols = everything(),
               names_to = "Variable",
               values_to = "Category") %>% 
  count(Variable, Category)

ggplot(plot_data, aes(x = Category,
y = n, fill = Category))+
    geom_bar(stat = "identity")+
    geom_text(aes(label = n), vjust = -0.3, size = 3)+
  facet_wrap(~ Variable, scales = "free_x")+
    labs(title = paste("Distribution of Factor Variables"),
         x = "Category",
         y = "Count")+
    theme_minimal()+
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "none")

4.3 Distribution of Character Variables

ggplot(data, aes(x = card_type, fill = card_type))+
    geom_bar()+
    geom_text(
      stat = "count", aes(label = after_stat(count)), 
      vjust = -0.3)+
    labs(title = paste("Distribution of card_type"),
         x = "card_type",
         y = "Count")+
    theme_minimal()+
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "none")

5 Coloniarity Assessment

5.1 Coloniarity between purchase category with Study variable (is.fraudulent)

tbl <- table(data$is_fraudulent,
             data$purchase_category)
  
View(tbl)
  
chisq.test(tbl)

## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 13.292, df = 5, p-value = 0.02079

5.2 Coloniarity between card (Character variable) type with Study variable (is.fraudulent)

chisq.test(table(data$is_fraudulent,
                 data$card_type))

## 
##  Pearson's Chi-squared test
## 
## data:  table(data$is_fraudulent, data$card_type)
## X-squared = 3.7812, df = 3, p-value = 0.2861

5.3 Coloniarity between numeric variables with Study variable (is.fraudulent)

ggplot(data, aes(x = is_fraudulent, y = amount, fill = is_fraudulent))+
  geom_boxplot()

ggplot(data, aes(x = is_fraudulent, y = customer_age, fill = is_fraudulent))+
  geom_boxplot()

6 Extracting New data sample for Predictive Modeling (Only purchase category and is.fraudulent)

sample_data <- data %>% 
  select(is_fraudulent, purchase_category)

7 Checking for class imbalance

table(sample_data$is_fraudulent)

## 
##    0    1 
## 4932 5068

The class is balanced in predicted variable

8 loading libraries

library(tidymodels)
library(vip)
library(rpart.plot)
library(xgboost)
library(ranger)

set.seed(100)

9 Preparing data

sample_data$is_fraudulent <- relevel(sample_data$is_fraudulent, ref = "1")

10 Train and test split

data_split <- initial_split(sample_data, prop = 0.8, 
                            strata = is_fraudulent)

train_data <- training(data_split)
test_data <- testing(data_split)

11 Create data preprocessing recipe

fraud_recipe <- recipe(is_fraudulent ~ purchase_category, data = train_data) %>% 
  step_dummy(all_nominal_predictors())

12 Cross validation folds

folds <- vfold_cv(train_data,
                  v = 5,
                  strata = is_fraudulent)

==================================================================================================================== MODEL 1 : LOGISTIC REGRESSION

====================================================================================================================

log_model <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")

log_workflow <- workflow() %>% 
  add_recipe(fraud_recipe) %>% 
  add_model(log_model)

log_fit <- fit(log_workflow, data = train_data)

log_resamples <- fit_resamples(
  log_workflow,
  resamples = folds,
  metrics = metric_set(roc_auc, accuracy)
)

collect_metrics(log_resamples)

==================================================================================================================== MODEL 2 : DECISION TREE

====================================================================================================================

tree_model <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
) %>% 
  set_engine("rpart") %>% 
  set_mode("classification")

tree_worflow <- workflow() %>% 
  add_recipe(fraud_recipe) %>% 
  add_model(tree_model)

tree_grid <- grid_regular(
  cost_complexity(),
  tree_depth(),
  min_n(),
  levels = 3
  
)

tree_tune <- tune_grid(
  tree_worflow,
  resamples = folds,
  grid = tree_grid,
  metrics = metric_set(roc_auc, accuracy)
)

collect_metrics(tree_tune)

best_tree <- select_best(
  tree_tune,
  metric = "roc_auc")
  
final_tree_workflow <- finalize_workflow(
  tree_worflow,
  best_tree
)

tree_fit <- fit(final_tree_workflow, data = train_data)

==============================================================================================================================

MODEL 3 : RANDOM FOREST

=============================================================================================================================

rf_model <- rand_forest(
  trees = 300,
  mtry = tune(),
  min_n = tune()
) %>% 
  set_engine("ranger",
             importance = "impurity") %>% 
  set_mode("classification")


rf_workflow <- workflow() %>% 
  add_recipe(fraud_recipe) %>% 
  add_model(rf_model)

rf_grid <- grid_regular(
  mtry(range = c(1, 2)),
  min_n(),
  levels = 3
)
  rf_tune <- tune_grid(
    rf_workflow,
    resamples = folds,
    grid = rf_grid,
    metrics = metric_set(roc_auc, accuracy)
  )
  
  collect_metrics(rf_tune)

  best_rf <- select_best(rf_tune, metric = "roc_auc")
  
  final_rf_workflow <- finalize_workflow(
    rf_workflow,
    best_rf
  )
  
  rf_fit <- fit(final_rf_workflow,
                data = train_data)

==============================================================================================================================

MODEL 4 : XGBOOST

=============================================================================================================================

xgb_model <- boost_tree(
  trees = 300,
  tree_depth = tune(),
  learn_rate = tune(),
  min_n = tune()
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

xgb_workflow <- workflow() %>% 
  add_recipe(fraud_recipe) %>% 
  add_model(xgb_model)

xgb_grid <- grid_regular(
  tree_depth(),
  learn_rate(),
  min_n(),
  levels = 3
)

xgb_tune <- tune_grid(
  xgb_workflow,
  resamples = folds,
  grid = xgb_grid,
  metric = metric_set(roc_auc, accuracy)
)

collect_metrics(xgb_tune)

best_xgb <- select_best(
  xgb_tune,
  metric = "roc_auc"
)

final_xgb_workflow <- finalize_workflow(
  xgb_workflow,
  best_xgb
)

xgb_fit <- fit(final_xgb_workflow,
               data = train_data)

========================================================================================================================

COMPARE MODEL PERFORMANCE

========================================================================================================================

log_metrics <- collect_metrics(log_resamples) %>% 
  filter(.metric == "roc_auc") %>% 
  mutate(model = "Logistic Regression")

tree_metrics <- collect_metrics(tree_tune) %>% 
  filter(.metric == "roc_auc") %>% 
  slice_max(mean, n = 1) %>% 
  mutate(model = "Decision Tree")

rf_metrics <- collect_metrics(rf_tune) %>% 
  filter(.metric == "roc_auc") %>% 
  slice_max(mean, n = 1) %>% 
  mutate(model = "Random Forest")

xgb_metrics <- collect_metrics(xgb_tune) %>% 
  filter(.metric == "roc_auc") %>% 
  slice_max(mean, n = 1) %>% 
  mutate(model = "XGBoost")

all_results <- bind_rows(
  log_metrics,
  tree_metrics,
  rf_metrics,
  xgb_metrics
)

all_results %>% 
  select(model, mean, std_err)

=========================================================================================================================

CHOOSE THE BEST MODEL

=========================================================================================================================

best_model <- all_results %>% 
  arrange(desc(mean))

best_model

========================================================================================================================

MAKE PREDICTIONS

========================================================================================================================

logistic_predictions <- predict(
  log_fit, new_data = test_data, type = "prob"
)

logistic_predictions

log_class <- predict(
  log_fit,
  new_data = test_data,
  type = "class"
)

log_class

final_predictions <- bind_cols(
  test_data,
  logistic_predictions,
  log_class
)

head(final_predictions)

==========================================================================================================================

SAVE BEST MODEL

==========================================================================================================================

saveRDS(log_fit, "best_fraud_model.rds")

==========================================================================================================================

LOAD SAVED MODEL LATER

==========================================================================================================================

loaded_model <- readRDS("best_fraud_model.rds")

==========================================================================================================================

USE SAVED MODEL FOR NEW PREDICTIONS

=========================================================================================================================

new_customer <- data.frame(
  purchase_category = factor(
    c("Online Shopping", "Restaurant"),
    levels = 
      levels(sample_data$purchase_category)
  )
)

predict(loaded_model,
        new_data = new_customer,
        type = "prob")

Fraud Detection (Probalility Analysis)

Celumusa Fakudze

2026-05-26

1 Data Importing

2 Data Review

3 Data cleaning

3.1 Feature Transformation

3.2 Extracting sample data for further analysis

3.3 Checking for Missing data

4 Exploratory Data Analysis

4.1 Descriptive Statistics (Numeric variables Distribution)

4.2 Distribution of Categorical variables

4.3 Distribution of Character Variables

5 Coloniarity Assessment

5.1 Coloniarity between purchase category with Study variable (is.fraudulent)

5.2 Coloniarity between card (Character variable) type with Study variable (is.fraudulent)

5.3 Coloniarity between numeric variables with Study variable (is.fraudulent)

6 Extracting New data sample for Predictive Modeling (Only purchase category and is.fraudulent)

7 Checking for class imbalance

8 loading libraries

9 Preparing data

10 Train and test split

11 Create data preprocessing recipe

12 Cross validation folds