1 Data Importing

library(readr)
data <- read_csv("C:/Users/Fakudze/Desktop/.archivetempsynthetic_financial_data.csv")
View(data)

2 Data Review

library(tidyverse)
str(data)
## spc_tbl_ [10,000 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ transaction_id         : num [1:10000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ customer_id            : num [1:10000] 1082 1015 1004 1095 1036 ...
##  $ merchant_id            : num [1:10000] 2027 2053 2035 2037 2083 ...
##  $ amount                 : num [1:10000] 5759 1902 1249 7619 1890 ...
##  $ transaction_time       : POSIXct[1:10000], format: "2023-01-01 00:00:00" "2023-01-01 00:00:01" ...
##  $ is_fraudulent          : num [1:10000] 0 1 1 1 1 1 1 0 0 0 ...
##  $ card_type              : chr [1:10000] "MasterCard" "Visa" "MasterCard" "Discover" ...
##  $ location               : chr [1:10000] "City-30" "City-47" "City-6" "City-6" ...
##  $ purchase_category      : chr [1:10000] "Gas Station" "Online Shopping" "Gas Station" "Travel" ...
##  $ customer_age           : num [1:10000] 43 61 57 59 36 43 19 40 43 55 ...
##  $ transaction_description: chr [1:10000] "Purchase at Merchant-2027" "Purchase at Merchant-2053" "Purchase at Merchant-2035" "Purchase at Merchant-2037" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   transaction_id = col_double(),
##   ..   customer_id = col_double(),
##   ..   merchant_id = col_double(),
##   ..   amount = col_double(),
##   ..   transaction_time = col_datetime(format = ""),
##   ..   is_fraudulent = col_double(),
##   ..   card_type = col_character(),
##   ..   location = col_character(),
##   ..   purchase_category = col_character(),
##   ..   customer_age = col_double(),
##   ..   transaction_description = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3 Data cleaning

3.1 Feature Transformation

library(dplyr)
data <- data %>% 
  mutate(across(c(is_fraudulent, purchase_category),
                as.factor)) %>% 
  mutate(date = as.Date(data$transaction_time))

3.2 Extracting sample data for further analysis

data <- data %>% 
  select(is_fraudulent, amount, card_type, location, purchase_category, customer_age)

3.3 Checking for Missing data

colSums(is.na(data))
##     is_fraudulent            amount         card_type          location 
##                 0                 0                 0                 0 
## purchase_category      customer_age 
##                 0                 0

4 Exploratory Data Analysis

4.1 Descriptive Statistics (Numeric variables Distribution)

library(summarytools)
data %>% 
  select(c(amount, customer_age)) %>% 
  descr(order = "preserve",
        stats = c("min", "q1", "med", "mean", "q3", "max")
    
  )
## Descriptive Statistics  
## data  
## N: 10000  
## 
##                 amount   customer_age
## ------------ --------- --------------
##          Min     10.61          18.00
##           Q1   2438.07          31.00
##       Median   4943.94          44.00
##         Mean   4958.38          44.05
##           Q3   7499.49          57.00
##          Max   9999.75          70.00

4.2 Distribution of Categorical variables

library(ggplot2)

factor_vars <- c("is_fraudulent", "purchase_category")
 
plot_data <- data %>% 
  select(all_of(factor_vars)) %>% 
  pivot_longer(cols = everything(),
               names_to = "Variable",
               values_to = "Category") %>% 
  count(Variable, Category)

ggplot(plot_data, aes(x = Category,
y = n, fill = Category))+
    geom_bar(stat = "identity")+
    geom_text(aes(label = n), vjust = -0.3, size = 3)+
  facet_wrap(~ Variable, scales = "free_x")+
    labs(title = paste("Distribution of Factor Variables"),
         x = "Category",
         y = "Count")+
    theme_minimal()+
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "none")

4.3 Distribution of Character Variables

ggplot(data, aes(x = card_type, fill = card_type))+
    geom_bar()+
    geom_text(
      stat = "count", aes(label = after_stat(count)), 
      vjust = -0.3)+
    labs(title = paste("Distribution of card_type"),
         x = "card_type",
         y = "Count")+
    theme_minimal()+
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "none")

5 Coloniarity Assessment

5.1 Coloniarity between purchase category with Study variable (is.fraudulent)

tbl <- table(data$is_fraudulent,
             data$purchase_category)
  
View(tbl)
  
chisq.test(tbl)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 13.292, df = 5, p-value = 0.02079

5.2 Coloniarity between card (Character variable) type with Study variable (is.fraudulent)

chisq.test(table(data$is_fraudulent,
                 data$card_type))
## 
##  Pearson's Chi-squared test
## 
## data:  table(data$is_fraudulent, data$card_type)
## X-squared = 3.7812, df = 3, p-value = 0.2861

5.3 Coloniarity between numeric variables with Study variable (is.fraudulent)

ggplot(data, aes(x = is_fraudulent, y = amount, fill = is_fraudulent))+
  geom_boxplot()

ggplot(data, aes(x = is_fraudulent, y = customer_age, fill = is_fraudulent))+
  geom_boxplot()

6 Extracting New data sample for Predictive Modeling (Only purchase category and is.fraudulent)

sample_data <- data %>% 
  select(is_fraudulent, purchase_category)

7 Checking for class imbalance

table(sample_data$is_fraudulent)
## 
##    0    1 
## 4932 5068

The class is balanced in predicted variable

8 loading libraries

library(tidymodels)
library(vip)
library(rpart.plot)
library(xgboost)
library(ranger)

set.seed(100)

9 Preparing data

sample_data$is_fraudulent <- relevel(sample_data$is_fraudulent, ref = "1")

10 Train and test split

data_split <- initial_split(sample_data, prop = 0.8, 
                            strata = is_fraudulent)

train_data <- training(data_split)
test_data <- testing(data_split)

11 Create data preprocessing recipe

fraud_recipe <- recipe(is_fraudulent ~ purchase_category, data = train_data) %>% 
  step_dummy(all_nominal_predictors())

12 Cross validation folds

folds <- vfold_cv(train_data,
                  v = 5,
                  strata = is_fraudulent)

==================================================================================================================== MODEL 1 : LOGISTIC REGRESSION

====================================================================================================================

log_model <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")

log_workflow <- workflow() %>% 
  add_recipe(fraud_recipe) %>% 
  add_model(log_model)

log_fit <- fit(log_workflow, data = train_data)

log_resamples <- fit_resamples(
  log_workflow,
  resamples = folds,
  metrics = metric_set(roc_auc, accuracy)
)

collect_metrics(log_resamples)

==================================================================================================================== MODEL 2 : DECISION TREE

====================================================================================================================

tree_model <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
) %>% 
  set_engine("rpart") %>% 
  set_mode("classification")

tree_worflow <- workflow() %>% 
  add_recipe(fraud_recipe) %>% 
  add_model(tree_model)

tree_grid <- grid_regular(
  cost_complexity(),
  tree_depth(),
  min_n(),
  levels = 3
  
)

tree_tune <- tune_grid(
  tree_worflow,
  resamples = folds,
  grid = tree_grid,
  metrics = metric_set(roc_auc, accuracy)
)

collect_metrics(tree_tune)
best_tree <- select_best(
  tree_tune,
  metric = "roc_auc")
  
final_tree_workflow <- finalize_workflow(
  tree_worflow,
  best_tree
)

tree_fit <- fit(final_tree_workflow, data = train_data)

==============================================================================================================================

MODEL 3 : RANDOM FOREST

=============================================================================================================================

rf_model <- rand_forest(
  trees = 300,
  mtry = tune(),
  min_n = tune()
) %>% 
  set_engine("ranger",
             importance = "impurity") %>% 
  set_mode("classification")


rf_workflow <- workflow() %>% 
  add_recipe(fraud_recipe) %>% 
  add_model(rf_model)

rf_grid <- grid_regular(
  mtry(range = c(1, 2)),
  min_n(),
  levels = 3
)
  rf_tune <- tune_grid(
    rf_workflow,
    resamples = folds,
    grid = rf_grid,
    metrics = metric_set(roc_auc, accuracy)
  )
  
  collect_metrics(rf_tune)
  best_rf <- select_best(rf_tune, metric = "roc_auc")
  
  final_rf_workflow <- finalize_workflow(
    rf_workflow,
    best_rf
  )
  
  rf_fit <- fit(final_rf_workflow,
                data = train_data)

==============================================================================================================================

MODEL 4 : XGBOOST

=============================================================================================================================

xgb_model <- boost_tree(
  trees = 300,
  tree_depth = tune(),
  learn_rate = tune(),
  min_n = tune()
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

xgb_workflow <- workflow() %>% 
  add_recipe(fraud_recipe) %>% 
  add_model(xgb_model)

xgb_grid <- grid_regular(
  tree_depth(),
  learn_rate(),
  min_n(),
  levels = 3
)

xgb_tune <- tune_grid(
  xgb_workflow,
  resamples = folds,
  grid = xgb_grid,
  metric = metric_set(roc_auc, accuracy)
)

collect_metrics(xgb_tune)
best_xgb <- select_best(
  xgb_tune,
  metric = "roc_auc"
)

final_xgb_workflow <- finalize_workflow(
  xgb_workflow,
  best_xgb
)

xgb_fit <- fit(final_xgb_workflow,
               data = train_data)

========================================================================================================================

COMPARE MODEL PERFORMANCE

========================================================================================================================

log_metrics <- collect_metrics(log_resamples) %>% 
  filter(.metric == "roc_auc") %>% 
  mutate(model = "Logistic Regression")

tree_metrics <- collect_metrics(tree_tune) %>% 
  filter(.metric == "roc_auc") %>% 
  slice_max(mean, n = 1) %>% 
  mutate(model = "Decision Tree")

rf_metrics <- collect_metrics(rf_tune) %>% 
  filter(.metric == "roc_auc") %>% 
  slice_max(mean, n = 1) %>% 
  mutate(model = "Random Forest")

xgb_metrics <- collect_metrics(xgb_tune) %>% 
  filter(.metric == "roc_auc") %>% 
  slice_max(mean, n = 1) %>% 
  mutate(model = "XGBoost")

all_results <- bind_rows(
  log_metrics,
  tree_metrics,
  rf_metrics,
  xgb_metrics
)

all_results %>% 
  select(model, mean, std_err)

=========================================================================================================================

CHOOSE THE BEST MODEL

=========================================================================================================================

best_model <- all_results %>% 
  arrange(desc(mean))

best_model

========================================================================================================================

MAKE PREDICTIONS

========================================================================================================================

logistic_predictions <- predict(
  log_fit, new_data = test_data, type = "prob"
)

logistic_predictions
log_class <- predict(
  log_fit,
  new_data = test_data,
  type = "class"
)

log_class
final_predictions <- bind_cols(
  test_data,
  logistic_predictions,
  log_class
)

head(final_predictions)

==========================================================================================================================

SAVE BEST MODEL

==========================================================================================================================

saveRDS(log_fit, "best_fraud_model.rds")

==========================================================================================================================

LOAD SAVED MODEL LATER

==========================================================================================================================

loaded_model <- readRDS("best_fraud_model.rds")

==========================================================================================================================

USE SAVED MODEL FOR NEW PREDICTIONS

=========================================================================================================================

new_customer <- data.frame(
  purchase_category = factor(
    c("Online Shopping", "Restaurant"),
    levels = 
      levels(sample_data$purchase_category)
  )
)

predict(loaded_model,
        new_data = new_customer,
        type = "prob")