1. Installing packages

# Set global chunk options for consistent document behavior
knitr::opts_chunk$set(
  echo = TRUE,           # Show all code in the output document
  warning = FALSE,       # Suppress warning messages for cleaner output
  message = FALSE        # Suppress package loading messages
)

# Configure CRAN mirror for reliable package installation
options(repos = c(CRAN = "https://cloud.r-project.org"))

# Load core data manipulation and visualization libraries
library(tidyverse)     # for Comprehensive data science package (ggplot2, dplyr, etc.)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)         # for Advanced data manipulation 
library(data.table)    # for High-performance data manipulation for large datasets
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggplot2)       # for Grammar of graphics for creating complex visualizations
library(corrplot)      # for Specialized correlation matrix visualization
## corrplot 0.95 loaded
library(lubridate)     # for Simplified date-time manipulation and arithmetic
library(VIM)           # for Visualization and imputation of missing values
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
library(scales)        # for Scale functions for axes and color aesthetics
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(viridis)       # for Perceptually uniform color scales
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## 
## The following object is masked from 'package:scales':
## 
##     viridis_pal
# Load specialized machine learning libraries
library(igraph)        # Comprehensive network analysis and graph theory
## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
library(networkD3)     # Interactive network visualizations for web
library(cluster)       # Classical cluster analysis algorithms
library(factoextra)    # Extract and visualize multivariate analysis results
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dbscan)        # Density-based spatial clustering for anomaly detection
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:VIM':
## 
##     kNN
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(caret)         # Classification and regression training framework
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)  # Ensemble learning algorithm for classification
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(plotly)        # Interactive web-based data visualization
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:igraph':
## 
##     groups
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Introduction

This analysis implements a comprehensive real-time fraud detection system for digital payments in Nigeria. The approach combines multiple advanced techniques:

2. Data Import and Initial Exploration

Purpose: Load the NIBSS fraud dataset and perform initial data structure analysis to understand the available features and data quality.

next step: Importing the fraud dataset from CSV format and examining its structure, dimensions, data types, and basic properties to inform the analysis strategy.

# Import the fraud dataset from CSV file
# This dataset contains transaction records with fraud labels and engineered features
Fraud <- read.csv("C:/Users/HomePC/OneDrive/Desktop/Fraud Detection/nibss_fraud_dataset.csv")

# Display the structure of the dataset to understand data types and columns
# This reveals the number of observations, variables, and their data types
print(str(Fraud))
## 'data.frame':    1000000 obs. of  38 variables:
##  $ transaction_id      : chr  "TXN_F08A86FFD87C" "TXN_C2D08134EC83" "TXN_B9499111901D" "TXN_48DB1D526A3B" ...
##  $ customer_id         : chr  "CUST_0002AED1" "CUST_0002AED1" "CUST_0002AED1" "CUST_0002AED1" ...
##  $ timestamp           : chr  "2023-01-14 04:31:09" "2023-01-17 11:20:13" "2023-01-22 02:17:46" "2023-01-24 08:18:23" ...
##  $ amount              : num  32267 72530 168153 16440 9923 ...
##  $ channel             : chr  "Mobile" "Web" "Mobile" "Mobile" ...
##  $ merchant_category   : chr  "Grocery" "Entertainment" "Transport" "Entertainment" ...
##  $ bank                : chr  "Sterling" "UBA" "Wema" "FCMB" ...
##  $ location            : chr  "Other" "Other" "Other" "Other" ...
##  $ age_group           : chr  "30-39" "30-39" "30-39" "30-39" ...
##  $ hour                : int  4 11 2 8 15 16 13 16 10 19 ...
##  $ day_of_week         : int  5 1 6 1 2 2 2 2 3 0 ...
##  $ month               : int  1 1 1 1 2 2 2 2 2 2 ...
##  $ is_weekend          : chr  "True" "False" "True" "False" ...
##  $ is_peak_hour        : chr  "False" "True" "False" "False" ...
##  $ tx_count_24h        : num  1 1 1 1 1 1 1 2 3 1 ...
##  $ amount_sum_24h      : num  32267 72530 168153 16440 9923 ...
##  $ amount_mean_7d      : num  32267 52399 120342 85708 9923 ...
##  $ amount_std_7d       : num  0 20132 47811 62634 0 ...
##  $ tx_count_total      : int  107 107 107 107 107 107 107 107 107 107 ...
##  $ amount_mean_total   : num  170390 170390 170390 170390 170390 ...
##  $ amount_std_total    : num  365916 365916 365916 365916 365916 ...
##  $ channel_diversity   : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ location_diversity  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ amount_vs_mean_ratio: num  0.1894 0.4257 0.9869 0.0965 0.0582 ...
##  $ online_channel_ratio: num  0.776 0.776 0.776 0.776 0.776 ...
##  $ is_fraud            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fraud_technique     : chr  "" "" "" "" ...
##  $ hour_sin            : num  0.866 0.259 0.5 0.866 -0.707 ...
##  $ hour_cos            : num  0.5 -0.966 0.866 -0.5 -0.707 ...
##  $ day_sin             : num  -0.975 0.782 -0.782 0.782 0.975 ...
##  $ day_cos             : num  -0.223 0.623 0.623 0.623 -0.223 ...
##  $ month_sin           : num  0.5 0.5 0.5 0.5 0.866 ...
##  $ month_cos           : num  0.866 0.866 0.866 0.866 0.5 ...
##  $ amount_log          : num  10.38 11.19 12.03 9.71 9.2 ...
##  $ amount_rounded      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ velocity_score      : num  0.1894 0.4257 0.9869 0.0965 0.0582 ...
##  $ merchant_risk_score : num  0.215 0.877 0.44 0.877 0.231 ...
##  $ composite_risk      : num  0.0706 0.2768 0.1636 0.2663 0.0713 ...
## NULL
# Show dataset dimensions for size assessment
nrow(Fraud)
## [1] 1000000
ncol(Fraud)
## [1] 38
# Display first few rows to understand the actual data format and values
head(Fraud)
##     transaction_id   customer_id           timestamp    amount channel
## 1 TXN_F08A86FFD87C CUST_0002AED1 2023-01-14 04:31:09  32266.83  Mobile
## 2 TXN_C2D08134EC83 CUST_0002AED1 2023-01-17 11:20:13  72530.49     Web
## 3 TXN_B9499111901D CUST_0002AED1 2023-01-22 02:17:46 168152.87  Mobile
## 4 TXN_48DB1D526A3B CUST_0002AED1 2023-01-24 08:18:23  16439.93  Mobile
## 5 TXN_56DB1E28B758 CUST_0002AED1 2023-02-01 15:39:53   9922.68     POS
## 6 TXN_8CB46D78CEED CUST_0002AED1 2023-02-08 16:27:19  80685.56     Web
##   merchant_category      bank location age_group hour day_of_week month
## 1           Grocery  Sterling    Other     30-39    4           5     1
## 2     Entertainment       UBA    Other     30-39   11           1     1
## 3         Transport      Wema    Other     30-39    2           6     1
## 4     Entertainment      FCMB    Other     30-39    8           1     1
## 5         Education FirstBank    Other     30-39   15           2     2
## 6        Restaurant    GTBank    Other     30-39   16           2     2
##   is_weekend is_peak_hour tx_count_24h amount_sum_24h amount_mean_7d
## 1       True        False            1       32266.83       32266.83
## 2      False         True            1       72530.49       52398.66
## 3       True        False            1      168152.87      120341.68
## 4      False        False            1       16439.93       85707.76
## 5      False         True            1        9922.68        9922.68
## 6      False         True            1       80685.56       80685.56
##   amount_std_7d tx_count_total amount_mean_total amount_std_total
## 1          0.00            107          170389.9         365915.9
## 2      20131.83            107          170389.9         365915.9
## 3      47811.19            107          170389.9         365915.9
## 4      62633.51            107          170389.9         365915.9
## 5          0.00            107          170389.9         365915.9
## 6          0.00            107          170389.9         365915.9
##   channel_diversity location_diversity amount_vs_mean_ratio
## 1                 5                  1           0.18936948
## 2                 5                  1           0.42567123
## 3                 5                  1           0.98686550
## 4                 5                  1           0.09648363
## 5                 5                  1           0.05823481
## 6                 5                  1           0.47353218
##   online_channel_ratio is_fraud fraud_technique   hour_sin   hour_cos
## 1            0.7757009        0                  0.8660254  0.5000000
## 2            0.7757009        0                  0.2588190 -0.9659258
## 3            0.7757009        0                  0.5000000  0.8660254
## 4            0.7757009        0                  0.8660254 -0.5000000
## 5            0.7757009        0                 -0.7071068 -0.7071068
## 6            0.7757009        0                 -0.8660254 -0.5000000
##      day_sin    day_cos month_sin month_cos amount_log amount_rounded
## 1 -0.9749279 -0.2225209 0.5000000 0.8660254  10.381826              0
## 2  0.7818315  0.6234898 0.5000000 0.8660254  11.191776              0
## 3 -0.7818315  0.6234898 0.5000000 0.8660254  12.032635              0
## 4  0.7818315  0.6234898 0.5000000 0.8660254   9.707529              0
## 5  0.9749279 -0.2225209 0.8660254 0.5000000   9.202679              0
## 6  0.9749279 -0.2225209 0.8660254 0.5000000  11.298327              0
##   velocity_score merchant_risk_score composite_risk
## 1     0.18936948           0.2149999     0.07055978
## 2     0.42567123           0.8774244     0.27684880
## 3     0.98686550           0.4402304     0.16364883
## 4     0.09648363           0.8774244     0.26631480
## 5     0.05823481           0.2312907     0.07125073
## 6     0.47353218           0.6084928     0.19770088
# Display column names for reference
names(Fraud)
##  [1] "transaction_id"       "customer_id"          "timestamp"           
##  [4] "amount"               "channel"              "merchant_category"   
##  [7] "bank"                 "location"             "age_group"           
## [10] "hour"                 "day_of_week"          "month"               
## [13] "is_weekend"           "is_peak_hour"         "tx_count_24h"        
## [16] "amount_sum_24h"       "amount_mean_7d"       "amount_std_7d"       
## [19] "tx_count_total"       "amount_mean_total"    "amount_std_total"    
## [22] "channel_diversity"    "location_diversity"   "amount_vs_mean_ratio"
## [25] "online_channel_ratio" "is_fraud"             "fraud_technique"     
## [28] "hour_sin"             "hour_cos"             "day_sin"             
## [31] "day_cos"              "month_sin"            "month_cos"           
## [34] "amount_log"           "amount_rounded"       "velocity_score"      
## [37] "merchant_risk_score"  "composite_risk"

Data Import Results: The dataset has been successfully loaded. It can be seen that the structure includes transaction-level information with pre-engineered features for fraud detection, including temporal patterns, behavioral metrics, and risk scores.

3. Data Quality Assessment and Summary Statistics

Purpose: Evaluate data completeness, identify missing values, and generate comprehensive summary statistics to understand data distribution and quality.

What is being done: Systematically checking for missing values across all columns and generating descriptive statistics to identify potential data quality issues, outliers, and distribution characteristics that will inform our preprocessing strategy.

# Check for missing values in each column to assess data completeness
missing_values <- colSums(is.na(Fraud))  # Count missing values per column
print(missing_values)
##       transaction_id          customer_id            timestamp 
##                    0                    0                    0 
##               amount              channel    merchant_category 
##                    0                    0                    0 
##                 bank             location            age_group 
##                    0                    0                    0 
##                 hour          day_of_week                month 
##                    0                    0                    0 
##           is_weekend         is_peak_hour         tx_count_24h 
##                    0                    0                    0 
##       amount_sum_24h       amount_mean_7d        amount_std_7d 
##                    0                    0                    0 
##       tx_count_total    amount_mean_total     amount_std_total 
##                    0                    0                    0 
##    channel_diversity   location_diversity amount_vs_mean_ratio 
##                    0                    0                    0 
## online_channel_ratio             is_fraud      fraud_technique 
##                    0                    0                    0 
##             hour_sin             hour_cos              day_sin 
##                    0                    0                    0 
##              day_cos            month_sin            month_cos 
##                    0                    0                    0 
##           amount_log       amount_rounded       velocity_score 
##                    0                    0                    0 
##  merchant_risk_score       composite_risk 
##                    0                    0
# Generate comprehensive summary statistics for all numeric columns
# This provides mean, median, quartiles, min, max, and range for each variable
print(summary(Fraud))
##  transaction_id     customer_id         timestamp             amount         
##  Length:1000000     Length:1000000     Length:1000000     Min.   :1.686e+02  
##  Class :character   Class :character   Class :character   1st Qu.:2.800e+04  
##  Mode  :character   Mode  :character   Mode  :character   Median :6.668e+04  
##                                                           Mean   :1.570e+05  
##                                                           3rd Qu.:1.595e+05  
##                                                           Max.   :1.793e+07  
##    channel          merchant_category      bank             location        
##  Length:1000000     Length:1000000     Length:1000000     Length:1000000    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   age_group              hour        day_of_week        month       
##  Length:1000000     Min.   : 0.00   Min.   :0.000   Min.   : 1.000  
##  Class :character   1st Qu.: 9.00   1st Qu.:1.000   1st Qu.: 3.000  
##  Mode  :character   Median :12.00   Median :3.000   Median : 7.000  
##                     Mean   :12.23   Mean   :3.013   Mean   : 6.609  
##                     3rd Qu.:15.00   3rd Qu.:5.000   3rd Qu.:10.000  
##                     Max.   :23.00   Max.   :6.000   Max.   :12.000  
##   is_weekend        is_peak_hour        tx_count_24h   amount_sum_24h     
##  Length:1000000     Length:1000000     Min.   :1.000   Min.   :1.686e+02  
##  Class :character   Class :character   1st Qu.:1.000   1st Qu.:3.617e+04  
##  Mode  :character   Mode  :character   Median :1.000   Median :8.972e+04  
##                                        Mean   :1.282   Mean   :1.992e+05  
##                                        3rd Qu.:1.000   3rd Qu.:2.151e+05  
##                                        Max.   :6.000   Max.   :1.326e+07  
##  amount_mean_7d      amount_std_7d     tx_count_total amount_mean_total
##  Min.   :2.128e+02   Min.   :      0   Min.   : 61    Min.   : 79531   
##  1st Qu.:5.327e+04   1st Qu.:  14216   1st Qu.: 94    1st Qu.:133244   
##  Median :9.779e+04   Median :  51220   Median :101    Median :151366   
##  Mean   :1.558e+05   Mean   : 111295   Mean   :101    Mean   :156224   
##  3rd Qu.:1.795e+05   3rd Qu.: 124608   3rd Qu.:107    3rd Qu.:174290   
##  Max.   :1.000e+07   Max.   :4998794   Max.   :140    Max.   :380509   
##  amount_std_total  channel_diversity location_diversity amount_vs_mean_ratio
##  Min.   :  76295   Min.   :4.000     Min.   :1          Min.   : 0.001013   
##  1st Qu.: 198098   1st Qu.:6.000     1st Qu.:1          1st Qu.: 0.181893   
##  Median : 252854   Median :6.000     Median :1          Median : 0.437315   
##  Mean   : 290158   Mean   :5.864     Mean   :1          Mean   : 0.999993   
##  3rd Qu.: 337364   3rd Qu.:6.000     3rd Qu.:1          3rd Qu.: 1.049607   
##  Max.   :1149815   Max.   :6.000     Max.   :1          Max.   :54.904026   
##  online_channel_ratio    is_fraud     fraud_technique       hour_sin       
##  Min.   :0.5244       Min.   :0.000   Length:1000000     Min.   :-1.00000  
##  1st Qu.:0.6700       1st Qu.:0.000   Class :character   1st Qu.:-0.70711  
##  Median :0.7016       Median :0.000   Mode  :character   Median : 0.00000  
##  Mean   :0.7002       Mean   :0.003                      Mean   :-0.08592  
##  3rd Qu.:0.7308       3rd Qu.:0.000                      3rd Qu.: 0.50000  
##  Max.   :0.8929       Max.   :1.000                      Max.   : 1.00000  
##     hour_cos          day_sin             day_cos            month_sin       
##  Min.   :-1.0000   Min.   :-0.974928   Min.   :-0.900969   Min.   :-1.00000  
##  1st Qu.:-0.8660   1st Qu.:-0.781831   1st Qu.:-0.900969   1st Qu.:-0.50000  
##  Median :-0.7071   Median : 0.000000   Median :-0.222521   Median : 0.00000  
##  Mean   :-0.4664   Mean   :-0.001601   Mean   :-0.003539   Mean   : 0.03398  
##  3rd Qu.:-0.2588   3rd Qu.: 0.781831   3rd Qu.: 0.623490   3rd Qu.: 0.86603  
##  Max.   : 1.0000   Max.   : 0.974928   Max.   : 1.000000   Max.   : 1.00000  
##    month_cos          amount_log     amount_rounded    velocity_score     
##  Min.   :-1.00000   Min.   : 5.133   Min.   :0.0e+00   Min.   :1.077e-03  
##  1st Qu.:-0.50000   1st Qu.:10.240   1st Qu.:0.0e+00   1st Qu.:2.467e-01  
##  Median : 0.00000   Median :11.108   Median :0.0e+00   Median :6.812e-01  
##  Mean   : 0.01273   Mean   :11.111   Mean   :7.9e-05   Mean   :1.905e+00  
##  3rd Qu.: 0.50000   3rd Qu.:11.980   3rd Qu.:0.0e+00   3rd Qu.:1.916e+00  
##  Max.   : 1.00000   Max.   :16.702   Max.   :1.0e+00   Max.   :1.915e+02  
##  merchant_risk_score composite_risk   
##  Min.   :0.1213      Min.   :0.03643  
##  1st Qu.:0.2027      1st Qu.:0.07808  
##  Median :0.3696      Median :0.13563  
##  Mean   :0.3956      Mean   :0.15129  
##  3rd Qu.:0.5424      3rd Qu.:0.19646  
##  Max.   :0.8774      Max.   :0.73458
# Check for duplicate transactions that might affect analysis
duplicates <- sum(duplicated(Fraud))
cat("Number of duplicate rows:", duplicates)
## Number of duplicate rows: 0
# Examine data types to ensure proper formatting
data_types <- sapply(Fraud, class) #sapply function applies a function across elements of a vector/list/data frame.
print(data_types)
##       transaction_id          customer_id            timestamp 
##          "character"          "character"          "character" 
##               amount              channel    merchant_category 
##            "numeric"          "character"          "character" 
##                 bank             location            age_group 
##          "character"          "character"          "character" 
##                 hour          day_of_week                month 
##            "integer"            "integer"            "integer" 
##           is_weekend         is_peak_hour         tx_count_24h 
##          "character"          "character"            "numeric" 
##       amount_sum_24h       amount_mean_7d        amount_std_7d 
##            "numeric"            "numeric"            "numeric" 
##       tx_count_total    amount_mean_total     amount_std_total 
##            "integer"            "numeric"            "numeric" 
##    channel_diversity   location_diversity amount_vs_mean_ratio 
##            "integer"            "integer"            "numeric" 
## online_channel_ratio             is_fraud      fraud_technique 
##            "numeric"            "integer"          "character" 
##             hour_sin             hour_cos              day_sin 
##            "numeric"            "numeric"            "numeric" 
##              day_cos            month_sin            month_cos 
##            "numeric"            "numeric"            "numeric" 
##           amount_log       amount_rounded       velocity_score 
##            "numeric"            "integer"            "numeric" 
##  merchant_risk_score       composite_risk 
##            "numeric"            "numeric"

Data Quality Assessment Results: The analysis reveals the data completeness status,and we were able to see that there are no missing values in the data set, and also no duplicate rows, it also shows the data types of all the columns in the Fraud dataset. Some columns, like transaction_id, customer_id, and channel, are stored as character data, meaning they represent text or categorical values. Other columns, such as amount, velocity_score, and merchant_risk_score, are numeric, representing continuous numerical features. There are also integer columns like hour, day_of_week, and is_fraud, which store whole numbers, with is_fraud being the target variable that indicates whether a transaction is fraudulent or not. Overall, this outcome confirms that the dataset contains a mix of categorical variables, numeric features, and the fraud label.

4. Exploratory Data Analysis (EDA)

4.1 Transaction Amount Distribution Analysis

Purpose: Analyze the distribution of transaction amounts to understand spending patterns.

What is being done: Creating a histogram to visualize the transaction amount distribution, which typically follows a highly skewed pattern in financial data. This analysis helps establish baseline patterns and informs threshold-setting for statistical anomaly detection methods.

# Create comprehensive histogram of transaction amounts
# Using optimal bin count to visualize the highly skewed distribution typical in financial data
ggplot(Fraud, aes(x = amount)) +                                    # Set amount as x-axis variable
  geom_histogram(bins = 50,                                         # Use 50 bins for detailed distribution view
                 alpha = 0.7,                                       # Set transparency for better visualization
                 fill = "steelblue",                                # Use professional blue color
                 color = "white") +                                 # Add white borders for bin separation
  labs(title = "Distribution of Transaction Amounts",               # Add descriptive title
       subtitle = "Frequency distribution showing typical financial data skewness",  # Explanatory subtitle
       x = "Transaction Amount (Nigerian Naira)",                   # Clear x-axis label with currency
       y = "Frequency (Number of Transactions)") +                  # Clear y-axis label
  scale_x_continuous(labels = scales::comma_format()) +             # Format x-axis with comma separators
  theme_minimal() +                                                 # Apply clean, professional theme
  theme(plot.title = element_text(size = 14, face = "bold"),        # Bold title formatting
        plot.subtitle = element_text(size = 10, color = "gray50"))  # Subtitle styling

# Calculate key statistics for amount distribution

cat("\nMean amount:", format(mean(Fraud$amount), big.mark = ",", scientific = FALSE))
## 
## Mean amount: 156,951.4
cat("\nMedian amount:", format(median(Fraud$amount), big.mark = ",", scientific = FALSE))
## 
## Median amount: 66,679.98
cat("\nStandard deviation:", format(sd(Fraud$amount), big.mark = ",", scientific = FALSE))
## 
## Standard deviation: 326,359
cat("\nMinimum amount:", format(min(Fraud$amount), big.mark = ",", scientific = FALSE))
## 
## Minimum amount: 168.55
cat("\nMaximum amount:", format(max(Fraud$amount), big.mark = ",", scientific = FALSE))
## 
## Maximum amount: 17,926,808

Transaction Amount Distribution Insights: The histogram reveals a highly right-skewed distribution typical of financial transaction data, where most transactions involve relatively small amounts with a long tail of high-value transactions. This pattern is crucial for understanding normal spending behavior.

4.2 Class Imbalance Assessment and Fraud Rate Analysis

Purpose: Evaluate the fundamental challenge of class imbalance in fraud detection and calculate key fraud statistics that will guide the modeling approach.

approach: Computing fraud prevalence rates, assessing the magnitude of class imbalance, and determining appropriate evaluation metrics and sampling strategies for machine learning models.

# Calculate comprehensive fraud statistics to understand class distribution
# This analysis is crucial for determining appropriate modeling strategies
fraud_table <- table(Fraud$is_fraud)                    # Create frequency table of fraud labels
names(fraud_table) <- c("Legitimate", "Fraudulent")     # Apply descriptive names for clarity

# Calculate key fraud rate metrics
total_transactions <- nrow(Fraud)                       # Total number of transactions in dataset
fraudulent_count <- sum(Fraud$is_fraud == 1)           # Count of fraudulent transactions
legitimate_count <- sum(Fraud$is_fraud == 0)           # Count of legitimate transactions
fraud_rate <- round((fraudulent_count / total_transactions) * 100, 4)  # Calculate fraud percentage

# Calculate class imbalance ratio (important for model training)
if(fraudulent_count > 0) {
  imbalance_ratio <- round(legitimate_count / fraudulent_count, 1)  # Ratio of majority to minority class
} else {
  imbalance_ratio <- Inf  # Handle edge case of no fraud
}

# Display comprehensive fraud statistic
cat("\nTotal transactions:", format(total_transactions, big.mark = ","))
## 
## Total transactions: 1,000,000
cat("\nLegitimate transactions:", format(legitimate_count, big.mark = ","))
## 
## Legitimate transactions: 997,000
cat("\nFraudulent transactions:", format(fraudulent_count, big.mark = ","))
## 
## Fraudulent transactions: 3,000
cat("\nOverall fraud rate:", fraud_rate, "%")
## 
## Overall fraud rate: 0.3 %
cat("\nClass imbalance ratio (Legitimate:Fraudulent):", imbalance_ratio, ":1")
## 
## Class imbalance ratio (Legitimate:Fraudulent): 332.3 :1
# Create enhanced fraud distribution visualization
ggplot(Fraud, aes(x = factor(is_fraud, labels = c("Legitimate", "Fraudulent")))) +  # Convert to factor with labels
  geom_bar(fill = c("lightblue", "red"),                 # Use distinct colors for each class
           alpha = 0.8,                                   # Set transparency for visual appeal
           color = "white",                               # Add white borders to bars
           size = 1) +                                    # Set border thickness
  labs(title = "Transaction Class Distribution Analysis", # Descriptive title
       subtitle = paste("Fraud Rate:", fraud_rate, "% | Imbalance Ratio:", imbalance_ratio, ":1"), # Key statistics
       x = "Transaction Classification",                 # Clear x-axis label
       y = "Number of Transactions") +                   # Clear y-axis label
  scale_y_continuous(labels = scales::comma_format()) +   # Format y-axis with comma separators
  theme_minimal() +                                       # Clean, professional theme
  theme(plot.title = element_text(size = 14, face = "bold"),      # Bold title formatting
        plot.subtitle = element_text(size = 12, color = "gray50"), # Subtitle styling
        axis.title = element_text(size = 12, face = "bold"))       # Bold axis labels

# Determine modeling implications based on imbalance severity
if(imbalance_ratio > 100) {
  imbalance_category <- "Extreme"
  modeling_strategy <- "Requires specialized techniques (SMOTE, cost-sensitive learning)"
} else if(imbalance_ratio > 50) {
  imbalance_category <- "Severe" 
  modeling_strategy <- "Needs oversampling or ensemble methods"
} else if(imbalance_ratio > 10) {
  imbalance_category <- "Moderate"
  modeling_strategy <- "Standard techniques with stratified sampling"
} else {
  imbalance_category <- "Manageable"
  modeling_strategy <- "Regular classification methods applicable"
}

cat("\nClass imbalance severity:", imbalance_category)
## 
## Class imbalance severity: Extreme
cat("\nRecommended strategy:", modeling_strategy)
## 
## Recommended strategy: Requires specialized techniques (SMOTE, cost-sensitive learning)

Class Distribution Analysis Results: This code is checking how many fraudulent and legitimate transactions there are in the fraud dataset. It counts both classes, calculates the fraud rate (percentage of fraud), and then works out the imbalance ratio (how many legitimate transactions there are for every 1 fraudulent one).

It also makes a bar chart that shows the number of legitimate vs. fraudulent transactions. Legitimate ones are shown in light blue, fraud in red.

Finally, it looks at how severe the imbalance is and suggests what kind of modeling techniques that should be usesd.

Outcome Explanation:

This dataset is highly imbalanced: there are far more legitimate transactions than fraudulent ones. the imbalance ratio is something like 332.3:1, that means for every 332 legitimate transactions, there’s only 1 fraudulent one.

This is a problem because:

  • A model could just predict “legitimate” every time and still be 99.5% accurate, but it would completely miss the fraud cases.
  • Fraudulent transactions are rare but extremely important to detect.

So, we need special techniques that make the model pay more attention to the minority class (fraud).

🔹 What the recommendation means

  1. SMOTE (Synthetic Minority Oversampling Technique)

    • SMOTE creates synthetic (artificial) fraudulent transactions by slightly modifying existing fraud cases.
    • This balances the dataset so the model doesn’t ignore fraud.
    • Instead of duplicating fraud cases, it generates new, realistic ones.

    Example: we have 100000 fraud cases and just 3,000 are legitimate ones, SMOTE can generate more fraud-like examples so the model sees, say, 10,000 frauds and 100,000 legits.

  2. Cost-Sensitive Learning

    • This means telling the model: “If you miss a fraud case, that’s much worse than wrongly flagging a legitimate one.”
    • You give a higher penalty (cost) for misclassifying fraud.
    • This forces the algorithm to take fraud detection more seriously.

    Example: A false negative (missing fraud) could cost the bank $1,000, while a false positive (flagging a normal transaction) might just annoy a customer. The model should prefer catching fraud even if it means more false alarms.

In simple terms

The recommended strategy is saying: “Because fraud is so rare compared to legitimate transactions, you can’t just train a model normally. You either need to create more fraud examples artificially (SMOTE) or tell the model that missing fraud is way more costly than a false alarm (cost-sensitive learning).”

Both methods help the model focus on catching fraud instead of just predicting ‘legitimate’ for everything.

hmmm…. interesting…!!

Transaction Channel Analysis

Understanding channel usage patterns helps identify which platforms are most vulnerable to fraud attacks.

# Create bar chart of transaction channels
# This shows the relative usage of different payment channels
ggplot(Fraud, aes(x=channel)) + 
  geom_bar(fill="steelblue", alpha=0.8) +                 # Consistent color scheme
  labs(title="Distribution of Transaction Channels",       # Descriptive title
       x="Channel", y="Count") +                         # Clear labels
  theme_minimal()                                         # Clean theme

Channel Usage Insights

Mobile transactions are the most frequent, followed by Web and POS. ATM, ECOM, and IB (Internet Banking) have fewer transactions. This distribution is important for risk assessment as high-volume channels may be prime targets for fraudsters.

Distribution of Merchant Categories Observation: The distribution across merchant categories appears relatively even, with no single category dominating the transaction volume.

In debt Explanation When we say “the distribution across merchant categories appears relatively even, with no single category dominating the transaction volume”, it means that the transactions in the dataset are spread fairly equally across different merchant categories.

In other words, no single category (like fuel, fashion, or airtime) has an overwhelming majority of transactions compared to the others. This is useful because:

  • It suggests there isn’t a strong bias toward one type of merchant.
  • Fraud detection models won’t be skewed by a single category dominating the data.
  • You’ll be able to compare fraud rates across categories more fairly.

If the distribution were uneven (say 80% of all transactions in “airtime”), then the model might learn more about that one category and perform poorly in others. But since it’s balanced, each merchant category contributes roughly equally to the dataset.

ggplot(Fraud, aes(x=merchant_category)) + 
  geom_bar() + 
  labs(title="Distribution of Merchant Categories", x="Merchant Category", y="Count") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Feature Analysis and Fraud Pattern Investigation

Distribution of Fraud Techniques

Observation: ‘SOCIAL_ENGINEERING’ is the most prevalent fraud technique in the dataset, followed by ‘ROBBERY’, ‘CARD_THEFT’, and ‘OTHER’. This insight can be valuable for developing targeted fraud prevention strategies.

This analysis shows that fraud isn’t evenly spread across techniques instead, Social Engineering dominates as the leading method used. This insight is important because:

Fraud prevention efforts might need to focus more on detecting and preventing social engineering scams, since that’s where the biggest risk lies.

At the same time, the other categories, though smaller, still matter because they may exploit different weaknesses (card theft, phishing, PIN compromise).

what is social enhinering: manipulating people to drop there account details.

Fraud %>%
  filter(is_fraud == 1) %>%
  count(fraud_technique)
##      fraud_technique    n
## 1         CARD_THEFT  220
## 2              OTHER  220
## 3           PHISHING  132
## 4     PIN_COMPROMISE  158
## 5            ROBBERY  341
## 6 SOCIAL_ENGINEERING 1929

Average Transaction Amount for Fraudulent vs. Non-Fraudulent Transactions

Observation: Fraudulent transactions (is_fraud = 1) have a significantly higher average amount (384,958.7) compared to non-fraudulent transactions (156,265.3). This suggests that transaction amount is a strong indicator of fraud.

Fraud %>%
  group_by(is_fraud) %>%
  summarise(AvgAmount = mean(amount, na.rm = TRUE))
## # A tibble: 2 × 2
##   is_fraud AvgAmount
##      <int>     <dbl>
## 1        0   156265.
## 2        1   384959.

Fraudulent Transactions by Channel

Observation: Mobile transactions account for the highest number of fraudulent activities, which aligns with it being the most frequent transaction channel. Web and POS also show a considerable number of fraudulent transactions.

ggplot(Fraud[Fraud$is_fraud == 1, ], aes(x=channel)) + 
  geom_bar() + 
  labs(title="Fraudulent Transactions by Channel", x="Channel", y="Count")

Fraudulent Transactions by Merchant Category Observation: Fraudulent transactions are distributed across various merchant categories, with ‘ATM_Withdrawal’, ‘Bill_Payment’, ‘Electronics’, ‘Retail’, and ‘Transfer’ showing higher counts. This indicates that fraud is not concentrated in a single merchant type.

Fraud %>%
  filter(is_fraud == 1) %>%
  ggplot(aes(x = merchant_category)) +
  geom_bar() +
  labs(title = "Fraudulent Transactions by Merchant Category",
       x = "Merchant Category", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Summary of Key Findings: +The dataset is clean with no missing values.

+Transaction amounts are highly skewed, with fraudulent transactions generally involving larger amounts.

+There is a significant class imbalance, with very few fraudulent transactions.

+Mobile, Web, and POS channels are most susceptible to fraud.

+‘SOCIAL_ENGINEERING’ is the most common fraud technique.

+Fraudulent activities are spread across various merchant categories.

ADVANCED FRAUD DETECTION MODELING

Enhanced Data Preprocessing

creating a customer behavioral profiles and temporal features that will be essential for advanced fraud detection techniques.

# Convert timestamp to proper datetime format for time-series analysis
# This enables extraction of temporal patterns and sequences
Fraud$timestamp <- as.POSIXct(Fraud$timestamp)

# Calculate overall fraud rate for context
# This baseline helps evaluate model performance improvements
fraud_rate <- mean(Fraud$is_fraud) * 100
cat("Overall Fraud Rate:", round(fraud_rate, 3), "%\n")
## Overall Fraud Rate: 0.3 %
# Create customer-level behavioral profiles
# These aggregate metrics capture customer behavior patterns for anomaly detection
customer_profiles <- Fraud %>%
  group_by(customer_id) %>%           # Group by customer for aggregation
  summarise(
    total_transactions = n(),          # Total number of transactions per customer
    fraud_count = sum(is_fraud),       # Number of fraudulent transactions
    fraud_rate = mean(is_fraud),       # Customer-specific fraud rate
    avg_amount = mean(amount),         # Average transaction amount
    std_amount = sd(amount),           # Standard deviation of amounts (variability)
    channel_diversity = n_distinct(channel),        # Number of different channels used
    merchant_diversity = n_distinct(merchant_category), # Merchant category diversity
    avg_velocity = mean(velocity_score, na.rm = TRUE), # Average velocity score
    night_transaction_rate = mean(ifelse(hour >= 22 | hour <= 6, 1, 0)), # Proportion of night transactions
    weekend_transaction_rate = mean(is_weekend),    # Proportion of weekend transactions
    .groups = 'drop'                  # Remove grouping after summarization
  )

# Display summary of customer profiles
print(summary(customer_profiles))
##  customer_id        total_transactions  fraud_count    fraud_rate      
##  Length:10000       Min.   : 61        Min.   :0.0   Min.   :0.000000  
##  Class :character   1st Qu.: 93        1st Qu.:0.0   1st Qu.:0.000000  
##  Mode  :character   Median :100        Median :0.0   Median :0.000000  
##                     Mean   :100        Mean   :0.3   Mean   :0.003005  
##                     3rd Qu.:107        3rd Qu.:1.0   3rd Qu.:0.008547  
##                     Max.   :140        Max.   :4.0   Max.   :0.043956  
##                                                                        
##    avg_amount       std_amount      channel_diversity merchant_diversity
##  Min.   : 79531   Min.   :  76295   Min.   :4.000     Min.   :12.00     
##  1st Qu.:133701   1st Qu.: 198532   1st Qu.:6.000     1st Qu.:14.00     
##  Median :151998   Median : 253788   Median :6.000     Median :14.00     
##  Mean   :156954   Mean   : 291808   Mean   :5.862     Mean   :13.99     
##  3rd Qu.:175009   3rd Qu.: 339275   3rd Qu.:6.000     3rd Qu.:14.00     
##  Max.   :380509   Max.   :1840761   Max.   :6.000     Max.   :14.00     
##                                                                         
##   avg_velocity   night_transaction_rate weekend_transaction_rate
##  Min.   :1.119   Min.   :0.02381        Min.   : NA             
##  1st Qu.:1.658   1st Qu.:0.11650        1st Qu.: NA             
##  Median :1.849   Median :0.13913        Median : NA             
##  Mean   :1.896   Mean   :0.14051        Mean   :NaN             
##  3rd Qu.:2.075   3rd Qu.:0.16346        3rd Qu.: NA             
##  Max.   :5.199   Max.   :0.28846        Max.   : NA             
##                                         NA's   :10000
head(customer_profiles)
## # A tibble: 6 × 11
##   customer_id   total_transactions fraud_count fraud_rate avg_amount std_amount
##   <chr>                      <int>       <int>      <dbl>      <dbl>      <dbl>
## 1 CUST_0002AED1                107           0    0          170390.    365916.
## 2 CUST_000888F9                101           1    0.00990    174489.    269289.
## 3 CUST_000C45C0                104           0    0          174831.    250649.
## 4 CUST_000EF02D                106           0    0           98335.    109784.
## 5 CUST_0019D768                105           2    0.0190     200093.    723111.
## 6 CUST_001CEEE8                 89           0    0          138737.    272240.
## # ℹ 5 more variables: channel_diversity <int>, merchant_diversity <int>,
## #   avg_velocity <dbl>, night_transaction_rate <dbl>,
## #   weekend_transaction_rate <dbl>

Customer Profiling Results The customer behavioral profiles reveal patterns in transaction frequency, amounts, channel usage, and timing preferences that will be crucial for detecting anomalous behavior.

1. ANOMALY DETECTION ALGORITHMS

1.1 Statistical Anomaly Detection

# Z-score based anomaly detection
Fraud$amount_zscore <- abs(scale(Fraud$amount))
Fraud$amount_anomaly <- ifelse(Fraud$amount_zscore > 3, 1, 0)

# Velocity-based anomalies
velocity_threshold <- quantile(Fraud$velocity_score, 0.95, na.rm = TRUE)
Fraud$velocity_anomaly <- ifelse(Fraud$velocity_score > velocity_threshold, 1, 0)

# Analyze anomaly detection performance
anomalies_analysis <- Fraud %>%
  group_by(amount_anomaly, is_fraud) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(amount_anomaly) %>%
  mutate(fraud_rate = count / sum(count) * 100)

print("Statistical Anomaly Detection Results:")
## [1] "Statistical Anomaly Detection Results:"
print(anomalies_analysis)
## # A tibble: 4 × 4
## # Groups:   amount_anomaly [2]
##   amount_anomaly[,1] is_fraud  count fraud_rate
##                <dbl>    <int>  <int>      <dbl>
## 1                  0        0 981664     99.7  
## 2                  0        1   2815      0.286
## 3                  1        0  15336     98.8  
## 4                  1        1    185      1.19
# Visualization
ggplot(Fraud, aes(x = factor(amount_anomaly), fill = factor(is_fraud))) +
  geom_bar(position = "fill") +
  labs(title = "Amount Anomaly Detection vs Fraud",
       x = "Amount Anomaly (0 = Normal, 1 = Anomaly)",
       y = "Proportion", fill = "Fraud Status") +
  scale_fill_manual(values = c("lightblue", "red"), labels = c("Legitimate", "Fraudulent")) +
  theme_minimal()

1.2 DBSCAN Clustering for Density-based Anomaly Detection

# Prepare features for HDSCAN
dbscan_features <- Fraud %>%
  select(amount_log, velocity_score, merchant_risk_score, hour, tx_count_24h) %>%
  na.omit() %>%
  scale() %>%
  as.matrix()

# Apply HDSCAN
library(dbscan)
set.seed(123)
db_result <- dbscan(dbscan_features[1:5000, ], eps = 0.5, minPts = 30)
db_result
## DBSCAN clustering for 5000 objects.
## Parameters: eps = 0.5, minPts = 30
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 4 cluster(s) and 1781 noise points.
## 
##    0    1    2    3    4 
## 1781  424 2652   93   50 
## 
## Available fields: cluster, eps, minPts, metric, borderPoints
# Add cluster labels and identify anomalies
Fraud$cluster <- -1
Fraud$cluster[complete.cases(Fraud[, c("amount_log", "velocity_score", "merchant_risk_score", "hour", "tx_count_24h")])] <- db_result$cluster
Fraud$dbscan_anomaly <- ifelse(Fraud$cluster == 0, 1, 0)

# Analyze DBSCAN performance
dbscan_analysis <- Fraud %>%
  group_by(dbscan_anomaly, is_fraud) %>%
 summarise(count = n(), .groups = 'drop') %>%
  group_by(dbscan_anomaly) %>%
  mutate(fraud_rate = count / sum(count) * 100)

print("DBSCAN Anomaly Detection Results:")
## [1] "DBSCAN Anomaly Detection Results:"
print(dbscan_analysis)
## # A tibble: 4 × 4
## # Groups:   dbscan_anomaly [2]
##   dbscan_anomaly is_fraud  count fraud_rate
##            <dbl>    <int>  <int>      <dbl>
## 1              0        0 641893     99.7  
## 2              0        1   1907      0.296
## 3              1        0 355107     99.7  
## 4              1        1   1093      0.307
ggplot(Fraud, aes(x = factor(dbscan_anomaly), fill = factor(is_fraud))) +
  geom_bar(position = "fill") +
  labs(title = "DBSCAN Anomaly Detection vs Fraud",
       x = "DBSCAN Anomaly (0 = Normal, 1 = Anomaly)",
       y = "Proportion", fill = "Fraud Status") +
  scale_fill_manual(values = c("lightblue", "red"), labels = c("Legitimate", "Fraudulent")) +
  theme_minimal()

Explanation of what is happening here

This step is all about finding unusual or suspicious transactions that could be fraud. It does this in two different ways. The first way is statistical anomaly detection, which basically means it looks at the numbers and tries to see if anything stands out as “weird.” For example, it looks at the transaction amounts and checks if some amounts are way higher or lower than what’s normal. If a transaction is too far from the average, it marks it as suspicious. It does the same thing for transaction speed, where it looks at the top 5% of fastest transactions and flags those as unusual too. After flagging these, it compares them with the actual fraud labels in the dataset to see if the unusual transactions really were fraud. Then, it creates a chart to show you the proportions of fraud versus legitimate within the flagged anomalies, so you can visually see if the method is working.

The second way it checks for fraud is by using DBSCAN (Density-based Anomaly Detection), which is a clustering method. Instead of just looking at amounts or speed separately, this approach looks at several features together, like amount, speed, merchant risk score, and transaction time, to understand the overall behavior of a transaction. DBSCAN groups together transactions that look similar, and anything that doesn’t fit into a group is marked as an outlier or anomaly. Those outliers are then compared against the fraud labels in the dataset to see if they are actually fraud. Like the first method, it also shows the results in a chart, where we can see how many flagged anomalies were truly fraudulent.

Overall, what the whole analysis is doing is trying out two different ways to catch fraud. The first method is very straightforward it just looks for extreme values that don’t seem normal. The second method is more advanced and uses clustering to find transactions that don’t belong anywhere. At the end, both methods are checked against the real fraud cases in the data set to see how accurate they are.

2. GRAPH THEORY AND NETWORK ANALYTICS

2.1 Transaction Network Analysis

This step is building a network to study the relationships between customers and merchants. Instead of just looking at transactions as separate rows, it creates a “graph” where customers and merchants are nodes (points) and each transaction between them is an edge (a connection). To make the network more meaningful, it only keeps customer merchant pairs where there have been at least two transactions. For each customer merchant pair, it calculates useful statistics: how many transactions happened, how many were fraudulent, the fraud rate (percentage of fraud), and the total amount spent.

Once this network is built, then counting how many customer merchant connections exist, how many unique customers there are, and how many unique merchants appear. This gives you a sense of the size and diversity of the transaction network.

The next part focuses on merchants specifically. It groups the network data by merchant category and calculates how risky each category is. For each merchant category, it looks at how many customers interacted with it, how many customer merchant relationships involved fraud, and the average fraud rate. Then it creates a “risk score,” which combines fraud rate with the size of the customer base. Categories with higher fraud rates and larger customer bases get higher risk scores, meaning they are more dangerous in terms of fraud exposure. The results are then sorted to show the riskiest merchant categories at the top.

Finally, it makes a bar chart of the top 10 riskiest merchant categories. The higher the bar, the more risky that merchant category is considered. This visualization makes it easier to see which merchant types are most strongly associated with fraud in the dataset.

# Create customer-merchant transaction network
customer_merchant_edges <- Fraud %>%
  group_by(customer_id, merchant_category) %>%
  summarise(
    transaction_count = n(),
    fraud_count = sum(is_fraud),
    fraud_rate = mean(is_fraud),
    total_amount = sum(amount),
    .groups = 'drop'
  ) %>%
  filter(transaction_count >= 2)

# Network statistics
cat("Customer-Merchant edges:", nrow(customer_merchant_edges), "\n")
## Customer-Merchant edges: 139102
cat("Unique customers:", n_distinct(customer_merchant_edges$customer_id), "\n")
## Unique customers: 10000
cat("Unique merchants:", n_distinct(customer_merchant_edges$merchant_category), "\n")
## Unique merchants: 14
# Analyze fraud concentration by merchant
merchant_risk <- customer_merchant_edges %>%
  group_by(merchant_category) %>%
  summarise(
    total_customers = n(),
    fraudulent_relationships = sum(fraud_count > 0),
    avg_fraud_rate = mean(fraud_rate),
    risk_score = avg_fraud_rate * log(total_customers + 1),
    .groups = 'drop'
  ) %>%
  arrange(desc(risk_score))

print("High-Risk Merchant Categories:")
## [1] "High-Risk Merchant Categories:"
print(head(merchant_risk, 10))
## # A tibble: 10 × 5
##    merchant_category total_customers fraudulent_relationships avg_fraud_rate
##    <chr>                       <int>                    <int>          <dbl>
##  1 ATM_Withdrawal               9929                      236        0.00349
##  2 Retail                       9940                      231        0.00332
##  3 Electronics                  9924                      233        0.00325
##  4 Bill_Payment                 9948                      224        0.00320
##  5 Medical                      9941                      208        0.00309
##  6 Transfer                     9932                      221        0.00306
##  7 Fuel                         9931                      205        0.00301
##  8 Restaurant                   9915                      213        0.00300
##  9 Airtime                      9939                      221        0.00299
## 10 Entertainment                9941                      203        0.00287
## # ℹ 1 more variable: risk_score <dbl>
# Visualization of merchant risk
ggplot(head(merchant_risk, 10), aes(x = reorder(merchant_category, risk_score), y = risk_score)) +
  geom_bar(stat = "identity", fill = "red", alpha = 0.7) +
  coord_flip() +
  labs(title = "Merchant Risk Scores (Top 10)",
       x = "Merchant Category", y = "Risk Score") +
  theme_minimal()

To be continued…