# Set global chunk options for consistent document behavior
knitr::opts_chunk$set(
echo = TRUE, # Show all code in the output document
warning = FALSE, # Suppress warning messages for cleaner output
message = FALSE # Suppress package loading messages
)
# Configure CRAN mirror for reliable package installation
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Load core data manipulation and visualization libraries
library(tidyverse) # for Comprehensive data science package (ggplot2, dplyr, etc.)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr) # for Advanced data manipulation
library(data.table) # for High-performance data manipulation for large datasets
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(ggplot2) # for Grammar of graphics for creating complex visualizations
library(corrplot) # for Specialized correlation matrix visualization
## corrplot 0.95 loaded
library(lubridate) # for Simplified date-time manipulation and arithmetic
library(VIM) # for Visualization and imputation of missing values
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
##
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
##
## The following object is masked from 'package:datasets':
##
## sleep
library(scales) # for Scale functions for axes and color aesthetics
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(viridis) # for Perceptually uniform color scales
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
##
## The following object is masked from 'package:scales':
##
## viridis_pal
# Load specialized machine learning libraries
library(igraph) # Comprehensive network analysis and graph theory
##
## Attaching package: 'igraph'
##
## The following objects are masked from 'package:lubridate':
##
## %--%, union
##
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
##
## The following objects are masked from 'package:purrr':
##
## compose, simplify
##
## The following object is masked from 'package:tidyr':
##
## crossing
##
## The following object is masked from 'package:tibble':
##
## as_data_frame
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union
library(networkD3) # Interactive network visualizations for web
library(cluster) # Classical cluster analysis algorithms
library(factoextra) # Extract and visualize multivariate analysis results
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dbscan) # Density-based spatial clustering for anomaly detection
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:VIM':
##
## kNN
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(caret) # Classification and regression training framework
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest) # Ensemble learning algorithm for classification
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(plotly) # Interactive web-based data visualization
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:igraph':
##
## groups
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
Introduction
This analysis implements a comprehensive real-time fraud detection system for digital payments in Nigeria. The approach combines multiple advanced techniques:
2. Data Import and Initial Exploration
Purpose: Load the NIBSS fraud dataset and perform initial data structure analysis to understand the available features and data quality.
next step: Importing the fraud dataset from CSV format and examining its structure, dimensions, data types, and basic properties to inform the analysis strategy.
# Import the fraud dataset from CSV file
# This dataset contains transaction records with fraud labels and engineered features
Fraud <- read.csv("C:/Users/HomePC/OneDrive/Desktop/Fraud Detection/nibss_fraud_dataset.csv")
# Display the structure of the dataset to understand data types and columns
# This reveals the number of observations, variables, and their data types
print(str(Fraud))
## 'data.frame': 1000000 obs. of 38 variables:
## $ transaction_id : chr "TXN_F08A86FFD87C" "TXN_C2D08134EC83" "TXN_B9499111901D" "TXN_48DB1D526A3B" ...
## $ customer_id : chr "CUST_0002AED1" "CUST_0002AED1" "CUST_0002AED1" "CUST_0002AED1" ...
## $ timestamp : chr "2023-01-14 04:31:09" "2023-01-17 11:20:13" "2023-01-22 02:17:46" "2023-01-24 08:18:23" ...
## $ amount : num 32267 72530 168153 16440 9923 ...
## $ channel : chr "Mobile" "Web" "Mobile" "Mobile" ...
## $ merchant_category : chr "Grocery" "Entertainment" "Transport" "Entertainment" ...
## $ bank : chr "Sterling" "UBA" "Wema" "FCMB" ...
## $ location : chr "Other" "Other" "Other" "Other" ...
## $ age_group : chr "30-39" "30-39" "30-39" "30-39" ...
## $ hour : int 4 11 2 8 15 16 13 16 10 19 ...
## $ day_of_week : int 5 1 6 1 2 2 2 2 3 0 ...
## $ month : int 1 1 1 1 2 2 2 2 2 2 ...
## $ is_weekend : chr "True" "False" "True" "False" ...
## $ is_peak_hour : chr "False" "True" "False" "False" ...
## $ tx_count_24h : num 1 1 1 1 1 1 1 2 3 1 ...
## $ amount_sum_24h : num 32267 72530 168153 16440 9923 ...
## $ amount_mean_7d : num 32267 52399 120342 85708 9923 ...
## $ amount_std_7d : num 0 20132 47811 62634 0 ...
## $ tx_count_total : int 107 107 107 107 107 107 107 107 107 107 ...
## $ amount_mean_total : num 170390 170390 170390 170390 170390 ...
## $ amount_std_total : num 365916 365916 365916 365916 365916 ...
## $ channel_diversity : int 5 5 5 5 5 5 5 5 5 5 ...
## $ location_diversity : int 1 1 1 1 1 1 1 1 1 1 ...
## $ amount_vs_mean_ratio: num 0.1894 0.4257 0.9869 0.0965 0.0582 ...
## $ online_channel_ratio: num 0.776 0.776 0.776 0.776 0.776 ...
## $ is_fraud : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fraud_technique : chr "" "" "" "" ...
## $ hour_sin : num 0.866 0.259 0.5 0.866 -0.707 ...
## $ hour_cos : num 0.5 -0.966 0.866 -0.5 -0.707 ...
## $ day_sin : num -0.975 0.782 -0.782 0.782 0.975 ...
## $ day_cos : num -0.223 0.623 0.623 0.623 -0.223 ...
## $ month_sin : num 0.5 0.5 0.5 0.5 0.866 ...
## $ month_cos : num 0.866 0.866 0.866 0.866 0.5 ...
## $ amount_log : num 10.38 11.19 12.03 9.71 9.2 ...
## $ amount_rounded : int 0 0 0 0 0 0 0 0 0 0 ...
## $ velocity_score : num 0.1894 0.4257 0.9869 0.0965 0.0582 ...
## $ merchant_risk_score : num 0.215 0.877 0.44 0.877 0.231 ...
## $ composite_risk : num 0.0706 0.2768 0.1636 0.2663 0.0713 ...
## NULL
# Show dataset dimensions for size assessment
nrow(Fraud)
## [1] 1000000
ncol(Fraud)
## [1] 38
# Display first few rows to understand the actual data format and values
head(Fraud)
## transaction_id customer_id timestamp amount channel
## 1 TXN_F08A86FFD87C CUST_0002AED1 2023-01-14 04:31:09 32266.83 Mobile
## 2 TXN_C2D08134EC83 CUST_0002AED1 2023-01-17 11:20:13 72530.49 Web
## 3 TXN_B9499111901D CUST_0002AED1 2023-01-22 02:17:46 168152.87 Mobile
## 4 TXN_48DB1D526A3B CUST_0002AED1 2023-01-24 08:18:23 16439.93 Mobile
## 5 TXN_56DB1E28B758 CUST_0002AED1 2023-02-01 15:39:53 9922.68 POS
## 6 TXN_8CB46D78CEED CUST_0002AED1 2023-02-08 16:27:19 80685.56 Web
## merchant_category bank location age_group hour day_of_week month
## 1 Grocery Sterling Other 30-39 4 5 1
## 2 Entertainment UBA Other 30-39 11 1 1
## 3 Transport Wema Other 30-39 2 6 1
## 4 Entertainment FCMB Other 30-39 8 1 1
## 5 Education FirstBank Other 30-39 15 2 2
## 6 Restaurant GTBank Other 30-39 16 2 2
## is_weekend is_peak_hour tx_count_24h amount_sum_24h amount_mean_7d
## 1 True False 1 32266.83 32266.83
## 2 False True 1 72530.49 52398.66
## 3 True False 1 168152.87 120341.68
## 4 False False 1 16439.93 85707.76
## 5 False True 1 9922.68 9922.68
## 6 False True 1 80685.56 80685.56
## amount_std_7d tx_count_total amount_mean_total amount_std_total
## 1 0.00 107 170389.9 365915.9
## 2 20131.83 107 170389.9 365915.9
## 3 47811.19 107 170389.9 365915.9
## 4 62633.51 107 170389.9 365915.9
## 5 0.00 107 170389.9 365915.9
## 6 0.00 107 170389.9 365915.9
## channel_diversity location_diversity amount_vs_mean_ratio
## 1 5 1 0.18936948
## 2 5 1 0.42567123
## 3 5 1 0.98686550
## 4 5 1 0.09648363
## 5 5 1 0.05823481
## 6 5 1 0.47353218
## online_channel_ratio is_fraud fraud_technique hour_sin hour_cos
## 1 0.7757009 0 0.8660254 0.5000000
## 2 0.7757009 0 0.2588190 -0.9659258
## 3 0.7757009 0 0.5000000 0.8660254
## 4 0.7757009 0 0.8660254 -0.5000000
## 5 0.7757009 0 -0.7071068 -0.7071068
## 6 0.7757009 0 -0.8660254 -0.5000000
## day_sin day_cos month_sin month_cos amount_log amount_rounded
## 1 -0.9749279 -0.2225209 0.5000000 0.8660254 10.381826 0
## 2 0.7818315 0.6234898 0.5000000 0.8660254 11.191776 0
## 3 -0.7818315 0.6234898 0.5000000 0.8660254 12.032635 0
## 4 0.7818315 0.6234898 0.5000000 0.8660254 9.707529 0
## 5 0.9749279 -0.2225209 0.8660254 0.5000000 9.202679 0
## 6 0.9749279 -0.2225209 0.8660254 0.5000000 11.298327 0
## velocity_score merchant_risk_score composite_risk
## 1 0.18936948 0.2149999 0.07055978
## 2 0.42567123 0.8774244 0.27684880
## 3 0.98686550 0.4402304 0.16364883
## 4 0.09648363 0.8774244 0.26631480
## 5 0.05823481 0.2312907 0.07125073
## 6 0.47353218 0.6084928 0.19770088
# Display column names for reference
names(Fraud)
## [1] "transaction_id" "customer_id" "timestamp"
## [4] "amount" "channel" "merchant_category"
## [7] "bank" "location" "age_group"
## [10] "hour" "day_of_week" "month"
## [13] "is_weekend" "is_peak_hour" "tx_count_24h"
## [16] "amount_sum_24h" "amount_mean_7d" "amount_std_7d"
## [19] "tx_count_total" "amount_mean_total" "amount_std_total"
## [22] "channel_diversity" "location_diversity" "amount_vs_mean_ratio"
## [25] "online_channel_ratio" "is_fraud" "fraud_technique"
## [28] "hour_sin" "hour_cos" "day_sin"
## [31] "day_cos" "month_sin" "month_cos"
## [34] "amount_log" "amount_rounded" "velocity_score"
## [37] "merchant_risk_score" "composite_risk"
Data Import Results: The dataset has been successfully loaded. It can be seen that the structure includes transaction-level information with pre-engineered features for fraud detection, including temporal patterns, behavioral metrics, and risk scores.
Purpose: Evaluate data completeness, identify missing values, and generate comprehensive summary statistics to understand data distribution and quality.
What is being done: Systematically checking for missing values across all columns and generating descriptive statistics to identify potential data quality issues, outliers, and distribution characteristics that will inform our preprocessing strategy.
# Check for missing values in each column to assess data completeness
missing_values <- colSums(is.na(Fraud)) # Count missing values per column
print(missing_values)
## transaction_id customer_id timestamp
## 0 0 0
## amount channel merchant_category
## 0 0 0
## bank location age_group
## 0 0 0
## hour day_of_week month
## 0 0 0
## is_weekend is_peak_hour tx_count_24h
## 0 0 0
## amount_sum_24h amount_mean_7d amount_std_7d
## 0 0 0
## tx_count_total amount_mean_total amount_std_total
## 0 0 0
## channel_diversity location_diversity amount_vs_mean_ratio
## 0 0 0
## online_channel_ratio is_fraud fraud_technique
## 0 0 0
## hour_sin hour_cos day_sin
## 0 0 0
## day_cos month_sin month_cos
## 0 0 0
## amount_log amount_rounded velocity_score
## 0 0 0
## merchant_risk_score composite_risk
## 0 0
# Generate comprehensive summary statistics for all numeric columns
# This provides mean, median, quartiles, min, max, and range for each variable
print(summary(Fraud))
## transaction_id customer_id timestamp amount
## Length:1000000 Length:1000000 Length:1000000 Min. :1.686e+02
## Class :character Class :character Class :character 1st Qu.:2.800e+04
## Mode :character Mode :character Mode :character Median :6.668e+04
## Mean :1.570e+05
## 3rd Qu.:1.595e+05
## Max. :1.793e+07
## channel merchant_category bank location
## Length:1000000 Length:1000000 Length:1000000 Length:1000000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## age_group hour day_of_week month
## Length:1000000 Min. : 0.00 Min. :0.000 Min. : 1.000
## Class :character 1st Qu.: 9.00 1st Qu.:1.000 1st Qu.: 3.000
## Mode :character Median :12.00 Median :3.000 Median : 7.000
## Mean :12.23 Mean :3.013 Mean : 6.609
## 3rd Qu.:15.00 3rd Qu.:5.000 3rd Qu.:10.000
## Max. :23.00 Max. :6.000 Max. :12.000
## is_weekend is_peak_hour tx_count_24h amount_sum_24h
## Length:1000000 Length:1000000 Min. :1.000 Min. :1.686e+02
## Class :character Class :character 1st Qu.:1.000 1st Qu.:3.617e+04
## Mode :character Mode :character Median :1.000 Median :8.972e+04
## Mean :1.282 Mean :1.992e+05
## 3rd Qu.:1.000 3rd Qu.:2.151e+05
## Max. :6.000 Max. :1.326e+07
## amount_mean_7d amount_std_7d tx_count_total amount_mean_total
## Min. :2.128e+02 Min. : 0 Min. : 61 Min. : 79531
## 1st Qu.:5.327e+04 1st Qu.: 14216 1st Qu.: 94 1st Qu.:133244
## Median :9.779e+04 Median : 51220 Median :101 Median :151366
## Mean :1.558e+05 Mean : 111295 Mean :101 Mean :156224
## 3rd Qu.:1.795e+05 3rd Qu.: 124608 3rd Qu.:107 3rd Qu.:174290
## Max. :1.000e+07 Max. :4998794 Max. :140 Max. :380509
## amount_std_total channel_diversity location_diversity amount_vs_mean_ratio
## Min. : 76295 Min. :4.000 Min. :1 Min. : 0.001013
## 1st Qu.: 198098 1st Qu.:6.000 1st Qu.:1 1st Qu.: 0.181893
## Median : 252854 Median :6.000 Median :1 Median : 0.437315
## Mean : 290158 Mean :5.864 Mean :1 Mean : 0.999993
## 3rd Qu.: 337364 3rd Qu.:6.000 3rd Qu.:1 3rd Qu.: 1.049607
## Max. :1149815 Max. :6.000 Max. :1 Max. :54.904026
## online_channel_ratio is_fraud fraud_technique hour_sin
## Min. :0.5244 Min. :0.000 Length:1000000 Min. :-1.00000
## 1st Qu.:0.6700 1st Qu.:0.000 Class :character 1st Qu.:-0.70711
## Median :0.7016 Median :0.000 Mode :character Median : 0.00000
## Mean :0.7002 Mean :0.003 Mean :-0.08592
## 3rd Qu.:0.7308 3rd Qu.:0.000 3rd Qu.: 0.50000
## Max. :0.8929 Max. :1.000 Max. : 1.00000
## hour_cos day_sin day_cos month_sin
## Min. :-1.0000 Min. :-0.974928 Min. :-0.900969 Min. :-1.00000
## 1st Qu.:-0.8660 1st Qu.:-0.781831 1st Qu.:-0.900969 1st Qu.:-0.50000
## Median :-0.7071 Median : 0.000000 Median :-0.222521 Median : 0.00000
## Mean :-0.4664 Mean :-0.001601 Mean :-0.003539 Mean : 0.03398
## 3rd Qu.:-0.2588 3rd Qu.: 0.781831 3rd Qu.: 0.623490 3rd Qu.: 0.86603
## Max. : 1.0000 Max. : 0.974928 Max. : 1.000000 Max. : 1.00000
## month_cos amount_log amount_rounded velocity_score
## Min. :-1.00000 Min. : 5.133 Min. :0.0e+00 Min. :1.077e-03
## 1st Qu.:-0.50000 1st Qu.:10.240 1st Qu.:0.0e+00 1st Qu.:2.467e-01
## Median : 0.00000 Median :11.108 Median :0.0e+00 Median :6.812e-01
## Mean : 0.01273 Mean :11.111 Mean :7.9e-05 Mean :1.905e+00
## 3rd Qu.: 0.50000 3rd Qu.:11.980 3rd Qu.:0.0e+00 3rd Qu.:1.916e+00
## Max. : 1.00000 Max. :16.702 Max. :1.0e+00 Max. :1.915e+02
## merchant_risk_score composite_risk
## Min. :0.1213 Min. :0.03643
## 1st Qu.:0.2027 1st Qu.:0.07808
## Median :0.3696 Median :0.13563
## Mean :0.3956 Mean :0.15129
## 3rd Qu.:0.5424 3rd Qu.:0.19646
## Max. :0.8774 Max. :0.73458
# Check for duplicate transactions that might affect analysis
duplicates <- sum(duplicated(Fraud))
cat("Number of duplicate rows:", duplicates)
## Number of duplicate rows: 0
# Examine data types to ensure proper formatting
data_types <- sapply(Fraud, class) #sapply function applies a function across elements of a vector/list/data frame.
print(data_types)
## transaction_id customer_id timestamp
## "character" "character" "character"
## amount channel merchant_category
## "numeric" "character" "character"
## bank location age_group
## "character" "character" "character"
## hour day_of_week month
## "integer" "integer" "integer"
## is_weekend is_peak_hour tx_count_24h
## "character" "character" "numeric"
## amount_sum_24h amount_mean_7d amount_std_7d
## "numeric" "numeric" "numeric"
## tx_count_total amount_mean_total amount_std_total
## "integer" "numeric" "numeric"
## channel_diversity location_diversity amount_vs_mean_ratio
## "integer" "integer" "numeric"
## online_channel_ratio is_fraud fraud_technique
## "numeric" "integer" "character"
## hour_sin hour_cos day_sin
## "numeric" "numeric" "numeric"
## day_cos month_sin month_cos
## "numeric" "numeric" "numeric"
## amount_log amount_rounded velocity_score
## "numeric" "integer" "numeric"
## merchant_risk_score composite_risk
## "numeric" "numeric"
Data Quality Assessment Results: The analysis reveals the data completeness status,and we were able to see that there are no missing values in the data set, and also no duplicate rows, it also shows the data types of all the columns in the Fraud dataset. Some columns, like transaction_id, customer_id, and channel, are stored as character data, meaning they represent text or categorical values. Other columns, such as amount, velocity_score, and merchant_risk_score, are numeric, representing continuous numerical features. There are also integer columns like hour, day_of_week, and is_fraud, which store whole numbers, with is_fraud being the target variable that indicates whether a transaction is fraudulent or not. Overall, this outcome confirms that the dataset contains a mix of categorical variables, numeric features, and the fraud label.
Purpose: Analyze the distribution of transaction amounts to understand spending patterns.
What is being done: Creating a histogram to visualize the transaction amount distribution, which typically follows a highly skewed pattern in financial data. This analysis helps establish baseline patterns and informs threshold-setting for statistical anomaly detection methods.
# Create comprehensive histogram of transaction amounts
# Using optimal bin count to visualize the highly skewed distribution typical in financial data
ggplot(Fraud, aes(x = amount)) + # Set amount as x-axis variable
geom_histogram(bins = 50, # Use 50 bins for detailed distribution view
alpha = 0.7, # Set transparency for better visualization
fill = "steelblue", # Use professional blue color
color = "white") + # Add white borders for bin separation
labs(title = "Distribution of Transaction Amounts", # Add descriptive title
subtitle = "Frequency distribution showing typical financial data skewness", # Explanatory subtitle
x = "Transaction Amount (Nigerian Naira)", # Clear x-axis label with currency
y = "Frequency (Number of Transactions)") + # Clear y-axis label
scale_x_continuous(labels = scales::comma_format()) + # Format x-axis with comma separators
theme_minimal() + # Apply clean, professional theme
theme(plot.title = element_text(size = 14, face = "bold"), # Bold title formatting
plot.subtitle = element_text(size = 10, color = "gray50")) # Subtitle styling
# Calculate key statistics for amount distribution
cat("\nMean amount:", format(mean(Fraud$amount), big.mark = ",", scientific = FALSE))
##
## Mean amount: 156,951.4
cat("\nMedian amount:", format(median(Fraud$amount), big.mark = ",", scientific = FALSE))
##
## Median amount: 66,679.98
cat("\nStandard deviation:", format(sd(Fraud$amount), big.mark = ",", scientific = FALSE))
##
## Standard deviation: 326,359
cat("\nMinimum amount:", format(min(Fraud$amount), big.mark = ",", scientific = FALSE))
##
## Minimum amount: 168.55
cat("\nMaximum amount:", format(max(Fraud$amount), big.mark = ",", scientific = FALSE))
##
## Maximum amount: 17,926,808
Transaction Amount Distribution Insights: The histogram reveals a highly right-skewed distribution typical of financial transaction data, where most transactions involve relatively small amounts with a long tail of high-value transactions. This pattern is crucial for understanding normal spending behavior.
Purpose: Evaluate the fundamental challenge of class imbalance in fraud detection and calculate key fraud statistics that will guide the modeling approach.
approach: Computing fraud prevalence rates, assessing the magnitude of class imbalance, and determining appropriate evaluation metrics and sampling strategies for machine learning models.
# Calculate comprehensive fraud statistics to understand class distribution
# This analysis is crucial for determining appropriate modeling strategies
fraud_table <- table(Fraud$is_fraud) # Create frequency table of fraud labels
names(fraud_table) <- c("Legitimate", "Fraudulent") # Apply descriptive names for clarity
# Calculate key fraud rate metrics
total_transactions <- nrow(Fraud) # Total number of transactions in dataset
fraudulent_count <- sum(Fraud$is_fraud == 1) # Count of fraudulent transactions
legitimate_count <- sum(Fraud$is_fraud == 0) # Count of legitimate transactions
fraud_rate <- round((fraudulent_count / total_transactions) * 100, 4) # Calculate fraud percentage
# Calculate class imbalance ratio (important for model training)
if(fraudulent_count > 0) {
imbalance_ratio <- round(legitimate_count / fraudulent_count, 1) # Ratio of majority to minority class
} else {
imbalance_ratio <- Inf # Handle edge case of no fraud
}
# Display comprehensive fraud statistic
cat("\nTotal transactions:", format(total_transactions, big.mark = ","))
##
## Total transactions: 1,000,000
cat("\nLegitimate transactions:", format(legitimate_count, big.mark = ","))
##
## Legitimate transactions: 997,000
cat("\nFraudulent transactions:", format(fraudulent_count, big.mark = ","))
##
## Fraudulent transactions: 3,000
cat("\nOverall fraud rate:", fraud_rate, "%")
##
## Overall fraud rate: 0.3 %
cat("\nClass imbalance ratio (Legitimate:Fraudulent):", imbalance_ratio, ":1")
##
## Class imbalance ratio (Legitimate:Fraudulent): 332.3 :1
# Create enhanced fraud distribution visualization
ggplot(Fraud, aes(x = factor(is_fraud, labels = c("Legitimate", "Fraudulent")))) + # Convert to factor with labels
geom_bar(fill = c("lightblue", "red"), # Use distinct colors for each class
alpha = 0.8, # Set transparency for visual appeal
color = "white", # Add white borders to bars
size = 1) + # Set border thickness
labs(title = "Transaction Class Distribution Analysis", # Descriptive title
subtitle = paste("Fraud Rate:", fraud_rate, "% | Imbalance Ratio:", imbalance_ratio, ":1"), # Key statistics
x = "Transaction Classification", # Clear x-axis label
y = "Number of Transactions") + # Clear y-axis label
scale_y_continuous(labels = scales::comma_format()) + # Format y-axis with comma separators
theme_minimal() + # Clean, professional theme
theme(plot.title = element_text(size = 14, face = "bold"), # Bold title formatting
plot.subtitle = element_text(size = 12, color = "gray50"), # Subtitle styling
axis.title = element_text(size = 12, face = "bold")) # Bold axis labels
# Determine modeling implications based on imbalance severity
if(imbalance_ratio > 100) {
imbalance_category <- "Extreme"
modeling_strategy <- "Requires specialized techniques (SMOTE, cost-sensitive learning)"
} else if(imbalance_ratio > 50) {
imbalance_category <- "Severe"
modeling_strategy <- "Needs oversampling or ensemble methods"
} else if(imbalance_ratio > 10) {
imbalance_category <- "Moderate"
modeling_strategy <- "Standard techniques with stratified sampling"
} else {
imbalance_category <- "Manageable"
modeling_strategy <- "Regular classification methods applicable"
}
cat("\nClass imbalance severity:", imbalance_category)
##
## Class imbalance severity: Extreme
cat("\nRecommended strategy:", modeling_strategy)
##
## Recommended strategy: Requires specialized techniques (SMOTE, cost-sensitive learning)
Class Distribution Analysis Results: This code is checking how many fraudulent and legitimate transactions there are in the fraud dataset. It counts both classes, calculates the fraud rate (percentage of fraud), and then works out the imbalance ratio (how many legitimate transactions there are for every 1 fraudulent one).
It also makes a bar chart that shows the number of legitimate vs. fraudulent transactions. Legitimate ones are shown in light blue, fraud in red.
Finally, it looks at how severe the imbalance is and suggests what kind of modeling techniques that should be usesd.
Outcome Explanation:
This dataset is highly imbalanced: there are far more legitimate transactions than fraudulent ones. the imbalance ratio is something like 332.3:1, that means for every 332 legitimate transactions, there’s only 1 fraudulent one.
This is a problem because:
So, we need special techniques that make the model pay more attention to the minority class (fraud).
SMOTE (Synthetic Minority Oversampling Technique)
Example: we have 100000 fraud cases and just 3,000 are legitimate ones, SMOTE can generate more fraud-like examples so the model sees, say, 10,000 frauds and 100,000 legits.
Cost-Sensitive Learning
Example: A false negative (missing fraud) could cost the bank $1,000, while a false positive (flagging a normal transaction) might just annoy a customer. The model should prefer catching fraud even if it means more false alarms.
In simple terms
The recommended strategy is saying: “Because fraud is so rare compared to legitimate transactions, you can’t just train a model normally. You either need to create more fraud examples artificially (SMOTE) or tell the model that missing fraud is way more costly than a false alarm (cost-sensitive learning).”
Both methods help the model focus on catching fraud instead of just predicting ‘legitimate’ for everything.
Understanding channel usage patterns helps identify which platforms are most vulnerable to fraud attacks.
# Create bar chart of transaction channels
# This shows the relative usage of different payment channels
ggplot(Fraud, aes(x=channel)) +
geom_bar(fill="steelblue", alpha=0.8) + # Consistent color scheme
labs(title="Distribution of Transaction Channels", # Descriptive title
x="Channel", y="Count") + # Clear labels
theme_minimal() # Clean theme
Mobile transactions are the most frequent, followed by Web and POS. ATM, ECOM, and IB (Internet Banking) have fewer transactions. This distribution is important for risk assessment as high-volume channels may be prime targets for fraudsters.
Distribution of Merchant Categories Observation: The distribution across merchant categories appears relatively even, with no single category dominating the transaction volume.
In debt Explanation When we say “the distribution across merchant categories appears relatively even, with no single category dominating the transaction volume”, it means that the transactions in the dataset are spread fairly equally across different merchant categories.
In other words, no single category (like fuel, fashion, or airtime) has an overwhelming majority of transactions compared to the others. This is useful because:
If the distribution were uneven (say 80% of all transactions in “airtime”), then the model might learn more about that one category and perform poorly in others. But since it’s balanced, each merchant category contributes roughly equally to the dataset.
ggplot(Fraud, aes(x=merchant_category)) +
geom_bar() +
labs(title="Distribution of Merchant Categories", x="Merchant Category", y="Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Feature Analysis and Fraud Pattern Investigation
Distribution of Fraud Techniques
Observation: ‘SOCIAL_ENGINEERING’ is the most prevalent fraud technique in the dataset, followed by ‘ROBBERY’, ‘CARD_THEFT’, and ‘OTHER’. This insight can be valuable for developing targeted fraud prevention strategies.
This analysis shows that fraud isn’t evenly spread across techniques instead, Social Engineering dominates as the leading method used. This insight is important because:
Fraud prevention efforts might need to focus more on detecting and preventing social engineering scams, since that’s where the biggest risk lies.
At the same time, the other categories, though smaller, still matter because they may exploit different weaknesses (card theft, phishing, PIN compromise).
what is social enhinering: manipulating people to drop there account details.
Fraud %>%
filter(is_fraud == 1) %>%
count(fraud_technique)
## fraud_technique n
## 1 CARD_THEFT 220
## 2 OTHER 220
## 3 PHISHING 132
## 4 PIN_COMPROMISE 158
## 5 ROBBERY 341
## 6 SOCIAL_ENGINEERING 1929
Average Transaction Amount for Fraudulent vs. Non-Fraudulent Transactions
Observation: Fraudulent transactions (is_fraud = 1) have a significantly higher average amount (384,958.7) compared to non-fraudulent transactions (156,265.3). This suggests that transaction amount is a strong indicator of fraud.
Fraud %>%
group_by(is_fraud) %>%
summarise(AvgAmount = mean(amount, na.rm = TRUE))
## # A tibble: 2 × 2
## is_fraud AvgAmount
## <int> <dbl>
## 1 0 156265.
## 2 1 384959.
Fraudulent Transactions by Channel
Observation: Mobile transactions account for the highest number of fraudulent activities, which aligns with it being the most frequent transaction channel. Web and POS also show a considerable number of fraudulent transactions.
ggplot(Fraud[Fraud$is_fraud == 1, ], aes(x=channel)) +
geom_bar() +
labs(title="Fraudulent Transactions by Channel", x="Channel", y="Count")
Fraudulent Transactions by Merchant Category Observation: Fraudulent transactions are distributed across various merchant categories, with ‘ATM_Withdrawal’, ‘Bill_Payment’, ‘Electronics’, ‘Retail’, and ‘Transfer’ showing higher counts. This indicates that fraud is not concentrated in a single merchant type.
Fraud %>%
filter(is_fraud == 1) %>%
ggplot(aes(x = merchant_category)) +
geom_bar() +
labs(title = "Fraudulent Transactions by Merchant Category",
x = "Merchant Category", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Summary of Key Findings: +The dataset is clean with no missing values.
+Transaction amounts are highly skewed, with fraudulent transactions generally involving larger amounts.
+There is a significant class imbalance, with very few fraudulent transactions.
+Mobile, Web, and POS channels are most susceptible to fraud.
+‘SOCIAL_ENGINEERING’ is the most common fraud technique.
+Fraudulent activities are spread across various merchant categories.
creating a customer behavioral profiles and temporal features that will be essential for advanced fraud detection techniques.
# Convert timestamp to proper datetime format for time-series analysis
# This enables extraction of temporal patterns and sequences
Fraud$timestamp <- as.POSIXct(Fraud$timestamp)
# Calculate overall fraud rate for context
# This baseline helps evaluate model performance improvements
fraud_rate <- mean(Fraud$is_fraud) * 100
cat("Overall Fraud Rate:", round(fraud_rate, 3), "%\n")
## Overall Fraud Rate: 0.3 %
# Create customer-level behavioral profiles
# These aggregate metrics capture customer behavior patterns for anomaly detection
customer_profiles <- Fraud %>%
group_by(customer_id) %>% # Group by customer for aggregation
summarise(
total_transactions = n(), # Total number of transactions per customer
fraud_count = sum(is_fraud), # Number of fraudulent transactions
fraud_rate = mean(is_fraud), # Customer-specific fraud rate
avg_amount = mean(amount), # Average transaction amount
std_amount = sd(amount), # Standard deviation of amounts (variability)
channel_diversity = n_distinct(channel), # Number of different channels used
merchant_diversity = n_distinct(merchant_category), # Merchant category diversity
avg_velocity = mean(velocity_score, na.rm = TRUE), # Average velocity score
night_transaction_rate = mean(ifelse(hour >= 22 | hour <= 6, 1, 0)), # Proportion of night transactions
weekend_transaction_rate = mean(is_weekend), # Proportion of weekend transactions
.groups = 'drop' # Remove grouping after summarization
)
# Display summary of customer profiles
print(summary(customer_profiles))
## customer_id total_transactions fraud_count fraud_rate
## Length:10000 Min. : 61 Min. :0.0 Min. :0.000000
## Class :character 1st Qu.: 93 1st Qu.:0.0 1st Qu.:0.000000
## Mode :character Median :100 Median :0.0 Median :0.000000
## Mean :100 Mean :0.3 Mean :0.003005
## 3rd Qu.:107 3rd Qu.:1.0 3rd Qu.:0.008547
## Max. :140 Max. :4.0 Max. :0.043956
##
## avg_amount std_amount channel_diversity merchant_diversity
## Min. : 79531 Min. : 76295 Min. :4.000 Min. :12.00
## 1st Qu.:133701 1st Qu.: 198532 1st Qu.:6.000 1st Qu.:14.00
## Median :151998 Median : 253788 Median :6.000 Median :14.00
## Mean :156954 Mean : 291808 Mean :5.862 Mean :13.99
## 3rd Qu.:175009 3rd Qu.: 339275 3rd Qu.:6.000 3rd Qu.:14.00
## Max. :380509 Max. :1840761 Max. :6.000 Max. :14.00
##
## avg_velocity night_transaction_rate weekend_transaction_rate
## Min. :1.119 Min. :0.02381 Min. : NA
## 1st Qu.:1.658 1st Qu.:0.11650 1st Qu.: NA
## Median :1.849 Median :0.13913 Median : NA
## Mean :1.896 Mean :0.14051 Mean :NaN
## 3rd Qu.:2.075 3rd Qu.:0.16346 3rd Qu.: NA
## Max. :5.199 Max. :0.28846 Max. : NA
## NA's :10000
head(customer_profiles)
## # A tibble: 6 × 11
## customer_id total_transactions fraud_count fraud_rate avg_amount std_amount
## <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 CUST_0002AED1 107 0 0 170390. 365916.
## 2 CUST_000888F9 101 1 0.00990 174489. 269289.
## 3 CUST_000C45C0 104 0 0 174831. 250649.
## 4 CUST_000EF02D 106 0 0 98335. 109784.
## 5 CUST_0019D768 105 2 0.0190 200093. 723111.
## 6 CUST_001CEEE8 89 0 0 138737. 272240.
## # ℹ 5 more variables: channel_diversity <int>, merchant_diversity <int>,
## # avg_velocity <dbl>, night_transaction_rate <dbl>,
## # weekend_transaction_rate <dbl>
Customer Profiling Results The customer behavioral profiles reveal patterns in transaction frequency, amounts, channel usage, and timing preferences that will be crucial for detecting anomalous behavior.
1. ANOMALY DETECTION ALGORITHMS
1.1 Statistical Anomaly Detection
# Z-score based anomaly detection
Fraud$amount_zscore <- abs(scale(Fraud$amount))
Fraud$amount_anomaly <- ifelse(Fraud$amount_zscore > 3, 1, 0)
# Velocity-based anomalies
velocity_threshold <- quantile(Fraud$velocity_score, 0.95, na.rm = TRUE)
Fraud$velocity_anomaly <- ifelse(Fraud$velocity_score > velocity_threshold, 1, 0)
# Analyze anomaly detection performance
anomalies_analysis <- Fraud %>%
group_by(amount_anomaly, is_fraud) %>%
summarise(count = n(), .groups = 'drop') %>%
group_by(amount_anomaly) %>%
mutate(fraud_rate = count / sum(count) * 100)
print("Statistical Anomaly Detection Results:")
## [1] "Statistical Anomaly Detection Results:"
print(anomalies_analysis)
## # A tibble: 4 × 4
## # Groups: amount_anomaly [2]
## amount_anomaly[,1] is_fraud count fraud_rate
## <dbl> <int> <int> <dbl>
## 1 0 0 981664 99.7
## 2 0 1 2815 0.286
## 3 1 0 15336 98.8
## 4 1 1 185 1.19
# Visualization
ggplot(Fraud, aes(x = factor(amount_anomaly), fill = factor(is_fraud))) +
geom_bar(position = "fill") +
labs(title = "Amount Anomaly Detection vs Fraud",
x = "Amount Anomaly (0 = Normal, 1 = Anomaly)",
y = "Proportion", fill = "Fraud Status") +
scale_fill_manual(values = c("lightblue", "red"), labels = c("Legitimate", "Fraudulent")) +
theme_minimal()
# Prepare features for HDSCAN
dbscan_features <- Fraud %>%
select(amount_log, velocity_score, merchant_risk_score, hour, tx_count_24h) %>%
na.omit() %>%
scale() %>%
as.matrix()
# Apply HDSCAN
library(dbscan)
set.seed(123)
db_result <- dbscan(dbscan_features[1:5000, ], eps = 0.5, minPts = 30)
db_result
## DBSCAN clustering for 5000 objects.
## Parameters: eps = 0.5, minPts = 30
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 4 cluster(s) and 1781 noise points.
##
## 0 1 2 3 4
## 1781 424 2652 93 50
##
## Available fields: cluster, eps, minPts, metric, borderPoints
# Add cluster labels and identify anomalies
Fraud$cluster <- -1
Fraud$cluster[complete.cases(Fraud[, c("amount_log", "velocity_score", "merchant_risk_score", "hour", "tx_count_24h")])] <- db_result$cluster
Fraud$dbscan_anomaly <- ifelse(Fraud$cluster == 0, 1, 0)
# Analyze DBSCAN performance
dbscan_analysis <- Fraud %>%
group_by(dbscan_anomaly, is_fraud) %>%
summarise(count = n(), .groups = 'drop') %>%
group_by(dbscan_anomaly) %>%
mutate(fraud_rate = count / sum(count) * 100)
print("DBSCAN Anomaly Detection Results:")
## [1] "DBSCAN Anomaly Detection Results:"
print(dbscan_analysis)
## # A tibble: 4 × 4
## # Groups: dbscan_anomaly [2]
## dbscan_anomaly is_fraud count fraud_rate
## <dbl> <int> <int> <dbl>
## 1 0 0 641893 99.7
## 2 0 1 1907 0.296
## 3 1 0 355107 99.7
## 4 1 1 1093 0.307
ggplot(Fraud, aes(x = factor(dbscan_anomaly), fill = factor(is_fraud))) +
geom_bar(position = "fill") +
labs(title = "DBSCAN Anomaly Detection vs Fraud",
x = "DBSCAN Anomaly (0 = Normal, 1 = Anomaly)",
y = "Proportion", fill = "Fraud Status") +
scale_fill_manual(values = c("lightblue", "red"), labels = c("Legitimate", "Fraudulent")) +
theme_minimal()
Explanation of what is happening here
This step is all about finding unusual or suspicious transactions that could be fraud. It does this in two different ways. The first way is statistical anomaly detection, which basically means it looks at the numbers and tries to see if anything stands out as “weird.” For example, it looks at the transaction amounts and checks if some amounts are way higher or lower than what’s normal. If a transaction is too far from the average, it marks it as suspicious. It does the same thing for transaction speed, where it looks at the top 5% of fastest transactions and flags those as unusual too. After flagging these, it compares them with the actual fraud labels in the dataset to see if the unusual transactions really were fraud. Then, it creates a chart to show you the proportions of fraud versus legitimate within the flagged anomalies, so you can visually see if the method is working.
The second way it checks for fraud is by using DBSCAN (Density-based Anomaly Detection), which is a clustering method. Instead of just looking at amounts or speed separately, this approach looks at several features together, like amount, speed, merchant risk score, and transaction time, to understand the overall behavior of a transaction. DBSCAN groups together transactions that look similar, and anything that doesn’t fit into a group is marked as an outlier or anomaly. Those outliers are then compared against the fraud labels in the dataset to see if they are actually fraud. Like the first method, it also shows the results in a chart, where we can see how many flagged anomalies were truly fraudulent.
Overall, what the whole analysis is doing is trying out two different ways to catch fraud. The first method is very straightforward it just looks for extreme values that don’t seem normal. The second method is more advanced and uses clustering to find transactions that don’t belong anywhere. At the end, both methods are checked against the real fraud cases in the data set to see how accurate they are.
2. GRAPH THEORY AND NETWORK ANALYTICS
2.1 Transaction Network Analysis
This step is building a network to study the relationships between customers and merchants. Instead of just looking at transactions as separate rows, it creates a “graph” where customers and merchants are nodes (points) and each transaction between them is an edge (a connection). To make the network more meaningful, it only keeps customer merchant pairs where there have been at least two transactions. For each customer merchant pair, it calculates useful statistics: how many transactions happened, how many were fraudulent, the fraud rate (percentage of fraud), and the total amount spent.
Once this network is built, then counting how many customer merchant connections exist, how many unique customers there are, and how many unique merchants appear. This gives you a sense of the size and diversity of the transaction network.
The next part focuses on merchants specifically. It groups the network data by merchant category and calculates how risky each category is. For each merchant category, it looks at how many customers interacted with it, how many customer merchant relationships involved fraud, and the average fraud rate. Then it creates a “risk score,” which combines fraud rate with the size of the customer base. Categories with higher fraud rates and larger customer bases get higher risk scores, meaning they are more dangerous in terms of fraud exposure. The results are then sorted to show the riskiest merchant categories at the top.
Finally, it makes a bar chart of the top 10 riskiest merchant categories. The higher the bar, the more risky that merchant category is considered. This visualization makes it easier to see which merchant types are most strongly associated with fraud in the dataset.
# Create customer-merchant transaction network
customer_merchant_edges <- Fraud %>%
group_by(customer_id, merchant_category) %>%
summarise(
transaction_count = n(),
fraud_count = sum(is_fraud),
fraud_rate = mean(is_fraud),
total_amount = sum(amount),
.groups = 'drop'
) %>%
filter(transaction_count >= 2)
# Network statistics
cat("Customer-Merchant edges:", nrow(customer_merchant_edges), "\n")
## Customer-Merchant edges: 139102
cat("Unique customers:", n_distinct(customer_merchant_edges$customer_id), "\n")
## Unique customers: 10000
cat("Unique merchants:", n_distinct(customer_merchant_edges$merchant_category), "\n")
## Unique merchants: 14
# Analyze fraud concentration by merchant
merchant_risk <- customer_merchant_edges %>%
group_by(merchant_category) %>%
summarise(
total_customers = n(),
fraudulent_relationships = sum(fraud_count > 0),
avg_fraud_rate = mean(fraud_rate),
risk_score = avg_fraud_rate * log(total_customers + 1),
.groups = 'drop'
) %>%
arrange(desc(risk_score))
print("High-Risk Merchant Categories:")
## [1] "High-Risk Merchant Categories:"
print(head(merchant_risk, 10))
## # A tibble: 10 × 5
## merchant_category total_customers fraudulent_relationships avg_fraud_rate
## <chr> <int> <int> <dbl>
## 1 ATM_Withdrawal 9929 236 0.00349
## 2 Retail 9940 231 0.00332
## 3 Electronics 9924 233 0.00325
## 4 Bill_Payment 9948 224 0.00320
## 5 Medical 9941 208 0.00309
## 6 Transfer 9932 221 0.00306
## 7 Fuel 9931 205 0.00301
## 8 Restaurant 9915 213 0.00300
## 9 Airtime 9939 221 0.00299
## 10 Entertainment 9941 203 0.00287
## # ℹ 1 more variable: risk_score <dbl>
# Visualization of merchant risk
ggplot(head(merchant_risk, 10), aes(x = reorder(merchant_category, risk_score), y = risk_score)) +
geom_bar(stat = "identity", fill = "red", alpha = 0.7) +
coord_flip() +
labs(title = "Merchant Risk Scores (Top 10)",
x = "Merchant Category", y = "Risk Score") +
theme_minimal()
To be continued…