Full R codebase (scripts/functions)
The following is a complete, runnable script set.
Copy each block into the specified file names.
Mermaid for repository layout:
flowchart LR
A[main_run.R] --> B[R/00_packages.R]
A --> C[R/01_box7.R]
A --> D[R/02_simulate.R]
A --> E[R/03_features.R]
A --> F[R/04_distances.R]
A --> G[R/05_clustering.R]
A --> H[R/06_evaluation_plots.R]
A --> I[output/data + output/figures + output/metrics]
R/00_packages.R
# R/00_packages.R
# R version: tested conceptually on R >= 4.2.0 (no strict requirement stated).
install_if_missing <- function(pkgs) {
missing <- pkgs[!vapply(pkgs, requireNamespace, FUN.VALUE = logical(1), quietly = TRUE)]
if (length(missing) > 0) {
message("Installing missing packages: ", paste(missing, collapse = ", "))
install.packages(missing, repos = "https://cloud.r-project.org")
}
}
PKGS <- c(
"data.table", "lubridate", "ggplot2",
"dplyr", "tidyr", "purrr", "stringr",
"cluster", "dbscan",
"robustbase", "robustcov",
"RSpectra",
"pROC", "PRROC",
"energy"
)
install_if_missing(PKGS)
suppressPackageStartupMessages({
library(data.table)
library(lubridate)
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
library(cluster)
library(dbscan)
library(robustbase)
library(robustcov)
library(RSpectra)
library(pROC)
library(PRROC)
library(energy)
})
dir.create("output", showWarnings = FALSE)
dir.create(file.path("output", "data"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path("output", "figures"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path("output", "metrics"), showWarnings = FALSE, recursive = TRUE)
set.seed(42)
R/01_box7.R
# R/01_box7.R
# Seven-point summary:
# (min, lower_fence, q1, median, q3, upper_fence, max)
# Helper robust scales
mad_robust <- function(x, constant = 1.4826, na.rm = TRUE) {
stats::mad(x, constant = constant, na.rm = na.rm)
}
# Tukey fences
tukey_fences <- function(q1, q3, k = 1.5) {
iqr <- q3 - q1
list(lf = q1 - k * iqr, uf = q3 + k * iqr, iqr = iqr)
}
# Adjusted boxplot fences (Hubert & Vandervieren):
# uses medcouple robust skewness (robustbase::mc)
# This implementation mirrors the commonly used exponential adjustment.
adjusted_fences <- function(x, q1, q3, k = 1.5) {
iqr <- q3 - q1
mc <- robustbase::mc(x) # medcouple
if (is.na(mc)) mc <- 0
if (mc >= 0) {
lf <- q1 - k * iqr * exp(-4 * mc)
uf <- q3 + k * iqr * exp( 3 * mc)
} else {
lf <- q1 - k * iqr * exp(-3 * mc)
uf <- q3 + k * iqr * exp( 4 * mc)
}
list(lf = lf, uf = uf, iqr = iqr, mc = mc)
}
# MAD-based fences around median
mad_fences <- function(med, madv, c = 3.0) {
list(lf = med - c * madv, uf = med + c * madv, mad = madv)
}
# Main 7-point function with rule switching:
# - default Tukey
# - adjusted if skewed enough and sample size adequate
# - MAD fallback if too few points or IQR ~ 0
box7 <- function(x,
rule = c("auto", "tukey", "adjusted", "mad"),
k = 1.5,
medcouple_threshold = 0.10,
n_min_adjusted = 20,
n_min_any = 5,
mad_c = 3.0) {
rule <- match.arg(rule)
x <- x[is.finite(x)]
n <- length(x)
if (n == 0) {
return(list(min = NA_real_, lf = NA_real_, q1 = NA_real_, med = NA_real_,
q3 = NA_real_, uf = NA_real_, max = NA_real_,
rule_used = NA_character_, n = 0))
}
# For tiny n, quartiles + MC are unstable; use MAD fallback
if (n < n_min_any) {
med <- stats::median(x)
madv <- mad_robust(x)
f <- mad_fences(med, madv, c = mad_c)
return(list(min = min(x), lf = f$lf, q1 = NA_real_, med = med,
q3 = NA_real_, uf = f$uf, max = max(x),
rule_used = "mad_small_n", n = n))
}
q1 <- as.numeric(stats::quantile(x, 0.25, type = 7, names = FALSE))
med <- as.numeric(stats::quantile(x, 0.50, type = 7, names = FALSE))
q3 <- as.numeric(stats::quantile(x, 0.75, type = 7, names = FALSE))
iqr <- q3 - q1
# If IQR collapses, MAD is safer
if (!is.finite(iqr) || iqr < .Machine$double.eps) {
madv <- mad_robust(x)
f <- mad_fences(med, madv, c = mad_c)
return(list(min = min(x), lf = f$lf, q1 = q1, med = med,
q3 = q3, uf = f$uf, max = max(x),
rule_used = "mad_iqr_zero", n = n))
}
if (rule == "tukey") {
f <- tukey_fences(q1, q3, k = k)
return(list(min = min(x), lf = f$lf, q1 = q1, med = med,
q3 = q3, uf = f$uf, max = max(x),
rule_used = "tukey", n = n))
}
if (rule == "adjusted") {
f <- adjusted_fences(x, q1, q3, k = k)
return(list(min = min(x), lf = f$lf, q1 = q1, med = med,
q3 = q3, uf = f$uf, max = max(x),
rule_used = "adjusted", n = n, mc = f$mc))
}
if (rule == "mad") {
madv <- mad_robust(x)
f <- mad_fences(med, madv, c = mad_c)
return(list(min = min(x), lf = f$lf, q1 = q1, med = med,
q3 = q3, uf = f$uf, max = max(x),
rule_used = "mad_forced", n = n))
}
# auto:
if (n >= n_min_adjusted) {
mc <- robustbase::mc(x)
if (is.finite(mc) && abs(mc) >= medcouple_threshold) {
f <- adjusted_fences(x, q1, q3, k = k)
return(list(min = min(x), lf = f$lf, q1 = q1, med = med,
q3 = q3, uf = f$uf, max = max(x),
rule_used = "auto_adjusted", n = n, mc = f$mc))
}
}
f <- tukey_fences(q1, q3, k = k)
list(min = min(x), lf = f$lf, q1 = q1, med = med, q3 = q3, uf = f$uf, max = max(x),
rule_used = "auto_tukey", n = n)
}
# Utility: convert list output to named numeric vector
box7_vec <- function(x, ...) {
b <- box7(x, ...)
c(min = b$min, lf = b$lf, q1 = b$q1, med = b$med, q3 = b$q3, uf = b$uf, max = b$max)
}
# Visualization helper: 7-point boxplot from precomputed summaries
# Expects df with columns:
# group, lf, q1, med, q3, uf, min, max
plot_box7 <- function(df, x_col = "group", title = "Seven-point boxplot", out_path = NULL) {
p <- ggplot(df, aes_string(x = x_col)) +
geom_boxplot(
aes(lower = q1, middle = med, upper = q3, ymin = lf, ymax = uf),
stat = "identity", width = 0.5
) +
geom_point(aes(y = min), shape = 21, size = 2) +
geom_point(aes(y = max), shape = 21, size = 2) +
labs(title = title, x = x_col, y = "Value") +
theme_minimal()
if (!is.null(out_path)) {
ggsave(out_path, p, width = 10, height = 5, dpi = 150)
}
p
}
R/02_simulate.R
# R/02_simulate.R
default_sim_params <- function() {
list(
n_customers = 20000,
n_merchants = 5000,
days = 120,
start_date = as.POSIXct("2026-01-01 00:00:00", tz = "Africa/Lagos"),
# Channel mixture (tune to your logs)
channel_probs = c(mobile = 0.55, ussd = 0.20, web = 0.15, pos = 0.10),
# Customer segments
segment_probs = c(retail = 0.78, sme = 0.16, vip = 0.03, agent = 0.03),
# Baseline activity
tx_mean_per_day = 1.8,
tx_dispersion = 1.2, # NegBin size param proxy
# Device/IP churn (baseline)
p_new_device = 0.02,
p_new_ip = 0.03,
# Amount model: lognormal mixtures per segment
# Each entry: list(w = mixture weights, mu = means on log scale, sigma = sds on log scale)
amount_mix = list(
retail = list(w = c(0.85, 0.15), mu = c(log(2500), log(12000)), sigma = c(0.7, 0.9)),
sme = list(w = c(0.70, 0.30), mu = c(log(12000), log(80000)), sigma = c(0.8, 1.1)),
vip = list(w = c(0.50, 0.50), mu = c(log(50000), log(300000)), sigma = c(0.9, 1.1)),
agent = list(w = c(0.80, 0.20), mu = c(log(4000), log(20000)), sigma = c(0.6, 0.8))
),
# Fraud controls
fraud_tx_rate_target = 0.004,
ato_customer_rate = 0.004,
ato_burst_range = c(6, 25),
ato_amount_mult = list(mu = log(2.2), sigma = 0.5),
ato_p_new_device = 0.65,
ato_p_new_ip = 0.70,
ato_night_rate = 0.45,
smurf_customer_rate = 0.006,
smurf_burst_range = c(12, 80),
smurf_amount = list(mu = log(1800), sigma = 0.6),
smurf_many_payees = TRUE,
collusive_merchant_rate = 0.004,
collusion_tx_per_day_range = c(8, 40),
collusion_amount_mult = list(mu = log(1.5), sigma = 0.4),
# label delay (days) applied to confirmed fraud label visibility
label_delay_days = c(0, 7)
)
}
.sample_lognorm_mix <- function(n, mix) {
k <- length(mix$w)
z <- sample.int(k, size = n, replace = TRUE, prob = mix$w)
rlnorm(n, meanlog = mix$mu[z], sdlog = mix$sigma[z])
}
# Helper: sample time-of-day with diurnal pattern
.sample_time_in_day <- function(n, tz = "Africa/Lagos") {
# Mixture of peaks: morning, afternoon, evening, and small night tail
comps <- sample.int(4, n, replace = TRUE, prob = c(0.25, 0.35, 0.30, 0.10))
hour <- numeric(n)
hour[comps == 1] <- pmin(pmax(rnorm(sum(comps == 1), 9, 2), 0), 23)
hour[comps == 2] <- pmin(pmax(rnorm(sum(comps == 2), 14, 2), 0), 23)
hour[comps == 3] <- pmin(pmax(rnorm(sum(comps == 3), 19, 2), 0), 23)
hour[comps == 4] <- pmin(pmax(rnorm(sum(comps == 4), 2, 1.5), 0), 23)
minute <- sample(0:59, n, replace = TRUE)
second <- sample(0:59, n, replace = TRUE)
as.difftime(floor(hour) * 3600 + minute * 60 + second, units = "secs")
}
simulate_transactions <- function(params = default_sim_params()) {
p <- params
tz <- "Africa/Lagos"
# Customers
customers <- data.table(
customer_id = sprintf("C%07d", 1:p$n_customers),
segment = sample(names(p$segment_probs), p$n_customers, replace = TRUE, prob = p$segment_probs)
)
# Activity rate heterogeneity (Gamma)
customers[, activity_weight := rgamma(.N, shape = 1.5, rate = 1.5)]
customers[, rate_per_day := p$tx_mean_per_day * activity_weight / mean(activity_weight)]
# Merchants
merchants <- data.table(
merchant_id = sprintf("M%06d", 1:p$n_merchants),
m_profile = sample(c("normal", "normal", "normal", "high_risk_tail"), p$n_merchants, replace = TRUE)
)
# Total baseline tx count
expected_total <- round(sum(customers$rate_per_day) * p$days)
n_base <- expected_total
# Assign customer per tx according to activity
cust_idx <- sample.int(p$n_customers, size = n_base, replace = TRUE, prob = customers$rate_per_day)
base <- data.table(customer_id = customers$customer_id[cust_idx])
base <- merge(base, customers[, .(customer_id, segment)], by = "customer_id", all.x = TRUE)
# Assign day and time
base[, day := sample.int(p$days, .N, replace = TRUE)]
base[, ts := p$start_date + days(day - 1) + .sample_time_in_day(.N, tz = tz)]
# Channel
base[, channel := sample(names(p$channel_probs), .N, replace = TRUE, prob = p$channel_probs)]
# Merchant choice: long tail (Zipf-like approximation)
merchant_probs <- (1 / (1:p$n_merchants))
merchant_probs <- merchant_probs / sum(merchant_probs)
base[, merchant_id := merchants$merchant_id[
sample.int(p$n_merchants, .N, replace = TRUE, prob = merchant_probs)
]]
# Amount
base[, amount := {
mix <- p$amount_mix[[segment[1]]]
.sample_lognorm_mix(.N, mix)
}, by = segment]
# Devices / IPs: baseline churn with per-customer pools
base <- base[order(customer_id, ts)]
base[, device_seq := 0L]
base[, ip_seq := 0L]
base[, `:=`(
device_id = {
dev <- character(.N)
pool <- character(0)
for (i in seq_len(.N)) {
if (length(pool) == 0 || runif(1) < p$p_new_device) {
pool <- c(pool, paste0("D", sample.int(9e9, 1)))
}
dev[i] <- sample(pool, 1)
}
dev
},
ip_subnet = {
ip <- character(.N)
pool <- character(0)
for (i in seq_len(.N)) {
if (length(pool) == 0 || runif(1) < p$p_new_ip) {
pool <- c(pool, paste0("IP", sample.int(1e6, 1)))
}
ip[i] <- sample(pool, 1)
}
ip
}
), by = customer_id]
base[, `:=`(
is_fraud = 0L,
fraud_type = NA_character_,
fraud_event_id = NA_character_
)]
# Fraud injection helpers
.mark_fraud <- function(dt, idx, ftype, eid) {
dt[idx, `:=`(is_fraud = 1L, fraud_type = ftype, fraud_event_id = eid)]
}
# ATO events
n_ato_cust <- max(1L, round(p$n_customers * p$ato_customer_rate))
ato_customers <- sample(customers$customer_id, n_ato_cust)
for (cid in ato_customers) {
burst_n <- sample(p$ato_burst_range[1]:p$ato_burst_range[2], 1)
day0 <- sample.int(p$days, 1)
t0 <- p$start_date + days(day0 - 1) + hours(sample(0:23, 1))
evt_id <- paste0("ATO_", cid, "_", day0)
# create burst tx rows
dt <- data.table(
customer_id = cid,
segment = customers[customer_id == cid, segment][1],
day = day0,
ts = sort(t0 + seconds(sample(0:(3600 * 6), burst_n, replace = TRUE))), # within 6h
channel = sample(c("mobile", "web"), burst_n, replace = TRUE, prob = c(0.8, 0.2)),
merchant_id = merchants$merchant_id[sample.int(p$n_merchants, burst_n, replace = TRUE, prob = merchant_probs)],
amount = NA_real_,
device_id = NA_character_,
ip_subnet = NA_character_,
is_fraud = 1L,
fraud_type = "ATO",
fraud_event_id = evt_id
)
# amounts: baseline * multiplier
base_mix <- p$amount_mix[[dt$segment[1]]]
dt[, amount := .sample_lognorm_mix(.N, base_mix) * rlnorm(.N, p$ato_amount_mult$mu, p$ato_amount_mult$sigma)]
# forced novelty
dt[, device_id := ifelse(runif(.N) < p$ato_p_new_device, paste0("D", sample.int(9e9, .N)), base[customer_id == cid, device_id][1])]
dt[, ip_subnet := ifelse(runif(.N) < p$ato_p_new_ip, paste0("IP", sample.int(1e6, .N)), base[customer_id == cid, ip_subnet][1])]
# shift to night sometimes
if (runif(1) < p$ato_night_rate) {
dt[, ts := as.POSIXct(floor_date(ts, "day"), tz = tz) + hours(sample(0:5, .N, replace = TRUE)) + minutes(sample(0:59, .N, TRUE))]
}
base <- rbind(base, dt, fill = TRUE)
}
# Smurfing events
n_smurf_cust <- max(1L, round(p$n_customers * p$smurf_customer_rate))
smurf_customers <- sample(setdiff(customers$customer_id, ato_customers), n_smurf_cust)
for (cid in smurf_customers) {
burst_n <- sample(p$smurf_burst_range[1]:p$smurf_burst_range[2], 1)
day0 <- sample.int(p$days, 1)
t0 <- p$start_date + days(day0 - 1) + hours(sample(8:22, 1))
evt_id <- paste0("SMURF_", cid, "_", day0)
dt <- data.table(
customer_id = cid,
segment = customers[customer_id == cid, segment][1],
day = day0,
ts = sort(t0 + seconds(sample(0:(3600 * 3), burst_n, replace = TRUE))), # within 3h
channel = sample(c("mobile", "ussd"), burst_n, replace = TRUE, prob = c(0.6, 0.4)),
merchant_id = NA_character_,
amount = rlnorm(burst_n, p$smurf_amount$mu, p$smurf_amount$sigma),
device_id = base[customer_id == cid, device_id][1],
ip_subnet = base[customer_id == cid, ip_subnet][1],
is_fraud = 1L,
fraud_type = "SMURF",
fraud_event_id = evt_id
)
if (isTRUE(p$smurf_many_payees)) {
# many distinct payees
dt[, merchant_id := merchants$merchant_id[sample.int(p$n_merchants, .N, replace = TRUE, prob = merchant_probs)]]
} else {
dt[, merchant_id := merchants$merchant_id[sample.int(p$n_merchants, 1, prob = merchant_probs)]]
}
base <- rbind(base, dt, fill = TRUE)
}
# Collusion merchants
n_coll_m <- max(1L, round(p$n_merchants * p$collusive_merchant_rate))
coll_merchants <- sample(merchants$merchant_id, n_coll_m)
for (mid in coll_merchants) {
day0 <- sample.int(p$days, 1)
n_tx <- sample(p$collusion_tx_per_day_range[1]:p$collusion_tx_per_day_range[2], 1)
# many customers
cid <- sample(customers$customer_id, n_tx, replace = TRUE, prob = customers$rate_per_day)
evt_id <- paste0("COLL_", mid, "_", day0)
dt <- data.table(
customer_id = cid,
segment = customers[match(cid, customer_id), segment],
day = day0,
ts = p$start_date + days(day0 - 1) + .sample_time_in_day(n_tx, tz = tz),
channel = "pos",
merchant_id = mid,
amount = NA_real_,
device_id = paste0("D", sample.int(9e9, n_tx)),
ip_subnet = paste0("IP", sample.int(1e6, n_tx)),
is_fraud = 1L,
fraud_type = "COLLUSION",
fraud_event_id = evt_id
)
# amounts: uplift
dt[, amount := .sample_lognorm_mix(.N, p$amount_mix[[segment[1]]]) *
rlnorm(.N, p$collusion_amount_mult$mu, p$collusion_amount_mult$sigma)
, by = segment]
base <- rbind(base, dt, fill = TRUE)
}
# Re-sort and finalize ids
base <- base[order(customer_id, ts)]
base[, tx_id := sprintf("T%010d", .I)]
setcolorder(base, c("tx_id", "ts", "day", "customer_id", "segment", "channel",
"merchant_id", "amount", "device_id", "ip_subnet",
"is_fraud", "fraud_type", "fraud_event_id"))
# Optional: simulate label delay (visibility of fraud confirmation)
# - confirmed_fraud_time = event start + delay
# Used later for detection latency
delays <- sample(seq(p$label_delay_days[1], p$label_delay_days[2]), 1)
base[, confirmed_fraud_ts := {
if (is.na(fraud_event_id[1])) return(as.POSIXct(NA, tz = tz))
evt_start <- min(ts)
evt_start + days(delays)
}, by = fraud_event_id]
list(
tx = base,
customers = customers,
merchants = merchants,
params = p
)
}
R/03_features.R
# R/03_features.R
# Customer-day aggregation (default). You can change to customer-hour by changing the window key.
make_customer_day_windows <- function(tx_dt) {
dt <- copy(tx_dt)
dt[, date := as.Date(ts, tz = "Africa/Lagos")]
setorder(dt, customer_id, ts)
# Inter-transaction gap per customer
dt[, itx_sec := as.numeric(difftime(ts, shift(ts), units = "secs")), by = customer_id]
# Night flag (local)
dt[, hour := lubridate::hour(ts)]
dt[, is_night := as.integer(hour %in% 0:5)]
# Windowed aggregation
agg <- dt[, .(
n_tx = .N,
n_merchants = uniqueN(merchant_id),
n_devices = uniqueN(device_id),
n_ips = uniqueN(ip_subnet),
night_rate = mean(is_night, na.rm = TRUE),
# channel mix (numeric proportions)
p_mobile = mean(channel == "mobile"),
p_ussd = mean(channel == "ussd"),
p_web = mean(channel == "web"),
p_pos = mean(channel == "pos"),
# novelty proxies inside same day
# (in real systems you’d compute "new vs last 30d"; here we approximate)
device_entropy = {
tab <- table(device_id)
probs <- tab / sum(tab)
-sum(probs * log(pmax(probs, 1e-12)))
},
ip_entropy = {
tab <- table(ip_subnet)
probs <- tab / sum(tab)
-sum(probs * log(pmax(probs, 1e-12)))
},
# labels (any fraud txn in window)
y_fraud = as.integer(any(is_fraud == 1L)),
y_fraud_type = {
ft <- unique(na.omit(fraud_type))
if (length(ft) == 0) NA_character_ else ft[1]
},
first_fraud_ts = {
if (any(is_fraud == 1L)) min(ts[is_fraud == 1L]) else as.POSIXct(NA, tz = "Africa/Lagos")
},
confirmed_fraud_ts = {
# if multiple fraud tx, these should align by event in simulation
suppressWarnings(min(confirmed_fraud_ts, na.rm = TRUE))
}
), by = .(customer_id, segment, date)]
# Seven-point summaries for numeric streams
# amount (raw and log), itx (inter-tx sec), velocity proxy (amount per tx)
agg_num <- dt[, .(
amount_vec = list(amount),
log_amount_vec = list(log1p(amount)),
itx_vec = list(itx_sec[is.finite(itx_sec) & itx_sec >= 0]),
amt_per_tx_vec = list(amount / pmax(1, .N))
), by = .(customer_id, date)]
agg <- merge(agg, agg_num, by = c("customer_id", "date"), all.x = TRUE)
# Apply box7 to each vector; expand columns
box_cols <- function(vec_col, prefix) {
tmp <- agg[[vec_col]]
bmat <- t(vapply(tmp, function(v) box7_vec(unlist(v), rule = "auto"), numeric(7)))
colnames(bmat) <- paste0(prefix, "_", colnames(bmat))
as.data.table(bmat)
}
agg <- cbind(
agg,
box_cols("amount_vec", "amt"),
box_cols("log_amount_vec", "logamt"),
box_cols("itx_vec", "itx"),
box_cols("amt_per_tx_vec", "aptx")
)
# Drop list columns
agg[, c("amount_vec", "log_amount_vec", "itx_vec", "amt_per_tx_vec") := NULL]
agg[]
}
# Build modeling matrix and metadata
prepare_model_frame <- function(win_dt) {
dt <- copy(win_dt)
# Categorical columns (keep as factors for Gower)
dt[, segment := as.factor(segment)]
# You can add more categorical modes if desired:
dt[, dominant_channel := factor(
c("mobile","ussd","web","pos")[max.col(as.matrix(dt[, .(p_mobile, p_ussd, p_web, p_pos)]), ties.method = "first")],
levels = c("mobile","ussd","web","pos")
)]
# Define features: exclude label/time id columns
id_cols <- c("customer_id", "date")
label_cols <- c("y_fraud", "y_fraud_type", "first_fraud_ts", "confirmed_fraud_ts")
feature_cols <- setdiff(names(dt), c(id_cols, label_cols))
list(
dt = dt,
id_cols = id_cols,
label_cols = label_cols,
feature_cols = feature_cols
)
}
R/04_distances.R
# R/04_distances.R
# Robust scaling helpers
robust_center_scale <- function(x, method = c("iqr", "mad")) {
method <- match.arg(method)
med <- median(x, na.rm = TRUE)
scl <- if (method == "iqr") IQR(x, na.rm = TRUE) else mad_robust(x)
if (!is.finite(scl) || scl < .Machine$double.eps) scl <- 1.0
list(center = med, scale = scl)
}
# Fence-based unit scaling (0..1) for robust Gower preprocessing
robust_unit_scale <- function(x, rule = c("auto", "tukey", "adjusted", "mad"), k = 1.5) {
b <- box7(x, rule = rule, k = k)
lf <- b$lf
uf <- b$uf
if (!is.finite(lf) || !is.finite(uf) || abs(uf - lf) < .Machine$double.eps) {
# fallback to median+MAD
med <- median(x, na.rm = TRUE)
madv <- mad_robust(x)
lf <- med - 3 * madv
uf <- med + 3 * madv
if (!is.finite(lf) || !is.finite(uf) || abs(uf - lf) < .Machine$double.eps) {
return(rep(0.5, length(x)))
}
}
xc <- pmin(pmax(x, lf), uf)
(xc - lf) / (uf - lf)
}
# Prepare data for Gower:
# - numeric columns scaled 0..1 via robust_unit_scale
# - factors kept as factors
preprocess_for_robust_gower <- function(df, numeric_cols, factor_cols) {
out <- df
for (cn in numeric_cols) {
out[[cn]] <- robust_unit_scale(out[[cn]], rule = "auto")
}
for (fc in factor_cols) {
out[[fc]] <- as.factor(out[[fc]])
}
out
}
# Robust Gower distance (returns dist object)
dist_robust_gower <- function(df, numeric_cols, factor_cols, weights = NULL) {
x <- preprocess_for_robust_gower(df, numeric_cols, factor_cols)
# daisy supports weights; metric="gower" generalizes Gower for mixed types
cluster::daisy(x, metric = "gower", weights = weights)
}
# L1 / Manhattan distance on robustly centered-scaled numeric features
dist_l1_robust <- function(df_num, scale_method = c("iqr", "mad")) {
scale_method <- match.arg(scale_method)
X <- as.matrix(df_num)
for (j in seq_len(ncol(X))) {
cs <- robust_center_scale(X[, j], method = scale_method)
X[, j] <- (X[, j] - cs$center) / cs$scale
}
stats::dist(X, method = "manhattan")
}
# Rank-based L1 distance (robust to monotone transforms)
dist_rank_l1 <- function(df_num) {
X <- as.matrix(df_num)
Xr <- apply(X, 2, rank, ties.method = "average", na.last = "keep")
stats::dist(Xr, method = "manhattan")
}
# Robust Mahalanobis pairwise distance:
# d(i,j) = sqrt( (xi-xj)' Sigma^{-1} (xi-xj) )
# Use for smaller n due to O(n^2).
dist_mahalanobis_robust <- function(df_num, cov_method = c("mcd", "ogk"), scale_first = TRUE) {
cov_method <- match.arg(cov_method)
X <- as.matrix(df_num)
if (scale_first) {
for (j in seq_len(ncol(X))) {
cs <- robust_center_scale(X[, j], method = "iqr")
X[, j] <- (X[, j] - cs$center) / cs$scale
}
}
cov_fit <- if (cov_method == "mcd") robustbase::covMcd(X) else robustbase::covOGK(X)
Sigma <- cov_fit$cov
Sinv <- solve(Sigma)
n <- nrow(X)
D <- matrix(0, n, n)
for (i in 1:(n-1)) {
xi <- X[i, ]
for (j in (i+1):n) {
d <- X[j, ] - xi
D[i, j] <- sqrt(drop(t(d) %*% Sinv %*% d))
D[j, i] <- D[i, j]
}
}
as.dist(D)
}
# Optional: distance-correlation-based feature weighting for numeric features.
# We compute a dCor matrix among features and downweight redundant features.
feature_weights_dcor <- function(df_num, eps = 1e-6) {
X <- as.matrix(df_num)
p <- ncol(X)
dc <- matrix(0, p, p)
for (i in 1:p) {
for (j in i:p) {
dc[i, j] <- energy::dcor(X[, i], X[, j])
dc[j, i] <- dc[i, j]
}
}
# Redundancy score: sum of dependencies with others
red <- rowSums(dc) - diag(dc)
w <- 1 / (eps + red)
w / sum(w)
}
R/05_clustering.R
# R/05_clustering.R
# Standardized clustering output:
# list(cluster = integer labels (0 = noise), score = numeric anomaly score (higher = more anomalous),
# model = raw fitted object, method = string)
cluster_hdbscan <- function(x, minPts = 15) {
# x can be a numeric matrix (Euclidean) or a dist object (arbitrary metric)
fit <- dbscan::hdbscan(x, minPts = minPts)
# dbscan's hdbscan returns outlier_scores (GLOSH computed automatically)
list(
cluster = fit$cluster,
score = fit$outlier_scores,
model = fit,
method = "hdbscan_glosh"
)
}
cluster_dbscan <- function(x, eps, minPts = 15, score_method = c("lof", "knn_dist")) {
score_method <- match.arg(score_method)
fit <- dbscan::dbscan(x, eps = eps, minPts = minPts)
# Outlier scoring proxies
score <- rep(NA_real_, length(fit$cluster))
if (score_method == "lof") {
# LOF expects numeric matrix; for dist objects, we skip LOF and use kNNdist if possible.
if (!inherits(x, "dist")) {
score <- dbscan::lof(x, minPts = minPts)
} else {
score <- as.numeric(fit$cluster == 0L) # crude fallback
}
} else {
if (!inherits(x, "dist")) {
score <- dbscan::kNNdist(x, k = minPts)
} else {
score <- as.numeric(fit$cluster == 0L)
}
}
list(cluster = fit$cluster, score = score, model = fit, method = paste0("dbscan_", score_method))
}
cluster_hierarchical <- function(d, k = 8, linkage = c("average", "complete")) {
linkage <- match.arg(linkage)
hc <- stats::hclust(d, method = linkage)
cl <- stats::cutree(hc, k = k)
# Anomaly score proxy: distance to cluster medoid (by distance matrix)
D <- as.matrix(d)
score <- rep(0, nrow(D))
for (g in unique(cl)) {
idx <- which(cl == g)
subD <- D[idx, idx, drop = FALSE]
medoid_local <- idx[which.min(rowSums(subD))]
score[idx] <- D[idx, medoid_local]
}
list(cluster = cl, score = score, model = hc, method = "hierarchical_medoid_dist")
}
# Spectral clustering with self-tuning (local scaling) affinity (Zelnik-Manor & Perona).
# - start from a robust distance matrix d
# - uses kNN-based local scales sigma_i
cluster_spectral_selftuning <- function(d, k = 8, knn_k = 10) {
D <- as.matrix(d)
n <- nrow(D)
# local scale sigma_i = distance to knn_k-th neighbor
sigma <- apply(D, 1, function(row) {
vv <- sort(row, partial = knn_k + 1)[knn_k + 1] # +1 for self-distance 0
if (!is.finite(vv) || vv <= 0) vv <- median(row[row > 0], na.rm = TRUE)
vv
})
W <- matrix(0, n, n)
for (i in 1:n) {
for (j in i:n) {
if (i == j) next
w <- exp(- (D[i, j]^2) / (sigma[i] * sigma[j] + 1e-12))
W[i, j] <- w
W[j, i] <- w
}
}
# normalized Laplacian L = I - D^{-1/2} W D^{-1/2}
deg <- rowSums(W)
deg[deg == 0] <- 1e-12
Dm12 <- diag(1 / sqrt(deg))
L <- diag(n) - Dm12 %*% W %*% Dm12
eig <- RSpectra::eigs_sym(L, k = k, which = "SM")
U <- eig$vectors
# normalize rows (Ng-Jordan-Weiss style normalization is common; we keep it simple)
U <- U / sqrt(rowSums(U^2) + 1e-12)
km <- stats::kmeans(U, centers = k, nstart = 20)
cl <- km$cluster
# anomaly proxy: low max affinity to assigned cluster centroid in U-space
cent <- km$centers[cl, , drop = FALSE]
score <- rowSums((U - cent)^2)
list(cluster = cl, score = score, model = list(W = W, L = L, km = km, eig = eig), method = "spectral_selftuning")
}
# Contaminated mixture clustering (numeric only) using ContaminatedMixt::CNmixt
cluster_contaminated_mixture <- function(df_num, G = 2:10, criterion = "BIC") {
X <- as.matrix(df_num)
fit <- ContaminatedMixt::CNmixt(X = X, G = G, verbose = FALSE)
best <- ContaminatedMixt::getBestModel(fit, criterion = criterion)
cl <- ContaminatedMixt::getCluster(best)
# Detection output (posterior probabilities of bad points are available via getDetection)
det <- tryCatch(ContaminatedMixt::getDetection(best), error = function(e) NULL)
# Score: if detection structure exists, use it; else proxy by Mahalanobis to component mean
score <- if (!is.null(det) && "D" %in% names(det)) {
as.numeric(det$D)
} else {
rep(0, length(cl))
}
list(cluster = cl, score = score, model = best, method = "contaminatedmixt")
}
R/06_evaluation_plots.R
# R/06_evaluation_plots.R
# Metrics:
# - AUROC via pROC
# - AUPRC via PRROC
# - precision@k
# - detection latency (per fraud event or per window, depending on labels)
precision_at_k <- function(y, score, k) {
ord <- order(score, decreasing = TRUE)
topk <- ord[seq_len(min(k, length(ord)))]
sum(y[topk] == 1L) / length(topk)
}
recall_at_k <- function(y, score, k) {
ord <- order(score, decreasing = TRUE)
topk <- ord[seq_len(min(k, length(ord)))]
tp <- sum(y[topk] == 1L)
pos <- sum(y == 1L)
if (pos == 0) return(NA_real_)
tp / pos
}
compute_auc_metrics <- function(y, score) {
y <- as.integer(y)
# AUROC
roc_obj <- pROC::roc(response = y, predictor = score, quiet = TRUE, direction = "auto")
auroc <- as.numeric(pROC::auc(roc_obj))
# AUPRC
pos <- score[y == 1L]
neg <- score[y == 0L]
pr_obj <- PRROC::pr.curve(scores.class0 = pos, scores.class1 = neg, curve = TRUE)
auprc <- as.numeric(pr_obj$auc.integral)
list(auroc = auroc, auprc = auprc, roc_obj = roc_obj, pr_obj = pr_obj)
}
plot_roc_pr <- function(metrics, prefix = "model", out_dir = file.path("output", "figures")) {
# ROC data
roc_df <- data.frame(
fpr = 1 - metrics$roc_obj$specificities,
tpr = metrics$roc_obj$sensitivities
)
p_roc <- ggplot(roc_df, aes(x = fpr, y = tpr)) +
geom_line() +
geom_abline(intercept = 0, slope = 1, linetype = 2) +
theme_minimal() +
labs(title = paste0("ROC: ", prefix), x = "False Positive Rate", y = "True Positive Rate")
ggsave(file.path(out_dir, paste0(prefix, "_roc.png")), p_roc, width = 7, height = 5, dpi = 150)
# PR data from PRROC object
pr_curve <- metrics$pr_obj$curve
pr_df <- data.frame(recall = pr_curve[, 1], precision = pr_curve[, 2])
p_pr <- ggplot(pr_df, aes(x = recall, y = precision)) +
geom_line() +
theme_minimal() +
labs(title = paste0("PR: ", prefix), x = "Recall", y = "Precision")
ggsave(file.path(out_dir, paste0(prefix, "_pr.png")), p_pr, width = 7, height = 5, dpi = 150)
list(p_roc = p_roc, p_pr = p_pr)
}
plot_cluster_pca <- function(df_num, cluster, score, prefix = "cluster", out_dir = file.path("output", "figures")) {
X <- as.matrix(df_num)
# robust-ish: replace NA with column medians
for (j in seq_len(ncol(X))) {
med <- median(X[, j], na.rm = TRUE)
X[is.na(X[, j]), j] <- med
}
pca <- prcomp(X, center = TRUE, scale. = TRUE)
z <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2],
cluster = factor(cluster),
score = score)
p <- ggplot(z, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.6) +
theme_minimal() +
labs(title = paste0("PCA cluster view: ", prefix))
ggsave(file.path(out_dir, paste0(prefix, "_pca.png")), p, width = 7, height = 5, dpi = 150)
p
}
# Detection latency at window granularity:
# - if you have first_fraud_ts and an alert time, latency can be computed.
# Here: define alert if score >= threshold; alert_time = window end time (date + 1 day)
compute_latency <- function(win_dt, score, threshold) {
dt <- copy(win_dt)
dt[, score := score]
dt[, alert := as.integer(score >= threshold)]
dt[, window_end := as.POSIXct(date + 1, tz = "Africa/Lagos")]
# latency for windows that contain fraud and are alerted
dt_f <- dt[!is.na(first_fraud_ts)]
dt_f[, latency_hours := as.numeric(difftime(window_end, first_fraud_ts, units = "hours"))]
dt_f_alert <- dt_f[alert == 1L]
list(
n_fraud_windows = nrow(dt_f),
n_alerted_fraud_windows = nrow(dt_f_alert),
latency_median_h = median(dt_f_alert$latency_hours, na.rm = TRUE),
latency_p95_h = quantile(dt_f_alert$latency_hours, 0.95, na.rm = TRUE)
)
}
main_run.R
# main_run.R
source("R/00_packages.R")
source("R/01_box7.R")
source("R/02_simulate.R")
source("R/03_features.R")
source("R/04_distances.R")
source("R/05_clustering.R")
source("R/06_evaluation_plots.R")
# 1) Simulate data
sim <- simulate_transactions(default_sim_params())
tx <- sim$tx
fwrite(tx, file.path("output", "data", "transactions.csv.gz"))
# 2) Build customer-day windows
win <- make_customer_day_windows(tx)
fwrite(win, file.path("output", "data", "customer_day_windows.csv.gz"))
# 3) Prepare modeling frame
mf <- prepare_model_frame(win)
dt <- mf$dt
# Identify numeric/factor columns for distance selection
feature_df <- dt[, ..mf$feature_cols]
factor_cols <- names(feature_df)[vapply(feature_df, is.factor, logical(1))]
numeric_cols <- setdiff(names(feature_df), factor_cols)
# For numeric-only variants
df_num <- feature_df[, ..numeric_cols]
# 4) Choose distance + clustering defaults
# Default: robust Gower + HDBSCAN (minPts tuned)
max_precompute <- 6000
n <- nrow(feature_df)
if (n > max_precompute) {
message("n=", n, " exceeds max_precompute=", max_precompute, "; sampling for demonstration.")
idx <- sample.int(n, max_precompute)
feature_df_run <- feature_df[idx, ]
dt_run <- dt[idx, ]
df_num_run <- df_num[idx, ]
} else {
feature_df_run <- feature_df
dt_run <- dt
df_num_run <- df_num
}
# Robust Gower distance (dist object)
d_gower <- dist_robust_gower(feature_df_run, numeric_cols = numeric_cols, factor_cols = factor_cols)
# HDBSCAN + GLOSH scoring
fit <- cluster_hdbscan(d_gower, minPts = 20)
# 5) Evaluate performance on simulated labels
y <- dt_run$y_fraud
score <- fit$score
metrics <- compute_auc_metrics(y, score)
# precision@k
k_list <- c(50, 100, 200, 500)
pk <- sapply(k_list, function(k) precision_at_k(y, score, k))
rk <- sapply(k_list, function(k) recall_at_k(y, score, k))
# latency (choose threshold at top 1% alerts)
thr <- quantile(score, 0.99, na.rm = TRUE)
lat <- compute_latency(dt_run, score, threshold = thr)
# Save metrics
out_metrics <- data.table(
method = fit$method,
n = length(y),
fraud_rate = mean(y == 1L),
auroc = metrics$auroc,
auprc = metrics$auprc,
threshold_p99 = thr,
latency_median_h = lat$latency_median_h,
latency_p95_h = lat$latency_p95_h
)
for (i in seq_along(k_list)) {
out_metrics[[paste0("precision@", k_list[i])]] <- pk[i]
out_metrics[[paste0("recall@", k_list[i])]] <- rk[i]
}
fwrite(out_metrics, file.path("output", "metrics", "metrics_summary.csv"))
# 6) Plots to disk
plot_roc_pr(metrics, prefix = fit$method)
plot_cluster_pca(df_num_run, cluster = fit$cluster, score = score, prefix = fit$method)
# 7) Example 7-point boxplots by cluster for log amount median summaries (diagnostic)
# Build cluster-level summary of the window-level logamt_med (as an example numeric variable)
diag_dt <- data.table(cluster = as.factor(fit$cluster), logamt_med = dt_run$logamt_med)
box_by_cluster <- diag_dt[, {
b <- box7(logamt_med, rule = "auto")
.(min = b$min, lf = b$lf, q1 = b$q1, med = b$med, q3 = b$q3, uf = b$uf, max = b$max)
}, by = cluster]
plot_box7(
box_by_cluster,
x_col = "cluster",
title = "Seven-point boxplot of window log(amount) medians by cluster",
out_path = file.path("output", "figures", paste0(fit$method, "_box7_logamt_med_by_cluster.png"))
)
message("Done. Outputs in output/ directory.")