packages <- c(
"tidyverse", # data wrangling + plots
"readr", # fast CSV reader
"janitor", # clean_names()
"FNN", # kNN distances
"isotree", # Isolation Forest
"scales" # percent formatting
)
installed <- packages %in% installed.packages()[, "Package"]
if (any(!installed)) {
install.packages(packages[!installed])
}
library(tidyverse) # data wrangling + plots
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr) # fast CSV reader
library(janitor) # clean_names()
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(FNN) # kNN distances
library(isotree) # Isolation Forest
library(scales) # percent formatting
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
raw <- read_csv("cardanomaly.csv") %>% clean_names()
## Rows: 71201 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (30): V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
more_pkgs <- c("patchwork", "GGally", "pROC", "PRROC")
to_install <- more_pkgs[!more_pkgs %in% installed.packages()[, "Package"]]
if (length(to_install)) install.packages(to_install)
invisible(lapply(more_pkgs, library, character.only = TRUE))
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
# Detect the fraud label column (here it's 'class' after clean_names())
if (!"class" %in% names(raw)) stop("Expected a 'Class' column (becomes 'class' after clean_names()).")
df <- raw %>%
rename(.label = class) %>% # keep label ONLY for later evaluation
mutate(.label = as.integer(.label %in% c(1, TRUE)))
# ----- FEATURES FOR MODELING: V1..V28 + amount (NO label used) -----
# After clean_names(), these are v1..v28 and amount.
feature_cols <- c(
grep("^v\\d+$", names(df), value = TRUE, perl = TRUE), # v1, v2, ..., v28
"amount"
)
# Keep only features that truly exist (robust to slight schema diffs)
feature_cols <- feature_cols[feature_cols %in% names(df)]
num_features <- df %>%
select(all_of(feature_cols))
# Basic stats
n_obs <- nrow(df)
fraud_rate <- mean(df$.label == 1)
cat("Rows:", n_obs, "\nFraud rate:", scales::percent(fraud_rate), "\n")
## Rows: 71201
## Fraud rate: 0%
summary(num_features)
## v1 v2 v3
## Min. :-41.928738 Min. :-40.80398 Min. :-48.325589
## 1st Qu.: -0.919575 1st Qu.: -0.59911 1st Qu.: -0.896589
## Median : 0.009672 Median : 0.07157 Median : 0.173557
## Mean : -0.003489 Mean : 0.00243 Mean : -0.001004
## 3rd Qu.: 1.315882 3rd Qu.: 0.80898 3rd Qu.: 1.024471
## Max. : 2.422508 Max. : 18.18363 Max. : 4.040465
## v4 v5 v6
## Min. :-5.60061 Min. :-1.137e+02 Min. :-21.929312
## 1st Qu.:-0.85531 1st Qu.:-6.919e-01 1st Qu.: -0.772195
## Median :-0.02900 Median :-5.109e-02 Median : -0.281348
## Mean :-0.01412 Mean : 3.232e-03 Mean : -0.000844
## 3rd Qu.: 0.72739 3rd Qu.: 6.151e-01 3rd Qu.: 0.399624
## Max. :15.30418 Max. : 3.291e+01 Max. : 73.301626
## v7 v8 v9
## Min. :-24.377116 Min. :-41.484823 Min. :-8.739670
## 1st Qu.: -0.554153 1st Qu.: -0.210382 1st Qu.:-0.641300
## Median : 0.045554 Median : 0.021421 Median :-0.048818
## Mean : 0.006614 Mean : -0.006191 Mean : 0.004188
## 3rd Qu.: 0.578940 3rd Qu.: 0.327346 3rd Qu.: 0.593613
## Max. :120.589494 Max. : 18.748872 Max. :10.348407
## v10 v11 v12
## Min. :-18.271168 Min. :-4.682931 Min. :-17.769143
## 1st Qu.: -0.537971 1st Qu.:-0.767305 1st Qu.: -0.409474
## Median : -0.095508 Median :-0.040311 Median : 0.134437
## Mean : 0.002199 Mean :-0.005878 Mean : -0.001198
## 3rd Qu.: 0.445837 3rd Qu.: 0.730657 3rd Qu.: 0.614638
## Max. : 15.331742 Max. :12.018913 Max. : 4.846452
## v13 v14 v15
## Min. :-3.8489431 Min. :-19.214326 Min. :-4.025373
## 1st Qu.:-0.6491476 1st Qu.: -0.429421 1st Qu.:-0.575685
## Median :-0.0162865 Median : 0.049514 Median : 0.051620
## Mean : 0.0008819 Mean : -0.005272 Mean : 0.002312
## 3rd Qu.: 0.6619683 3rd Qu.: 0.488973 3rd Qu.: 0.648783
## Max. : 7.1268830 Max. : 7.421944 Max. : 8.877742
## v16 v17 v18
## Min. :-1.239e+01 Min. :-22.541652 Min. :-7.9867207
## 1st Qu.:-4.701e-01 1st Qu.: -0.484179 1st Qu.:-0.4990526
## Median : 6.392e-02 Median : -0.063811 Median :-0.0053602
## Mean : 2.887e-04 Mean : 0.003883 Mean : 0.0005761
## 3rd Qu.: 5.243e-01 3rd Qu.: 0.403678 3rd Qu.: 0.4983144
## Max. : 1.732e+01 Max. : 7.140627 Max. : 3.8642284
## v19 v20 v21
## Min. :-4.932733 Min. :-54.497720 Min. :-2.162e+01
## 1st Qu.:-0.458371 1st Qu.: -0.210592 1st Qu.:-2.297e-01
## Median :-0.002111 Median : -0.061306 Median :-3.044e-02
## Mean :-0.003221 Mean : 0.002924 Mean :-3.893e-04
## 3rd Qu.: 0.454514 3rd Qu.: 0.134799 3rd Qu.: 1.854e-01
## Max. : 5.591971 Max. : 23.649095 Max. : 2.720e+01
## v22 v23 v24
## Min. :-8.887017 Min. :-44.807735 Min. :-2.8078970
## 1st Qu.:-0.542334 1st Qu.: -0.160811 1st Qu.:-0.3554761
## Median : 0.005080 Median : -0.011084 Median : 0.0427464
## Mean :-0.003415 Mean : 0.003038 Mean : 0.0005425
## 3rd Qu.: 0.523246 3rd Qu.: 0.148437 3rd Qu.: 0.4392306
## Max. : 8.272233 Max. : 18.946734 Max. : 4.5845491
## v25 v26 v27
## Min. :-1.030e+01 Min. :-2.241620 Min. :-8.878665
## 1st Qu.:-3.178e-01 1st Qu.:-0.327995 1st Qu.:-0.070851
## Median : 1.481e-02 Median :-0.052520 Median : 0.002269
## Mean :-1.108e-04 Mean :-0.001554 Mean : 0.002377
## 3rd Qu.: 3.523e-01 3rd Qu.: 0.236142 3rd Qu.: 0.092484
## Max. : 5.521e+00 Max. : 3.415636 Max. :31.612198
## v28 amount
## Min. :-15.430084 Min. : 0.00
## 1st Qu.: -0.053209 1st Qu.: 5.49
## Median : 0.011437 Median : 22.00
## Mean : -0.001304 Mean : 88.24
## 3rd Qu.: 0.079782 3rd Qu.: 77.06
## Max. : 15.415925 Max. :25691.16
# Scale numeric features (training remains UNSUPERVISED; label unused here)
X <- num_features %>%
mutate(across(everything(), ~ as.numeric(.))) %>%
scale() %>%
as.data.frame()
# Use observed fraud rate only to pick a threshold for evaluation
contamination <- max(1e-4, min(0.5, mean(df$.label == 1)))
cat("Using contamination (threshold share) =", scales::percent(contamination), "\n")
## Using contamination (threshold share) = 0%
# Idea: Use the k-distance (distance to the k-th nearest neighbor) as an outlier score.
# Larger k-distance => more isolated => more anomalous.
k <- 10 # small, simple default; can try 5~20
knn_out <- get.knn(data = X, k = k)
# k-distance score = max distance among the k neighbors
knn_score <- apply(knn_out$nn.dist, 1, max)
# Choose threshold so that the top 'contamination' fraction are flagged as outliers
knn_cut <- quantile(knn_score, probs = 1 - contamination, na.rm = TRUE)
knn_pred_outlier <- as.integer(knn_score >= knn_cut)
# Save results
res_knn <- tibble(
knn_score = knn_score,
knn_is_outlier = knn_pred_outlier
)
summary(res_knn$knn_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.190 1.872 2.121 2.671 193.474
#------
tibble(score = res_knn$knn_score) %>%
ggplot(aes(x = score)) +
geom_histogram(bins = 60) +
geom_vline(xintercept = knn_cut, linetype = 2) +
labs(title = "kNN Outlier Score (k-distance)", x = "Score", y = "Count")
# Train Isolation Forest on features only (unsupervised)
iso <- isolation.forest(X, ntrees = 200, seed = 42)
## Warning in isolation.forest(X, ntrees = 200, seed = 42): Attempting to use more
## than 1 thread, but package was compiled without OpenMP support. See
## https://github.com/david-cortes/installing-optimized-libraries#4-macos-install-and-enable-openmp
# Predict anomaly scores (higher = more anomalous)
iso_score <- predict(iso, X, type = "score")
# Threshold using same contamination rate
iso_cut <- quantile(iso_score, probs = 1 - contamination, na.rm = TRUE)
iso_pred_outlier <- as.integer(iso_score >= iso_cut)
res_iso <- tibble(
iso_score = iso_score,
iso_is_outlier = iso_pred_outlier
)
summary(res_iso$iso_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3154 0.3273 0.3363 0.3438 0.3496 0.8532
#----
tibble(score = res_iso$iso_score) %>%
ggplot(aes(x = score)) +
geom_histogram(bins = 60) +
geom_vline(xintercept = iso_cut, linetype = 2) +
labs(title = "Isolation Forest Outlier Score", x = "Score", y = "Count")
Interpretation:
-Both histograms are heavily right-skewed, with most transactions having very low outlier scores.
-The vertical dashed lines mark the chosen thresholds. Only a small portion to the right are flagged as “outliers,” which matches the very low fraud rate (0.17%).
-In anomaly detection, most data points are normal, and the tail of the distribution corresponds to potential frauds. kNN scores (distance-based) vary more widely; a few transactions are very isolated. Isolation Forest scores are typically bounded between 0–1, showing a clear separation around 0.65.
# Combine predictions and label
eval_df <- bind_cols(
df %>% select(.label),
res_knn,
res_iso
)
# Helper to make a simple confusion matrix + basic metrics
conf_mat_simple <- function(truth, pred){
tp <- sum(truth==1 & pred==1)
tn <- sum(truth==0 & pred==0)
fp <- sum(truth==0 & pred==1)
fn <- sum(truth==1 & pred==0)
acc <- (tp + tn) / length(truth)
prec <- ifelse(tp+fp==0, NA, tp/(tp+fp))
rec <- ifelse(tp+fn==0, NA, tp/(tp+fn)) # recall = TPR (sensitivity)
f1 <- ifelse(is.na(prec) | is.na(rec) | (prec+rec)==0, NA, 2*prec*rec/(prec+rec))
tibble(
TP = tp, FP = fp, FN = fn, TN = tn,
Accuracy = acc, Precision = prec, Recall = rec, F1 = f1
)
}
knn_cm <- conf_mat_simple(eval_df$.label, eval_df$knn_is_outlier)
iso_cm <- conf_mat_simple(eval_df$.label, eval_df$iso_is_outlier)
knn_cm %>% mutate(Model = "Unsupervised kNN") %>% relocate(Model)
## # A tibble: 1 × 9
## Model TP FP FN TN Accuracy Precision Recall F1
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Unsupervised kNN 18 88 88 71007 0.998 0.170 0.170 0.170
iso_cm %>% mutate(Model = "Isolation Forest") %>% relocate(Model)
## # A tibble: 1 × 9
## Model TP FP FN TN Accuracy Precision Recall F1
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Isolation Forest 31 75 75 71020 0.998 0.292 0.292 0.292
make_cm_long <- function(truth, pred, title){
cm <- table(
Truth = factor(truth, levels = c(0, 1), labels = c("Non-Fraud", "Fraud")),
Pred = factor(pred, levels = c(0, 1), labels = c("Inlier", "Outlier"))
)
as_tibble(cm) |>
ggplot(aes(Pred, Truth, fill = n, label = n)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(size = 5, color = "black", fontface = "bold") +
scale_fill_gradient(
low = "#dbe9f6", # light blue
high = "#08306b", # dark blue
guide = "none"
) +
labs(title = title, x = "Prediction", y = "True class") +
theme_minimal(base_size = 13) +
theme(
panel.grid = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5)
)
}
# Re-plot
p_cm_knn <- make_cm_long(eval_df$.label, eval_df$knn_is_outlier, "Confusion Matrix – kNN")
p_cm_if <- make_cm_long(eval_df$.label, eval_df$iso_is_outlier, "Confusion Matrix – Isolation Forest")
p_cm_knn + p_cm_if
| Model | TP | FN | FP | TN | Precision | Recall |
|---|---|---|---|---|---|---|
| kNN | 18 | 88 | 88 | 71,007 | 17% | 17% |
| Isolation Forest | 31 | 75 | 75 | 71,020 | 29% | 29% |
Interpretation:
-Both methods identify only a small portion of true frauds (low recall) but produce relatively few false positives (high specificity).
-Isolation Forest performs slightly better, catching more frauds (higher TP) while keeping FP low.