Load the data

Load

packages <- c(
"tidyverse", # data wrangling + plots
"readr", # fast CSV reader
"janitor", # clean_names()
"FNN", # kNN distances
"isotree", # Isolation Forest
"scales" # percent formatting
)

installed <- packages %in% installed.packages()[, "Package"]
if (any(!installed)) {
install.packages(packages[!installed])
}

library(tidyverse) # data wrangling + plots

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr) # fast CSV reader
library(janitor) # clean_names()

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(FNN) # kNN distances
library(isotree) # Isolation Forest
library(scales) # percent formatting

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

raw <- read_csv("cardanomaly.csv") %>% clean_names()

## Rows: 71201 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (30): V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

more_pkgs <- c("patchwork", "GGally", "pROC", "PRROC")
to_install <- more_pkgs[!more_pkgs %in% installed.packages()[, "Package"]]
if (length(to_install)) install.packages(to_install)
invisible(lapply(more_pkgs, library, character.only = TRUE))

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## Loading required package: rlang

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice

Descriptive Summary

# Detect the fraud label column (here it's 'class' after clean_names())

if (!"class" %in% names(raw)) stop("Expected a 'Class' column (becomes 'class' after clean_names()).")

df <- raw %>%
rename(.label = class) %>%                 # keep label ONLY for later evaluation
mutate(.label = as.integer(.label %in% c(1, TRUE)))

# ----- FEATURES FOR MODELING: V1..V28 + amount (NO label used) -----

# After clean_names(), these are v1..v28 and amount.

feature_cols <- c(
grep("^v\\d+$", names(df), value = TRUE, perl = TRUE),  # v1, v2, ..., v28
"amount"
)

# Keep only features that truly exist (robust to slight schema diffs)

feature_cols <- feature_cols[feature_cols %in% names(df)]

num_features <- df %>%
select(all_of(feature_cols))

# Basic stats

n_obs <- nrow(df)
fraud_rate <- mean(df$.label == 1)
cat("Rows:", n_obs, "\nFraud rate:", scales::percent(fraud_rate), "\n")

## Rows: 71201 
## Fraud rate: 0%

summary(num_features)

##        v1                   v2                  v3            
##  Min.   :-41.928738   Min.   :-40.80398   Min.   :-48.325589  
##  1st Qu.: -0.919575   1st Qu.: -0.59911   1st Qu.: -0.896589  
##  Median :  0.009672   Median :  0.07157   Median :  0.173557  
##  Mean   : -0.003489   Mean   :  0.00243   Mean   : -0.001004  
##  3rd Qu.:  1.315882   3rd Qu.:  0.80898   3rd Qu.:  1.024471  
##  Max.   :  2.422508   Max.   : 18.18363   Max.   :  4.040465  
##        v4                 v5                   v6            
##  Min.   :-5.60061   Min.   :-1.137e+02   Min.   :-21.929312  
##  1st Qu.:-0.85531   1st Qu.:-6.919e-01   1st Qu.: -0.772195  
##  Median :-0.02900   Median :-5.109e-02   Median : -0.281348  
##  Mean   :-0.01412   Mean   : 3.232e-03   Mean   : -0.000844  
##  3rd Qu.: 0.72739   3rd Qu.: 6.151e-01   3rd Qu.:  0.399624  
##  Max.   :15.30418   Max.   : 3.291e+01   Max.   : 73.301626  
##        v7                   v8                   v9           
##  Min.   :-24.377116   Min.   :-41.484823   Min.   :-8.739670  
##  1st Qu.: -0.554153   1st Qu.: -0.210382   1st Qu.:-0.641300  
##  Median :  0.045554   Median :  0.021421   Median :-0.048818  
##  Mean   :  0.006614   Mean   : -0.006191   Mean   : 0.004188  
##  3rd Qu.:  0.578940   3rd Qu.:  0.327346   3rd Qu.: 0.593613  
##  Max.   :120.589494   Max.   : 18.748872   Max.   :10.348407  
##       v10                  v11                 v12            
##  Min.   :-18.271168   Min.   :-4.682931   Min.   :-17.769143  
##  1st Qu.: -0.537971   1st Qu.:-0.767305   1st Qu.: -0.409474  
##  Median : -0.095508   Median :-0.040311   Median :  0.134437  
##  Mean   :  0.002199   Mean   :-0.005878   Mean   : -0.001198  
##  3rd Qu.:  0.445837   3rd Qu.: 0.730657   3rd Qu.:  0.614638  
##  Max.   : 15.331742   Max.   :12.018913   Max.   :  4.846452  
##       v13                  v14                  v15           
##  Min.   :-3.8489431   Min.   :-19.214326   Min.   :-4.025373  
##  1st Qu.:-0.6491476   1st Qu.: -0.429421   1st Qu.:-0.575685  
##  Median :-0.0162865   Median :  0.049514   Median : 0.051620  
##  Mean   : 0.0008819   Mean   : -0.005272   Mean   : 0.002312  
##  3rd Qu.: 0.6619683   3rd Qu.:  0.488973   3rd Qu.: 0.648783  
##  Max.   : 7.1268830   Max.   :  7.421944   Max.   : 8.877742  
##       v16                  v17                  v18            
##  Min.   :-1.239e+01   Min.   :-22.541652   Min.   :-7.9867207  
##  1st Qu.:-4.701e-01   1st Qu.: -0.484179   1st Qu.:-0.4990526  
##  Median : 6.392e-02   Median : -0.063811   Median :-0.0053602  
##  Mean   : 2.887e-04   Mean   :  0.003883   Mean   : 0.0005761  
##  3rd Qu.: 5.243e-01   3rd Qu.:  0.403678   3rd Qu.: 0.4983144  
##  Max.   : 1.732e+01   Max.   :  7.140627   Max.   : 3.8642284  
##       v19                 v20                  v21            
##  Min.   :-4.932733   Min.   :-54.497720   Min.   :-2.162e+01  
##  1st Qu.:-0.458371   1st Qu.: -0.210592   1st Qu.:-2.297e-01  
##  Median :-0.002111   Median : -0.061306   Median :-3.044e-02  
##  Mean   :-0.003221   Mean   :  0.002924   Mean   :-3.893e-04  
##  3rd Qu.: 0.454514   3rd Qu.:  0.134799   3rd Qu.: 1.854e-01  
##  Max.   : 5.591971   Max.   : 23.649095   Max.   : 2.720e+01  
##       v22                 v23                  v24            
##  Min.   :-8.887017   Min.   :-44.807735   Min.   :-2.8078970  
##  1st Qu.:-0.542334   1st Qu.: -0.160811   1st Qu.:-0.3554761  
##  Median : 0.005080   Median : -0.011084   Median : 0.0427464  
##  Mean   :-0.003415   Mean   :  0.003038   Mean   : 0.0005425  
##  3rd Qu.: 0.523246   3rd Qu.:  0.148437   3rd Qu.: 0.4392306  
##  Max.   : 8.272233   Max.   : 18.946734   Max.   : 4.5845491  
##       v25                  v26                 v27           
##  Min.   :-1.030e+01   Min.   :-2.241620   Min.   :-8.878665  
##  1st Qu.:-3.178e-01   1st Qu.:-0.327995   1st Qu.:-0.070851  
##  Median : 1.481e-02   Median :-0.052520   Median : 0.002269  
##  Mean   :-1.108e-04   Mean   :-0.001554   Mean   : 0.002377  
##  3rd Qu.: 3.523e-01   3rd Qu.: 0.236142   3rd Qu.: 0.092484  
##  Max.   : 5.521e+00   Max.   : 3.415636   Max.   :31.612198  
##       v28                 amount        
##  Min.   :-15.430084   Min.   :    0.00  
##  1st Qu.: -0.053209   1st Qu.:    5.49  
##  Median :  0.011437   Median :   22.00  
##  Mean   : -0.001304   Mean   :   88.24  
##  3rd Qu.:  0.079782   3rd Qu.:   77.06  
##  Max.   : 15.415925   Max.   :25691.16

EDA

# Scale numeric features (training remains UNSUPERVISED; label unused here)

X <- num_features %>%
mutate(across(everything(), ~ as.numeric(.))) %>%
scale() %>%
as.data.frame()

# Use observed fraud rate only to pick a threshold for evaluation

contamination <- max(1e-4, min(0.5, mean(df$.label == 1)))
cat("Using contamination (threshold share) =", scales::percent(contamination), "\n")

## Using contamination (threshold share) = 0%

Unsupervised Learning Methods

Unsupervised KNN

# Idea: Use the k-distance (distance to the k-th nearest neighbor) as an outlier score.

# Larger k-distance => more isolated => more anomalous.

k <- 10  # small, simple default; can try 5~20
knn_out <- get.knn(data = X, k = k)

# k-distance score = max distance among the k neighbors

knn_score <- apply(knn_out$nn.dist, 1, max)

# Choose threshold so that the top 'contamination' fraction are flagged as outliers

knn_cut <- quantile(knn_score, probs = 1 - contamination, na.rm = TRUE)
knn_pred_outlier <- as.integer(knn_score >= knn_cut)

# Save results

res_knn <- tibble(
knn_score = knn_score,
knn_is_outlier = knn_pred_outlier
)
summary(res_knn$knn_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.190   1.872   2.121   2.671 193.474

#------
tibble(score = res_knn$knn_score) %>%
ggplot(aes(x = score)) +
geom_histogram(bins = 60) +
geom_vline(xintercept = knn_cut, linetype = 2) +
labs(title = "kNN Outlier Score (k-distance)", x = "Score", y = "Count")

Isolation Forest

# Train Isolation Forest on features only (unsupervised)

iso <- isolation.forest(X, ntrees = 200, seed = 42)

## Warning in isolation.forest(X, ntrees = 200, seed = 42): Attempting to use more
## than 1 thread, but package was compiled without OpenMP support. See
## https://github.com/david-cortes/installing-optimized-libraries#4-macos-install-and-enable-openmp

# Predict anomaly scores (higher = more anomalous)

iso_score <- predict(iso, X, type = "score")

# Threshold using same contamination rate

iso_cut <- quantile(iso_score, probs = 1 - contamination, na.rm = TRUE)
iso_pred_outlier <- as.integer(iso_score >= iso_cut)

res_iso <- tibble(
iso_score = iso_score,
iso_is_outlier = iso_pred_outlier
)
summary(res_iso$iso_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3154  0.3273  0.3363  0.3438  0.3496  0.8532

#----
tibble(score = res_iso$iso_score) %>%
ggplot(aes(x = score)) +
geom_histogram(bins = 60) +
geom_vline(xintercept = iso_cut, linetype = 2) +
labs(title = "Isolation Forest Outlier Score", x = "Score", y = "Count")

Interpretation:

-Both histograms are heavily right-skewed, with most transactions having very low outlier scores.

-The vertical dashed lines mark the chosen thresholds. Only a small portion to the right are flagged as “outliers,” which matches the very low fraud rate (0.17%).

-In anomaly detection, most data points are normal, and the tail of the distribution corresponds to potential frauds. kNN scores (distance-based) vary more widely; a few transactions are very isolated. Isolation Forest scores are typically bounded between 0–1, showing a clear separation around 0.65.

Conclusion & Interpretation

Confusion Matrix

# Combine predictions and label

eval_df <- bind_cols(
df %>% select(.label),
res_knn,
res_iso
)

# Helper to make a simple confusion matrix + basic metrics

conf_mat_simple <- function(truth, pred){
tp <- sum(truth==1 & pred==1)
tn <- sum(truth==0 & pred==0)
fp <- sum(truth==0 & pred==1)
fn <- sum(truth==1 & pred==0)
acc <- (tp + tn) / length(truth)
prec <- ifelse(tp+fp==0, NA, tp/(tp+fp))
rec  <- ifelse(tp+fn==0, NA, tp/(tp+fn))  # recall = TPR (sensitivity)
f1   <- ifelse(is.na(prec) | is.na(rec) | (prec+rec)==0, NA, 2*prec*rec/(prec+rec))
tibble(
TP = tp, FP = fp, FN = fn, TN = tn,
Accuracy = acc, Precision = prec, Recall = rec, F1 = f1
)
}

knn_cm <- conf_mat_simple(eval_df$.label, eval_df$knn_is_outlier)
iso_cm <- conf_mat_simple(eval_df$.label, eval_df$iso_is_outlier)

knn_cm %>% mutate(Model = "Unsupervised kNN") %>% relocate(Model)

## # A tibble: 1 × 9
##   Model               TP    FP    FN    TN Accuracy Precision Recall    F1
##   <chr>            <int> <int> <int> <int>    <dbl>     <dbl>  <dbl> <dbl>
## 1 Unsupervised kNN    18    88    88 71007    0.998     0.170  0.170 0.170

iso_cm %>% mutate(Model = "Isolation Forest") %>% relocate(Model)

## # A tibble: 1 × 9
##   Model               TP    FP    FN    TN Accuracy Precision Recall    F1
##   <chr>            <int> <int> <int> <int>    <dbl>     <dbl>  <dbl> <dbl>
## 1 Isolation Forest    31    75    75 71020    0.998     0.292  0.292 0.292

make_cm_long <- function(truth, pred, title){
  cm <- table(
    Truth = factor(truth, levels = c(0, 1), labels = c("Non-Fraud", "Fraud")),
    Pred  = factor(pred,  levels = c(0, 1), labels = c("Inlier", "Outlier"))
  )
  
  as_tibble(cm) |>
    ggplot(aes(Pred, Truth, fill = n, label = n)) +
    geom_tile(color = "white", linewidth = 1.2) +
    geom_text(size = 5, color = "black", fontface = "bold") +
    scale_fill_gradient(
      low = "#dbe9f6",  # light blue
      high = "#08306b", # dark blue
      guide = "none"
    ) +
    labs(title = title, x = "Prediction", y = "True class") +
    theme_minimal(base_size = 13) +
    theme(
      panel.grid = element_blank(),
      plot.title = element_text(face = "bold", hjust = 0.5)
    )
}

# Re-plot
p_cm_knn <- make_cm_long(eval_df$.label, eval_df$knn_is_outlier, "Confusion Matrix – kNN")
p_cm_if  <- make_cm_long(eval_df$.label, eval_df$iso_is_outlier, "Confusion Matrix – Isolation Forest")

p_cm_knn + p_cm_if

Model	TP	FN	FP	TN	Precision	Recall
kNN	18	88	88	71,007	17%	17%
Isolation Forest	31	75	75	71,020	29%	29%

Interpretation:

-Both methods identify only a small portion of true frauds (low recall) but produce relatively few false positives (high specificity).

-Isolation Forest performs slightly better, catching more frauds (higher TP) while keeping FP low.

Anomaly Excercise - Credit Card Fraud Detection

Yuhan Wen

2025-11-12