This document demonstrates comprehensive anomaly detection techniques across three types of data structures:
For each data type, we apply multiple detection methods and combine them using a consensus approach.
# Install and load required packages
packages <- c(
"tidyverse", "gt", "plotly", "lubridate",
"tsoutliers", "forecast", "anomalize", "timetk",
"isotree", "dbscan",
"plm", "panelr", "lme4",
"caret", "randomForest", "e1071",
"robust", "robustbase", "outliers",
"zoo", "xts", "TTR", "gridExtra"
)
install_if_missing <- function(packages) {
for (pkg in packages) {
if (!require(pkg, character.only = TRUE, quietly = TRUE)) {
install.packages(pkg, dependencies = TRUE, quiet = TRUE)
library(pkg, character.only = TRUE)
}
}
}
suppressWarnings(suppressMessages(install_if_missing(packages)))
# Resolve conflicts
library(dplyr, warn.conflicts = FALSE)Dataset Being Analyzed: Retail Sales Data (3 years, daily observations)
Time series anomaly detection identifies unusual patterns in sequential data. We apply five complementary methods:
## Loading time series data: ts_sales_data.csv
ts_data_raw <- read_csv("simulated_data/ts_sales_data.csv", show_col_types = FALSE)
# Data has standardized columns: date, value, true_anomaly
ts_data <- ts_data_raw %>%
dplyr::select(date, value, true_anomaly)
cat(" - Loaded", nrow(ts_data), "observations\n")## - Loaded 1095 observations
## - Date range: 18628 to 19722
## - True anomalies: 8
ggplot(ts_data, aes(x = date, y = value)) +
geom_line(color = "steelblue", alpha = 0.7) +
geom_point(data = ts_data %>% filter(true_anomaly),
aes(x = date, y = value),
color = "red", size = 3, shape = 4) +
labs(title = "Raw Time Series: Retail Sales Data",
subtitle = "Red X marks indicate true anomalies",
x = "Date", y = "Sales Value") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14))Explanation: Seasonal-Trend decomposition using Loess (STL) separates the time series into seasonal, trend, and remainder components. Anomalies are detected in the remainder using the Interquartile Range (IQR) rule.
How it works: - Decompose series:
value = trend + seasonal + remainder - Calculate IQR of
remainder component - Flag points where
|remainder - median| > 1.5 × IQR
## Method 1: STL Decomposition + IQR...
ts_object <- ts(ts_data$value, frequency = 7)
stl_decomp <- stl(ts_object, s.window = "periodic")
remainder <- stl_decomp$time.series[, "remainder"]
iqr_threshold <- 1.5
iqr_val <- IQR(remainder)
median_val <- median(remainder)
ts_data$stl_anomaly <- abs(remainder - median_val) > iqr_threshold * iqr_val
cat(" Detected:", sum(ts_data$stl_anomaly), "anomalies\n")## Detected: 59 anomalies
# Extract STL components for plotting
stl_df <- data.frame(
date = ts_data$date,
observed = as.numeric(stl_decomp$time.series[, "seasonal"] +
stl_decomp$time.series[, "trend"] +
stl_decomp$time.series[, "remainder"]),
trend = as.numeric(stl_decomp$time.series[, "trend"]),
seasonal = as.numeric(stl_decomp$time.series[, "seasonal"]),
remainder = as.numeric(stl_decomp$time.series[, "remainder"])
)
# Create custom colored STL plot
p1 <- ggplot(stl_df, aes(x = date, y = observed)) +
geom_line(color = "#2C3E50", size = 0.5) +
labs(title = "Observed", y = "Value") +
theme_minimal() +
theme(axis.title.x = element_blank())
p2 <- ggplot(stl_df, aes(x = date, y = trend)) +
geom_line(color = "#E74C3C", size = 0.8) +
labs(title = "Trend", y = "Value") +
theme_minimal() +
theme(axis.title.x = element_blank())
p3 <- ggplot(stl_df, aes(x = date, y = seasonal)) +
geom_line(color = "#3498DB", size = 0.5) +
labs(title = "Seasonal", y = "Value") +
theme_minimal() +
theme(axis.title.x = element_blank())
p4 <- ggplot(stl_df, aes(x = date, y = remainder)) +
geom_line(color = "#27AE60", size = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
labs(title = "Remainder (Anomalies detected here)", y = "Value", x = "Date") +
theme_minimal()
# Combine plots
library(gridExtra)
grid.arrange(p1, p2, p3, p4, ncol = 1,
top = grid::textGrob("STL Decomposition of Retail Sales",
gp = grid::gpar(fontsize = 16, fontface = "bold")))Explanation: AutoRegressive Integrated Moving Average (ARIMA) models forecast future values. Large residuals (difference between actual and predicted) indicate anomalies.
How it works: - Fit optimal ARIMA model using
auto.arima() - Extract residuals - Flag points where
|residual| > 3 × SD(residuals)
## Method 2: ARIMA Residuals...
tryCatch({
fit_arima <- auto.arima(ts_object, seasonal = TRUE)
residuals_arima <- residuals(fit_arima)
threshold_arima <- 3 * sd(residuals_arima, na.rm = TRUE)
ts_data$arima_anomaly <- c(abs(residuals_arima) > threshold_arima,
rep(FALSE, nrow(ts_data) - length(residuals_arima)))
cat(" Model:", as.character(fit_arima), "\n")
cat(" Detected:", sum(ts_data$arima_anomaly), "anomalies\n")
}, error = function(e) {
ts_data$arima_anomaly <<- FALSE
cat(" Warning: ARIMA method failed\n")
})## Model: ARIMA(3,0,1)(2,1,0)[7]
## Detected: 14 anomalies
Explanation: Compares each point to its local moving average. Z-scores measure how many standard deviations away from the local mean.
How it works: - Calculate 7-day moving average -
Compute residuals: residual = value - moving_average -
Standardize residuals (z-score) - Flag points where
|z-score| > 3
## Method 3: Moving Average + Z-score...
ma_window <- 7
ts_data <- ts_data %>%
mutate(
ma = zoo::rollmean(value, ma_window, fill = NA, align = "center"),
ma_residual = value - ma,
ma_zscore = as.numeric(scale(ma_residual)),
ma_anomaly = abs(ma_zscore) > 3
)
cat(" Window size:", ma_window, "days\n")## Window size: 7 days
## Detected: 7 anomalies
Explanation: Generalized Extreme Studentized Deviate (GESD) test for multiple outliers, applied to seasonally decomposed data.
How it works: - Decompose time series (STL) - Apply GESD test to remainder component - Robust to multiple outliers and seasonal patterns
## Method 4: Seasonal Hybrid ESD...
tryCatch({
ts_anomalize <- ts_data %>%
dplyr::select(date, value) %>%
time_decompose(value, method = "stl", frequency = "auto", trend = "auto") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.2) %>%
time_recompose()
ts_data$gesd_anomaly <- ts_anomalize$anomaly == "Yes"
cat(" Detected:", sum(ts_data$gesd_anomaly), "anomalies\n")
}, error = function(e) {
ts_data$gesd_anomaly <<- FALSE
cat(" Warning: GESD method failed\n")
})## Warning: GESD method failed
Explanation: Machine learning ensemble method that isolates anomalies by randomly partitioning the feature space. Anomalies are easier to isolate (shorter path length in decision trees).
How it works: - Engineer features: lags, differences, rolling statistics - Build ensemble of isolation trees - Calculate anomaly scores (lower depth = more anomalous) - Flag bottom 5% as anomalies
## Method 5: Isolation Forest (isotree)...
# Engineer features
ts_features <- ts_data %>%
mutate(
lag1 = lag(value, 1),
lag7 = lag(value, 7),
diff1 = value - lag1,
roll_mean_7 = zoo::rollmean(value, 7, fill = NA, align = "right"),
roll_sd_7 = zoo::rollapply(value, 7, sd, fill = NA, align = "right")
) %>%
drop_na()
# Fit Isolation Forest
iso_model_ts <- isolation.forest(
ts_features %>% dplyr::select(value, lag1, lag7, diff1, roll_mean_7, roll_sd_7),
ndim = 6,
ntrees = 100,
sample_size = 256
)
# Get anomaly scores
iso_scores_ts <- predict(iso_model_ts,
ts_features %>% dplyr::select(value, lag1, lag7, diff1, roll_mean_7, roll_sd_7),
type = "avg_depth")
# Flag bottom 5% as anomalies
threshold_ts <- quantile(iso_scores_ts, 0.05)
ts_features$iso_anomaly <- iso_scores_ts < threshold_ts
# Add back to main dataset
ts_data$iso_anomaly <- FALSE
ts_data$iso_anomaly[ts_data$date %in% ts_features$date] <- ts_features$iso_anomaly
cat(" Features:", ncol(ts_features) - 1, "\n")## Features: 15
## Trees: 100
## Detected: 55 anomalies
Explanation: Combines results from all methods using majority voting. An observation is flagged as anomalous only if 3 or more methods agree.
Rationale: Reduces false positives by requiring consensus among multiple independent approaches.
# Combine results
ts_results <- ts_data %>%
mutate(
total_votes = rowSums(cbind(stl_anomaly, arima_anomaly, ma_anomaly,
gesd_anomaly, iso_anomaly), na.rm = TRUE),
consensus_anomaly = total_votes >= 3
)
cat("Consensus approach: Require 3+ methods to agree\n")## Consensus approach: Require 3+ methods to agree
## Consensus anomalies detected: 9
# Create summary table
ts_summary <- tibble(
Method = c("STL + IQR", "ARIMA Residuals", "Moving Avg + Z-score",
"Seasonal Hybrid ESD", "Isolation Forest (isotree)", "Consensus (3+ votes)"),
Anomalies_Detected = c(
sum(ts_results$stl_anomaly, na.rm = TRUE),
sum(ts_results$arima_anomaly, na.rm = TRUE),
sum(ts_results$ma_anomaly, na.rm = TRUE),
sum(ts_results$gesd_anomaly, na.rm = TRUE),
sum(ts_results$iso_anomaly, na.rm = TRUE),
sum(ts_results$consensus_anomaly, na.rm = TRUE)
),
True_Positives = c(
sum(ts_results$stl_anomaly & ts_results$true_anomaly, na.rm = TRUE),
sum(ts_results$arima_anomaly & ts_results$true_anomaly, na.rm = TRUE),
sum(ts_results$ma_anomaly & ts_results$true_anomaly, na.rm = TRUE),
sum(ts_results$gesd_anomaly & ts_results$true_anomaly, na.rm = TRUE),
sum(ts_results$iso_anomaly & ts_results$true_anomaly, na.rm = TRUE),
sum(ts_results$consensus_anomaly & ts_results$true_anomaly, na.rm = TRUE)
)
) %>%
mutate(
Precision = round(True_Positives / pmax(Anomalies_Detected, 1) * 100, 2),
Recall = round(True_Positives / sum(ts_results$true_anomaly) * 100, 2)
)
# Display table
ts_summary %>%
gt() %>%
tab_header(
title = "Time Series Anomaly Detection - Performance Summary",
subtitle = paste("Dataset: Retail Sales | Observations:", nrow(ts_data),
"| True anomalies:", sum(ts_data$true_anomaly))
) %>%
fmt_number(
columns = c(Precision, Recall),
decimals = 2
) %>%
tab_style(
style = cell_fill(color = "#E8F4F8"),
locations = cells_body(rows = Method == "Consensus (3+ votes)")
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = Method, rows = Method == "Consensus (3+ votes)")
)| Time Series Anomaly Detection - Performance Summary | ||||
| Dataset: Retail Sales | Observations: 1095 | True anomalies: 8 | ||||
| Method | Anomalies_Detected | True_Positives | Precision | Recall |
|---|---|---|---|---|
| STL + IQR | 59 | 8 | 13.56 | 100.00 |
| ARIMA Residuals | 14 | 8 | 57.14 | 100.00 |
| Moving Avg + Z-score | 7 | 7 | 100.00 | 87.50 |
| Seasonal Hybrid ESD | 0 | 0 | 0.00 | 0.00 |
| Isolation Forest (isotree) | 55 | 8 | 14.55 | 100.00 |
| Consensus (3+ votes) | 9 | 8 | 88.89 | 100.00 |
Metrics Explained:
True Positives / (True Positives + False Positives)True Positives / (True Positives + False Negatives)# Create main plot with all data
ts_plot <- plot_ly() %>%
add_trace(
data = ts_results,
x = ~date, y = ~value,
type = 'scatter', mode = 'lines',
name = 'Time Series',
line = list(color = '#95A5A6', width = 1.5)
) %>%
add_trace(
data = ts_results %>% filter(true_anomaly),
x = ~date, y = ~value,
type = 'scatter', mode = 'markers',
name = 'True Anomalies',
marker = list(size = 12, color = '#E74C3C', symbol = 'x', line = list(width = 2))
) %>%
add_trace(
data = ts_results %>% filter(consensus_anomaly & !true_anomaly),
x = ~date, y = ~value,
type = 'scatter', mode = 'markers',
name = 'Detected (False Positive)',
marker = list(size = 10, color = '#F39C12', symbol = 'circle-open', line = list(width = 2))
) %>%
add_trace(
data = ts_results %>% filter(consensus_anomaly & true_anomaly),
x = ~date, y = ~value,
type = 'scatter', mode = 'markers',
name = 'Detected (True Positive)',
marker = list(size = 12, color = '#27AE60', symbol = 'circle', line = list(width = 2))
) %>%
layout(
title = list(
text = "Time Series Anomaly Detection Results - Retail Sales Data",
font = list(size = 16, color = '#2C3E50')
),
xaxis = list(
title = "Date",
showgrid = TRUE,
gridcolor = '#E0E0E0'
),
yaxis = list(
title = "Sales Value",
showgrid = TRUE,
gridcolor = '#E0E0E0'
),
hovermode = 'closest',
plot_bgcolor = '#F8F9FA',
legend = list(
orientation = 'h',
x = 0.5,
xanchor = 'center',
y = -0.2
)
)
ts_plot##
## **Detection Summary:**
cat(" True Positives:", sum(ts_results$consensus_anomaly & ts_results$true_anomaly, na.rm = TRUE), "\n")## True Positives: 8
cat(" False Positives:", sum(ts_results$consensus_anomaly & !ts_results$true_anomaly, na.rm = TRUE), "\n")## False Positives: 1
cat(" False Negatives:", sum(!ts_results$consensus_anomaly & ts_results$true_anomaly, na.rm = TRUE), "\n")## False Negatives: 0
Dataset Being Analyzed: Customer Transaction Data (2,000 customers)
Cross-sectional analysis identifies unusual observations at a single point in time. We apply six methods:
## Loading cross-sectional data: cs_customer_data.csv
cs_data_raw <- read_csv("simulated_data/cs_customer_data.csv", show_col_types = FALSE)
# Data has standardized columns: id, features, true_anomaly
cs_data <- cs_data_raw
cat(" - Loaded", nrow(cs_data), "observations\n")## - Loaded 2000 observations
## - Features: 5
## - True anomalies: 30
# Extract features
features <- cs_data %>%
dplyr::select(-id, -true_anomaly)
cat(" Feature names:", paste(names(features), collapse = ", "), "\n\n")## Feature names: age, num_purchases, avg_purchase_value, total_spent, purchase_frequency
## age num_purchases avg_purchase_value total_spent
## Min. :-10.00 Min. : 1.00 Min. : 70.04 Min. : 0
## 1st Qu.: 38.40 1st Qu.: 72.00 1st Qu.: 136.77 1st Qu.: 3965
## Median : 45.12 Median : 80.00 Median : 150.92 Median : 5037
## Mean : 45.35 Mean : 81.62 Mean : 472.57 Mean : 8697
## 3rd Qu.: 52.00 3rd Qu.: 89.00 3rd Qu.: 164.23 3rd Qu.: 6184
## Max. :200.00 Max. :500.00 Max. :50000.00 Max. :500000
## purchase_frequency
## Min. : 0.08333
## 1st Qu.: 6.00000
## Median : 6.66667
## Mean : 6.80179
## 3rd Qu.: 7.41667
## Max. :41.66667
##
## Feature Correlations:
## age num_purchases avg_purchase_value total_spent
## age 1.00 -0.06 0.27 0.24
## num_purchases -0.06 1.00 -0.12 0.05
## avg_purchase_value 0.27 -0.12 1.00 0.98
## total_spent 0.24 0.05 0.98 1.00
## purchase_frequency -0.06 1.00 -0.12 0.05
## purchase_frequency
## age -0.06
## num_purchases 1.00
## avg_purchase_value -0.12
## total_spent 0.05
## purchase_frequency 1.00
Explanation: Standardizes each feature to have mean=0, SD=1. An observation is anomalous if any feature has extreme z-score.
How it works: - Standardize all features:
z = (x - mean) / SD - For each observation, take maximum
absolute z-score across features - Flag if
max(|z-scores|) > 3
## Method 1: Multivariate Z-score...
z_scores <- scale(features)
cs_data$zscore_anomaly <- apply(abs(z_scores), 1, max) > 3
cat(" Detected:", sum(cs_data$zscore_anomaly), "anomalies\n")## Detected: 20 anomalies
Explanation: Measures distance from the center of the distribution, accounting for correlations between features. More sophisticated than Euclidean distance.
How it works: - Calculate distance:
D² = (x - μ)ᵀ Σ⁻¹ (x - μ) - Where μ = mean vector, Σ =
covariance matrix - Compare to χ² distribution with p degrees of freedom
- Flag if D² > χ²₀.₉₇₅,ₚ
## Method 2: Mahalanobis Distance...
tryCatch({
center <- colMeans(features)
cov_matrix <- cov(features)
mah_dist <- mahalanobis(features, center, cov_matrix)
threshold_mah <- qchisq(0.975, df = ncol(features))
cs_data$mahalanobis_anomaly <- mah_dist > threshold_mah
cat(" Threshold (χ² 97.5%):", round(threshold_mah, 2), "\n")
cat(" Detected:", sum(cs_data$mahalanobis_anomaly), "anomalies\n")
}, error = function(e) {
cs_data$mahalanobis_anomaly <<- FALSE
cat(" Warning: Mahalanobis method failed\n")
})## Warning: Mahalanobis method failed
Explanation: Same ML ensemble method as time series, but applied to multivariate features without temporal component.
How it works: - Build ensemble of isolation trees on feature space - Anomalies have shorter average path length - Flag bottom 5% by path depth
## Method 3: Isolation Forest (isotree)...
iso_model_cs <- isolation.forest(
features,
ndim = ncol(features),
ntrees = 100,
sample_size = 256
)
iso_scores_cs <- predict(iso_model_cs, features, type = "avg_depth")
threshold_cs <- quantile(iso_scores_cs, 0.05)
cs_data$iso_anomaly <- iso_scores_cs < threshold_cs
cat(" Detected:", sum(cs_data$iso_anomaly), "anomalies\n")## Detected: 100 anomalies
Explanation: Density-based method that compares local density of a point to its neighbors. Points in sparse regions are anomalous.
How it works: - For each point, find k nearest neighbors - Calculate local density relative to neighbors - LOF > 1.5 indicates lower density than neighbors (outlier)
## Method 4: Local Outlier Factor (LOF)...
lof_scores <- dbscan::lof(as.matrix(features), k = 20)
cs_data$lof_anomaly <- lof_scores > 1.5
cat(" k-neighbors:", 20, "\n")## k-neighbors: 20
## Detected: 44 anomalies
Explanation: Support Vector Machine trained only on “normal” data to learn a boundary. Points outside the boundary are anomalous.
How it works: - Train SVM on majority of data (assuming mostly normal) - Learn hyperplane that encloses normal data - nu parameter controls expected anomaly rate (5%)
## Method 5: One-Class SVM...
tryCatch({
train_data <- features[1:(nrow(features) - 100), ]
svm_model <- e1071::svm(train_data, y = NULL, type = 'one-classification',
nu = 0.05, kernel = "radial")
svm_pred <- predict(svm_model, features)
cs_data$svm_anomaly <- !svm_pred
cat(" Kernel: radial\n")
cat(" nu:", 0.05, "\n")
cat(" Detected:", sum(cs_data$svm_anomaly), "anomalies\n")
}, error = function(e) {
cs_data$svm_anomaly <<- FALSE
cat(" Warning: SVM failed\n")
})## Kernel: radial
## nu: 0.05
## Detected: 129 anomalies
Explanation: Density-based clustering. Points not belonging to any cluster are classified as noise (anomalies).
How it works: - Group points into dense regions (clusters) - Points in sparse regions labeled as cluster 0 (noise) - Noise points = anomalies
## Method 6: DBSCAN (outliers as anomalies)...
features_scaled <- scale(features)
dbscan_result <- dbscan::dbscan(features_scaled, eps = 0.5, minPts = 10)
cs_data$dbscan_anomaly <- dbscan_result$cluster == 0
cat(" Clusters found:", max(dbscan_result$cluster), "\n")## Clusters found: 3
## Noise points (anomalies): 12
# Combine results
cs_results <- cs_data %>%
mutate(
total_votes = rowSums(cbind(zscore_anomaly, mahalanobis_anomaly, iso_anomaly,
lof_anomaly, svm_anomaly, dbscan_anomaly), na.rm = TRUE),
consensus_anomaly = total_votes >= 3
)
cat("Consensus approach: Require 3+ methods to agree\n")## Consensus approach: Require 3+ methods to agree
## Consensus anomalies detected: 36
# Create summary table
cs_summary <- tibble(
Method = c("Multivariate Z-score", "Mahalanobis Distance", "Isolation Forest (isotree)",
"Local Outlier Factor", "One-Class SVM", "DBSCAN", "Consensus (3+ votes)"),
Anomalies_Detected = c(
sum(cs_results$zscore_anomaly, na.rm = TRUE),
sum(cs_results$mahalanobis_anomaly, na.rm = TRUE),
sum(cs_results$iso_anomaly, na.rm = TRUE),
sum(cs_results$lof_anomaly, na.rm = TRUE),
sum(cs_results$svm_anomaly, na.rm = TRUE),
sum(cs_results$dbscan_anomaly, na.rm = TRUE),
sum(cs_results$consensus_anomaly, na.rm = TRUE)
),
True_Positives = c(
sum(cs_results$zscore_anomaly & cs_results$true_anomaly, na.rm = TRUE),
sum(cs_results$mahalanobis_anomaly & cs_results$true_anomaly, na.rm = TRUE),
sum(cs_results$iso_anomaly & cs_results$true_anomaly, na.rm = TRUE),
sum(cs_results$lof_anomaly & cs_results$true_anomaly, na.rm = TRUE),
sum(cs_results$svm_anomaly & cs_results$true_anomaly, na.rm = TRUE),
sum(cs_results$dbscan_anomaly & cs_results$true_anomaly, na.rm = TRUE),
sum(cs_results$consensus_anomaly & cs_results$true_anomaly, na.rm = TRUE)
)
) %>%
mutate(
Precision = round(True_Positives / pmax(Anomalies_Detected, 1) * 100, 2),
Recall = round(True_Positives / sum(cs_results$true_anomaly) * 100, 2)
)
# Display table
cs_summary %>%
gt() %>%
tab_header(
title = "Cross-Sectional Anomaly Detection - Performance Summary",
subtitle = paste("Dataset: Customer Transactions | Observations:", nrow(cs_data),
"| True anomalies:", sum(cs_data$true_anomaly))
) %>%
fmt_number(
columns = c(Precision, Recall),
decimals = 2
) %>%
tab_style(
style = cell_fill(color = "#E8F4F8"),
locations = cells_body(rows = Method == "Consensus (3+ votes)")
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = Method, rows = Method == "Consensus (3+ votes)")
)| Cross-Sectional Anomaly Detection - Performance Summary | ||||
| Dataset: Customer Transactions | Observations: 2000 | True anomalies: 30 | ||||
| Method | Anomalies_Detected | True_Positives | Precision | Recall |
|---|---|---|---|---|
| Multivariate Z-score | 20 | 20 | 100.00 | 66.67 |
| Mahalanobis Distance | 0 | 0 | 0.00 | 0.00 |
| Isolation Forest (isotree) | 100 | 30 | 30.00 | 100.00 |
| Local Outlier Factor | 44 | 30 | 68.18 | 100.00 |
| One-Class SVM | 129 | 30 | 23.26 | 100.00 |
| DBSCAN | 12 | 10 | 83.33 | 33.33 |
| Consensus (3+ votes) | 36 | 30 | 83.33 | 100.00 |
numeric_features <- features %>% dplyr::select(where(is.numeric))
feature_names <- names(numeric_features)[1:3]
cs_plot <- plot_ly(
data = cs_results,
x = ~get(feature_names[1]),
y = ~get(feature_names[2]),
z = ~get(feature_names[3]),
color = ~consensus_anomaly,
colors = c('blue', 'red'),
type = 'scatter3d',
mode = 'markers',
marker = list(size = 3),
text = ~paste('ID:', id, '<br>True Anomaly:', true_anomaly)
) %>%
layout(
title = "Cross-Sectional Anomaly Detection - Customer Data (3D View)",
scene = list(
xaxis = list(title = feature_names[1]),
yaxis = list(title = feature_names[2]),
zaxis = list(title = feature_names[3])
)
)
cs_plotDataset Being Analyzed: Company Financial Performance (100 companies, 40 quarters)
Panel data combines cross-sectional and time series characteristics. We apply six methods:
## Loading panel data: panel_company_financials.csv
panel_data_raw <- read_csv("simulated_data/panel_company_financials.csv", show_col_types = FALSE)
# Data has standardized columns: entity, period, value, true_anomaly
panel_data <- panel_data_raw %>%
dplyr::select(entity, period, value, true_anomaly)
cat(" - Loaded", nrow(panel_data), "observations\n")## - Loaded 4000 observations
## - Entities (companies): 100
## - Periods (quarters): 40
## - True anomalies: 30
# Sample of data structure
head(panel_data, 12) %>%
gt() %>%
tab_header(title = "Panel Data Structure - First 12 Observations")| Panel Data Structure - First 12 Observations | |||
| entity | period | value | true_anomaly |
|---|---|---|---|
| 1 | 1 | 5591.413 | FALSE |
| 1 | 2 | 6064.588 | FALSE |
| 1 | 3 | 5611.574 | FALSE |
| 1 | 4 | 5283.551 | FALSE |
| 1 | 5 | 5903.044 | FALSE |
| 1 | 6 | 6551.096 | FALSE |
| 1 | 7 | 5628.063 | FALSE |
| 1 | 8 | 6299.814 | FALSE |
| 1 | 9 | 5936.614 | FALSE |
| 1 | 10 | 6823.067 | FALSE |
| 1 | 11 | 5907.760 | FALSE |
| 1 | 12 | 5735.255 | FALSE |
Explanation: Panel regression that controls for unobserved entity-specific effects. Large residuals after accounting for entity effects indicate anomalies.
How it works: - Model:
value_it = α_i + β × period + ε_it - α_i captures
entity-specific fixed effects - Flag where
|ε_it| > 3 × SD(ε)
## Method 1: Fixed Effects Model Residuals...
# Create pdata.frame for plm modeling
panel_data_plm <- pdata.frame(panel_data, index = c("entity", "period"))
fe_model <- plm(value ~ period, data = panel_data_plm, model = "within")
# Extract residuals
panel_data$fe_residuals <- as.numeric(residuals(fe_model))
threshold_fe <- 3 * sd(panel_data$fe_residuals, na.rm = TRUE)
panel_data$fe_anomaly <- abs(panel_data$fe_residuals) > threshold_fe
cat(" Model: Fixed Effects (within estimator)\n")## Model: Fixed Effects (within estimator)
## Threshold: 3560.09
## Detected: 103 anomalies
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = value ~ period, data = panel_data_plm, model = "within")
##
## Balanced Panel: n = 100, T = 40, N = 4000
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -6054.288 -282.300 -16.065 256.524 13655.301
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## period2 307.38 170.80 1.7997 0.0719932 .
## period3 198.15 170.80 1.1602 0.2460563
## period4 237.31 170.80 1.3894 0.1647778
## period5 107.91 170.80 0.6318 0.5275692
## period6 449.58 170.80 2.6323 0.0085155 **
## period7 195.83 170.80 1.1466 0.2516205
## period8 703.92 170.80 4.1214 3.845e-05 ***
## period9 645.63 170.80 3.7801 0.0001591 ***
## period10 974.67 170.80 5.7066 1.239e-08 ***
## period11 621.24 170.80 3.6373 0.0002791 ***
## period12 301.83 170.80 1.7672 0.0772715 .
## period13 1423.54 170.80 8.3347 < 2.2e-16 ***
## period14 777.76 170.80 4.5537 5.433e-06 ***
## period15 970.71 170.80 5.6834 1.418e-08 ***
## period16 974.87 170.80 5.7077 1.231e-08 ***
## period17 662.97 170.80 3.8816 0.0001055 ***
## period18 978.69 170.80 5.7301 1.080e-08 ***
## period19 932.38 170.80 5.4590 5.089e-08 ***
## period20 962.98 170.80 5.6381 1.842e-08 ***
## period21 1081.64 170.80 6.3329 2.682e-10 ***
## period22 1692.52 170.80 9.9095 < 2.2e-16 ***
## period23 1469.58 170.80 8.6042 < 2.2e-16 ***
## period24 1276.57 170.80 7.4742 9.556e-14 ***
## period25 1469.70 170.80 8.6049 < 2.2e-16 ***
## period26 1479.47 170.80 8.6621 < 2.2e-16 ***
## period27 1592.54 170.80 9.3241 < 2.2e-16 ***
## period28 1563.06 170.80 9.1516 < 2.2e-16 ***
## period29 1790.07 170.80 10.4807 < 2.2e-16 ***
## period30 1917.01 170.80 11.2239 < 2.2e-16 ***
## period31 2179.20 170.80 12.7590 < 2.2e-16 ***
## period32 1756.98 170.80 10.2869 < 2.2e-16 ***
## period33 2065.21 170.80 12.0916 < 2.2e-16 ***
## period34 2361.27 170.80 13.8250 < 2.2e-16 ***
## period35 2214.50 170.80 12.9657 < 2.2e-16 ***
## period36 2042.06 170.80 11.9561 < 2.2e-16 ***
## period37 1956.86 170.80 11.4572 < 2.2e-16 ***
## period38 2214.62 170.80 12.9664 < 2.2e-16 ***
## period39 2255.26 170.80 13.2043 < 2.2e-16 ***
## period40 2070.94 170.80 12.1252 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 7598400000
## Residual Sum of Squares: 5631600000
## R-Squared: 0.25884
## Adj. R-Squared: 0.23235
## F-statistic: 34.5747 on 39 and 3861 DF, p-value: < 2.22e-16
Explanation: Each entity has its own mean and standard deviation. Points are compared to their entity-specific distribution.
How it works: - For each entity, calculate mean and
SD - Standardize: z_it = (value_it - mean_i) / SD_i - Flag
if |z_it| > 3
## Method 2: Within-Entity Z-score...
panel_data <- panel_data %>%
group_by(entity) %>%
mutate(
entity_mean = mean(value, na.rm = TRUE),
entity_sd = sd(value, na.rm = TRUE),
entity_zscore = (value - entity_mean) / entity_sd,
within_entity_anomaly = abs(entity_zscore) > 3
) %>%
ungroup()
cat(" Detected:", sum(panel_data$within_entity_anomaly), "anomalies\n")## Detected: 14 anomalies
Explanation: Each time period has its own mean and SD. Points are compared to other entities in the same time period.
How it works: - For each period, calculate mean and
SD across entities - Standardize:
z_it = (value_it - mean_t) / SD_t - Flag if
|z_it| > 3
## Method 3: Within-Period Z-score...
panel_data <- panel_data %>%
group_by(period) %>%
mutate(
period_mean = mean(value, na.rm = TRUE),
period_sd = sd(value, na.rm = TRUE),
period_zscore = (value - period_mean) / period_sd,
within_period_anomaly = abs(period_zscore) > 3
) %>%
ungroup()
cat(" Detected:", sum(panel_data$within_period_anomaly), "anomalies\n")## Detected: 42 anomalies
Explanation: Each entity may have its own growth trend. Detects deviations from entity-specific linear trends.
How it works: - For each entity, fit linear trend:
value ~ period - Calculate residuals - Standardize and flag
large deviations
## Method 4: Deviation from Entity-Specific Trend...
panel_data <- panel_data %>%
group_by(entity) %>%
mutate(
entity_trend = predict(lm(value ~ period)),
trend_deviation = value - entity_trend,
trend_deviation_zscore = as.numeric(scale(trend_deviation)),
trend_anomaly = abs(trend_deviation_zscore) > 3
) %>%
ungroup()
cat(" Detected:", sum(panel_data$trend_anomaly), "anomalies\n")## Detected: 28 anomalies
Explanation: Applies Isolation Forest to panel-specific features including entity ID and time period (both scaled).
How it works: - Features: value, scaled entity ID, scaled period - Captures cross-sectional and temporal patterns - Flag bottom 5% by path depth
## Method 5: Isolation Forest with Panel Features (isotree)...
panel_features <- panel_data %>%
dplyr::select(entity, period, value) %>%
mutate(
entity_scaled = scale(entity)[,1],
period_scaled = scale(period)[,1]
)
iso_model_panel <- isolation.forest(
panel_features %>% dplyr::select(value, entity_scaled, period_scaled),
ndim = 3,
ntrees = 100,
sample_size = 256
)
iso_scores_panel <- predict(iso_model_panel,
panel_features %>% dplyr::select(value, entity_scaled, period_scaled),
type = "avg_depth")
threshold_panel <- quantile(iso_scores_panel, 0.05)
panel_data$iso_anomaly <- iso_scores_panel < threshold_panel
cat(" Detected:", sum(panel_data$iso_anomaly), "anomalies\n")## Detected: 200 anomalies
Explanation: Alternative to fixed effects that treats entity effects as random draws from a distribution.
How it works: - Model:
value_it = α + β × period + u_i + ε_it - u_i ~ N(0, σ²_u)
are random entity effects - Flag large residuals
## Method 6: Random Effects Model Residuals...
tryCatch({
re_model <- plm(value ~ period, data = panel_data_plm, model = "random")
panel_data$re_residuals <- as.numeric(residuals(re_model))
threshold_re <- 3 * sd(panel_data$re_residuals, na.rm = TRUE)
panel_data$re_anomaly <- abs(panel_data$re_residuals) > threshold_re
cat(" Model: Random Effects\n")
cat(" Detected:", sum(panel_data$re_anomaly), "anomalies\n")
}, error = function(e) {
panel_data$re_anomaly <<- FALSE
cat(" Warning: Random effects model failed\n")
})## Model: Random Effects
## Detected: 89 anomalies
# Combine results
panel_results <- panel_data %>%
mutate(
total_votes = rowSums(cbind(fe_anomaly, within_entity_anomaly, within_period_anomaly,
trend_anomaly, iso_anomaly, re_anomaly), na.rm = TRUE),
consensus_anomaly = total_votes >= 3
)
cat("Consensus approach: Require 3+ methods to agree\n")## Consensus approach: Require 3+ methods to agree
## Consensus anomalies detected: 72
# Create summary table
panel_summary <- tibble(
Method = c("Fixed Effects Residuals", "Within-Entity Z-score", "Within-Period Z-score",
"Entity Trend Deviation", "Isolation Forest (isotree)", "Random Effects Residuals",
"Consensus (3+ votes)"),
Anomalies_Detected = c(
sum(panel_results$fe_anomaly, na.rm = TRUE),
sum(panel_results$within_entity_anomaly, na.rm = TRUE),
sum(panel_results$within_period_anomaly, na.rm = TRUE),
sum(panel_results$trend_anomaly, na.rm = TRUE),
sum(panel_results$iso_anomaly, na.rm = TRUE),
sum(panel_results$re_anomaly, na.rm = TRUE),
sum(panel_results$consensus_anomaly, na.rm = TRUE)
),
True_Positives = c(
sum(panel_results$fe_anomaly & panel_results$true_anomaly, na.rm = TRUE),
sum(panel_results$within_entity_anomaly & panel_results$true_anomaly, na.rm = TRUE),
sum(panel_results$within_period_anomaly & panel_results$true_anomaly, na.rm = TRUE),
sum(panel_results$trend_anomaly & panel_results$true_anomaly, na.rm = TRUE),
sum(panel_results$iso_anomaly & panel_results$true_anomaly, na.rm = TRUE),
sum(panel_results$re_anomaly & panel_results$true_anomaly, na.rm = TRUE),
sum(panel_results$consensus_anomaly & panel_results$true_anomaly, na.rm = TRUE)
)
) %>%
mutate(
Precision = round(True_Positives / pmax(Anomalies_Detected, 1) * 100, 2),
Recall = round(True_Positives / sum(panel_results$true_anomaly) * 100, 2)
)
# Display table
panel_summary %>%
gt() %>%
tab_header(
title = "Panel Data Anomaly Detection - Performance Summary",
subtitle = paste("Dataset: Company Financials | Observations:", nrow(panel_data),
"| True anomalies:", sum(panel_data$true_anomaly))
) %>%
fmt_number(
columns = c(Precision, Recall),
decimals = 2
) %>%
tab_style(
style = cell_fill(color = "#E8F4F8"),
locations = cells_body(rows = Method == "Consensus (3+ votes)")
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = Method, rows = Method == "Consensus (3+ votes)")
)| Panel Data Anomaly Detection - Performance Summary | ||||
| Dataset: Company Financials | Observations: 4000 | True anomalies: 30 | ||||
| Method | Anomalies_Detected | True_Positives | Precision | Recall |
|---|---|---|---|---|
| Fixed Effects Residuals | 103 | 14 | 13.59 | 46.67 |
| Within-Entity Z-score | 14 | 14 | 100.00 | 46.67 |
| Within-Period Z-score | 42 | 8 | 19.05 | 26.67 |
| Entity Trend Deviation | 28 | 17 | 60.71 | 56.67 |
| Isolation Forest (isotree) | 200 | 13 | 6.50 | 43.33 |
| Random Effects Residuals | 89 | 14 | 15.73 | 46.67 |
| Consensus (3+ votes) | 72 | 13 | 18.06 | 43.33 |
# Create heatmap
panel_matrix <- panel_results %>%
dplyr::select(entity, period, consensus_anomaly) %>%
pivot_wider(names_from = period, values_from = consensus_anomaly, values_fill = FALSE) %>%
dplyr::select(-entity) %>%
as.matrix()
panel_heatmap <- plot_ly(
z = panel_matrix * 1,
type = "heatmap",
colorscale = list(c(0, "lightblue"), c(1, "red")),
showscale = TRUE
) %>%
layout(
title = "Panel Data Anomaly Heatmap - Company Financials",
xaxis = list(title = "Quarter"),
yaxis = list(title = "Company ID")
)
panel_heatmap# Summary of companies with anomalies
company_anomalies <- panel_results %>%
filter(consensus_anomaly) %>%
group_by(entity) %>%
summarise(
n_anomalies = n(),
quarters_affected = paste(sort(unique(period)), collapse = ", "),
avg_value = mean(value, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(n_anomalies)) %>%
slice_head(n = 10)
if (nrow(company_anomalies) > 0) {
company_anomalies %>%
gt() %>%
tab_header(
title = "Top Companies with Detected Anomalies",
subtitle = "Ranked by number of anomalous quarters"
) %>%
cols_label(
entity = "Company ID",
n_anomalies = "# Anomalies",
quarters_affected = "Affected Quarters",
avg_value = "Avg Revenue"
) %>%
fmt_number(
columns = avg_value,
decimals = 0
) %>%
tab_style(
style = cell_fill(color = "#FFEBEE"),
locations = cells_body(rows = n_anomalies >= 2)
)
} else {
cat("No consensus anomalies detected in panel data.\n")
}| Top Companies with Detected Anomalies | |||
| Ranked by number of anomalous quarters | |||
| Company ID | # Anomalies | Affected Quarters | Avg Revenue |
|---|---|---|---|
| 2 | 33 | 1, 2, 3, 4, 6, 7, 8, 10, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40 | 16,395 |
| 86 | 19 | 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40 | 17,082 |
| 21 | 6 | 21, 25, 29, 36, 38, 39 | 12,760 |
| 81 | 4 | 37, 38, 39, 40 | 2,621 |
| 7 | 1 | 26 | 21,052 |
| 9 | 1 | 35 | 10,999 |
| 20 | 1 | 23 | 18,516 |
| 41 | 1 | 22 | 16,974 |
| 53 | 1 | 18 | 12,946 |
| 62 | 1 | 29 | 3,434 |
# Find entities with most anomalies for better visualization
entities_with_anomalies <- panel_results %>%
filter(consensus_anomaly) %>%
group_by(entity) %>%
summarise(n_anomalies = n()) %>%
arrange(desc(n_anomalies)) %>%
slice_head(n = 3)
if (nrow(entities_with_anomalies) > 0) {
selected_entities <- entities_with_anomalies$entity
cat("Plotting companies with most anomalies:\n")
cat(" Company", selected_entities[1], ":", entities_with_anomalies$n_anomalies[1], "anomalies\n")
if (length(selected_entities) > 1) {
cat(" Company", selected_entities[2], ":", entities_with_anomalies$n_anomalies[2], "anomalies\n")
}
if (length(selected_entities) > 2) {
cat(" Company", selected_entities[3], ":", entities_with_anomalies$n_anomalies[3], "anomalies\n")
}
} else {
# Fallback to entities with highest variance
selected_entities <- panel_results %>%
group_by(entity) %>%
summarise(variance = var(value, na.rm = TRUE)) %>%
arrange(desc(variance)) %>%
slice_head(n = 3) %>%
pull(entity)
cat("No consensus anomalies found. Plotting companies with highest variance.\n")
}## Plotting companies with most anomalies:
## Company 2 : 33 anomalies
## Company 86 : 19 anomalies
## Company 21 : 6 anomalies
panel_ts_plot <- plot_ly()
colors <- c('#E74C3C', '#3498DB', '#27AE60') # Red, Blue, Green
for (i in seq_along(selected_entities)) {
ent <- selected_entities[i]
entity_data <- panel_results %>% filter(entity == ent)
panel_ts_plot <- panel_ts_plot %>%
add_trace(
data = entity_data,
x = ~period, y = ~value,
type = 'scatter', mode = 'lines+markers',
name = paste('Company', ent),
line = list(width = 2, color = colors[i]),
marker = list(size = 4, color = colors[i])
) %>%
add_trace(
data = entity_data %>% filter(consensus_anomaly),
x = ~period, y = ~value,
type = 'scatter', mode = 'markers',
name = paste('Anomalies - Company', ent),
marker = list(size = 12, symbol = 'x', color = 'red', line = list(width = 2)),
showlegend = TRUE
)
}
panel_ts_plot <- panel_ts_plot %>%
layout(
title = list(
text = "Panel Data: Companies with Detected Anomalies",
font = list(size = 16, color = '#2C3E50', family = 'Arial, sans-serif')
),
xaxis = list(
title = "Quarter",
showgrid = TRUE,
gridcolor = '#E0E0E0'
),
yaxis = list(
title = "Revenue",
showgrid = TRUE,
gridcolor = '#E0E0E0'
),
hovermode = 'closest',
plot_bgcolor = '#F8F9FA',
paper_bgcolor = 'white'
)
panel_ts_plot# Combine all summaries
combined_summary <- bind_rows(
ts_summary %>% mutate(Data_Type = "Time Series", Dataset = "Retail Sales", .before = 1),
cs_summary %>% mutate(Data_Type = "Cross-Sectional", Dataset = "Customer Transactions", .before = 1),
panel_summary %>% mutate(Data_Type = "Panel Data", Dataset = "Company Financials", .before = 1)
)
# Display comprehensive table
combined_summary %>%
gt(groupname_col = "Data_Type") %>%
tab_header(
title = "Comprehensive Anomaly Detection Comparison",
subtitle = "Performance Across All Data Types and Methods"
) %>%
fmt_number(
columns = c(Precision, Recall),
decimals = 2
) %>%
tab_style(
style = cell_fill(color = "#E8F4F8"),
locations = cells_body(rows = grepl("Consensus", Method))
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(columns = Method, rows = grepl("Consensus", Method))
)| Comprehensive Anomaly Detection Comparison | |||||
| Performance Across All Data Types and Methods | |||||
| Dataset | Method | Anomalies_Detected | True_Positives | Precision | Recall |
|---|---|---|---|---|---|
| Time Series | |||||
| Retail Sales | STL + IQR | 59 | 8 | 13.56 | 100.00 |
| Retail Sales | ARIMA Residuals | 14 | 8 | 57.14 | 100.00 |
| Retail Sales | Moving Avg + Z-score | 7 | 7 | 100.00 | 87.50 |
| Retail Sales | Seasonal Hybrid ESD | 0 | 0 | 0.00 | 0.00 |
| Retail Sales | Isolation Forest (isotree) | 55 | 8 | 14.55 | 100.00 |
| Retail Sales | Consensus (3+ votes) | 9 | 8 | 88.89 | 100.00 |
| Cross-Sectional | |||||
| Customer Transactions | Multivariate Z-score | 20 | 20 | 100.00 | 66.67 |
| Customer Transactions | Mahalanobis Distance | 0 | 0 | 0.00 | 0.00 |
| Customer Transactions | Isolation Forest (isotree) | 100 | 30 | 30.00 | 100.00 |
| Customer Transactions | Local Outlier Factor | 44 | 30 | 68.18 | 100.00 |
| Customer Transactions | One-Class SVM | 129 | 30 | 23.26 | 100.00 |
| Customer Transactions | DBSCAN | 12 | 10 | 83.33 | 33.33 |
| Customer Transactions | Consensus (3+ votes) | 36 | 30 | 83.33 | 100.00 |
| Panel Data | |||||
| Company Financials | Fixed Effects Residuals | 103 | 14 | 13.59 | 46.67 |
| Company Financials | Within-Entity Z-score | 14 | 14 | 100.00 | 46.67 |
| Company Financials | Within-Period Z-score | 42 | 8 | 19.05 | 26.67 |
| Company Financials | Entity Trend Deviation | 28 | 17 | 60.71 | 56.67 |
| Company Financials | Isolation Forest (isotree) | 200 | 13 | 6.50 | 43.33 |
| Company Financials | Random Effects Residuals | 89 | 14 | 15.73 | 46.67 |
| Company Financials | Consensus (3+ votes) | 72 | 13 | 18.06 | 43.33 |
precision_plot <- plot_ly(
data = combined_summary,
x = ~Method,
y = ~Precision,
color = ~Data_Type,
type = 'bar',
text = ~paste0(round(Precision, 1), "%"),
textposition = 'outside'
) %>%
layout(
title = "Precision Comparison: All Methods and Data Types",
xaxis = list(title = ""),
yaxis = list(title = "Precision (%)"),
barmode = 'group',
showlegend = TRUE
)
precision_plotrecall_plot <- plot_ly(
data = combined_summary,
x = ~Method,
y = ~Recall,
color = ~Data_Type,
type = 'bar',
text = ~paste0(round(Recall, 1), "%"),
textposition = 'outside'
) %>%
layout(
title = "Recall Comparison: All Methods and Data Types",
xaxis = list(title = ""),
yaxis = list(title = "Recall (%)"),
barmode = 'group',
showlegend = TRUE
)
recall_plot# Find best methods by data type
best_precision <- combined_summary %>%
group_by(Data_Type) %>%
filter(Method != "Consensus (3+ votes)") %>%
slice_max(Precision, n = 1) %>%
dplyr::select(Data_Type, Dataset, Method, Precision)
best_recall <- combined_summary %>%
group_by(Data_Type) %>%
filter(Method != "Consensus (3+ votes)") %>%
slice_max(Recall, n = 1) %>%
dplyr::select(Data_Type, Dataset, Method, Recall)
cat("Best Methods by Precision:\n")## Best Methods by Precision:
## # A tibble: 3 × 4
## # Groups: Data_Type [3]
## Data_Type Dataset Method Precision
## <chr> <chr> <chr> <dbl>
## 1 Cross-Sectional Customer Transactions Multivariate Z-score 100
## 2 Panel Data Company Financials Within-Entity Z-score 100
## 3 Time Series Retail Sales Moving Avg + Z-score 100
##
## Best Methods by Recall:
## # A tibble: 7 × 4
## # Groups: Data_Type [3]
## Data_Type Dataset Method Recall
## <chr> <chr> <chr> <dbl>
## 1 Cross-Sectional Customer Transactions Isolation Forest (isotree) 100
## 2 Cross-Sectional Customer Transactions Local Outlier Factor 100
## 3 Cross-Sectional Customer Transactions One-Class SVM 100
## 4 Panel Data Company Financials Entity Trend Deviation 56.7
## 5 Time Series Retail Sales STL + IQR 100
## 6 Time Series Retail Sales ARIMA Residuals 100
## 7 Time Series Retail Sales Isolation Forest (isotree) 100
consensus_results <- combined_summary %>%
filter(Method == "Consensus (3+ votes)") %>%
dplyr::select(Data_Type, Dataset, Precision, Recall)
cat("\nConsensus Method Performance:\n")##
## Consensus Method Performance:
## # A tibble: 3 × 4
## Data_Type Dataset Precision Recall
## <chr> <chr> <dbl> <dbl>
## 1 Time Series Retail Sales 88.9 100
## 2 Cross-Sectional Customer Transactions 83.3 100
## 3 Panel Data Company Financials 18.1 43.3
Based on the analysis:
ntrees: More trees = more stable but slower (default:
100)sample_size: Smaller = faster but less accurate
(default: 256)quantile: Lower = fewer anomalies (default: 0.05 =
bottom 5%)min_votes: Higher = fewer false positives but lower
recall (default: 3)## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gridExtra_2.3 TTR_0.24.4 xts_0.14.1
## [4] zoo_1.8-14 outliers_0.15 robustbase_0.99-6
## [7] robust_0.7-5 fit.models_0.64 e1071_1.7-16
## [10] randomForest_4.7-1.2 caret_7.0-1 lattice_0.22-5
## [13] panelr_0.7.8 lme4_1.1-38 Matrix_1.7-4
## [16] plm_2.6-7 dbscan_1.2.3 isotree_0.6.1-4
## [19] timetk_2.9.1 anomalize_0.3.0 forecast_8.24.0
## [22] tsoutliers_0.6-10 plotly_4.11.0 gt_1.1.0
## [25] lubridate_1.9.4 forcats_1.0.1 stringr_1.5.2
## [28] dplyr_1.1.4 purrr_1.1.0 readr_2.1.6
## [31] tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.0
## [34] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1 jsonlite_2.0.0
## [4] magrittr_2.0.4 farver_2.1.2 nloptr_2.2.1
## [7] rmarkdown_2.30 fs_1.6.6 vctrs_0.6.5
## [10] minqa_1.2.8 htmltools_0.5.8.1 curl_7.0.0
## [13] broom_1.0.10 Formula_1.2-5 pROC_1.19.0.1
## [16] sass_0.4.10 parallelly_1.45.1 bslib_0.9.0
## [19] htmlwidgets_1.6.4 plyr_1.8.9 sandwich_3.1-1
## [22] cachem_1.1.0 lifecycle_1.0.4 iterators_1.0.14
## [25] pkgconfig_2.0.3 R6_2.6.1 fastmap_1.2.0
## [28] rbibutils_2.4 future_1.68.0 collapse_2.1.5
## [31] digest_0.6.37 colorspace_2.1-2 miscTools_0.6-28
## [34] furrr_0.3.1 crosstalk_1.2.2 labeling_0.4.3
## [37] timechange_0.3.0 httr_1.4.7 compiler_4.5.1
## [40] proxy_0.4-28 bit64_4.6.0-1 withr_3.0.2
## [43] pander_0.6.6 S7_0.2.0 backports_1.5.0
## [46] tseries_0.10-58 broom.mixed_0.2.9.6 MASS_7.3-65
## [49] lava_1.8.2 ModelMetrics_1.2.2.2 tools_4.5.1
## [52] rrcov_1.7-7 lmtest_0.9-40 quantmod_0.4.28
## [55] future.apply_1.20.1 nnet_7.3-20 glue_1.8.0
## [58] quadprog_1.5-8 nlme_3.1-168 grid_4.5.1
## [61] reshape2_1.4.5 generics_0.1.4 recipes_1.3.1
## [64] gtable_0.3.6 tzdb_0.5.0 class_7.3-23
## [67] data.table_1.17.8 hms_1.1.4 rsample_1.3.1
## [70] utf8_1.2.6 xml2_1.4.0 foreach_1.5.2
## [73] pillar_1.11.1 vroom_1.6.7 splines_4.5.1
## [76] bit_4.6.0 survival_3.8-3 tidyselect_1.2.1
## [79] knitr_1.50 reformulas_0.4.2 urca_1.3-4
## [82] RhpcBLASctl_0.23-42 stats4_4.5.1 xfun_0.53
## [85] hardhat_1.4.2 timeDate_4051.111 DEoptimR_1.1-4
## [88] stringi_1.8.7 lazyeval_0.2.2 yaml_2.3.10
## [91] boot_1.3-31 evaluate_1.0.5 codetools_0.2-19
## [94] cli_3.6.5 rpart_4.1.24 Rdpack_2.6.4
## [97] jquerylib_0.1.4 Rcpp_1.1.0 globals_0.18.0
## [100] bdsmatrix_1.3-7 parallel_4.5.1 fracdiff_1.5-3
## [103] gower_1.0.2 listenv_0.10.0 mvtnorm_1.3-3
## [106] viridisLite_0.4.2 ipred_0.9-15 scales_1.4.0
## [109] prodlim_2025.04.28 pcaPP_2.0-5 crayon_1.5.3
## [112] maxLik_1.5-2.1 rlang_1.1.6 jtools_2.3.0
This analysis demonstrates a comprehensive approach to anomaly detection across three data structures:
The modular approach allows methods to be mixed and matched based on: - Data characteristics - Computational resources - Required precision/recall trade-off - Domain-specific requirements
Next Steps: 1. Apply to your own data 2. Tune thresholds based on requirements 3. Combine with domain knowledge for validation 4. Deploy best-performing methods in production
Analysis completed using R and the isotree package for Isolation Forest implementation.