This R Markdown mirrors the structure of the original Python notebook, but translates the workflow into R code chunks. The goal is still the same: build a fraud detection model from raw transaction data, evaluate it properly, tune the threshold, inspect feature importance, and save the model for later deployment.
We have a dataset of credit card transactions where each row is labelled:
is_fraud = 0 for legitimateis_fraud = 1 for fraudOur goal is to train a model that scores a brand-new transaction and estimates the probability that it is fraudulent.
The challenge is class imbalance. Fraud is rare, so a model that always predicts “not fraud” can still look accurate. That is why we focus on ROC-AUC, PR-AUC, precision, recall, F1, and threshold tuning instead of relying on accuracy alone.
XGBoost is a tree-based gradient boosting algorithm. In plain English:
For tabular fraud data, XGBoost is often a very strong baseline because it handles nonlinear feature interactions well and performs strongly even before heavy tuning.
# If needed, install packages once:
# install.packages(c(
# "tidyverse", "lubridate", "xgboost", "caret",
# "pROC", "PRROC", "Matrix", "knitr"
# ))
library(tidyverse)
library(lubridate)
library(xgboost)
library(caret)
library(pROC)
library(PRROC)
library(Matrix)
library(knitr)
theme_set(theme_minimal(base_size = 12))
options(scipen = 999)
legit_col <- "#4682B4"
fraud_col <- "#DC143C"
navy_col <- "#000080"
orange_col <- "#FFA500"
cat("All libraries loaded successfully!\n")
## All libraries loaded successfully!
cat("xgboost version:", as.character(packageVersion("xgboost")), "\n")
## xgboost version: 1.7.11.1
We will also define a few helper functions for evaluation.
precision_fn <- function(actual, pred) {
tp <- sum(actual == 1 & pred == 1)
fp <- sum(actual == 0 & pred == 1)
if ((tp + fp) == 0) return(0)
tp / (tp + fp)
}
recall_fn <- function(actual, pred) {
tp <- sum(actual == 1 & pred == 1)
fn <- sum(actual == 1 & pred == 0)
if ((tp + fn) == 0) return(0)
tp / (tp + fn)
}
f1_fn <- function(actual, pred) {
p <- precision_fn(actual, pred)
r <- recall_fn(actual, pred)
if ((p + r) == 0) return(0)
2 * p * r / (p + r)
}
DATA_PATH <- "/Users/daniyalzafar/python journey/Data Science Road Map/XGboost/archive (1)/fraudTrain.csv"
df <- read_csv(DATA_PATH, n_max = 200000, show_col_types = FALSE) |>
select(-any_of("Unnamed: 0"))
## New names:
## • `` -> `...1`
cat("Dataset shape:", dim(df)[1], "rows x", dim(df)[2], "columns\n")
## Dataset shape: 200000 rows x 23 columns
head(df)
## # A tibble: 6 × 23
## ...1 trans_date_trans_time cc_num merchant category amt first last
## <dbl> <dttm> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 0 2019-01-01 00:00:18 2.70e15 fraud_Rippin,… misc_net 4.97 Jenn… Banks
## 2 1 2019-01-01 00:00:44 6.30e11 fraud_Heller,… grocery… 107. Step… Gill
## 3 2 2019-01-01 00:00:51 3.89e13 fraud_Lind-Bu… enterta… 220. Edwa… Sanc…
## 4 3 2019-01-01 00:01:16 3.53e15 fraud_Kutch, … gas_tra… 45 Jere… White
## 5 4 2019-01-01 00:03:06 3.76e14 fraud_Keeling… misc_pos 42.0 Tyler Garc…
## 6 5 2019-01-01 00:04:08 4.77e15 fraud_Stroman… gas_tra… 94.6 Jenn… Conn…
## # ℹ 15 more variables: gender <chr>, street <chr>, city <chr>, state <chr>,
## # zip <dbl>, lat <dbl>, long <dbl>, city_pop <dbl>, job <chr>, dob <date>,
## # trans_num <chr>, unix_time <dbl>, merch_lat <dbl>, merch_long <dbl>,
## # is_fraud <dbl>
glimpse(df)
## Rows: 200,000
## Columns: 23
## $ ...1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ trans_date_trans_time <dttm> 2019-01-01 00:00:18, 2019-01-01 00:00:44, 2019-…
## $ cc_num <dbl> 2703186189652095, 630423337322, 38859492057661, …
## $ merchant <chr> "fraud_Rippin, Kub and Mann", "fraud_Heller, Gut…
## $ category <chr> "misc_net", "grocery_pos", "entertainment", "gas…
## $ amt <dbl> 4.97, 107.23, 220.11, 45.00, 41.96, 94.63, 44.54…
## $ first <chr> "Jennifer", "Stephanie", "Edward", "Jeremy", "Ty…
## $ last <chr> "Banks", "Gill", "Sanchez", "White", "Garcia", "…
## $ gender <chr> "F", "F", "M", "M", "M", "F", "F", "M", "F", "F"…
## $ street <chr> "561 Perry Cove", "43039 Riley Greens Suite 393"…
## $ city <chr> "Moravian Falls", "Orient", "Malad City", "Bould…
## $ state <chr> "NC", "WA", "ID", "MT", "VA", "PA", "KS", "VA", …
## $ zip <dbl> 28654, 99160, 83252, 59632, 24433, 18917, 67851,…
## $ lat <dbl> 36.0788, 48.8878, 42.1808, 46.2306, 38.4207, 40.…
## $ long <dbl> -81.1781, -118.2105, -112.2620, -112.1138, -79.4…
## $ city_pop <dbl> 3495, 149, 4154, 1939, 99, 2158, 2691, 6018, 147…
## $ job <chr> "Psychologist, counselling", "Special educationa…
## $ dob <date> 1988-03-09, 1978-06-21, 1962-01-19, 1967-01-12,…
## $ trans_num <chr> "0b242abb623afc578575680df30655b9", "1f76529f857…
## $ unix_time <dbl> 1325376018, 1325376044, 1325376051, 1325376076, …
## $ merch_lat <dbl> 36.01129, 49.15905, 43.15070, 47.03433, 38.67500…
## $ merch_long <dbl> -82.04832, -118.18646, -112.15448, -112.56107, -…
## $ is_fraud <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
missing_summary <- colSums(is.na(df))
if (sum(missing_summary) == 0) {
cat("No missing values — clean dataset!\n")
} else {
missing_summary[missing_summary > 0]
}
## No missing values — clean dataset!
fraud_counts <- df |>
count(is_fraud) |>
mutate(label = if_else(is_fraud == 1, "Fraud", "Legitimate"))
fraud_rate <- mean(df$is_fraud) * 100
cat("Legitimate transactions:", fraud_counts$n[fraud_counts$is_fraud == 0], "\n")
## Legitimate transactions: 198355
cat("Fraudulent transactions:", fraud_counts$n[fraud_counts$is_fraud == 1], "\n")
## Fraudulent transactions: 1645
cat("Fraud rate:", round(fraud_rate, 2), "%\n")
## Fraud rate: 0.82 %
cat("Always predicting 'not fraud' would look", round(100 - fraud_rate, 2), "% accurate.\n")
## Always predicting 'not fraud' would look 99.18 % accurate.
par(mfrow = c(1, 2))
barplot(
height = fraud_counts$n,
names.arg = fraud_counts$label,
col = c(legit_col, fraud_col),
main = "Transaction Count by Class",
ylab = "Count"
)
pie(
fraud_counts$n,
labels = paste0(fraud_counts$label, " (", round(100 * fraud_counts$n / sum(fraud_counts$n), 2), "%)"),
col = c(legit_col, fraud_col),
main = "Proportion of Fraud"
)
par(mfrow = c(1, 1))
par(mfrow = c(1, 2))
hist(
log1p(df$amt[df$is_fraud == 0]),
breaks = 60,
col = grDevices::adjustcolor(legit_col, alpha.f = 0.7),
main = "Log(Amount) Distribution by Class",
xlab = "log(1 + Amount)",
ylab = "Frequency"
)
hist(
log1p(df$amt[df$is_fraud == 1]),
breaks = 60,
col = grDevices::adjustcolor(fraud_col, alpha.f = 0.7),
add = TRUE
)
legend("topright", legend = c("Legitimate", "Fraud"), fill = c(legit_col, fraud_col))
boxplot(
amt ~ is_fraud,
data = df,
col = c(legit_col, fraud_col),
main = "Amount by Fraud Label",
xlab = "is_fraud (0 = Legit, 1 = Fraud)",
ylab = "Transaction Amount ($)"
)
par(mfrow = c(1, 1))
cat(
"Average amount -- Legit: $",
round(mean(df$amt[df$is_fraud == 0]), 2),
" | Fraud: $",
round(mean(df$amt[df$is_fraud == 1]), 2),
"\n",
sep = ""
)
## Average amount -- Legit: $67.54 | Fraud: $509.35
fraud_by_cat <- df |>
group_by(category) |>
summarise(
fraud_count = sum(is_fraud),
total = n(),
fraud_rate = 100 * fraud_count / total,
.groups = "drop"
) |>
arrange(desc(fraud_rate))
kable(fraud_by_cat)
## Warning: 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning: 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
| category | fraud_count | total | fraud_rate |
|---|---|---|---|
| shopping_net | 354 | 15237 | 2.3232920 |
| grocery_pos | 390 | 19139 | 2.0377240 |
| misc_net | 189 | 9799 | 1.9287682 |
| shopping_pos | 175 | 18055 | 0.9692606 |
| gas_transport | 150 | 20398 | 0.7353662 |
| travel | 31 | 6243 | 0.4965561 |
| misc_pos | 57 | 12176 | 0.4681340 |
| grocery_net | 30 | 6981 | 0.4297379 |
| kids_pets | 64 | 17234 | 0.3713589 |
| personal_care | 51 | 13892 | 0.3671178 |
| entertainment | 48 | 14415 | 0.3329865 |
| health_fitness | 36 | 13397 | 0.2687169 |
| food_dining | 33 | 14104 | 0.2339762 |
| home | 37 | 18930 | 0.1954569 |
ggplot(fraud_by_cat, aes(x = reorder(category, fraud_rate), y = fraud_rate)) +
geom_col(fill = fraud_col, color = "black", alpha = 0.85) +
coord_flip() +
labs(
title = "Fraud Rate by Merchant Category",
x = NULL,
y = "Fraud Rate (%)"
)
Feature engineering is what turns raw transaction data into model-ready signals.
df <- df |>
mutate(
trans_date_trans_time = ymd_hms(trans_date_trans_time),
hour = hour(trans_date_trans_time),
day_of_week = wday(trans_date_trans_time, week_start = 1) - 1,
month = month(trans_date_trans_time),
is_weekend = if_else(day_of_week >= 5, 1L, 0L),
is_night = if_else(hour >= 22 | hour <= 5, 1L, 0L)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `trans_date_trans_time = ymd_hms(trans_date_trans_time)`.
## Caused by warning:
## ! 4 failed to parse.
df |>
select(trans_date_trans_time, hour, day_of_week, month, is_weekend, is_night) |>
head()
## # A tibble: 6 × 6
## trans_date_trans_time hour day_of_week month is_weekend is_night
## <dttm> <int> <dbl> <dbl> <int> <int>
## 1 2019-01-01 00:00:18 0 1 1 0 1
## 2 2019-01-01 00:00:44 0 1 1 0 1
## 3 2019-01-01 00:00:51 0 1 1 0 1
## 4 2019-01-01 00:01:16 0 1 1 0 1
## 5 2019-01-01 00:03:06 0 1 1 0 1
## 6 2019-01-01 00:04:08 0 1 1 0 1
fraud_by_hour <- df |>
group_by(hour) |>
summarise(fraud_rate = 100 * mean(is_fraud), .groups = "drop")
ggplot(fraud_by_hour, aes(x = hour, y = fraud_rate)) +
geom_line(color = fraud_col, linewidth = 1.2) +
geom_point(color = fraud_col, size = 2) +
geom_area(fill = fraud_col, alpha = 0.2) +
scale_x_continuous(breaks = 0:23) +
labs(
title = "Fraud Rate by Hour of Day",
x = "Hour of Day (0 = midnight)",
y = "Fraud Rate (%)"
)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
df <- df |>
mutate(
dob = ymd(dob),
age = floor(as.numeric(difftime(trans_date_trans_time, dob, units = "days")) / 365)
)
summary(df$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.00 32.00 43.00 44.97 56.00 94.00 4
df <- df |>
mutate(
distance_from_home = sqrt((lat - merch_lat)^2 + (long - merch_long)^2)
)
df |>
group_by(is_fraud) |>
summarise(
n = n(),
mean_distance = mean(distance_from_home),
median_distance = median(distance_from_home),
sd_distance = sd(distance_from_home),
.groups = "drop"
)
## # A tibble: 2 × 5
## is_fraud n mean_distance median_distance sd_distance
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 0 198355 0.767 0.799 0.284
## 2 1 1645 0.762 0.797 0.286
gender_levels <- sort(unique(df$gender))
category_levels <- sort(unique(df$category))
df <- df |>
mutate(
gender_enc = as.integer(factor(gender, levels = gender_levels)) - 1L,
category_enc = as.integer(factor(category, levels = category_levels)) - 1L
)
gender_map <- tibble(gender = gender_levels, gender_enc = seq_along(gender_levels) - 1L)
category_map <- tibble(category = category_levels, category_enc = seq_along(category_levels) - 1L)
gender_map
## # A tibble: 2 × 2
## gender gender_enc
## <chr> <int>
## 1 F 0
## 2 M 1
category_map
## # A tibble: 14 × 2
## category category_enc
## <chr> <int>
## 1 entertainment 0
## 2 food_dining 1
## 3 gas_transport 2
## 4 grocery_net 3
## 5 grocery_pos 4
## 6 health_fitness 5
## 7 home 6
## 8 kids_pets 7
## 9 misc_net 8
## 10 misc_pos 9
## 11 personal_care 10
## 12 shopping_net 11
## 13 shopping_pos 12
## 14 travel 13
FEATURES <- c(
"amt", "category_enc", "gender_enc", "city_pop", "age",
"hour", "day_of_week", "month", "is_weekend", "is_night",
"distance_from_home", "lat", "long", "merch_lat", "merch_long", "zip"
)
TARGET <- "is_fraud"
X <- df |>
select(all_of(FEATURES))
y <- df[[TARGET]]
cat("Feature matrix X shape:", dim(X)[1], "rows x", dim(X)[2], "columns\n")
## Feature matrix X shape: 200000 rows x 16 columns
cat("Target length:", length(y), "\n")
## Target length: 200000
We want a fair evaluation, so we keep a holdout test set. Because fraud is rare, the split should preserve the fraud rate.
set.seed(42)
train_idx <- createDataPartition(y, p = 0.80, list = FALSE)
X_train <- X[train_idx, ]
X_test <- X[-train_idx, ]
y_train <- y[train_idx]
y_test <- y[-train_idx]
cat("Training set:", nrow(X_train), "rows\n")
## Training set: 160000 rows
cat("Test set:", nrow(X_test), "rows\n")
## Test set: 40000 rows
cat("Fraud rate -- Train:", round(mean(y_train) * 100, 3), "%\n")
## Fraud rate -- Train: 0.828 %
cat("Fraud rate -- Test:", round(mean(y_test) * 100, 3), "%\n")
## Fraud rate -- Test: 0.803 %
The Python notebook uses scale_pos_weight. We will
compute the same idea in R.
neg_count <- sum(y_train == 0)
pos_count <- sum(y_train == 1)
scale_pos_weight <- neg_count / pos_count
cat("Legitimate (0):", neg_count, "\n")
## Legitimate (0): 158676
cat("Fraudulent (1):", pos_count, "\n")
## Fraudulent (1): 1324
cat("scale_pos_weight =", round(scale_pos_weight, 1), "\n")
## scale_pos_weight = 119.8
train_matrix <- data.matrix(X_train)
test_matrix <- data.matrix(X_test)
dtrain <- xgb.DMatrix(data = train_matrix, label = y_train)
dtest <- xgb.DMatrix(data = test_matrix, label = y_test)
params <- list(
objective = "binary:logistic",
eval_metric = "aucpr",
max_depth = 6,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8,
scale_pos_weight = scale_pos_weight,
tree_method = "hist"
)
watchlist <- list(train = dtrain, eval = dtest)
model <- xgb.train(
params = params,
data = dtrain,
nrounds = 300,
watchlist = watchlist,
early_stopping_rounds = 20,
print_every_n = 50,
verbose = 1
)
## [1] train-aucpr:0.606590 eval-aucpr:0.598611
## Multiple eval metrics are present. Will use eval_aucpr for early stopping.
## Will train until eval_aucpr hasn't improved in 20 rounds.
##
## [51] train-aucpr:0.928626 eval-aucpr:0.872476
## [101] train-aucpr:0.985370 eval-aucpr:0.928143
## [151] train-aucpr:0.996952 eval-aucpr:0.946495
## [201] train-aucpr:0.998987 eval-aucpr:0.955790
## [251] train-aucpr:0.999741 eval-aucpr:0.960118
## [300] train-aucpr:0.999821 eval-aucpr:0.963620
eval_log <- model$evaluation_log
ggplot(eval_log, aes(x = iter, y = eval_aucpr)) +
geom_line(color = legit_col, linewidth = 1.2) +
geom_vline(xintercept = model$best_iteration, color = fraud_col, linetype = "dashed") +
labs(
title = "Learning Curve — Model Improves with More Trees",
x = "Number of Trees (Rounds)",
y = "AUCPR"
)
y_proba <- predict(model, dtest)
y_pred <- if_else(y_proba >= 0.5, 1L, 0L)
sample_predictions <- tibble(
Actual = y_test[1:10],
Predicted_Label = y_pred[1:10],
Fraud_Probability = round(y_proba[1:10], 4)
)
sample_predictions
## # A tibble: 10 × 3
## Actual Predicted_Label Fraud_Probability
## <dbl> <int> <dbl>
## 1 0 0 0.0002
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## 7 0 0 0.0002
## 8 0 0 0.0002
## 9 0 0 0.0003
## 10 0 0 0
cm <- table(
Actual = factor(y_test, levels = c(0, 1)),
Predicted = factor(y_pred, levels = c(0, 1))
)
cm
## Predicted
## Actual 0 1
## 0 39631 48
## 1 26 295
TN <- cm[1, 1]
FP <- cm[1, 2]
FN <- cm[2, 1]
TP <- cm[2, 2]
cat("True Negatives :", TN, "\n")
## True Negatives : 39631
cat("False Positives:", FP, "\n")
## False Positives: 48
cat("False Negatives:", FN, "\n")
## False Negatives: 26
cat("True Positives :", TP, "\n")
## True Positives : 295
cm_df <- as.data.frame(cm)
ggplot(cm_df, aes(x = Predicted, y = Actual, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = scales::comma(Freq)), color = "white", size = 5) +
scale_fill_gradient(low = legit_col, high = navy_col) +
labs(title = "Confusion Matrix", x = "Predicted", y = "Actual")
classification_report <- tibble(
Metric = c("Precision", "Recall", "F1"),
Fraud = c(
precision_fn(y_test, y_pred),
recall_fn(y_test, y_pred),
f1_fn(y_test, y_pred)
)
)
classification_report
## # A tibble: 3 × 2
## Metric Fraud
## <chr> <dbl>
## 1 Precision 0.860
## 2 Recall 0.919
## 3 F1 0.889
roc_obj <- roc(y_test, y_proba, quiet = TRUE)
roc_auc <- as.numeric(auc(roc_obj))
pr_obj <- pr.curve(
scores.class0 = y_proba[y_test == 1],
scores.class1 = y_proba[y_test == 0],
curve = TRUE
)
pr_auc <- pr_obj$auc.integral
baseline <- mean(y_test)
cat("ROC-AUC:", round(roc_auc, 4), "\n")
## ROC-AUC: 0.9993
cat("PR-AUC :", round(pr_auc, 4), "\n")
## PR-AUC : 0.9636
par(mfrow = c(1, 2))
plot(roc_obj, col = legit_col, main = "ROC Curve", lwd = 2)
abline(a = 0, b = 1, lty = 2)
plot(
pr_obj$curve[, 1],
pr_obj$curve[, 2],
type = "l",
lwd = 2,
col = fraud_col,
xlab = "Recall",
ylab = "Precision",
main = "Precision-Recall Curve"
)
abline(h = baseline, lty = 2)
par(mfrow = c(1, 1))
XGBoost returns a probability. We still have to decide where the cutoff should be.
thresholds <- seq(0.01, 0.99, by = 0.01)
threshold_metrics <- map_dfr(thresholds, function(thresh) {
pred_t <- if_else(y_proba >= thresh, 1L, 0L)
tibble(
threshold = thresh,
precision = precision_fn(y_test, pred_t),
recall = recall_fn(y_test, pred_t),
f1 = f1_fn(y_test, pred_t),
fp = sum(y_test == 0 & pred_t == 1),
fn = sum(y_test == 1 & pred_t == 0)
)
})
best_threshold <- threshold_metrics$threshold[which.max(threshold_metrics$f1)]
best_row <- threshold_metrics |> slice_max(f1, n = 1)
best_row
## # A tibble: 1 × 6
## threshold precision recall f1 fp fn
## <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 0.89 0.972 0.875 0.921 8 40
threshold_metrics |>
select(threshold, precision, recall, f1) |>
pivot_longer(-threshold, names_to = "metric", values_to = "value") |>
ggplot(aes(x = threshold, y = value, color = metric)) +
geom_line(linewidth = 1.1) +
geom_vline(xintercept = best_threshold, linetype = "dashed", color = orange_col) +
labs(
title = "Precision, Recall, and F1 vs Threshold",
x = "Decision Threshold",
y = "Score"
)
threshold_comparison <- tibble(
threshold = c(0.50, best_threshold)
) |>
mutate(
label = c("Default (0.50)", paste0("Optimised (", round(best_threshold, 2), ")")),
precision = map_dbl(threshold, ~ precision_fn(y_test, if_else(y_proba >= .x, 1L, 0L))),
recall = map_dbl(threshold, ~ recall_fn(y_test, if_else(y_proba >= .x, 1L, 0L))),
f1 = map_dbl(threshold, ~ f1_fn(y_test, if_else(y_proba >= .x, 1L, 0L))),
missed_fraud = map_dbl(threshold, ~ sum(y_test == 1 & if_else(y_proba >= .x, 1L, 0L) == 0))
)
threshold_comparison
## # A tibble: 2 × 6
## threshold label precision recall f1 missed_fraud
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 0.5 Default (0.50) 0.860 0.919 0.889 26
## 2 0.89 Optimised (0.89) 0.972 0.875 0.921 40
XGBoost can show which features contributed most to the model globally.
importance_df <- xgb.importance(
feature_names = FEATURES,
model = model
)
importance_df
## Feature Gain Cover Frequency
## 1: amt 0.6055229204 0.3238502718 0.148816421
## 2: category_enc 0.1061410812 0.1622812070 0.103790763
## 3: is_night 0.0795619723 0.0703708278 0.015394802
## 4: hour 0.0671585790 0.0804807796 0.072421784
## 5: age 0.0373252134 0.0657677820 0.107101473
## 6: city_pop 0.0261661784 0.0632559251 0.101307731
## 7: lat 0.0144105819 0.0422596475 0.058434034
## 8: zip 0.0118843574 0.0438375992 0.055785466
## 9: long 0.0115559102 0.0293428939 0.056944215
## 10: merch_lat 0.0110401418 0.0355245822 0.061579209
## 11: distance_from_home 0.0080094210 0.0237799872 0.076808475
## 12: merch_long 0.0064281968 0.0167487123 0.048419136
## 13: day_of_week 0.0061256377 0.0232942294 0.042956464
## 14: month 0.0055559068 0.0149150571 0.034431386
## 15: gender_enc 0.0029010928 0.0040273935 0.013408376
## 16: is_weekend 0.0002128089 0.0002631043 0.002400265
xgb.plot.importance(
importance_matrix = importance_df,
top_n = 16,
rel_to_first = TRUE,
xlab = "Relative Importance"
)
We can also rank by gain share explicitly.
importance_ranked <- importance_df |>
as_tibble() |>
arrange(desc(Gain)) |>
mutate(
Gain_pct = 100 * Gain / sum(Gain),
Cumulative_pct = cumsum(Gain_pct)
)
importance_ranked
## # A tibble: 16 × 7
## Feature Gain Cover Frequency Importance Gain_pct Cumulative_pct
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 amt 6.06e-1 3.24e-1 0.149 0.606 60.6 60.6
## 2 category_enc 1.06e-1 1.62e-1 0.104 0.106 10.6 71.2
## 3 is_night 7.96e-2 7.04e-2 0.0154 0.0796 7.96 79.1
## 4 hour 6.72e-2 8.05e-2 0.0724 0.0672 6.72 85.8
## 5 age 3.73e-2 6.58e-2 0.107 0.0373 3.73 89.6
## 6 city_pop 2.62e-2 6.33e-2 0.101 0.0262 2.62 92.2
## 7 lat 1.44e-2 4.23e-2 0.0584 0.0144 1.44 93.6
## 8 zip 1.19e-2 4.38e-2 0.0558 0.0119 1.19 94.8
## 9 long 1.16e-2 2.93e-2 0.0569 0.0116 1.16 96.0
## 10 merch_lat 1.10e-2 3.55e-2 0.0616 0.0110 1.10 97.1
## 11 distance_from_h… 8.01e-3 2.38e-2 0.0768 0.00801 0.801 97.9
## 12 merch_long 6.43e-3 1.67e-2 0.0484 0.00643 0.643 98.5
## 13 day_of_week 6.13e-3 2.33e-2 0.0430 0.00613 0.613 99.1
## 14 month 5.56e-3 1.49e-2 0.0344 0.00556 0.556 99.7
## 15 gender_enc 2.90e-3 4.03e-3 0.0134 0.00290 0.290 100.
## 16 is_weekend 2.13e-4 2.63e-4 0.00240 0.000213 0.0213 100
The original notebook tested several max_depth values.
We can do the same in R with cross-validation.
depths <- 3:8
cv_scores <- map_dfr(depths, function(depth) {
cv <- xgb.cv(
params = list(
objective = "binary:logistic",
eval_metric = "auc",
max_depth = depth,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8,
scale_pos_weight = scale_pos_weight,
tree_method = "hist"
),
data = dtrain,
nrounds = 100,
nfold = 3,
stratified = TRUE,
early_stopping_rounds = 10,
verbose = 0
)
best_idx <- which.max(cv$evaluation_log$test_auc_mean)
tibble(
depth = depth,
mean_auc = cv$evaluation_log$test_auc_mean[best_idx],
std_auc = cv$evaluation_log$test_auc_std[best_idx]
)
})
cv_scores
## # A tibble: 6 × 3
## depth mean_auc std_auc
## <int> <dbl> <dbl>
## 1 3 0.992 0.000900
## 2 4 0.995 0.000978
## 3 5 0.996 0.000956
## 4 6 0.996 0.000644
## 5 7 0.997 0.00119
## 6 8 0.997 0.000539
best_depth <- cv_scores$depth[which.max(cv_scores$mean_auc)]
cat("Best max_depth:", best_depth, "\n")
## Best max_depth: 8
final_params <- list(
objective = "binary:logistic",
eval_metric = "aucpr",
max_depth = best_depth,
eta = 0.05,
subsample = 0.8,
colsample_bytree = 0.8,
scale_pos_weight = scale_pos_weight,
tree_method = "hist"
)
final_model <- xgb.train(
params = final_params,
data = dtrain,
nrounds = 500,
watchlist = watchlist,
early_stopping_rounds = 30,
print_every_n = 100,
verbose = 1
)
## [1] train-aucpr:0.740409 eval-aucpr:0.717157
## Multiple eval metrics are present. Will use eval_aucpr for early stopping.
## Will train until eval_aucpr hasn't improved in 30 rounds.
##
## [101] train-aucpr:0.984153 eval-aucpr:0.909707
## [201] train-aucpr:0.999241 eval-aucpr:0.947727
## [301] train-aucpr:0.999808 eval-aucpr:0.958989
## [401] train-aucpr:0.999994 eval-aucpr:0.962826
## Stopping. Best iteration:
## [436] train-aucpr:1.000000 eval-aucpr:0.963454
y_proba_final <- predict(final_model, dtest)
final_roc_auc <- as.numeric(auc(roc(y_test, y_proba_final, quiet = TRUE)))
final_pr_obj <- pr.curve(
scores.class0 = y_proba_final[y_test == 1],
scores.class1 = y_proba_final[y_test == 0],
curve = TRUE
)
final_pr_auc <- final_pr_obj$auc.integral
cat("FINAL TUNED MODEL RESULTS\n")
## FINAL TUNED MODEL RESULTS
cat("ROC-AUC :", round(final_roc_auc, 4), "\n")
## ROC-AUC : 0.9993
cat("PR-AUC :", round(final_pr_auc, 4), "\n")
## PR-AUC : 0.9635
cat("Best iteration:", final_model$best_iteration, "\n")
## Best iteration: 436
MODEL_PATH <- "/Users/daniyalzafar/python journey/Data Science Road Map/XGboost/fraud_xgboost_model_r.model"
RDS_PATH <- "/Users/daniyalzafar/python journey/Data Science Road Map/XGboost/fraud_xgboost_model_r.rds"
xgb.save(final_model, MODEL_PATH)
## [1] TRUE
saveRDS(final_model, RDS_PATH)
cat("Model saved as JSON :", MODEL_PATH, "\n")
## Model saved as JSON : /Users/daniyalzafar/python journey/Data Science Road Map/XGboost/fraud_xgboost_model_r.model
cat("Model saved as RDS :", RDS_PATH, "\n")
## Model saved as RDS : /Users/daniyalzafar/python journey/Data Science Road Map/XGboost/fraud_xgboost_model_r.rds
loaded_model <- xgb.load(MODEL_PATH)
loaded_proba <- predict(loaded_model, dtest)
loaded_auc <- as.numeric(auc(roc(y_test, loaded_proba, quiet = TRUE)))
cat("Loaded model AUC:", round(loaded_auc, 4), "\n")
## Loaded model AUC: 0.9993
The Python notebook used the tuned threshold found earlier and then applied it to the final model. We do the same here.
y_pred_final_opt <- if_else(y_proba_final >= best_threshold, 1L, 0L)
cm_final <- table(
Actual = factor(y_test, levels = c(0, 1)),
Predicted = factor(y_pred_final_opt, levels = c(0, 1))
)
TN_f <- cm_final[1, 1]
FP_f <- cm_final[1, 2]
FN_f <- cm_final[2, 1]
TP_f <- cm_final[2, 2]
p_f <- precision_fn(y_test, y_pred_final_opt)
r_f <- recall_fn(y_test, y_pred_final_opt)
f_f <- f1_fn(y_test, y_pred_final_opt)
scorecard <- tibble(
Metric = c(
"ROC-AUC",
"PR-AUC",
"Decision Threshold",
"Precision",
"Recall",
"F1 Score",
"Fraud caught (TP)",
"Missed fraud (FN)",
"False alarms (FP)"
),
Value = c(
round(final_roc_auc, 4),
round(final_pr_auc, 4),
round(best_threshold, 2),
round(p_f, 4),
round(r_f, 4),
round(f_f, 4),
TP_f,
FN_f,
FP_f
)
)
scorecard
## # A tibble: 9 × 2
## Metric Value
## <chr> <dbl>
## 1 ROC-AUC 0.999
## 2 PR-AUC 0.964
## 3 Decision Threshold 0.89
## 4 Precision 0.986
## 5 Recall 0.847
## 6 F1 Score 0.911
## 7 Fraud caught (TP) 272
## 8 Missed fraud (FN) 49
## 9 False alarms (FP) 4
This workflow shows the full lifecycle in R:
That is the same end-to-end story as the original notebook, now translated into R Markdown so it can be rendered and published directly.