## spc_tbl_ [10,000 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ transaction_id : num [1:10000] 1 2 3 4 5 6 7 8 9 10 ...
## $ customer_id : num [1:10000] 1082 1015 1004 1095 1036 ...
## $ merchant_id : num [1:10000] 2027 2053 2035 2037 2083 ...
## $ amount : num [1:10000] 5759 1902 1249 7619 1890 ...
## $ transaction_time : POSIXct[1:10000], format: "2023-01-01 00:00:00" "2023-01-01 00:00:01" ...
## $ is_fraudulent : num [1:10000] 0 1 1 1 1 1 1 0 0 0 ...
## $ card_type : chr [1:10000] "MasterCard" "Visa" "MasterCard" "Discover" ...
## $ location : chr [1:10000] "City-30" "City-47" "City-6" "City-6" ...
## $ purchase_category : chr [1:10000] "Gas Station" "Online Shopping" "Gas Station" "Travel" ...
## $ customer_age : num [1:10000] 43 61 57 59 36 43 19 40 43 55 ...
## $ transaction_description: chr [1:10000] "Purchase at Merchant-2027" "Purchase at Merchant-2053" "Purchase at Merchant-2035" "Purchase at Merchant-2037" ...
## - attr(*, "spec")=
## .. cols(
## .. transaction_id = col_double(),
## .. customer_id = col_double(),
## .. merchant_id = col_double(),
## .. amount = col_double(),
## .. transaction_time = col_datetime(format = ""),
## .. is_fraudulent = col_double(),
## .. card_type = col_character(),
## .. location = col_character(),
## .. purchase_category = col_character(),
## .. customer_age = col_double(),
## .. transaction_description = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
library(summarytools)
data %>%
select(c(amount, customer_age)) %>%
descr(order = "preserve",
stats = c("min", "q1", "med", "mean", "q3", "max")
)## Descriptive Statistics
## data
## N: 10000
##
## amount customer_age
## ------------ --------- --------------
## Min 10.61 18.00
## Q1 2438.07 31.00
## Median 4943.94 44.00
## Mean 4958.38 44.05
## Q3 7499.49 57.00
## Max 9999.75 70.00
library(ggplot2)
factor_vars <- c("is_fraudulent", "purchase_category")
plot_data <- data %>%
select(all_of(factor_vars)) %>%
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Category") %>%
count(Variable, Category)
ggplot(plot_data, aes(x = Category,
y = n, fill = Category))+
geom_bar(stat = "identity")+
geom_text(aes(label = n), vjust = -0.3, size = 3)+
facet_wrap(~ Variable, scales = "free_x")+
labs(title = paste("Distribution of Factor Variables"),
x = "Category",
y = "Count")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
legend.position = "none")ggplot(data, aes(x = card_type, fill = card_type))+
geom_bar()+
geom_text(
stat = "count", aes(label = after_stat(count)),
vjust = -0.3)+
labs(title = paste("Distribution of card_type"),
x = "card_type",
y = "Count")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
legend.position = "none")##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 13.292, df = 5, p-value = 0.02079
##
## Pearson's Chi-squared test
##
## data: table(data$is_fraudulent, data$card_type)
## X-squared = 3.7812, df = 3, p-value = 0.2861
##
## 0 1
## 4932 5068
The class is balanced in predicted variable
==================================================================================================================== MODEL 1 : LOGISTIC REGRESSION
====================================================================================================================
log_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
log_workflow <- workflow() %>%
add_recipe(fraud_recipe) %>%
add_model(log_model)
log_fit <- fit(log_workflow, data = train_data)
log_resamples <- fit_resamples(
log_workflow,
resamples = folds,
metrics = metric_set(roc_auc, accuracy)
)
collect_metrics(log_resamples)==================================================================================================================== MODEL 2 : DECISION TREE
====================================================================================================================
tree_model <- decision_tree(
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) %>%
set_engine("rpart") %>%
set_mode("classification")
tree_worflow <- workflow() %>%
add_recipe(fraud_recipe) %>%
add_model(tree_model)
tree_grid <- grid_regular(
cost_complexity(),
tree_depth(),
min_n(),
levels = 3
)
tree_tune <- tune_grid(
tree_worflow,
resamples = folds,
grid = tree_grid,
metrics = metric_set(roc_auc, accuracy)
)
collect_metrics(tree_tune)best_tree <- select_best(
tree_tune,
metric = "roc_auc")
final_tree_workflow <- finalize_workflow(
tree_worflow,
best_tree
)
tree_fit <- fit(final_tree_workflow, data = train_data)==============================================================================================================================
MODEL 3 : RANDOM FOREST
=============================================================================================================================
rf_model <- rand_forest(
trees = 300,
mtry = tune(),
min_n = tune()
) %>%
set_engine("ranger",
importance = "impurity") %>%
set_mode("classification")
rf_workflow <- workflow() %>%
add_recipe(fraud_recipe) %>%
add_model(rf_model)
rf_grid <- grid_regular(
mtry(range = c(1, 2)),
min_n(),
levels = 3
)
rf_tune <- tune_grid(
rf_workflow,
resamples = folds,
grid = rf_grid,
metrics = metric_set(roc_auc, accuracy)
)
collect_metrics(rf_tune) best_rf <- select_best(rf_tune, metric = "roc_auc")
final_rf_workflow <- finalize_workflow(
rf_workflow,
best_rf
)
rf_fit <- fit(final_rf_workflow,
data = train_data)==============================================================================================================================
MODEL 4 : XGBOOST
=============================================================================================================================
xgb_model <- boost_tree(
trees = 300,
tree_depth = tune(),
learn_rate = tune(),
min_n = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_workflow <- workflow() %>%
add_recipe(fraud_recipe) %>%
add_model(xgb_model)
xgb_grid <- grid_regular(
tree_depth(),
learn_rate(),
min_n(),
levels = 3
)
xgb_tune <- tune_grid(
xgb_workflow,
resamples = folds,
grid = xgb_grid,
metric = metric_set(roc_auc, accuracy)
)
collect_metrics(xgb_tune)best_xgb <- select_best(
xgb_tune,
metric = "roc_auc"
)
final_xgb_workflow <- finalize_workflow(
xgb_workflow,
best_xgb
)
xgb_fit <- fit(final_xgb_workflow,
data = train_data)========================================================================================================================
COMPARE MODEL PERFORMANCE
========================================================================================================================
log_metrics <- collect_metrics(log_resamples) %>%
filter(.metric == "roc_auc") %>%
mutate(model = "Logistic Regression")
tree_metrics <- collect_metrics(tree_tune) %>%
filter(.metric == "roc_auc") %>%
slice_max(mean, n = 1) %>%
mutate(model = "Decision Tree")
rf_metrics <- collect_metrics(rf_tune) %>%
filter(.metric == "roc_auc") %>%
slice_max(mean, n = 1) %>%
mutate(model = "Random Forest")
xgb_metrics <- collect_metrics(xgb_tune) %>%
filter(.metric == "roc_auc") %>%
slice_max(mean, n = 1) %>%
mutate(model = "XGBoost")
all_results <- bind_rows(
log_metrics,
tree_metrics,
rf_metrics,
xgb_metrics
)
all_results %>%
select(model, mean, std_err)=========================================================================================================================
CHOOSE THE BEST MODEL
=========================================================================================================================
========================================================================================================================
MAKE PREDICTIONS
========================================================================================================================
logistic_predictions <- predict(
log_fit, new_data = test_data, type = "prob"
)
logistic_predictionsfinal_predictions <- bind_cols(
test_data,
logistic_predictions,
log_class
)
head(final_predictions)==========================================================================================================================
SAVE BEST MODEL
==========================================================================================================================
==========================================================================================================================
LOAD SAVED MODEL LATER
==========================================================================================================================
==========================================================================================================================
USE SAVED MODEL FOR NEW PREDICTIONS
=========================================================================================================================
new_customer <- data.frame(
purchase_category = factor(
c("Online Shopping", "Restaurant"),
levels =
levels(sample_data$purchase_category)
)
)
predict(loaded_model,
new_data = new_customer,
type = "prob")