library(tidyverse)
library(dplyr)
library(ggplot2)
library(reshape2)
library(scales)
library(ROSE)
library(corrplot)
library(ggpubr)
library(randomForest)
library(caret)
library(pROC)
application_record <- read.csv("bnpl_dataset.csv")
credit_record <- read.csv("bnpl_dataset_v2.csv")
print(head(application_record))
## Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359 56 Male 32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2 46 Male 72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8 32 Male 82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf 60 Male 92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec 25 Male 32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17 38 Female 94833
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Repayment_Status
## 1 353 Beauty Sezzle 249 Defaulted
## 2 354 Groceries Affirm 188 Paid On Time
## 3 630 Travel Sezzle 1610 Paid On Time
## 4 470 Fashion Sezzle 120 Paid On Time
## 5 502 Travel Klarna 1849 Paid On Time
## 6 779 Travel Klarna 1112 Paid On Time
print(head(credit_record))
## Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359 56 Male 32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2 46 Male 72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8 32 Male 82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf 60 Male 92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec 25 Male 32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17 38 Female 94833
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1 353 Beauty Sezzle 249 Tablet
## 2 354 Groceries Affirm 188 Mobile
## 3 630 Travel Sezzle 1610 Desktop
## 4 470 Fashion Sezzle 120 Mobile
## 5 502 Travel Klarna 1849 Mobile
## 6 779 Travel Klarna 1112 Mobile
## Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1 WiFi 82 Firefox Paid On Time
## 2 WiFi 60 Firefox Paid On Time
## 3 WiFi 86 Chrome Paid On Time
## 4 4G/5G 169 Chrome Paid On Time
## 5 4G/5G 38 Chrome Defaulted
## 6 4G/5G 82 Safari Paid On Time
cat("--- Application Record Structure ---\n")
## --- Application Record Structure ---
colnames(application_record)
## [1] "Transaction_ID" "Customer_Age" "Gender"
## [4] "Annual_Income" "Credit_Score" "Purchase_Category"
## [7] "BNPL_Provider" "Purchase_Amount" "Repayment_Status"
cat("---credit_record---")
## ---credit_record---
colnames(credit_record)
## [1] "Transaction_ID" "Customer_Age" "Gender"
## [4] "Annual_Income" "Credit_Score" "Purchase_Category"
## [7] "BNPL_Provider" "Purchase_Amount" "Device_Type"
## [10] "Connection_Type" "Checkout_Time_Seconds" "Browser"
## [13] "Repayment_Status"
dim(application_record)
## [1] 50000 9
dim(credit_record)
## [1] 50000 13
str(application_record)
## 'data.frame': 50000 obs. of 9 variables:
## $ Transaction_ID : chr "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
## $ Customer_Age : int 56 46 32 60 25 38 56 36 40 28 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Annual_Income : int 32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
## $ Credit_Score : int 353 354 630 470 502 779 403 731 669 359 ...
## $ Purchase_Category: chr "Beauty" "Groceries" "Travel" "Fashion" ...
## $ BNPL_Provider : chr "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
## $ Purchase_Amount : int 249 188 1610 120 1849 1112 418 2117 286 244 ...
## $ Repayment_Status : chr "Defaulted" "Paid On Time" "Paid On Time" "Paid On Time" ...
str(credit_record)
## 'data.frame': 50000 obs. of 13 variables:
## $ Transaction_ID : chr "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
## $ Customer_Age : int 56 46 32 60 25 38 56 36 40 28 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Annual_Income : int 32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
## $ Credit_Score : int 353 354 630 470 502 779 403 731 669 359 ...
## $ Purchase_Category : chr "Beauty" "Groceries" "Travel" "Fashion" ...
## $ BNPL_Provider : chr "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
## $ Purchase_Amount : int 249 188 1610 120 1849 1112 418 2117 286 244 ...
## $ Device_Type : chr "Tablet" "Mobile" "Desktop" "Mobile" ...
## $ Connection_Type : chr "WiFi" "WiFi" "WiFi" "4G/5G" ...
## $ Checkout_Time_Seconds: int 82 60 86 169 38 82 169 14 164 5 ...
## $ Browser : chr "Firefox" "Firefox" "Chrome" "Chrome" ...
## $ Repayment_Status : chr "Paid On Time" "Paid On Time" "Paid On Time" "Paid On Time" ...
50,000 records with
9 variables, while the credit dataset contains
50,000 records with 13 variables.colSums(is.na(application_record))
## Transaction_ID Customer_Age Gender Annual_Income
## 0 0 0 0
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount
## 0 0 0 0
## Repayment_Status
## 0
colSums(is.na(credit_record))
## Transaction_ID Customer_Age Gender
## 0 0 0
## Annual_Income Credit_Score Purchase_Category
## 0 0 0
## BNPL_Provider Purchase_Amount Device_Type
## 0 0 0
## Connection_Type Checkout_Time_Seconds Browser
## 0 0 0
## Repayment_Status
## 0
0 missing
values across all columns, indicating complete and clean data.unique_app_ids <- length(unique(application_record$Transaction_ID))
cat("Unique application ids:",unique_app_ids,"\n")
## Unique application ids: 50000
unique_cred_ids <- length(unique(credit_record$Transaction_ID))
cat("Unique credit ids:",unique_cred_ids)
## Unique credit ids: 50000
# Find the overlap
overlap_ids <- intersect(unique(application_record$Transaction_ID),
unique(credit_record$Transaction_ID))
head(overlap_ids)
## [1] "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359"
## [2] "863e8aa6-847e-4ae0-b96b-65241f3450a2"
## [3] "a24efee2-16f2-42dc-a0e7-6df4960df0b8"
## [4] "bbad847a-a92f-4766-ba3f-98b9b199b4cf"
## [5] "3f1b1928-09ca-4d06-8ec3-4efd3468d0ec"
## [6] "f99db9dc-9e5c-40d1-918a-ecd3469e8c17"
num_overlap <- length(overlap_ids)
cat("No. Of Overlap ids:",num_overlap)
## No. Of Overlap ids: 50000
status_distribution <- table(credit_record$Repayment_Status)
cat("Frequency Table for the Repayment_Status column :\n",status_distribution,"\n")
## Frequency Table for the Repayment_Status column :
## 4237 7364 38399
print(status_distribution)
##
## Defaulted Late Payment Paid On Time
## 4237 7364 38399
most_frequent_status <- names(status_distribution[which.max(status_distribution)])
cat("Most Frequent Status: ",most_frequent_status)
## Most Frequent Status: Paid On Time
Repayment_Status column has three categories:
Defaulted, Late Payment, and Paid On Time.app_duplicates <- sum(duplicated(application_record$Transaction_ID))
cat("Duplicate IDs in Application Record:", app_duplicates, "\n")
## Duplicate IDs in Application Record: 0
cred_duplicates <- sum(duplicated(credit_record$Transaction_ID))
cat("Duplicate IDs in Credit Record:", cred_duplicates, "\n")
## Duplicate IDs in Credit Record: 0
application_record <- application_record[!duplicated(application_record$Transaction_ID), ]
credit_record <- credit_record[!duplicated(credit_record$Transaction_ID), ]
cat("Cleaned Application Rows:", nrow(application_record), "\n")
## Cleaned Application Rows: 50000
cat("Cleaned Credit Rows:", nrow(credit_record))
## Cleaned Credit Rows: 50000
top_10_wealth <- application_record %>%
arrange(desc(Annual_Income)) %>%
head(10)
print(top_10_wealth)
## Transaction_ID Customer_Age Gender Annual_Income
## 1 cda8bc71-033d-492f-9ab3-1d7b135e5778 42 Male 119998
## 2 6556a8a6-7bd4-42a7-8d3d-6454aff7622f 36 Male 119995
## 3 264f3c9e-773a-4ad0-9c1c-fc26f24f6dbf 38 Female 119994
## 4 b1c4bdc9-1a54-4bea-96f3-65686a502a85 39 Male 119992
## 5 8f5a9adc-b378-4966-89f5-223277a8aeaa 58 Female 119990
## 6 7b3838fd-bcc9-400e-a064-a16139c4c9a5 32 Female 119989
## 7 e0440639-8486-449d-88d2-8873832e8d84 46 Male 119989
## 8 8604c4b5-d984-4919-b741-6962cfbdb3b9 31 Female 119987
## 9 69ef7ce5-1317-4a17-8d07-045024078440 56 Female 119983
## 10 aa230197-83fe-414b-bc6d-3c0cee853853 48 Male 119982
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount
## 1 395 Beauty Afterpay 203
## 2 666 Fashion Sezzle 171
## 3 423 Travel Klarna 1243
## 4 463 Travel Affirm 799
## 5 744 Electronics Afterpay 940
## 6 847 Groceries Afterpay 44
## 7 462 Home & Furniture Affirm 203
## 8 552 Home & Furniture Affirm 193
## 9 415 Travel Klarna 631
## 10 400 Travel Sezzle 2957
## Repayment_Status
## 1 Paid On Time
## 2 Paid On Time
## 3 Paid On Time
## 4 Late Payment
## 5 Paid On Time
## 6 Paid On Time
## 7 Late Payment
## 8 Paid On Time
## 9 Paid On Time
## 10 Late Payment
high_value_creditors <- application_record %>%
filter(BNPL_Provider %in% c("Klarna", "Afterpay") & Credit_Score > 750)
cat("High Value applicants who used 'Klarna' or 'Afterpay'\n")
## High Value applicants who used 'Klarna' or 'Afterpay'
nrow(high_value_creditors)
## [1] 4521
high_risk_defaults <- application_record %>%
filter(Repayment_Status == "Defaulted")
cat("No. Of rows high rish users for 'Defaulted': \n" )
## No. Of rows high rish users for 'Defaulted':
nrow(high_risk_defaults)
## [1] 4379
rapid_transactions <- credit_record %>%
filter(Checkout_Time_Seconds < 30)
cat("Count of high speed transaction 'rapid checkouts':\n")
## Count of high speed transaction 'rapid checkouts':
nrow(rapid_transactions)
## [1] 7261
spending_profile <- application_record %>%
group_by(Purchase_Category, Gender) %>%
summarize(
Mean_Spending = mean(Purchase_Amount, na.rm = TRUE),
Median_Spending = median(Purchase_Amount, na.rm = TRUE),
SD_Spending = sd(Purchase_Amount, na.rm = TRUE),
Total_Count = n(),
.groups = "drop"
) %>%
arrange(desc(Mean_Spending))
cat("Summary Table for 'Purchase category and Gender':\n")
## Summary Table for 'Purchase category and Gender':
print(spending_profile)
## # A tibble: 18 × 6
## Purchase_Category Gender Mean_Spending Median_Spending SD_Spending
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Travel Non-Binary 1661. 1654 773.
## 2 Travel Male 1638. 1634. 774.
## 3 Travel Female 1621. 1616. 776.
## 4 Electronics Male 1109. 1107 524.
## 5 Electronics Female 1099. 1086 517.
## 6 Electronics Non-Binary 1074. 1036 511.
## 7 Fashion Non-Binary 162. 158 79.6
## 8 Home & Furniture Non-Binary 161. 158 78.7
## 9 Fashion Male 161. 163 81.6
## 10 Beauty Non-Binary 161. 160 83.5
## 11 Groceries Male 160. 160 80.7
## 12 Home & Furniture Male 160. 160 80.9
## 13 Beauty Male 160. 160 81.7
## 14 Home & Furniture Female 159. 160 80.9
## 15 Groceries Female 159. 159 80.3
## 16 Fashion Female 159. 156 80.3
## 17 Beauty Female 158. 156 81.2
## 18 Groceries Non-Binary 157. 160 81.7
## # ℹ 1 more variable: Total_Count <int>
application_record <- application_record %>%
mutate(Income_Bracket = ifelse(Annual_Income < 30000, "Entry-Level",
ifelse(Annual_Income <= 70000, "Mid-Tier", "Wealth-Tier")))
cat("No. of users are in each category:")
## No. of users are in each category:
table(application_record$Income_Bracket)
##
## Entry-Level Mid-Tier Wealth-Tier
## 5053 19907 25040
head(application_record[, c("Transaction_ID", "Annual_Income", "Income_Bracket")])
## Transaction_ID Annual_Income Income_Bracket
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359 32293 Mid-Tier
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2 72774 Wealth-Tier
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8 82207 Wealth-Tier
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf 92498 Wealth-Tier
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec 32060 Mid-Tier
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17 94833 Wealth-Tier
income_credit_summary <- application_record %>%
group_by(Income_Bracket) %>%
summarise(
Avg_Credit_Score = mean(Credit_Score, na.rm = TRUE),
Total_Applicants = n()
)
cat('Summary Table for Credit Scores by Income Segment:')
## Summary Table for Credit Scores by Income Segment:
print(income_credit_summary)
## # A tibble: 3 × 3
## Income_Bracket Avg_Credit_Score Total_Applicants
## <chr> <dbl> <int>
## 1 Entry-Level 574. 5053
## 2 Mid-Tier 573. 19907
## 3 Wealth-Tier 574. 25040
provider_summary <- application_record %>%
group_by(BNPL_Provider, Gender) %>%
summarize(Avg_Purchase = mean(Purchase_Amount, na.rm = TRUE), .groups = "drop")
provider_matrix <- provider_summary %>%
pivot_wider(names_from = Gender, values_from = Avg_Purchase)
cat('View the comparison matrix')
## View the comparison matrix
print(provider_matrix)
## # A tibble: 4 × 4
## BNPL_Provider Female Male `Non-Binary`
## <chr> <dbl> <dbl> <dbl>
## 1 Affirm 551. 566. 532.
## 2 Afterpay 565. 569. 524.
## 3 Klarna 568. 559. 651.
## 4 Sezzle 567. 565. 513.
master_dataset <- application_record %>%
left_join(credit_record, by = "Transaction_ID")
dim(master_dataset)
## [1] 50000 22
colnames(master_dataset)
## [1] "Transaction_ID" "Customer_Age.x" "Gender.x"
## [4] "Annual_Income.x" "Credit_Score.x" "Purchase_Category.x"
## [7] "BNPL_Provider.x" "Purchase_Amount.x" "Repayment_Status.x"
## [10] "Income_Bracket" "Customer_Age.y" "Gender.y"
## [13] "Annual_Income.y" "Credit_Score.y" "Purchase_Category.y"
## [16] "BNPL_Provider.y" "Purchase_Amount.y" "Device_Type"
## [19] "Connection_Type" "Checkout_Time_Seconds" "Browser"
## [22] "Repayment_Status.y"
sum(is.na(master_dataset$Purchase_Amount))
## [1] 0
Transaction_ID..x and .y show overlapping
variables from both datasets.master_dataset <- master_dataset %>%
mutate(Is_Bad = ifelse(Repayment_Status.x %in% c("Defaulted", "Late"), 1, 0))
ggplot(master_dataset, aes(x = factor(Is_Bad), fill = factor(Is_Bad))) +
geom_bar() +
scale_fill_manual(values = c("skyblue", "pink")) +
labs(title = "Count of Good (0) vs Bad (1) Customers",
x = "Risk Status (Is_Bad)",
y = "Frequency") +
theme_minimal()
ggplot(master_dataset, aes(x = Credit_Score.x)) +
geom_histogram(binwidth = 20, fill = "orange", color = "red") +
labs(title = "Distribution of Credit Scores",
x = "Credit Score",
y = "Frequency") +
theme_minimal()
Objective: To check the correlation between Income and Spending, similar to your mtcars weight vs. mpg example.
ggplot(master_dataset, aes(x = Annual_Income.x, y = Purchase_Amount.x)) +
geom_point(alpha = 0.2, color = "blue", size = 1) +
geom_smooth(method = "lm", formula = y ~ x, color = "black", linewidth = 1.5, se = TRUE) +
labs(title = "Wealth vs. Consumption Scatter Analysis",
subtitle = "Testing the relationship between Income and Spending",
x = "Annual Income (USD)",
y = "Purchase Amount (USD)") +
theme_minimal()
category_summary <- master_dataset %>%
group_by(Income_Bracket, Purchase_Category.x) %>%
summarise(Total_Spend = sum(Purchase_Amount.x, na.rm = TRUE), .groups = "drop")
ggplot(category_summary, aes(x = Purchase_Category.x, y = Total_Spend, fill = Purchase_Category.x)) +
geom_bar(stat = "identity") +
facet_wrap(~Income_Bracket) +
labs(title = "Total Spending by Category and Income Segment",
x = "Category",
y = "Total Sales Amount") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(master_dataset, aes(x = BNPL_Provider.x, fill = Gender.x)) +
geom_bar(position = "dodge") +
labs(title = "Provider Transaction Volume by Gender",
x = "BNPL Provider",
y = "Count of Transactions",
fill = "Gender") +
theme_minimal()
ggplot(master_dataset, aes(x = Repayment_Status.x, y = Annual_Income.x, fill = Repayment_Status.x)) +
geom_boxplot(notch = TRUE, alpha = 0.8) +
theme_minimal() +
labs(title = "Income Distribution by Repayment Status",
subtitle = "Diagnostic: Analyzing if income distinguishes risk tiers",
x = "Repayment Status",
y = "Annual Income (USD)") +
theme(legend.position = "none")
Objective: To detect “multicollinearity”—a scenario where two independent variables are highly correlated (e.g., > 0.85). If features are too similar, the model cannot distinguish their individual impact, which can destabilize predictive performance.
numerical_data <- master_dataset[, sapply(master_dataset, is.numeric)]
cor_matrix <- cor(numerical_data, use = "complete.obs")
melted_cormat <- melt(cor_matrix)
ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
scale_fill_gradient2(low = "#512c3f", mid = "white", high = "#d87b3e",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
labs(title = "Multivariate Feature Interaction Heatmap",
subtitle = "Checking for multicollinearity among numerical features",
x = "",
y = "")
Objective: To identify and eliminate redundant features (duplicate columns) generated during the dataset merge process. By removing these artifacts, we prevent the model from receiving duplicate signals, ensuring feature parsimony and cleaner input for the predictive algorithm.
master_dataset_clean <- master_dataset[, !grepl("\\.y$", names(master_dataset))]
head(master_dataset_clean)
## Transaction_ID Customer_Age.x Gender.x Annual_Income.x
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359 56 Male 32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2 46 Male 72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8 32 Male 82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf 60 Male 92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec 25 Male 32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17 38 Female 94833
## Credit_Score.x Purchase_Category.x BNPL_Provider.x Purchase_Amount.x
## 1 353 Beauty Sezzle 249
## 2 354 Groceries Affirm 188
## 3 630 Travel Sezzle 1610
## 4 470 Fashion Sezzle 120
## 5 502 Travel Klarna 1849
## 6 779 Travel Klarna 1112
## Repayment_Status.x Income_Bracket Device_Type Connection_Type
## 1 Defaulted Mid-Tier Tablet WiFi
## 2 Paid On Time Wealth-Tier Mobile WiFi
## 3 Paid On Time Wealth-Tier Desktop WiFi
## 4 Paid On Time Wealth-Tier Mobile 4G/5G
## 5 Paid On Time Mid-Tier Mobile 4G/5G
## 6 Paid On Time Wealth-Tier Mobile 4G/5G
## Checkout_Time_Seconds Browser Is_Bad
## 1 82 Firefox 1
## 2 60 Firefox 0
## 3 86 Chrome 0
## 4 169 Chrome 0
## 5 38 Chrome 0
## 6 82 Safari 0
Objective: To group applicants into segments using K-Means clustering and visualize where the model makes the most errors. This identifies which customer types are hardest to predict.
set.seed(123)
clusters <- kmeans(master_dataset_clean[, c("Annual_Income.x", "Credit_Score.x")], centers = 3)
master_dataset_clean$Cluster <- as.factor(clusters$cluster)
ggplot(master_dataset_clean, aes(x = Annual_Income.x, y = Credit_Score.x, color = Is_Bad)) +
geom_point(alpha = 0.6) +
facet_wrap(~Cluster) +
theme_minimal() +
labs(title = "Diagnostic: Model Errors by Customer Segment",
subtitle = "Comparing performance across K-Means Clusters",
x = "Annual Income", y = "Credit Score")
Objective: To visually compare the distribution of key numerical features (like Annual_Income or Credit_Score) for “Good” vs. “Bad” customers. This helps us see if these variables actually “separate” the risk classes.
ggplot(master_dataset_clean, aes(x = Annual_Income.x, fill = as.factor(Is_Bad))) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = c("0" = "forestgreen", "1" = "firebrick"),
labels = c("Good (0)", "Bad (1)")) +
scale_y_continuous(labels = label_number(accuracy = 0.00001)) +
theme_minimal() +
theme(legend.position = "top") +
labs(title = "Feature-Risk Distribution: Annual Income",
subtitle = "Comparing income density between Good and Bad customers",
x = "Annual Income",
y = "Density",
fill = "Repayment Status")
cols_to_drop <- c("Transaction_ID")
master_dataset_clean <- master_dataset_clean[, !(names(master_dataset_clean) %in% cols_to_drop)]
char_cols <- sapply(master_dataset_clean, is.character)
master_dataset_clean[char_cols] <- lapply(master_dataset_clean[char_cols], as.factor)
master_dataset_clean$Is_Bad <- as.factor(master_dataset_clean$Is_Bad)
str(master_dataset_clean)
## 'data.frame': 50000 obs. of 15 variables:
## $ Customer_Age.x : int 56 46 32 60 25 38 56 36 40 28 ...
## $ Gender.x : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 1 1 2 1 1 ...
## $ Annual_Income.x : int 32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
## $ Credit_Score.x : int 353 354 630 470 502 779 403 731 669 359 ...
## $ Purchase_Category.x : Factor w/ 6 levels "Beauty","Electronics",..: 1 4 6 3 6 6 6 6 5 1 ...
## $ BNPL_Provider.x : Factor w/ 4 levels "Affirm","Afterpay",..: 4 1 4 4 3 3 3 3 4 3 ...
## $ Purchase_Amount.x : int 249 188 1610 120 1849 1112 418 2117 286 244 ...
## $ Repayment_Status.x : Factor w/ 3 levels "Defaulted","Late Payment",..: 1 3 3 3 3 3 3 3 3 1 ...
## $ Income_Bracket : Factor w/ 3 levels "Entry-Level",..: 2 3 3 3 2 3 3 3 3 3 ...
## $ Device_Type : Factor w/ 3 levels "Desktop","Mobile",..: 3 2 1 2 2 2 1 1 1 3 ...
## $ Connection_Type : Factor w/ 3 levels "4G/5G","VPN",..: 3 3 3 1 1 1 3 3 1 1 ...
## $ Checkout_Time_Seconds: int 82 60 86 169 38 82 169 14 164 5 ...
## $ Browser : Factor w/ 4 levels "Chrome","Edge",..: 3 3 1 1 1 4 4 3 1 4 ...
## $ Is_Bad : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
## $ Cluster : Factor w/ 3 levels "1","2","3": 1 2 2 3 1 3 3 2 2 3 ...
summary(master_dataset_clean)
## Customer_Age.x Gender.x Annual_Income.x Credit_Score.x
## Min. :18.00 Female :24139 Min. : 20000 Min. :300.0
## 1st Qu.:29.00 Male :23952 1st Qu.: 44916 1st Qu.:435.0
## Median :41.00 Non-Binary: 1909 Median : 70072 Median :572.0
## Mean :40.98 Mean : 69981 Mean :573.6
## 3rd Qu.:53.00 3rd Qu.: 95262 3rd Qu.:711.0
## Max. :64.00 Max. :119998 Max. :849.0
## Purchase_Category.x BNPL_Provider.x Purchase_Amount.x
## Beauty :8381 Affirm :12418 Min. : 20.0
## Electronics :8303 Afterpay:12536 1st Qu.: 124.0
## Fashion :8307 Klarna :12545 Median : 229.0
## Groceries :8240 Sezzle :12501 Mean : 563.2
## Home & Furniture:8368 3rd Qu.: 785.0
## Travel :8401 Max. :2999.0
## Repayment_Status.x Income_Bracket Device_Type Connection_Type
## Defaulted : 4379 Entry-Level: 5053 Desktop:14975 4G/5G:19976
## Late Payment: 8009 Mid-Tier :19907 Mobile :30023 VPN : 4960
## Paid On Time:37612 Wealth-Tier:25040 Tablet : 5002 WiFi :25064
##
##
##
## Checkout_Time_Seconds Browser Is_Bad Cluster
## Min. : 5.0 Chrome :12479 0:45621 1:16792
## 1st Qu.: 48.0 Edge :12652 1: 4379 2:16538
## Median : 92.0 Firefox:12481 3:16670
## Mean : 92.1 Safari :12388
## 3rd Qu.:136.0
## Max. :179.0
Objective: To evaluate the linear dependence between Customer_Age and Annual_Income. This diagnostic tests whether these demographic features provide redundant information (multicollinearity) or if they contribute unique, non-linear signals to the predictive model.
ggplot(master_dataset_clean, aes(x = Customer_Age.x, y = Annual_Income.x)) +
geom_point(alpha = 0.5, color = "steelblue") + # Points
geom_smooth(method = "lm", color = "darkred", se = TRUE) +
theme_minimal() +
labs(title = "Income vs. Age: Correlation Analysis",
subtitle = "Visualizing the linear relationship between Age and Annual Income",
x = "Customer Age",
y = "Annual Income")
## `geom_smooth()` using formula = 'y ~ x'
Objective: To perform a diagnostic visualization of the dataset’s class separability. By plotting the feature distribution against the risk target, we assess whether a linear model (e.g., Logistic Regression) can successfully partition the data, or if the “Cloud” distribution necessitates non-linear ensemble methods.
set.seed(123)
eda_sample <- master_dataset_clean[sample(1:nrow(master_dataset_clean), 2000), ]
ggplot(eda_sample, aes(x = Annual_Income.x, y = Customer_Age.x, color = as.factor(Is_Bad))) +
geom_point(alpha = 0.5) +
theme_minimal() +
labs(title = "Data Topology Analysis: Income vs Age",
subtitle = "Visualizing the 'Cloud' distribution before modeling",
x = "Annual Income",
y = "Customer Age",
color = "Risk (0=Good, 1=Bad)") +
geom_smooth(method = "lm", color = "black", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
Objective: To validate the overall health of the dataset by checking for class balance, structure, and identifying potential data leakage. This stage ensures that the data is statistically sound and free from features that would artificially inflate model performance.
write.csv(master_dataset_clean, "cleaned_master_dataset.csv", row.names = FALSE)
cat("Data successfully saved as 'cleaned_master_dataset.csv'")
## Data successfully saved as 'cleaned_master_dataset.csv'
my_data <- read.csv("cleaned_master_dataset.csv")
balanced_data <- ovun.sample(Is_Bad ~ ., data = my_data,
method = "both",
p = 0.5,
N = 35000)$data
print(table(balanced_data$Is_Bad))
##
## 0 1
## 17759 17241
The class distribution shows that the dataset is almost perfectly balanced after preprocessing and sampling.
This balanced distribution helps the Random Forest model learn both classes effectively and prevents bias toward the majority class. As a result, the model can identify risky borrowers more accurately and improve overall prediction performance.
Objective: To implement a robust data partitioning strategy. By splitting the dataset into “Training” and “Testing” , we ensure that the model is evaluated on “unseen” data.
set.seed(123)
split_index <- sample(1:nrow(balanced_data), 0.7 * nrow(balanced_data))
train_data <- balanced_data[split_index, ]
test_data <- balanced_data[-split_index, ]
cat("Training set dimensions:", dim(train_data), "\n")
## Training set dimensions: 24500 15
cat("Testing set dimensions:", dim(test_data), "\n")
## Testing set dimensions: 10500 15
The dataset was divided into separate training and testing subsets to evaluate the model on unseen data.
This data partitioning strategy helps ensure that the model generalizes well and reduces the risk of overfitting.
train_data <- train_data[, !(names(train_data) %in% c("Repayment_Status.x", "Cluster"))]
test_data <- test_data[, !(names(test_data) %in% c("Repayment_Status.x", "Cluster"))]
cols_to_factor <- c("Gender.x", "Purchase_Category.x", "BNPL_Provider.x",
"Income_Bracket", "Device_Type", "Connection_Type", "Browser")
train_data[cols_to_factor] <- lapply(train_data[cols_to_factor], as.factor)
test_data[cols_to_factor] <- lapply(test_data[cols_to_factor], as.factor)
train_data$Is_Bad <- as.factor(train_data$Is_Bad)
test_data$Is_Bad <- as.factor(test_data$Is_Bad)
# 4. Final check
str(train_data)
## 'data.frame': 24500 obs. of 13 variables:
## $ Customer_Age.x : int 62 42 45 56 58 59 59 60 47 24 ...
## $ Gender.x : Factor w/ 3 levels "Female","Male",..: 2 1 1 2 1 2 3 2 2 2 ...
## $ Annual_Income.x : int 64845 32071 47133 119774 92598 102377 108637 59371 84838 39981 ...
## $ Credit_Score.x : int 406 316 504 810 507 316 544 431 567 463 ...
## $ Purchase_Category.x : Factor w/ 6 levels "Beauty","Electronics",..: 1 3 3 3 4 2 4 4 3 3 ...
## $ BNPL_Provider.x : Factor w/ 4 levels "Affirm","Afterpay",..: 3 3 2 2 1 4 1 4 4 4 ...
## $ Purchase_Amount.x : int 99 30 286 108 136 244 115 145 255 198 ...
## $ Income_Bracket : Factor w/ 3 levels "Entry-Level",..: 2 2 2 3 3 3 3 2 3 2 ...
## $ Device_Type : Factor w/ 3 levels "Desktop","Mobile",..: 2 1 3 2 2 3 2 2 3 3 ...
## $ Connection_Type : Factor w/ 3 levels "4G/5G","VPN",..: 3 3 1 3 1 3 1 1 3 3 ...
## $ Checkout_Time_Seconds: int 82 91 29 24 40 100 84 101 146 66 ...
## $ Browser : Factor w/ 4 levels "Chrome","Edge",..: 2 2 3 3 1 2 3 2 2 2 ...
## $ Is_Bad : Factor w/ 2 levels "0","1": 1 2 2 1 1 2 1 1 2 2 ...
Objective: To train the Random Forest model on the train_data subset. This algorithm utilizes an ensemble approach to handle the non-linear relationships between customer behaviors and credit risk, ensuring high predictive accuracy without overfitting.
set.seed(123)
model_final <- randomForest(
Is_Bad ~ .,
data = train_data,
ntree = 100,
importance = TRUE
)
print(model_final)
##
## Call:
## randomForest(formula = Is_Bad ~ ., data = train_data, ntree = 100, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.76%
## Confusion matrix:
## 0 1 class.error
## 0 11928 399 0.032367973
## 1 32 12141 0.002628769
A Random Forest classification model was developed using the training dataset to predict whether a BNPL customer belongs to the low-risk or high-risk category.
ntree = 100).mtry = 3)
were randomly selected to improve model diversity and reduce
overfitting.The results indicate that the Random Forest model performs extremely well in identifying both low-risk and high-risk borrowers, with particularly strong performance in detecting risky customers.
Objective: To evaluate the predictive performance of the trained Random Forest model on the “hold-out” Testing dataset (10,500 records). This validates the model’s ability to generalize to new, unseen customer profiles and ensures that the model is not overfitting to the training set.
predictions <- predict(model_final, test_data)
conf_matrix <- confusionMatrix(predictions, test_data$Is_Bad)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5267 3
## 1 165 5065
##
## Accuracy : 0.984
## 95% CI : (0.9814, 0.9863)
## No Information Rate : 0.5173
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.968
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9696
## Specificity : 0.9994
## Pos Pred Value : 0.9994
## Neg Pred Value : 0.9685
## Prevalence : 0.5173
## Detection Rate : 0.5016
## Detection Prevalence : 0.5019
## Balanced Accuracy : 0.9845
##
## 'Positive' Class : 0
##
The confusion matrix results indicate that the Random Forest model achieved excellent classification performance on the testing dataset.
Overall, the model demonstrates strong generalization capability and is highly effective for BNPL credit risk prediction.
Objective: To determine which customer features (e.g., Credit_Score, Annual_Income, Purchase_Amount) have the highest predictive power in determining credit risk. This provides transparency into the model’s decision-making process.
varImpPlot(model_final,
n.var = 10,
main = "Top 10 Determinants of Credit Risk",
col = "darkblue",
pch = 19)
Checkout_Time_SecondsBrowserBNPL_Provider.xPurchase_Category.xConnection_TypeThe primary objective of this project was to develop an intelligent and reliable machine learning system for assessing customer credit risk in the Buy Now Pay Later (BNPL) sector. To achieve this objective, a complete machine learning pipeline was implemented, including data preprocessing, train-test partitioning, class balancing using ROSE sampling, feature selection, Random Forest model training, and performance evaluation.
The final Random Forest classification model demonstrated excellent predictive capability in distinguishing between low-risk and high-risk borrowers. The model achieved:
These results indicate that the developed system is highly effective in identifying risky borrowers while maintaining strong generalization performance on unseen test data.
The confusion matrix analysis further confirmed that the model correctly classified the majority of customers with very low misclassification rates. The high sensitivity and specificity values demonstrate that the system performs efficiently for both default detection and safe customer approval.
Feature importance analysis revealed that: - Device_Type
- Credit_Score.x - Annual_Income.x
were among the most influential variables affecting customer credit behavior. This finding highlights the growing importance of digital behavioral indicators alongside traditional financial metrics in modern Fintech risk assessment systems.
To improve accessibility and real-world usability, a professional Shiny web application was also developed. The application provides: - Real-time credit risk prediction - Interactive borrower assessment - AI-based approval recommendations - User-friendly dashboard interface
Overall, the project successfully demonstrates how machine learning and behavioral analytics can be integrated to build a scalable, accurate, and business-oriented BNPL credit risk assessment framework for modern financial platforms.
Although the current system achieved strong predictive performance, several improvements can be explored in future work to further enhance scalability, automation, and analytical capability.
The current model operates within a Shiny-based interface using static prediction workflows. In future, the model can be deployed using FastAPI or Flask APIs for real-time credit scoring during live BNPL transactions.
Advanced deep learning models such as Artificial Neural Networks (ANNs), Deep Neural Networks (DNNs), or LSTM architectures can be explored to capture more complex customer behavior and transactional patterns.
The project can be extended into a complete MLOps pipeline by integrating: - Automated retraining - Model monitoring - Drift detection - Continuous deployment
This would ensure that the model remains accurate as customer behavior changes over time.
Future systems may include additional behavioral and transactional variables such as: - Shopping frequency - Session duration - Geolocation consistency - Mobile activity patterns - Real-time transaction streams
These features may further improve predictive performance and fraud detection capability.
The application can be deployed on cloud platforms such as: - AWS - Azure - Google Cloud
to support large-scale enterprise-level Fintech applications.
This project successfully developed a high-performance BNPL Credit Risk Assessment System using Random Forest machine learning. By combining traditional financial indicators with digital behavioral analytics, the model achieved highly accurate and reliable risk prediction performance.
The integration of machine learning with a professional web-based prediction interface demonstrates the practical applicability of AI in modern Fintech ecosystems for automated lending, fraud prevention, and intelligent credit decision-making.