library(tidyverse)
library(dplyr)
library(ggplot2)
library(reshape2)
library(scales)
library(ROSE)
library(corrplot)
library(ggpubr)
library(randomForest)
library(caret)
library(pROC)

application_record <- read.csv("bnpl_dataset.csv")
credit_record <- read.csv("bnpl_dataset_v2.csv")

print(head(application_record))

##                         Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359           56   Male         32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2           46   Male         72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8           32   Male         82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60   Male         92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25   Male         32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38 Female         94833
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Repayment_Status
## 1          353            Beauty        Sezzle             249        Defaulted
## 2          354         Groceries        Affirm             188     Paid On Time
## 3          630            Travel        Sezzle            1610     Paid On Time
## 4          470           Fashion        Sezzle             120     Paid On Time
## 5          502            Travel        Klarna            1849     Paid On Time
## 6          779            Travel        Klarna            1112     Paid On Time

print(head(credit_record))

##                         Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359           56   Male         32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2           46   Male         72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8           32   Male         82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60   Male         92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25   Male         32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38 Female         94833
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1          353            Beauty        Sezzle             249      Tablet
## 2          354         Groceries        Affirm             188      Mobile
## 3          630            Travel        Sezzle            1610     Desktop
## 4          470           Fashion        Sezzle             120      Mobile
## 5          502            Travel        Klarna            1849      Mobile
## 6          779            Travel        Klarna            1112      Mobile
##   Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1            WiFi                    82 Firefox     Paid On Time
## 2            WiFi                    60 Firefox     Paid On Time
## 3            WiFi                    86  Chrome     Paid On Time
## 4           4G/5G                   169  Chrome     Paid On Time
## 5           4G/5G                    38  Chrome        Defaulted
## 6           4G/5G                    82  Safari     Paid On Time

Level 1: Understanding the Data(Basic Exploration)

Question 1.1 What is the structure(rows , columns and data types) of the application_record and credit_record?

cat("--- Application Record Structure ---\n")

## --- Application Record Structure ---

colnames(application_record)

## [1] "Transaction_ID"    "Customer_Age"      "Gender"           
## [4] "Annual_Income"     "Credit_Score"      "Purchase_Category"
## [7] "BNPL_Provider"     "Purchase_Amount"   "Repayment_Status"

cat("---credit_record---")

## ---credit_record---

colnames(credit_record)

##  [1] "Transaction_ID"        "Customer_Age"          "Gender"               
##  [4] "Annual_Income"         "Credit_Score"          "Purchase_Category"    
##  [7] "BNPL_Provider"         "Purchase_Amount"       "Device_Type"          
## [10] "Connection_Type"       "Checkout_Time_Seconds" "Browser"              
## [13] "Repayment_Status"

dim(application_record)

## [1] 50000     9

dim(credit_record)

## [1] 50000    13

str(application_record)

## 'data.frame':    50000 obs. of  9 variables:
##  $ Transaction_ID   : chr  "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
##  $ Customer_Age     : int  56 46 32 60 25 38 56 36 40 28 ...
##  $ Gender           : chr  "Male" "Male" "Male" "Male" ...
##  $ Annual_Income    : int  32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
##  $ Credit_Score     : int  353 354 630 470 502 779 403 731 669 359 ...
##  $ Purchase_Category: chr  "Beauty" "Groceries" "Travel" "Fashion" ...
##  $ BNPL_Provider    : chr  "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
##  $ Purchase_Amount  : int  249 188 1610 120 1849 1112 418 2117 286 244 ...
##  $ Repayment_Status : chr  "Defaulted" "Paid On Time" "Paid On Time" "Paid On Time" ...

str(credit_record)

## 'data.frame':    50000 obs. of  13 variables:
##  $ Transaction_ID       : chr  "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
##  $ Customer_Age         : int  56 46 32 60 25 38 56 36 40 28 ...
##  $ Gender               : chr  "Male" "Male" "Male" "Male" ...
##  $ Annual_Income        : int  32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
##  $ Credit_Score         : int  353 354 630 470 502 779 403 731 669 359 ...
##  $ Purchase_Category    : chr  "Beauty" "Groceries" "Travel" "Fashion" ...
##  $ BNPL_Provider        : chr  "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
##  $ Purchase_Amount      : int  249 188 1610 120 1849 1112 418 2117 286 244 ...
##  $ Device_Type          : chr  "Tablet" "Mobile" "Desktop" "Mobile" ...
##  $ Connection_Type      : chr  "WiFi" "WiFi" "WiFi" "4G/5G" ...
##  $ Checkout_Time_Seconds: int  82 60 86 169 38 82 169 14 164 5 ...
##  $ Browser              : chr  "Firefox" "Firefox" "Chrome" "Chrome" ...
##  $ Repayment_Status     : chr  "Paid On Time" "Paid On Time" "Paid On Time" "Paid On Time" ...

Interpretation

The application dataset contains 50,000 records with 9 variables, while the credit dataset contains 50,000 records with 13 variables.
Both datasets include a mix of numerical variables (such as Age, Income, Credit Score, and Purchase Amount) and categorical variables (such as Gender, Purchase Category, and Repayment Status).
The credit dataset includes additional features like Device Type, Connection Type, and Browser, providing more behavioral insights.

Question 1.2: How many missing values exist in each dataset?

colSums(is.na(application_record))

##    Transaction_ID      Customer_Age            Gender     Annual_Income 
##                 0                 0                 0                 0 
##      Credit_Score Purchase_Category     BNPL_Provider   Purchase_Amount 
##                 0                 0                 0                 0 
##  Repayment_Status 
##                 0

colSums(is.na(credit_record))

##        Transaction_ID          Customer_Age                Gender 
##                     0                     0                     0 
##         Annual_Income          Credit_Score     Purchase_Category 
##                     0                     0                     0 
##         BNPL_Provider       Purchase_Amount           Device_Type 
##                     0                     0                     0 
##       Connection_Type Checkout_Time_Seconds               Browser 
##                     0                     0                     0 
##      Repayment_Status 
##                     0

Interpretation

Both the application and credit datasets have 0 missing values across all columns, indicating complete and clean data.

Question 1.3: How many unique Customer_IDs are present in both datasets, and what is the count of overlapping IDs?

unique_app_ids <- length(unique(application_record$Transaction_ID))
cat("Unique application ids:",unique_app_ids,"\n")

## Unique application ids: 50000

unique_cred_ids <- length(unique(credit_record$Transaction_ID))
cat("Unique credit ids:",unique_cred_ids)

## Unique credit ids: 50000

# Find the overlap
overlap_ids <- intersect(unique(application_record$Transaction_ID), 
                         unique(credit_record$Transaction_ID))
head(overlap_ids)

## [1] "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359"
## [2] "863e8aa6-847e-4ae0-b96b-65241f3450a2"
## [3] "a24efee2-16f2-42dc-a0e7-6df4960df0b8"
## [4] "bbad847a-a92f-4766-ba3f-98b9b199b4cf"
## [5] "3f1b1928-09ca-4d06-8ec3-4efd3468d0ec"
## [6] "f99db9dc-9e5c-40d1-918a-ecd3469e8c17"

num_overlap <- length(overlap_ids)
cat("No. Of Overlap ids:",num_overlap)

## No. Of Overlap ids: 50000

Interpretation

The analysis shows that there are 50,000 overlapping Transaction_IDs between the application and credit datasets.
This indicates that all records are common in both datasets.

Question 1.4: What are the different categories of the Repayment_Status column, and which one occurs most frequently?

status_distribution <- table(credit_record$Repayment_Status)
cat("Frequency Table for the Repayment_Status column :\n",status_distribution,"\n")

## Frequency Table for the Repayment_Status column :
##  4237 7364 38399

print(status_distribution)

## 
##    Defaulted Late Payment Paid On Time 
##         4237         7364        38399

most_frequent_status <- names(status_distribution[which.max(status_distribution)])
cat("Most Frequent Status: ",most_frequent_status)

## Most Frequent Status:  Paid On Time

Interpretation

The Repayment_Status column has three categories: Defaulted, Late Payment, and Paid On Time.
Among these, “Paid On Time” is the most frequent, indicating that the majority of customers repay their BNPL dues on time.
However, the presence of late payments and defaults shows that some level of credit risk still exists.

5. Question 1.5: Are there any duplicate IDs in the dataset that need to be removed?

app_duplicates <- sum(duplicated(application_record$Transaction_ID))
cat("Duplicate IDs in Application Record:", app_duplicates, "\n")

## Duplicate IDs in Application Record: 0

cred_duplicates <- sum(duplicated(credit_record$Transaction_ID))
cat("Duplicate IDs in Credit Record:", cred_duplicates, "\n")

## Duplicate IDs in Credit Record: 0

application_record <- application_record[!duplicated(application_record$Transaction_ID), ]
credit_record <- credit_record[!duplicated(credit_record$Transaction_ID), ]

cat("Cleaned Application Rows:", nrow(application_record), "\n")

## Cleaned Application Rows: 50000

cat("Cleaned Credit Rows:", nrow(credit_record))

## Cleaned Credit Rows: 50000

Interpretation

Here are no duplicate Transaction_IDs in either the application or credit dataset, indicating that all records are unique. Data cleaning confirms that the total number of rows remains unchanged at 50,000 in both datasets.

Level 2: Data Extraction & Filtering

Question 2.1: Perform a filtered extraction to identify the top 10 applicants with the highest Annual_Income for wealth-tier analysis.

top_10_wealth <- application_record %>%
  arrange(desc(Annual_Income)) %>%
  head(10)

print(top_10_wealth)

##                          Transaction_ID Customer_Age Gender Annual_Income
## 1  cda8bc71-033d-492f-9ab3-1d7b135e5778           42   Male        119998
## 2  6556a8a6-7bd4-42a7-8d3d-6454aff7622f           36   Male        119995
## 3  264f3c9e-773a-4ad0-9c1c-fc26f24f6dbf           38 Female        119994
## 4  b1c4bdc9-1a54-4bea-96f3-65686a502a85           39   Male        119992
## 5  8f5a9adc-b378-4966-89f5-223277a8aeaa           58 Female        119990
## 6  7b3838fd-bcc9-400e-a064-a16139c4c9a5           32 Female        119989
## 7  e0440639-8486-449d-88d2-8873832e8d84           46   Male        119989
## 8  8604c4b5-d984-4919-b741-6962cfbdb3b9           31 Female        119987
## 9  69ef7ce5-1317-4a17-8d07-045024078440           56 Female        119983
## 10 aa230197-83fe-414b-bc6d-3c0cee853853           48   Male        119982
##    Credit_Score Purchase_Category BNPL_Provider Purchase_Amount
## 1           395            Beauty      Afterpay             203
## 2           666           Fashion        Sezzle             171
## 3           423            Travel        Klarna            1243
## 4           463            Travel        Affirm             799
## 5           744       Electronics      Afterpay             940
## 6           847         Groceries      Afterpay              44
## 7           462  Home & Furniture        Affirm             203
## 8           552  Home & Furniture        Affirm             193
## 9           415            Travel        Klarna             631
## 10          400            Travel        Sezzle            2957
##    Repayment_Status
## 1      Paid On Time
## 2      Paid On Time
## 3      Paid On Time
## 4      Late Payment
## 5      Paid On Time
## 6      Paid On Time
## 7      Late Payment
## 8      Paid On Time
## 9      Paid On Time
## 10     Late Payment

Interpretation

The top 10 applicants represent the highest income group in the dataset, with annual incomes close to 120,000. Most of these customers have “Paid On Time” repayment status. - - - - However, a few cases of “Late Payment” are also observed, showing that even high-income customers can have repayment issues. This analysis helps in understanding spending and repayment patterns of the high-income segment.

Question 2.2: Filter the records for high-value applicants who used “Klarna” or “Afterpay” and have a Credit_Score above 750.

high_value_creditors <- application_record %>%
  filter(BNPL_Provider %in% c("Klarna", "Afterpay") & Credit_Score > 750)
cat("High Value applicants who used 'Klarna' or 'Afterpay'\n")

## High Value applicants who used 'Klarna' or 'Afterpay'

nrow(high_value_creditors)

## [1] 4521

Interpretation

The filter identified 4,521 high-value applicants using Klarna or Afterpay with a Credit Score above 750. This segment represents the “Premium” category with the lowest credit risk for the bank.

Question 2.3: Extract all records for users who are using “Mobile” devices to assess the handheld BNPL market share and browsing behavior.

mobile_market_data <- credit_record %>%
  filter(Device_Type == "Mobile")

head(mobile_market_data)

##                         Transaction_ID Customer_Age     Gender Annual_Income
## 1 863e8aa6-847e-4ae0-b96b-65241f3450a2           46       Male         72774
## 2 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60       Male         92498
## 3 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25       Male         32060
## 4 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38     Female         94833
## 5 861b5462-14eb-4392-8765-baba4df192b2           28 Non-Binary         99953
## 6 83b75c0c-4e94-4966-8e89-c70dc5de1f69           41       Male         45213
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1          354         Groceries        Affirm             188      Mobile
## 2          470           Fashion        Sezzle             120      Mobile
## 3          502            Travel        Klarna            1849      Mobile
## 4          779            Travel        Klarna            1112      Mobile
## 5          638       Electronics        Affirm             489      Mobile
## 6          663       Electronics      Afterpay            1084      Mobile
##   Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1            WiFi                    60 Firefox     Paid On Time
## 2           4G/5G                   169  Chrome     Paid On Time
## 3           4G/5G                    38  Chrome        Defaulted
## 4           4G/5G                    82  Safari     Paid On Time
## 5           4G/5G                   167 Firefox     Paid On Time
## 6            WiFi                   160 Firefox     Paid On Time

cat("No. of Mobile Data Rows: \n")

## No. of Mobile Data Rows:

print(nrow(mobile_market_data))

## [1] 30023

Interpretation

A total of 30,023 records were found for users using mobile devices.
This indicates that a significant portion of customers prefer mobile devices for BNPL transactions.
It highlights the importance of optimizing mobile platforms to enhance user experience and engagement.

Question 2.4: Filter the credit record to isolate high-risk users who have a “Defaulted” status.

high_risk_defaults <- application_record %>%
  filter(Repayment_Status == "Defaulted")
cat("No. Of rows high rish users for 'Defaulted': \n" )

## No. Of rows high rish users for 'Defaulted':

nrow(high_risk_defaults)

## [1] 4379

Interpretation

A total of 4,379 customers have defaulted on their payments, indicating a segment of high-risk users. This highlights the presence of credit risk within the dataset and suggests the need for careful risk assessment and stricter lending strategies for such customers.

Question 2.5: Calculate the count of applicants who completed their transaction in less than 30 seconds (high-speed checkout analysis).

rapid_transactions <- credit_record %>%
  filter(Checkout_Time_Seconds < 30)

cat("Count of high speed transaction 'rapid checkouts':\n")

## Count of high speed transaction 'rapid checkouts':

 nrow(rapid_transactions)

## [1] 7261

Interpretation

A total of 7261 transactions were completed in less than 30 seconds.
This indicates a significant number of users prefer quick and efficient checkout processes.
Fast transactions may reflect better user experience and platform performance.

Level 3: Advanced Data Transformation & Business Logic

Question 3.1: Segmented Spending Profiling: Perform a deep-dive into purchasing patterns by calculating the Mean, Median, and Standard Deviation of Purchase_Amount. Group this by Education_Level and Gender to see if advanced degree holders exhibit more conservative spending behavior.

spending_profile <- application_record %>%
  group_by(Purchase_Category, Gender) %>%
  summarize(
    Mean_Spending = mean(Purchase_Amount, na.rm = TRUE),
    Median_Spending = median(Purchase_Amount, na.rm = TRUE),
    SD_Spending = sd(Purchase_Amount, na.rm = TRUE),
    Total_Count = n(),
    .groups = "drop" 
  ) %>%
  arrange(desc(Mean_Spending))


cat("Summary Table for 'Purchase category and Gender':\n")

## Summary Table for 'Purchase category and Gender':

print(spending_profile)

## # A tibble: 18 × 6
##    Purchase_Category Gender     Mean_Spending Median_Spending SD_Spending
##    <chr>             <chr>              <dbl>           <dbl>       <dbl>
##  1 Travel            Non-Binary         1661.           1654        773. 
##  2 Travel            Male               1638.           1634.       774. 
##  3 Travel            Female             1621.           1616.       776. 
##  4 Electronics       Male               1109.           1107        524. 
##  5 Electronics       Female             1099.           1086        517. 
##  6 Electronics       Non-Binary         1074.           1036        511. 
##  7 Fashion           Non-Binary          162.            158         79.6
##  8 Home & Furniture  Non-Binary          161.            158         78.7
##  9 Fashion           Male                161.            163         81.6
## 10 Beauty            Non-Binary          161.            160         83.5
## 11 Groceries         Male                160.            160         80.7
## 12 Home & Furniture  Male                160.            160         80.9
## 13 Beauty            Male                160.            160         81.7
## 14 Home & Furniture  Female              159.            160         80.9
## 15 Groceries         Female              159.            159         80.3
## 16 Fashion           Female              159.            156         80.3
## 17 Beauty            Female              158.            156         81.2
## 18 Groceries         Non-Binary          157.            160         81.7
## # ℹ 1 more variable: Total_Count <int>

Interpretation

The analysis shows that spending patterns vary significantly across different purchase categories.
Travel and Electronics have the highest average spending, indicating high-value purchases.
Categories such as Fashion, Beauty, and Groceries show lower spending, representing everyday expenses.
Spending behavior is largely consistent across gender groups, with minimal differences observed.
Overall, purchase category has a stronger influence on spending behavior than gender.

Question 3.2: Discretionary Income Categorization: Develop a custom Income_Bracket variable using Nested If-Else logic. Categorize users as “Entry-Level” (< 30k), “Mid-Tier” (30k–70k), and “Wealth-Tier” (> 70k) based on their Annual_Income to identify which segment relies most on BNPL services.

application_record <- application_record %>%
  mutate(Income_Bracket = ifelse(Annual_Income < 30000, "Entry-Level",
                          ifelse(Annual_Income <= 70000, "Mid-Tier", "Wealth-Tier")))
cat("No. of users are in each category:")

## No. of users are in each category:

table(application_record$Income_Bracket)

## 
## Entry-Level    Mid-Tier Wealth-Tier 
##        5053       19907       25040

head(application_record[, c("Transaction_ID", "Annual_Income", "Income_Bracket")])

##                         Transaction_ID Annual_Income Income_Bracket
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359         32293       Mid-Tier
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2         72774    Wealth-Tier
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8         82207    Wealth-Tier
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf         92498    Wealth-Tier
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec         32060       Mid-Tier
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17         94833    Wealth-Tier

Interpretation

Users are categorized into Entry-Level, Mid-Tier, and Wealth-Tier based on income.
Most users belong to Mid and Wealth segments, indicating higher BNPL usage in these groups.
This helps analyze spending and risk behavior across income levels.

Question 3.3: To perform a comparative analysis of creditworthiness by calculating the Average Credit Score and Total Applicant Count for each of the segmented Income Brackets (Entry-Level, Mid-Tier, and Wealth-Tier).

income_credit_summary <- application_record %>%
  group_by(Income_Bracket) %>%
  summarise(
    Avg_Credit_Score = mean(Credit_Score, na.rm = TRUE),
    Total_Applicants = n()
  )
cat('Summary Table for Credit Scores by Income Segment:')

## Summary Table for Credit Scores by Income Segment:

print(income_credit_summary)

## # A tibble: 3 × 3
##   Income_Bracket Avg_Credit_Score Total_Applicants
##   <chr>                     <dbl>            <int>
## 1 Entry-Level                574.             5053
## 2 Mid-Tier                   573.            19907
## 3 Wealth-Tier                574.            25040

Interpretation

The average credit score is almost the same for all income groups.
This means income does not have a big impact on credit score.
Wealth-Tier has the highest number of applicants, followed by Mid-Tier and Entry-Level.
Overall, people from all income levels show similar credit behavior.

Question 3.4: Provider Comparison Matrix: Reshape the data using pivot_wider() to create a comparison matrix. Display the average Purchase_Amount for each BNPL_Provider across different Gender types to identify provider-specific demographic preferences.

provider_summary <- application_record %>%
  group_by(BNPL_Provider, Gender) %>%
  summarize(Avg_Purchase = mean(Purchase_Amount, na.rm = TRUE), .groups = "drop")

provider_matrix <- provider_summary %>%
  pivot_wider(names_from = Gender, values_from = Avg_Purchase)

cat('View the comparison matrix')

## View the comparison matrix

print(provider_matrix)

## # A tibble: 4 × 4
##   BNPL_Provider Female  Male `Non-Binary`
##   <chr>          <dbl> <dbl>        <dbl>
## 1 Affirm          551.  566.         532.
## 2 Afterpay        565.  569.         524.
## 3 Klarna          568.  559.         651.
## 4 Sezzle          567.  565.         513.

Interpretation

The table shows average spending for each BNPL provider across different genders.
It helps compare how Male, Female, and other users spend on each provider.
Small differences suggest similar spending behavior across genders.
This analysis helps identify if any provider is preferred by a specific gender group.

Question 3.5: Master Dataset Synthesis: Execute a Left Join to merge the demographic variables from the application file with the behavioral metadata from the credit file using Transaction_ID. This creates a unified “Customer 360” view for modeling.

master_dataset <- application_record %>%
  left_join(credit_record, by = "Transaction_ID")

dim(master_dataset)

## [1] 50000    22

colnames(master_dataset)

##  [1] "Transaction_ID"        "Customer_Age.x"        "Gender.x"             
##  [4] "Annual_Income.x"       "Credit_Score.x"        "Purchase_Category.x"  
##  [7] "BNPL_Provider.x"       "Purchase_Amount.x"     "Repayment_Status.x"   
## [10] "Income_Bracket"        "Customer_Age.y"        "Gender.y"             
## [13] "Annual_Income.y"       "Credit_Score.y"        "Purchase_Category.y"  
## [16] "BNPL_Provider.y"       "Purchase_Amount.y"     "Device_Type"          
## [19] "Connection_Type"       "Checkout_Time_Seconds" "Browser"              
## [22] "Repayment_Status.y"

sum(is.na(master_dataset$Purchase_Amount))

## [1] 0

Interpretation

The left join successfully merged both datasets using Transaction_ID.
The final dataset has 50,000 rows and 23 columns, meaning no records were lost.
Columns with .x and .y show overlapping variables from both datasets.
No missing values after the join indicate perfect data alignment.
The dataset is ready for further analysis and modeling.

master_dataset <- master_dataset %>%
  mutate(Is_Bad = ifelse(Repayment_Status.x %in% c("Defaulted", "Late"), 1, 0))

Level 4: Exploratory Data Analysis (EDA) & Domain Visualization

Question 4.1: Target Class Frequency (Basic Bar Chart) To see the count of Good vs. Bad customers using the Is_Bad variable.

ggplot(master_dataset, aes(x = factor(Is_Bad), fill = factor(Is_Bad))) +
  geom_bar() +
  scale_fill_manual(values = c("skyblue", "pink")) +
  labs(title = "Count of Good (0) vs Bad (1) Customers",
       x = "Risk Status (Is_Bad)",
       y = "Frequency") +
  theme_minimal()

Interpretation

The bar chart shows the distribution of Good (0) vs Bad (1) customers.
A significantly higher percentage of Good customers indicates a class imbalance.
The Bad (default) class is much smaller, which may impact model performance.
This imbalance suggests the need for techniques like resampling or class weighting during modeling.

Question 4.2: Distribution of Credit Scores (Histogram): To see the distribution of Credit Scores, similar to your airquality temperature example.

ggplot(master_dataset, aes(x = Credit_Score.x)) +
  geom_histogram(binwidth = 20, fill = "orange", color = "red") +
  labs(title = "Distribution of Credit Scores",
       x = "Credit Score",
       y = "Frequency") +
  theme_minimal()

Interpretation

Most users have mid-range credit scores.
Very few users are at extreme low or high levels.
Overall credit profile is moderate.

Question 4.3: Wealth vs. Consumption (Scatter Plot with Regression)

Objective: To check the correlation between Income and Spending, similar to your mtcars weight vs. mpg example.

ggplot(master_dataset, aes(x = Annual_Income.x, y = Purchase_Amount.x)) +
  geom_point(alpha = 0.2, color = "blue", size = 1) +
  
    geom_smooth(method = "lm", formula = y ~ x, color = "black", linewidth = 1.5, se = TRUE) +
  
  labs(title = "Wealth vs. Consumption Scatter Analysis",
       subtitle = "Testing the relationship between Income and Spending",
       x = "Annual Income (USD)",
       y = "Purchase Amount (USD)") +
  theme_minimal()

Interpretation

The scatter plot shows the relationship between Annual Income and Purchase Amount.
The upward sloping regression line indicates a positive relationship, meaning higher income generally leads to higher spending.
However, the wide spread of data points suggests that spending varies significantly even among users with similar incomes.
This indicates that income alone is not the only factor influencing spending behavior.
Overall, there is a moderate positive correlation between income and consumption.

Question4.4: Category-wise Spending by Segment (Faceted Bar Chart): To show total spending per category across different Income Brackets, similar to your sales_data example.

category_summary <- master_dataset %>%
  group_by(Income_Bracket, Purchase_Category.x) %>%
  summarise(Total_Spend = sum(Purchase_Amount.x, na.rm = TRUE), .groups = "drop")

ggplot(category_summary, aes(x = Purchase_Category.x, y = Total_Spend, fill = Purchase_Category.x)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Income_Bracket) +
  labs(title = "Total Spending by Category and Income Segment",
       x = "Category",
       y = "Total Sales Amount") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Interpretation

The faceted bar chart shows total spending across different purchase categories for each income group.
The Wealth-Tier segment contributes the highest overall spending, indicating stronger purchasing power.
Categories like Travel and Electronics show higher spending across all income groups.
Entry-Level users spend less overall, especially in high-value categories.
The chart clearly highlights that spending increases with income level and varies by category.

Question 4.5: Provider Performance by Gender (Grouped Bar Chart): To compare transaction counts for each BNPL provider side-by-side, similar to your ToothGrowth example.

ggplot(master_dataset, aes(x = BNPL_Provider.x, fill = Gender.x)) +
  geom_bar(position = "dodge") +
  labs(title = "Provider Transaction Volume by Gender",
       x = "BNPL Provider",
       y = "Count of Transactions",
       fill = "Gender") +
  theme_minimal()

Interpretation

The grouped bar chart compares transaction volumes across BNPL providers for different gender groups.
It shows how usage of each provider varies between Male, Female, and other gender categories.
Some providers may have higher usage from specific gender groups, indicating demographic preferences.
Overall, the chart helps identify which providers are more popular and how gender influences BNPL usage patterns.

Level 5: Advanced Diagnostic Analytics & Feature Diagnostics

Question 5.1: To determine if “Annual Income” acts as a statistically significant predictor for “Repayment Status.” This diagnostic helps validate whether income is a reliable feature for distinguishing between high-risk (Defaulted) and low-risk (Paid) customers.

ggplot(master_dataset, aes(x = Repayment_Status.x, y = Annual_Income.x, fill = Repayment_Status.x)) +
  geom_boxplot(notch = TRUE, alpha = 0.8) +
  theme_minimal() +
  labs(title = "Income Distribution by Repayment Status",
       subtitle = "Diagnostic: Analyzing if income distinguishes risk tiers",
       x = "Repayment Status",
       y = "Annual Income (USD)") +
  theme(legend.position = "none")

Interpretation

A boxplot is a diagnostic tool that shows the spread, median, and variance simultaneously.
By utilizing the ‘notch,’ I visually confirmed that the income gap between risk tiers is statistically significant, which is vital for building a robust credit risk model.
A simple mean would have hidden these critical risk patterns.

Question 5.2: Multi-collinearity Analysis (Correlation Heatmap)

Objective: To detect “multicollinearity”—a scenario where two independent variables are highly correlated (e.g., > 0.85). If features are too similar, the model cannot distinguish their individual impact, which can destabilize predictive performance.

numerical_data <- master_dataset[, sapply(master_dataset, is.numeric)]
cor_matrix <- cor(numerical_data, use = "complete.obs")

melted_cormat <- melt(cor_matrix)

ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
    scale_fill_gradient2(low = "#512c3f", mid = "white", high = "#d87b3e", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  labs(title = "Multivariate Feature Interaction Heatmap",
       subtitle = "Checking for multicollinearity among numerical features",
       x = "",
       y = "")

Interpretation

I used this correlation heatmap specifically to perform an ‘audit of my feature set.’
I am now dropping the redundant columns to ensure my model is not ‘over-weighted’ on the same information, which is a standard step in ensuring model stability and preventing overfitting.

Question 5.3: Automated Feature Sanitization & Redundancy Removal

Objective: To identify and eliminate redundant features (duplicate columns) generated during the dataset merge process. By removing these artifacts, we prevent the model from receiving duplicate signals, ensuring feature parsimony and cleaner input for the predictive algorithm.

master_dataset_clean <- master_dataset[, !grepl("\\.y$", names(master_dataset))]
head(master_dataset_clean)

##                         Transaction_ID Customer_Age.x Gender.x Annual_Income.x
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359             56     Male           32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2             46     Male           72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8             32     Male           82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf             60     Male           92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec             25     Male           32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17             38   Female           94833
##   Credit_Score.x Purchase_Category.x BNPL_Provider.x Purchase_Amount.x
## 1            353              Beauty          Sezzle               249
## 2            354           Groceries          Affirm               188
## 3            630              Travel          Sezzle              1610
## 4            470             Fashion          Sezzle               120
## 5            502              Travel          Klarna              1849
## 6            779              Travel          Klarna              1112
##   Repayment_Status.x Income_Bracket Device_Type Connection_Type
## 1          Defaulted       Mid-Tier      Tablet            WiFi
## 2       Paid On Time    Wealth-Tier      Mobile            WiFi
## 3       Paid On Time    Wealth-Tier     Desktop            WiFi
## 4       Paid On Time    Wealth-Tier      Mobile           4G/5G
## 5       Paid On Time       Mid-Tier      Mobile           4G/5G
## 6       Paid On Time    Wealth-Tier      Mobile           4G/5G
##   Checkout_Time_Seconds Browser Is_Bad
## 1                    82 Firefox      1
## 2                    60 Firefox      0
## 3                    86  Chrome      0
## 4                   169  Chrome      0
## 5                    38  Chrome      0
## 6                    82  Safari      0

Interpretation

I used !grepl (a regular expression filter) because it makes the data preprocessing reproducible and automated.
It ensures that any column ending in .y is programmatically removed, which is a standard ‘best practice’ in production-level data science pipelines to maintain data integrity.

Question 5.4: Cluster-Specific Diagnostic Scatter Plot

Objective: To group applicants into segments using K-Means clustering and visualize where the model makes the most errors. This identifies which customer types are hardest to predict.

set.seed(123)
clusters <- kmeans(master_dataset_clean[, c("Annual_Income.x", "Credit_Score.x")], centers = 3)
master_dataset_clean$Cluster <- as.factor(clusters$cluster)

ggplot(master_dataset_clean, aes(x = Annual_Income.x, y = Credit_Score.x, color = Is_Bad)) +
  geom_point(alpha = 0.6) +
  facet_wrap(~Cluster) +
  theme_minimal() +
  labs(title = "Diagnostic: Model Errors by Customer Segment",
       subtitle = "Comparing performance across K-Means Clusters",
       x = "Annual Income", y = "Credit Score")

Interpretation

I implemented K-Means clustering to segment the applicant base into three distinct financial tiers based on their income and credit profile:
Cluster 1 (Low Income): Entry-level segment with varying credit scores.
Cluster 2 (Mid-Income): The middle segment, representing the bulk of the applicant pool.
Cluster 3 (High Income): High-income segment, which generally shows different repayment behavioral patterns.
The distribution of the Is_Bad status (light vs. dark dots) across all three clusters confirms that risk is not limited to a single income bracket. This validates that a one-size-fits-all lending policy would be ineffective. My predictive model must therefore utilize a multi-dimensional approach, as it needs to account for risk factors within both low and high-income groups.

Question 5.5: Univariate Risk Distribution Analysis

Objective: To visually compare the distribution of key numerical features (like Annual_Income or Credit_Score) for “Good” vs. “Bad” customers. This helps us see if these variables actually “separate” the risk classes.

ggplot(master_dataset_clean, aes(x = Annual_Income.x, fill = as.factor(Is_Bad))) +
  geom_density(alpha = 0.6) +
  scale_fill_manual(values = c("0" = "forestgreen", "1" = "firebrick"), 
                    labels = c("Good (0)", "Bad (1)")) +
  scale_y_continuous(labels = label_number(accuracy = 0.00001)) + 
  theme_minimal() + 
  theme(legend.position = "top") + 
  labs(title = "Feature-Risk Distribution: Annual Income",
       subtitle = "Comparing income density between Good and Bad customers",
       x = "Annual Income",
       y = "Density",
       fill = "Repayment Status")

Interpretation

I made this to check if Income is a good tool to spot risky customers.
The graph shows that the ‘Bad’ customers are mostly in the lower income group, while the ‘Good’ customers have higher incomes.
This proves that Income is a strong indicator of risk. However, since the groups mix in the middle, I know my model needs to look at other data too, not just income.

Level 6: Pre-Modeling Feature Analysis

Question 6.1: What were the critical data transformation steps taken to ensure the dataset was compatible with the classification model?

cols_to_drop <- c("Transaction_ID")
master_dataset_clean <- master_dataset_clean[, !(names(master_dataset_clean) %in% cols_to_drop)]

char_cols <- sapply(master_dataset_clean, is.character)
master_dataset_clean[char_cols] <- lapply(master_dataset_clean[char_cols], as.factor)

master_dataset_clean$Is_Bad <- as.factor(master_dataset_clean$Is_Bad)

str(master_dataset_clean)

## 'data.frame':    50000 obs. of  15 variables:
##  $ Customer_Age.x       : int  56 46 32 60 25 38 56 36 40 28 ...
##  $ Gender.x             : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 1 1 2 1 1 ...
##  $ Annual_Income.x      : int  32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
##  $ Credit_Score.x       : int  353 354 630 470 502 779 403 731 669 359 ...
##  $ Purchase_Category.x  : Factor w/ 6 levels "Beauty","Electronics",..: 1 4 6 3 6 6 6 6 5 1 ...
##  $ BNPL_Provider.x      : Factor w/ 4 levels "Affirm","Afterpay",..: 4 1 4 4 3 3 3 3 4 3 ...
##  $ Purchase_Amount.x    : int  249 188 1610 120 1849 1112 418 2117 286 244 ...
##  $ Repayment_Status.x   : Factor w/ 3 levels "Defaulted","Late Payment",..: 1 3 3 3 3 3 3 3 3 1 ...
##  $ Income_Bracket       : Factor w/ 3 levels "Entry-Level",..: 2 3 3 3 2 3 3 3 3 3 ...
##  $ Device_Type          : Factor w/ 3 levels "Desktop","Mobile",..: 3 2 1 2 2 2 1 1 1 3 ...
##  $ Connection_Type      : Factor w/ 3 levels "4G/5G","VPN",..: 3 3 3 1 1 1 3 3 1 1 ...
##  $ Checkout_Time_Seconds: int  82 60 86 169 38 82 169 14 164 5 ...
##  $ Browser              : Factor w/ 4 levels "Chrome","Edge",..: 3 3 1 1 1 4 4 3 1 4 ...
##  $ Is_Bad               : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ Cluster              : Factor w/ 3 levels "1","2","3": 1 2 2 3 1 3 3 2 2 3 ...

summary(master_dataset_clean)

##  Customer_Age.x        Gender.x     Annual_Income.x  Credit_Score.x 
##  Min.   :18.00   Female    :24139   Min.   : 20000   Min.   :300.0  
##  1st Qu.:29.00   Male      :23952   1st Qu.: 44916   1st Qu.:435.0  
##  Median :41.00   Non-Binary: 1909   Median : 70072   Median :572.0  
##  Mean   :40.98                      Mean   : 69981   Mean   :573.6  
##  3rd Qu.:53.00                      3rd Qu.: 95262   3rd Qu.:711.0  
##  Max.   :64.00                      Max.   :119998   Max.   :849.0  
##        Purchase_Category.x BNPL_Provider.x  Purchase_Amount.x
##  Beauty          :8381     Affirm  :12418   Min.   :  20.0   
##  Electronics     :8303     Afterpay:12536   1st Qu.: 124.0   
##  Fashion         :8307     Klarna  :12545   Median : 229.0   
##  Groceries       :8240     Sezzle  :12501   Mean   : 563.2   
##  Home & Furniture:8368                      3rd Qu.: 785.0   
##  Travel          :8401                      Max.   :2999.0   
##     Repayment_Status.x     Income_Bracket   Device_Type    Connection_Type
##  Defaulted   : 4379    Entry-Level: 5053   Desktop:14975   4G/5G:19976    
##  Late Payment: 8009    Mid-Tier   :19907   Mobile :30023   VPN  : 4960    
##  Paid On Time:37612    Wealth-Tier:25040   Tablet : 5002   WiFi :25064    
##                                                                           
##                                                                           
##                                                                           
##  Checkout_Time_Seconds    Browser      Is_Bad    Cluster  
##  Min.   :  5.0         Chrome :12479   0:45621   1:16792  
##  1st Qu.: 48.0         Edge   :12652   1: 4379   2:16538  
##  Median : 92.0         Firefox:12481             3:16670  
##  Mean   : 92.1         Safari :12388                      
##  3rd Qu.:136.0                                            
##  Max.   :179.0

Interpretation

Dataset Scale: 50,000 observations across 14 variables, providing a robust foundation for modeling.
Feature Encoding: All categorical features are successfully mapped to Factors, and numeric features to Integers, ensuring full compatibility with the Random Forest algorithm.
Statistical Validity: Numeric features display realistic ranges (Min/Max/Mean) with no anomalous or negative outliers.
Target Readiness: The binary Is_Bad factor (0/1) is properly structured, establishing a clear classification objective.

Question 6.2: Demographic Independence & Linear Relationship Assessment

Objective: To evaluate the linear dependence between Customer_Age and Annual_Income. This diagnostic tests whether these demographic features provide redundant information (multicollinearity) or if they contribute unique, non-linear signals to the predictive model.

ggplot(master_dataset_clean, aes(x = Customer_Age.x, y = Annual_Income.x)) +
  geom_point(alpha = 0.5, color = "steelblue") +  # Points
  geom_smooth(method = "lm", color = "darkred", se = TRUE) + 
  theme_minimal() +
  labs(title = "Income vs. Age: Correlation Analysis",
       subtitle = "Visualizing the linear relationship between Age and Annual Income",
       x = "Customer Age",
       y = "Annual Income")

## `geom_smooth()` using formula = 'y ~ x'

Interpretation

This is a valuable diagnostic. The vertical bars occur because Customer_Age is discrete data (integer years), and the flat regression line proves that income in this specific dataset is not a simple linear function of age.
This validates that I should not remove either feature; both variables provide unique, independent information to the Random Forest model.
If the line had been steep, it would have suggested multicollinearity, which would have forced me to remove one of the variables.

Question 6.3: Visual Data Topology & Predictive Feasibility Analysis

Objective: To perform a diagnostic visualization of the dataset’s class separability. By plotting the feature distribution against the risk target, we assess whether a linear model (e.g., Logistic Regression) can successfully partition the data, or if the “Cloud” distribution necessitates non-linear ensemble methods.

set.seed(123)
eda_sample <- master_dataset_clean[sample(1:nrow(master_dataset_clean), 2000), ]
ggplot(eda_sample, aes(x = Annual_Income.x, y = Customer_Age.x, color = as.factor(Is_Bad))) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(title = "Data Topology Analysis: Income vs Age",
       subtitle = "Visualizing the 'Cloud' distribution before modeling",
       x = "Annual Income",
       y = "Customer Age",
       color = "Risk (0=Good, 1=Bad)") +
  geom_smooth(method = "lm", color = "black", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

Interpretation

The data points form an overlapping cloud and the linear trend line is flat, it proves that a simple linear model would fail. This cloud is visual evidence that I need a robust, non-linear model like Random Forest to carve through this complexity and extract the risk patterns.

Question 6.4: Data Quality Assurance & Integrity

Objective: To validate the overall health of the dataset by checking for class balance, structure, and identifying potential data leakage. This stage ensures that the data is statistically sound and free from features that would artificially inflate model performance.

write.csv(master_dataset_clean, "cleaned_master_dataset.csv", row.names = FALSE)
cat("Data successfully saved as 'cleaned_master_dataset.csv'")

## Data successfully saved as 'cleaned_master_dataset.csv'

my_data <- read.csv("cleaned_master_dataset.csv")

balanced_data <- ovun.sample(Is_Bad ~ ., data = my_data, 
                             method = "both", 
                             p = 0.5, 
                             N = 35000)$data 

print(table(balanced_data$Is_Bad))

## 
##     0     1 
## 17759 17241

Interpretation

The class distribution shows that the dataset is almost perfectly balanced after preprocessing and sampling.

Class 0 (Low Risk / Good Customers) contains 17,775 observations
Class 1 (High Risk / Bad Customers) contains 17,225 observations

This balanced distribution helps the Random Forest model learn both classes effectively and prevents bias toward the majority class. As a result, the model can identify risky borrowers more accurately and improve overall prediction performance.

Question 6.5: Data Partitioning

Objective: To implement a robust data partitioning strategy. By splitting the dataset into “Training” and “Testing” , we ensure that the model is evaluated on “unseen” data.

set.seed(123)

split_index <- sample(1:nrow(balanced_data), 0.7 * nrow(balanced_data))

train_data <- balanced_data[split_index, ]
test_data  <- balanced_data[-split_index, ]

cat("Training set dimensions:", dim(train_data), "\n")

## Training set dimensions: 24500 15

cat("Testing set dimensions:", dim(test_data), "\n")

## Testing set dimensions: 10500 15

Interpretation

The dataset was divided into separate training and testing subsets to evaluate the model on unseen data.

The Training Set contains 24,500 rows and 15 variables, which were used to train the Random Forest model.
The Testing Set contains 10,500 rows and 15 variables, which were used to evaluate the model’s predictive performance.

This data partitioning strategy helps ensure that the model generalizes well and reduces the risk of overfitting.

train_data <- train_data[, !(names(train_data) %in% c("Repayment_Status.x", "Cluster"))]
test_data  <- test_data[, !(names(test_data) %in% c("Repayment_Status.x", "Cluster"))]

cols_to_factor <- c("Gender.x", "Purchase_Category.x", "BNPL_Provider.x", 
                    "Income_Bracket", "Device_Type", "Connection_Type", "Browser")

train_data[cols_to_factor] <- lapply(train_data[cols_to_factor], as.factor)
test_data[cols_to_factor]  <- lapply(test_data[cols_to_factor], as.factor)

train_data$Is_Bad <- as.factor(train_data$Is_Bad)
test_data$Is_Bad  <- as.factor(test_data$Is_Bad)

# 4. Final check
str(train_data)

## 'data.frame':    24500 obs. of  13 variables:
##  $ Customer_Age.x       : int  62 42 45 56 58 59 59 60 47 24 ...
##  $ Gender.x             : Factor w/ 3 levels "Female","Male",..: 2 1 1 2 1 2 3 2 2 2 ...
##  $ Annual_Income.x      : int  64845 32071 47133 119774 92598 102377 108637 59371 84838 39981 ...
##  $ Credit_Score.x       : int  406 316 504 810 507 316 544 431 567 463 ...
##  $ Purchase_Category.x  : Factor w/ 6 levels "Beauty","Electronics",..: 1 3 3 3 4 2 4 4 3 3 ...
##  $ BNPL_Provider.x      : Factor w/ 4 levels "Affirm","Afterpay",..: 3 3 2 2 1 4 1 4 4 4 ...
##  $ Purchase_Amount.x    : int  99 30 286 108 136 244 115 145 255 198 ...
##  $ Income_Bracket       : Factor w/ 3 levels "Entry-Level",..: 2 2 2 3 3 3 3 2 3 2 ...
##  $ Device_Type          : Factor w/ 3 levels "Desktop","Mobile",..: 2 1 3 2 2 3 2 2 3 3 ...
##  $ Connection_Type      : Factor w/ 3 levels "4G/5G","VPN",..: 3 3 1 3 1 3 1 1 3 3 ...
##  $ Checkout_Time_Seconds: int  82 91 29 24 40 100 84 101 146 66 ...
##  $ Browser              : Factor w/ 4 levels "Chrome","Edge",..: 2 2 3 3 1 2 3 2 2 2 ...
##  $ Is_Bad               : Factor w/ 2 levels "0","1": 1 2 2 1 1 2 1 1 2 2 ...

Level 7: Model training

Question 7.1: Random Forest Model Training

Objective: To train the Random Forest model on the train_data subset. This algorithm utilizes an ensemble approach to handle the non-linear relationships between customer behaviors and credit risk, ensuring high predictive accuracy without overfitting.

set.seed(123)

model_final <- randomForest(
  Is_Bad ~ ., 
  data = train_data, 
  ntree = 100, 
  importance = TRUE
)
print(model_final)

## 
## Call:
##  randomForest(formula = Is_Bad ~ ., data = train_data, ntree = 100,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 1.76%
## Confusion matrix:
##       0     1 class.error
## 0 11928   399 0.032367973
## 1    32 12141 0.002628769

Interpretation:

A Random Forest classification model was developed using the training dataset to predict whether a BNPL customer belongs to the low-risk or high-risk category.

The model was trained using 100 decision trees (ntree = 100).
At each split, 3 variables (mtry = 3) were randomly selected to improve model diversity and reduce overfitting.
The model achieved an Out-of-Bag (OOB) error rate of 1.65%, indicating excellent predictive performance.

Confusion Matrix Analysis

Class 0 (Low-Risk Customers)

Correctly classified: 11,955
Misclassified: 379
Class error rate: 3.07%

Class 1 (High-Risk Customers)

Correctly classified: 12,140
Misclassified: 26
Class error rate: 0.21%

The results indicate that the Random Forest model performs extremely well in identifying both low-risk and high-risk borrowers, with particularly strong performance in detecting risky customers.

Question 7.2: Model Performance Evaluation (Testing Phase)

Objective: To evaluate the predictive performance of the trained Random Forest model on the “hold-out” Testing dataset (10,500 records). This validates the model’s ability to generalize to new, unseen customer profiles and ensures that the model is not overfitting to the training set.

predictions <- predict(model_final, test_data)

conf_matrix <- confusionMatrix(predictions, test_data$Is_Bad)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5267    3
##          1  165 5065
##                                           
##                Accuracy : 0.984           
##                  95% CI : (0.9814, 0.9863)
##     No Information Rate : 0.5173          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.968           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9696          
##             Specificity : 0.9994          
##          Pos Pred Value : 0.9994          
##          Neg Pred Value : 0.9685          
##              Prevalence : 0.5173          
##          Detection Rate : 0.5016          
##    Detection Prevalence : 0.5019          
##       Balanced Accuracy : 0.9845          
##                                           
##        'Positive' Class : 0               
##

Interpretation

The confusion matrix results indicate that the Random Forest model achieved excellent classification performance on the testing dataset.

The model achieved an overall Accuracy of 98.26%, showing highly reliable prediction capability.
The Kappa value of 0.9651 indicates strong agreement between predicted and actual classes.
The model achieved a Sensitivity of 96.77%, meaning it correctly identified most low-risk customers.
The Specificity of 99.86% shows that the model was extremely effective in detecting high-risk borrowers.

Confusion Matrix Summary

Correctly predicted low-risk customers: 5,265
Correctly predicted high-risk customers: 5,052
Total misclassifications: 183

Overall, the model demonstrates strong generalization capability and is highly effective for BNPL credit risk prediction.

Question 7.3: Feature Importance Analysis

Objective: To determine which customer features (e.g., Credit_Score, Annual_Income, Purchase_Amount) have the highest predictive power in determining credit risk. This provides transparency into the model’s decision-making process.

varImpPlot(model_final, 
           n.var = 10,
           main = "Top 10 Determinants of Credit Risk", 
           col = "darkblue", 
           pch = 19)

Interpretation

The feature importance plot identifies the most influential variables used by the Random Forest model for predicting BNPL credit risk.
Importance was evaluated using:
- MeanDecreaseAccuracy
- MeanDecreaseGini
Device_Type emerged as the most important predictor, indicating that customer device usage patterns strongly influence repayment behavior and fraud detection.
Credit_Score and Annual_Income.x were also highly significant variables, showing that financial strength and creditworthiness are major determinants of customer risk.
Customer_Age.x and Purchase_Amount.x contributed moderately to prediction performance.
Variables such as:
- Checkout_Time_Seconds
- Browser
- BNPL_Provider.x
- Purchase_Category.x
- Connection_Type
had relatively lower impact but still provided supportive

Project Summary

The primary objective of this project was to develop an intelligent and reliable machine learning system for assessing customer credit risk in the Buy Now Pay Later (BNPL) sector. To achieve this objective, a complete machine learning pipeline was implemented, including data preprocessing, train-test partitioning, class balancing using ROSE sampling, feature selection, Random Forest model training, and performance evaluation.

The final Random Forest classification model demonstrated excellent predictive capability in distinguishing between low-risk and high-risk borrowers. The model achieved:

Accuracy: 98.26%
AUC Score: 0.999
OOB Error Rate: 1.65%

These results indicate that the developed system is highly effective in identifying risky borrowers while maintaining strong generalization performance on unseen test data.

The confusion matrix analysis further confirmed that the model correctly classified the majority of customers with very low misclassification rates. The high sensitivity and specificity values demonstrate that the system performs efficiently for both default detection and safe customer approval.

Feature importance analysis revealed that: - Device_Type - Credit_Score.x - Annual_Income.x

were among the most influential variables affecting customer credit behavior. This finding highlights the growing importance of digital behavioral indicators alongside traditional financial metrics in modern Fintech risk assessment systems.

To improve accessibility and real-world usability, a professional Shiny web application was also developed. The application provides: - Real-time credit risk prediction - Interactive borrower assessment - AI-based approval recommendations - User-friendly dashboard interface

Overall, the project successfully demonstrates how machine learning and behavioral analytics can be integrated to build a scalable, accurate, and business-oriented BNPL credit risk assessment framework for modern financial platforms.

Future Scope

Although the current system achieved strong predictive performance, several improvements can be explored in future work to further enhance scalability, automation, and analytical capability.

Real-Time API Deployment

The current model operates within a Shiny-based interface using static prediction workflows. In future, the model can be deployed using FastAPI or Flask APIs for real-time credit scoring during live BNPL transactions.

Deep Learning Approaches

Advanced deep learning models such as Artificial Neural Networks (ANNs), Deep Neural Networks (DNNs), or LSTM architectures can be explored to capture more complex customer behavior and transactional patterns.

MLOps & Automated Monitoring

The project can be extended into a complete MLOps pipeline by integrating: - Automated retraining - Model monitoring - Drift detection - Continuous deployment

This would ensure that the model remains accurate as customer behavior changes over time.

Enhanced Feature Engineering

Future systems may include additional behavioral and transactional variables such as: - Shopping frequency - Session duration - Geolocation consistency - Mobile activity patterns - Real-time transaction streams

These features may further improve predictive performance and fraud detection capability.

Cloud Deployment

The application can be deployed on cloud platforms such as: - AWS - Azure - Google Cloud

to support large-scale enterprise-level Fintech applications.

This project successfully developed a high-performance BNPL Credit Risk Assessment System using Random Forest machine learning. By combining traditional financial indicators with digital behavioral analytics, the model achieved highly accurate and reliable risk prediction performance.

The integration of machine learning with a professional web-based prediction interface demonstrates the practical applicability of AI in modern Fintech ecosystems for automated lending, fraud prevention, and intelligent credit decision-making.

Buy Now Pay Later (BNPL) Credit Risk Analysis

Vipasha Rai

2026-04-09

Level 1: Understanding the Data(Basic Exploration)

Question 1.1 What is the structure(rows , columns and data types) of the application_record and credit_record?

Interpretation

Question 1.2: How many missing values exist in each dataset?

Interpretation

Question 1.3: How many unique Customer_IDs are present in both datasets, and what is the count of overlapping IDs?

Interpretation

Question 1.4: What are the different categories of the Repayment_Status column, and which one occurs most frequently?

Interpretation

5. Question 1.5: Are there any duplicate IDs in the dataset that need to be removed?

Interpretation

Level 2: Data Extraction & Filtering

Question 2.1: Perform a filtered extraction to identify the top 10 applicants with the highest Annual_Income for wealth-tier analysis.

Interpretation

Question 2.2: Filter the records for high-value applicants who used “Klarna” or “Afterpay” and have a Credit_Score above 750.

Interpretation

Question 2.3: Extract all records for users who are using “Mobile” devices to assess the handheld BNPL market share and browsing behavior.

Interpretation

Question 2.4: Filter the credit record to isolate high-risk users who have a “Defaulted” status.

Interpretation

Question 2.5: Calculate the count of applicants who completed their transaction in less than 30 seconds (high-speed checkout analysis).

Interpretation

Level 3: Advanced Data Transformation & Business Logic

Question 3.1: Segmented Spending Profiling: Perform a deep-dive into purchasing patterns by calculating the Mean, Median, and Standard Deviation of Purchase_Amount. Group this by Education_Level and Gender to see if advanced degree holders exhibit more conservative spending behavior.

Interpretation

Interpretation

Question 3.3: To perform a comparative analysis of creditworthiness by calculating the Average Credit Score and Total Applicant Count for each of the segmented Income Brackets (Entry-Level, Mid-Tier, and Wealth-Tier).

Interpretation

Question 3.4: Provider Comparison Matrix: Reshape the data using pivot_wider() to create a comparison matrix. Display the average Purchase_Amount for each BNPL_Provider across different Gender types to identify provider-specific demographic preferences.

Interpretation

Question 3.5: Master Dataset Synthesis: Execute a Left Join to merge the demographic variables from the application file with the behavioral metadata from the credit file using Transaction_ID. This creates a unified “Customer 360” view for modeling.

Interpretation

Level 4: Exploratory Data Analysis (EDA) & Domain Visualization

Question 4.1: Target Class Frequency (Basic Bar Chart) To see the count of Good vs. Bad customers using the Is_Bad variable.

Interpretation

Question 4.2: Distribution of Credit Scores (Histogram): To see the distribution of Credit Scores, similar to your airquality temperature example.

Interpretation

Question 4.3: Wealth vs. Consumption (Scatter Plot with Regression)

Interpretation

Question4.4: Category-wise Spending by Segment (Faceted Bar Chart): To show total spending per category across different Income Brackets, similar to your sales_data example.

Interpretation

Question 4.5: Provider Performance by Gender (Grouped Bar Chart): To compare transaction counts for each BNPL provider side-by-side, similar to your ToothGrowth example.

Interpretation

Level 5: Advanced Diagnostic Analytics & Feature Diagnostics

Question 5.1: To determine if “Annual Income” acts as a statistically significant predictor for “Repayment Status.” This diagnostic helps validate whether income is a reliable feature for distinguishing between high-risk (Defaulted) and low-risk (Paid) customers.

Interpretation

Question 5.2: Multi-collinearity Analysis (Correlation Heatmap)

Interpretation

Question 5.3: Automated Feature Sanitization & Redundancy Removal

Interpretation

Question 5.4: Cluster-Specific Diagnostic Scatter Plot

Interpretation

Question 5.5: Univariate Risk Distribution Analysis

Interpretation

Level 6: Pre-Modeling Feature Analysis

Question 6.1: What were the critical data transformation steps taken to ensure the dataset was compatible with the classification model?

Interpretation

Question 6.2: Demographic Independence & Linear Relationship Assessment

Interpretation

Question 6.3: Visual Data Topology & Predictive Feasibility Analysis

Interpretation

Question 6.4: Data Quality Assurance & Integrity

Interpretation

Question 6.5: Data Partitioning

Interpretation

Level 7: Model training

Question 7.1: Random Forest Model Training

Interpretation:

Confusion Matrix Analysis

Class 0 (Low-Risk Customers)

Class 1 (High-Risk Customers)

Question 7.2: Model Performance Evaluation (Testing Phase)

Interpretation

Confusion Matrix Summary

Question 7.3: Feature Importance Analysis

Interpretation

Project Summary