library(tidyverse)
library(dplyr)
library(ggplot2)
library(reshape2)
library(scales)
library(ROSE)
library(corrplot)
library(ggpubr)
library(randomForest)
library(caret)
library(pROC)
application_record <- read.csv("bnpl_dataset.csv")
credit_record <- read.csv("bnpl_dataset_v2.csv")
print(head(application_record))
##                         Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359           56   Male         32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2           46   Male         72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8           32   Male         82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60   Male         92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25   Male         32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38 Female         94833
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Repayment_Status
## 1          353            Beauty        Sezzle             249        Defaulted
## 2          354         Groceries        Affirm             188     Paid On Time
## 3          630            Travel        Sezzle            1610     Paid On Time
## 4          470           Fashion        Sezzle             120     Paid On Time
## 5          502            Travel        Klarna            1849     Paid On Time
## 6          779            Travel        Klarna            1112     Paid On Time
print(head(credit_record))
##                         Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359           56   Male         32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2           46   Male         72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8           32   Male         82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60   Male         92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25   Male         32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38 Female         94833
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1          353            Beauty        Sezzle             249      Tablet
## 2          354         Groceries        Affirm             188      Mobile
## 3          630            Travel        Sezzle            1610     Desktop
## 4          470           Fashion        Sezzle             120      Mobile
## 5          502            Travel        Klarna            1849      Mobile
## 6          779            Travel        Klarna            1112      Mobile
##   Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1            WiFi                    82 Firefox     Paid On Time
## 2            WiFi                    60 Firefox     Paid On Time
## 3            WiFi                    86  Chrome     Paid On Time
## 4           4G/5G                   169  Chrome     Paid On Time
## 5           4G/5G                    38  Chrome        Defaulted
## 6           4G/5G                    82  Safari     Paid On Time

1 Level 1: Understanding the Data(Basic Exploration)

1.0.1 Question 1.1 What is the structure(rows , columns and data types) of the application_record and credit_record?

cat("--- Application Record Structure ---\n")
## --- Application Record Structure ---
colnames(application_record)
## [1] "Transaction_ID"    "Customer_Age"      "Gender"           
## [4] "Annual_Income"     "Credit_Score"      "Purchase_Category"
## [7] "BNPL_Provider"     "Purchase_Amount"   "Repayment_Status"
cat("---credit_record---")
## ---credit_record---
colnames(credit_record)
##  [1] "Transaction_ID"        "Customer_Age"          "Gender"               
##  [4] "Annual_Income"         "Credit_Score"          "Purchase_Category"    
##  [7] "BNPL_Provider"         "Purchase_Amount"       "Device_Type"          
## [10] "Connection_Type"       "Checkout_Time_Seconds" "Browser"              
## [13] "Repayment_Status"
dim(application_record)
## [1] 50000     9
dim(credit_record)
## [1] 50000    13
str(application_record)
## 'data.frame':    50000 obs. of  9 variables:
##  $ Transaction_ID   : chr  "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
##  $ Customer_Age     : int  56 46 32 60 25 38 56 36 40 28 ...
##  $ Gender           : chr  "Male" "Male" "Male" "Male" ...
##  $ Annual_Income    : int  32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
##  $ Credit_Score     : int  353 354 630 470 502 779 403 731 669 359 ...
##  $ Purchase_Category: chr  "Beauty" "Groceries" "Travel" "Fashion" ...
##  $ BNPL_Provider    : chr  "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
##  $ Purchase_Amount  : int  249 188 1610 120 1849 1112 418 2117 286 244 ...
##  $ Repayment_Status : chr  "Defaulted" "Paid On Time" "Paid On Time" "Paid On Time" ...
str(credit_record)
## 'data.frame':    50000 obs. of  13 variables:
##  $ Transaction_ID       : chr  "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
##  $ Customer_Age         : int  56 46 32 60 25 38 56 36 40 28 ...
##  $ Gender               : chr  "Male" "Male" "Male" "Male" ...
##  $ Annual_Income        : int  32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
##  $ Credit_Score         : int  353 354 630 470 502 779 403 731 669 359 ...
##  $ Purchase_Category    : chr  "Beauty" "Groceries" "Travel" "Fashion" ...
##  $ BNPL_Provider        : chr  "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
##  $ Purchase_Amount      : int  249 188 1610 120 1849 1112 418 2117 286 244 ...
##  $ Device_Type          : chr  "Tablet" "Mobile" "Desktop" "Mobile" ...
##  $ Connection_Type      : chr  "WiFi" "WiFi" "WiFi" "4G/5G" ...
##  $ Checkout_Time_Seconds: int  82 60 86 169 38 82 169 14 164 5 ...
##  $ Browser              : chr  "Firefox" "Firefox" "Chrome" "Chrome" ...
##  $ Repayment_Status     : chr  "Paid On Time" "Paid On Time" "Paid On Time" "Paid On Time" ...

1.0.2 Interpretation

  • The application dataset contains 50,000 records with 9 variables, while the credit dataset contains 50,000 records with 13 variables.
  • Both datasets include a mix of numerical variables (such as Age, Income, Credit Score, and Purchase Amount) and categorical variables (such as Gender, Purchase Category, and Repayment Status).
  • The credit dataset includes additional features like Device Type, Connection Type, and Browser, providing more behavioral insights.

1.0.3 Question 1.2: How many missing values exist in each dataset?

colSums(is.na(application_record))
##    Transaction_ID      Customer_Age            Gender     Annual_Income 
##                 0                 0                 0                 0 
##      Credit_Score Purchase_Category     BNPL_Provider   Purchase_Amount 
##                 0                 0                 0                 0 
##  Repayment_Status 
##                 0
colSums(is.na(credit_record))
##        Transaction_ID          Customer_Age                Gender 
##                     0                     0                     0 
##         Annual_Income          Credit_Score     Purchase_Category 
##                     0                     0                     0 
##         BNPL_Provider       Purchase_Amount           Device_Type 
##                     0                     0                     0 
##       Connection_Type Checkout_Time_Seconds               Browser 
##                     0                     0                     0 
##      Repayment_Status 
##                     0

1.0.4 Interpretation

  • Both the application and credit datasets have 0 missing values across all columns, indicating complete and clean data.

1.0.5 Question 1.3: How many unique Customer_IDs are present in both datasets, and what is the count of overlapping IDs?

unique_app_ids <- length(unique(application_record$Transaction_ID))
cat("Unique application ids:",unique_app_ids,"\n")
## Unique application ids: 50000
unique_cred_ids <- length(unique(credit_record$Transaction_ID))
cat("Unique credit ids:",unique_cred_ids)
## Unique credit ids: 50000
# Find the overlap
overlap_ids <- intersect(unique(application_record$Transaction_ID), 
                         unique(credit_record$Transaction_ID))
head(overlap_ids)
## [1] "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359"
## [2] "863e8aa6-847e-4ae0-b96b-65241f3450a2"
## [3] "a24efee2-16f2-42dc-a0e7-6df4960df0b8"
## [4] "bbad847a-a92f-4766-ba3f-98b9b199b4cf"
## [5] "3f1b1928-09ca-4d06-8ec3-4efd3468d0ec"
## [6] "f99db9dc-9e5c-40d1-918a-ecd3469e8c17"
num_overlap <- length(overlap_ids)
cat("No. Of Overlap ids:",num_overlap)
## No. Of Overlap ids: 50000

1.0.6 Interpretation

  • The analysis shows that there are 50,000 overlapping Transaction_IDs between the application and credit datasets.
  • This indicates that all records are common in both datasets.

1.0.7 Question 1.4: What are the different categories of the Repayment_Status column, and which one occurs most frequently?

status_distribution <- table(credit_record$Repayment_Status)
cat("Frequency Table for the Repayment_Status column :\n",status_distribution,"\n")
## Frequency Table for the Repayment_Status column :
##  4237 7364 38399
print(status_distribution)
## 
##    Defaulted Late Payment Paid On Time 
##         4237         7364        38399
most_frequent_status <- names(status_distribution[which.max(status_distribution)])
cat("Most Frequent Status: ",most_frequent_status)
## Most Frequent Status:  Paid On Time

1.0.8 Interpretation

  • The Repayment_Status column has three categories: Defaulted, Late Payment, and Paid On Time.
  • Among these, “Paid On Time” is the most frequent, indicating that the majority of customers repay their BNPL dues on time.
  • However, the presence of late payments and defaults shows that some level of credit risk still exists.

1.0.9 5. Question 1.5: Are there any duplicate IDs in the dataset that need to be removed?

app_duplicates <- sum(duplicated(application_record$Transaction_ID))
cat("Duplicate IDs in Application Record:", app_duplicates, "\n")
## Duplicate IDs in Application Record: 0
cred_duplicates <- sum(duplicated(credit_record$Transaction_ID))
cat("Duplicate IDs in Credit Record:", cred_duplicates, "\n")
## Duplicate IDs in Credit Record: 0
application_record <- application_record[!duplicated(application_record$Transaction_ID), ]
credit_record <- credit_record[!duplicated(credit_record$Transaction_ID), ]

cat("Cleaned Application Rows:", nrow(application_record), "\n")
## Cleaned Application Rows: 50000
cat("Cleaned Credit Rows:", nrow(credit_record))
## Cleaned Credit Rows: 50000

1.0.10 Interpretation

  • Here are no duplicate Transaction_IDs in either the application or credit dataset, indicating that all records are unique. Data cleaning confirms that the total number of rows remains unchanged at 50,000 in both datasets.

2 Level 2: Data Extraction & Filtering

2.0.1 Question 2.1: Perform a filtered extraction to identify the top 10 applicants with the highest Annual_Income for wealth-tier analysis.

top_10_wealth <- application_record %>%
  arrange(desc(Annual_Income)) %>%
  head(10)

print(top_10_wealth)
##                          Transaction_ID Customer_Age Gender Annual_Income
## 1  cda8bc71-033d-492f-9ab3-1d7b135e5778           42   Male        119998
## 2  6556a8a6-7bd4-42a7-8d3d-6454aff7622f           36   Male        119995
## 3  264f3c9e-773a-4ad0-9c1c-fc26f24f6dbf           38 Female        119994
## 4  b1c4bdc9-1a54-4bea-96f3-65686a502a85           39   Male        119992
## 5  8f5a9adc-b378-4966-89f5-223277a8aeaa           58 Female        119990
## 6  7b3838fd-bcc9-400e-a064-a16139c4c9a5           32 Female        119989
## 7  e0440639-8486-449d-88d2-8873832e8d84           46   Male        119989
## 8  8604c4b5-d984-4919-b741-6962cfbdb3b9           31 Female        119987
## 9  69ef7ce5-1317-4a17-8d07-045024078440           56 Female        119983
## 10 aa230197-83fe-414b-bc6d-3c0cee853853           48   Male        119982
##    Credit_Score Purchase_Category BNPL_Provider Purchase_Amount
## 1           395            Beauty      Afterpay             203
## 2           666           Fashion        Sezzle             171
## 3           423            Travel        Klarna            1243
## 4           463            Travel        Affirm             799
## 5           744       Electronics      Afterpay             940
## 6           847         Groceries      Afterpay              44
## 7           462  Home & Furniture        Affirm             203
## 8           552  Home & Furniture        Affirm             193
## 9           415            Travel        Klarna             631
## 10          400            Travel        Sezzle            2957
##    Repayment_Status
## 1      Paid On Time
## 2      Paid On Time
## 3      Paid On Time
## 4      Late Payment
## 5      Paid On Time
## 6      Paid On Time
## 7      Late Payment
## 8      Paid On Time
## 9      Paid On Time
## 10     Late Payment

2.0.2 Interpretation

  • The top 10 applicants represent the highest income group in the dataset, with annual incomes close to 120,000. Most of these customers have “Paid On Time” repayment status. - - - - However, a few cases of “Late Payment” are also observed, showing that even high-income customers can have repayment issues. This analysis helps in understanding spending and repayment patterns of the high-income segment.

2.0.3 Question 2.2: Filter the records for high-value applicants who used “Klarna” or “Afterpay” and have a Credit_Score above 750.

high_value_creditors <- application_record %>%
  filter(BNPL_Provider %in% c("Klarna", "Afterpay") & Credit_Score > 750)
cat("High Value applicants who used 'Klarna' or 'Afterpay'\n")
## High Value applicants who used 'Klarna' or 'Afterpay'
nrow(high_value_creditors)
## [1] 4521

2.0.4 Interpretation

  • The filter identified 4,521 high-value applicants using Klarna or Afterpay with a Credit Score above 750. This segment represents the “Premium” category with the lowest credit risk for the bank.

2.0.5 Question 2.3: Extract all records for users who are using “Mobile” devices to assess the handheld BNPL market share and browsing behavior.

mobile_market_data <- credit_record %>%
  filter(Device_Type == "Mobile")

head(mobile_market_data)
##                         Transaction_ID Customer_Age     Gender Annual_Income
## 1 863e8aa6-847e-4ae0-b96b-65241f3450a2           46       Male         72774
## 2 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60       Male         92498
## 3 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25       Male         32060
## 4 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38     Female         94833
## 5 861b5462-14eb-4392-8765-baba4df192b2           28 Non-Binary         99953
## 6 83b75c0c-4e94-4966-8e89-c70dc5de1f69           41       Male         45213
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1          354         Groceries        Affirm             188      Mobile
## 2          470           Fashion        Sezzle             120      Mobile
## 3          502            Travel        Klarna            1849      Mobile
## 4          779            Travel        Klarna            1112      Mobile
## 5          638       Electronics        Affirm             489      Mobile
## 6          663       Electronics      Afterpay            1084      Mobile
##   Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1            WiFi                    60 Firefox     Paid On Time
## 2           4G/5G                   169  Chrome     Paid On Time
## 3           4G/5G                    38  Chrome        Defaulted
## 4           4G/5G                    82  Safari     Paid On Time
## 5           4G/5G                   167 Firefox     Paid On Time
## 6            WiFi                   160 Firefox     Paid On Time
cat("No. of Mobile Data Rows: \n")
## No. of Mobile Data Rows:
print(nrow(mobile_market_data))
## [1] 30023

2.0.6 Interpretation

  • A total of 30,023 records were found for users using mobile devices.
  • This indicates that a significant portion of customers prefer mobile devices for BNPL transactions.
  • It highlights the importance of optimizing mobile platforms to enhance user experience and engagement.

2.0.7 Question 2.4: Filter the credit record to isolate high-risk users who have a “Defaulted” status.

high_risk_defaults <- application_record %>%
  filter(Repayment_Status == "Defaulted")
cat("No. Of rows high rish users for 'Defaulted': \n" )
## No. Of rows high rish users for 'Defaulted':
nrow(high_risk_defaults)
## [1] 4379

2.0.8 Interpretation

  • A total of 4,379 customers have defaulted on their payments, indicating a segment of high-risk users. This highlights the presence of credit risk within the dataset and suggests the need for careful risk assessment and stricter lending strategies for such customers.

2.0.9 Question 2.5: Calculate the count of applicants who completed their transaction in less than 30 seconds (high-speed checkout analysis).

rapid_transactions <- credit_record %>%
  filter(Checkout_Time_Seconds < 30)

cat("Count of high speed transaction 'rapid checkouts':\n")
## Count of high speed transaction 'rapid checkouts':
 nrow(rapid_transactions)
## [1] 7261

2.0.10 Interpretation

  • A total of 7261 transactions were completed in less than 30 seconds.
  • This indicates a significant number of users prefer quick and efficient checkout processes.
  • Fast transactions may reflect better user experience and platform performance.

3 Level 3: Advanced Data Transformation & Business Logic

3.0.1 Question 3.1: Segmented Spending Profiling: Perform a deep-dive into purchasing patterns by calculating the Mean, Median, and Standard Deviation of Purchase_Amount. Group this by Education_Level and Gender to see if advanced degree holders exhibit more conservative spending behavior.

spending_profile <- application_record %>%
  group_by(Purchase_Category, Gender) %>%
  summarize(
    Mean_Spending = mean(Purchase_Amount, na.rm = TRUE),
    Median_Spending = median(Purchase_Amount, na.rm = TRUE),
    SD_Spending = sd(Purchase_Amount, na.rm = TRUE),
    Total_Count = n(),
    .groups = "drop" 
  ) %>%
  arrange(desc(Mean_Spending))


cat("Summary Table for 'Purchase category and Gender':\n")
## Summary Table for 'Purchase category and Gender':
print(spending_profile)
## # A tibble: 18 × 6
##    Purchase_Category Gender     Mean_Spending Median_Spending SD_Spending
##    <chr>             <chr>              <dbl>           <dbl>       <dbl>
##  1 Travel            Non-Binary         1661.           1654        773. 
##  2 Travel            Male               1638.           1634.       774. 
##  3 Travel            Female             1621.           1616.       776. 
##  4 Electronics       Male               1109.           1107        524. 
##  5 Electronics       Female             1099.           1086        517. 
##  6 Electronics       Non-Binary         1074.           1036        511. 
##  7 Fashion           Non-Binary          162.            158         79.6
##  8 Home & Furniture  Non-Binary          161.            158         78.7
##  9 Fashion           Male                161.            163         81.6
## 10 Beauty            Non-Binary          161.            160         83.5
## 11 Groceries         Male                160.            160         80.7
## 12 Home & Furniture  Male                160.            160         80.9
## 13 Beauty            Male                160.            160         81.7
## 14 Home & Furniture  Female              159.            160         80.9
## 15 Groceries         Female              159.            159         80.3
## 16 Fashion           Female              159.            156         80.3
## 17 Beauty            Female              158.            156         81.2
## 18 Groceries         Non-Binary          157.            160         81.7
## # ℹ 1 more variable: Total_Count <int>

3.0.2 Interpretation

  • The analysis shows that spending patterns vary significantly across different purchase categories.
  • Travel and Electronics have the highest average spending, indicating high-value purchases.
  • Categories such as Fashion, Beauty, and Groceries show lower spending, representing everyday expenses.
  • Spending behavior is largely consistent across gender groups, with minimal differences observed.
  • Overall, purchase category has a stronger influence on spending behavior than gender.

3.0.3 Question 3.2: Discretionary Income Categorization: Develop a custom Income_Bracket variable using Nested If-Else logic. Categorize users as “Entry-Level” (< 30k), “Mid-Tier” (30k–70k), and “Wealth-Tier” (> 70k) based on their Annual_Income to identify which segment relies most on BNPL services.

application_record <- application_record %>%
  mutate(Income_Bracket = ifelse(Annual_Income < 30000, "Entry-Level",
                          ifelse(Annual_Income <= 70000, "Mid-Tier", "Wealth-Tier")))
cat("No. of users are in each category:")
## No. of users are in each category:
table(application_record$Income_Bracket)
## 
## Entry-Level    Mid-Tier Wealth-Tier 
##        5053       19907       25040
head(application_record[, c("Transaction_ID", "Annual_Income", "Income_Bracket")])
##                         Transaction_ID Annual_Income Income_Bracket
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359         32293       Mid-Tier
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2         72774    Wealth-Tier
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8         82207    Wealth-Tier
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf         92498    Wealth-Tier
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec         32060       Mid-Tier
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17         94833    Wealth-Tier

3.0.4 Interpretation

  • Users are categorized into Entry-Level, Mid-Tier, and Wealth-Tier based on income.
  • Most users belong to Mid and Wealth segments, indicating higher BNPL usage in these groups.
  • This helps analyze spending and risk behavior across income levels.

3.0.5 Question 3.3: To perform a comparative analysis of creditworthiness by calculating the Average Credit Score and Total Applicant Count for each of the segmented Income Brackets (Entry-Level, Mid-Tier, and Wealth-Tier).

income_credit_summary <- application_record %>%
  group_by(Income_Bracket) %>%
  summarise(
    Avg_Credit_Score = mean(Credit_Score, na.rm = TRUE),
    Total_Applicants = n()
  )
cat('Summary Table for Credit Scores by Income Segment:')
## Summary Table for Credit Scores by Income Segment:
print(income_credit_summary)
## # A tibble: 3 × 3
##   Income_Bracket Avg_Credit_Score Total_Applicants
##   <chr>                     <dbl>            <int>
## 1 Entry-Level                574.             5053
## 2 Mid-Tier                   573.            19907
## 3 Wealth-Tier                574.            25040

3.0.6 Interpretation

  • The average credit score is almost the same for all income groups.
  • This means income does not have a big impact on credit score.
  • Wealth-Tier has the highest number of applicants, followed by Mid-Tier and Entry-Level.
  • Overall, people from all income levels show similar credit behavior.

3.0.7 Question 3.4: Provider Comparison Matrix: Reshape the data using pivot_wider() to create a comparison matrix. Display the average Purchase_Amount for each BNPL_Provider across different Gender types to identify provider-specific demographic preferences.

provider_summary <- application_record %>%
  group_by(BNPL_Provider, Gender) %>%
  summarize(Avg_Purchase = mean(Purchase_Amount, na.rm = TRUE), .groups = "drop")

provider_matrix <- provider_summary %>%
  pivot_wider(names_from = Gender, values_from = Avg_Purchase)

cat('View the comparison matrix')
## View the comparison matrix
print(provider_matrix)
## # A tibble: 4 × 4
##   BNPL_Provider Female  Male `Non-Binary`
##   <chr>          <dbl> <dbl>        <dbl>
## 1 Affirm          551.  566.         532.
## 2 Afterpay        565.  569.         524.
## 3 Klarna          568.  559.         651.
## 4 Sezzle          567.  565.         513.

3.0.8 Interpretation

  • The table shows average spending for each BNPL provider across different genders.
  • It helps compare how Male, Female, and other users spend on each provider.
  • Small differences suggest similar spending behavior across genders.
  • This analysis helps identify if any provider is preferred by a specific gender group.

3.0.9 Question 3.5: Master Dataset Synthesis: Execute a Left Join to merge the demographic variables from the application file with the behavioral metadata from the credit file using Transaction_ID. This creates a unified “Customer 360” view for modeling.

master_dataset <- application_record %>%
  left_join(credit_record, by = "Transaction_ID")

dim(master_dataset)
## [1] 50000    22
colnames(master_dataset)
##  [1] "Transaction_ID"        "Customer_Age.x"        "Gender.x"             
##  [4] "Annual_Income.x"       "Credit_Score.x"        "Purchase_Category.x"  
##  [7] "BNPL_Provider.x"       "Purchase_Amount.x"     "Repayment_Status.x"   
## [10] "Income_Bracket"        "Customer_Age.y"        "Gender.y"             
## [13] "Annual_Income.y"       "Credit_Score.y"        "Purchase_Category.y"  
## [16] "BNPL_Provider.y"       "Purchase_Amount.y"     "Device_Type"          
## [19] "Connection_Type"       "Checkout_Time_Seconds" "Browser"              
## [22] "Repayment_Status.y"
sum(is.na(master_dataset$Purchase_Amount))
## [1] 0

3.0.10 Interpretation

  • The left join successfully merged both datasets using Transaction_ID.
  • The final dataset has 50,000 rows and 23 columns, meaning no records were lost.
  • Columns with .x and .y show overlapping variables from both datasets.
  • No missing values after the join indicate perfect data alignment.
  • The dataset is ready for further analysis and modeling.
master_dataset <- master_dataset %>%
  mutate(Is_Bad = ifelse(Repayment_Status.x %in% c("Defaulted", "Late"), 1, 0))

4 Level 4: Exploratory Data Analysis (EDA) & Domain Visualization

4.0.1 Question 4.1: Target Class Frequency (Basic Bar Chart) To see the count of Good vs. Bad customers using the Is_Bad variable.

ggplot(master_dataset, aes(x = factor(Is_Bad), fill = factor(Is_Bad))) +
  geom_bar() +
  scale_fill_manual(values = c("skyblue", "pink")) +
  labs(title = "Count of Good (0) vs Bad (1) Customers",
       x = "Risk Status (Is_Bad)",
       y = "Frequency") +
  theme_minimal()

4.0.2 Interpretation

  • The bar chart shows the distribution of Good (0) vs Bad (1) customers.
  • A significantly higher percentage of Good customers indicates a class imbalance.
  • The Bad (default) class is much smaller, which may impact model performance.
  • This imbalance suggests the need for techniques like resampling or class weighting during modeling.

4.0.3 Question 4.2: Distribution of Credit Scores (Histogram): To see the distribution of Credit Scores, similar to your airquality temperature example.

ggplot(master_dataset, aes(x = Credit_Score.x)) +
  geom_histogram(binwidth = 20, fill = "orange", color = "red") +
  labs(title = "Distribution of Credit Scores",
       x = "Credit Score",
       y = "Frequency") +
  theme_minimal()

4.0.4 Interpretation

  • Most users have mid-range credit scores.
  • Very few users are at extreme low or high levels.
  • Overall credit profile is moderate.

4.0.5 Question 4.3: Wealth vs. Consumption (Scatter Plot with Regression)

Objective: To check the correlation between Income and Spending, similar to your mtcars weight vs. mpg example.

ggplot(master_dataset, aes(x = Annual_Income.x, y = Purchase_Amount.x)) +
  geom_point(alpha = 0.2, color = "blue", size = 1) +
  
    geom_smooth(method = "lm", formula = y ~ x, color = "black", linewidth = 1.5, se = TRUE) +
  
  labs(title = "Wealth vs. Consumption Scatter Analysis",
       subtitle = "Testing the relationship between Income and Spending",
       x = "Annual Income (USD)",
       y = "Purchase Amount (USD)") +
  theme_minimal()

4.0.6 Interpretation

  • The scatter plot shows the relationship between Annual Income and Purchase Amount.
  • The upward sloping regression line indicates a positive relationship, meaning higher income generally leads to higher spending.
  • However, the wide spread of data points suggests that spending varies significantly even among users with similar incomes.
  • This indicates that income alone is not the only factor influencing spending behavior.
  • Overall, there is a moderate positive correlation between income and consumption.

4.0.7 Question4.4: Category-wise Spending by Segment (Faceted Bar Chart): To show total spending per category across different Income Brackets, similar to your sales_data example.

category_summary <- master_dataset %>%
  group_by(Income_Bracket, Purchase_Category.x) %>%
  summarise(Total_Spend = sum(Purchase_Amount.x, na.rm = TRUE), .groups = "drop")
ggplot(category_summary, aes(x = Purchase_Category.x, y = Total_Spend, fill = Purchase_Category.x)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Income_Bracket) +
  labs(title = "Total Spending by Category and Income Segment",
       x = "Category",
       y = "Total Sales Amount") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

4.0.8 Interpretation

  • The faceted bar chart shows total spending across different purchase categories for each income group.
  • The Wealth-Tier segment contributes the highest overall spending, indicating stronger purchasing power.
  • Categories like Travel and Electronics show higher spending across all income groups.
  • Entry-Level users spend less overall, especially in high-value categories.
  • The chart clearly highlights that spending increases with income level and varies by category.

4.0.9 Question 4.5: Provider Performance by Gender (Grouped Bar Chart): To compare transaction counts for each BNPL provider side-by-side, similar to your ToothGrowth example.

ggplot(master_dataset, aes(x = BNPL_Provider.x, fill = Gender.x)) +
  geom_bar(position = "dodge") +
  labs(title = "Provider Transaction Volume by Gender",
       x = "BNPL Provider",
       y = "Count of Transactions",
       fill = "Gender") +
  theme_minimal()

4.0.10 Interpretation

  • The grouped bar chart compares transaction volumes across BNPL providers for different gender groups.
  • It shows how usage of each provider varies between Male, Female, and other gender categories.
  • Some providers may have higher usage from specific gender groups, indicating demographic preferences.
  • Overall, the chart helps identify which providers are more popular and how gender influences BNPL usage patterns.

5 Level 5: Advanced Diagnostic Analytics & Feature Diagnostics

5.0.1 Question 5.1: To determine if “Annual Income” acts as a statistically significant predictor for “Repayment Status.” This diagnostic helps validate whether income is a reliable feature for distinguishing between high-risk (Defaulted) and low-risk (Paid) customers.

ggplot(master_dataset, aes(x = Repayment_Status.x, y = Annual_Income.x, fill = Repayment_Status.x)) +
  geom_boxplot(notch = TRUE, alpha = 0.8) +
  theme_minimal() +
  labs(title = "Income Distribution by Repayment Status",
       subtitle = "Diagnostic: Analyzing if income distinguishes risk tiers",
       x = "Repayment Status",
       y = "Annual Income (USD)") +
  theme(legend.position = "none")

5.0.2 Interpretation

  • This is an Income Distribution Diagnostic. I used a notched boxplot to evaluate the discriminatory power of the Annual Income variable against the repayment status.
  • The key feature here is the notch. It represents the 95% confidence interval around the median. Because these notches do not overlap, I have visual evidence that there is a statistically significant difference in income levels between those who default and those who pay on time.

5.0.3 Question 5.2: Multi-collinearity Analysis (Correlation Heatmap)

Objective: To detect “multicollinearity”—a scenario where two independent variables are highly correlated (e.g., > 0.85). If features are too similar, the model cannot distinguish their individual impact, which can destabilize predictive performance.

numerical_data <- master_dataset[, sapply(master_dataset, is.numeric)]
cor_matrix <- cor(numerical_data, use = "complete.obs")

melted_cormat <- melt(cor_matrix)

ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3.5) +
    scale_fill_gradient2(low = "#512c3f", mid = "white", high = "#d87b3e", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  labs(title = "Multivariate Feature Interaction Heatmap",
       subtitle = "Checking for multicollinearity among numerical features",
       x = "",
       y = "")

5.0.4 Interpretation

  • I used a heatmap with a Pearson Correlation gradient to instantly visualize these relationships.
  • Positive Correlation (Orange): Shows variables move in the same direction.
  • Diagonal Line: Represents the correlation of a variable with itself (always 1.0).
  • The Takeaway: By removing the redundant variables identified by this heatmap, I improved the stability and interpretability of my Random Forest model.

5.0.5 Question 5.3: Automated Feature Sanitization & Redundancy Removal

Objective: To identify and eliminate redundant features (duplicate columns) generated during the dataset merge process. By removing these artifacts, we prevent the model from receiving duplicate signals, ensuring feature parsimony and cleaner input for the predictive algorithm.

master_dataset_clean <- master_dataset[, !grepl("\\.y$", names(master_dataset))]
head(master_dataset_clean)
##                         Transaction_ID Customer_Age.x Gender.x Annual_Income.x
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359             56     Male           32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2             46     Male           72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8             32     Male           82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf             60     Male           92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec             25     Male           32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17             38   Female           94833
##   Credit_Score.x Purchase_Category.x BNPL_Provider.x Purchase_Amount.x
## 1            353              Beauty          Sezzle               249
## 2            354           Groceries          Affirm               188
## 3            630              Travel          Sezzle              1610
## 4            470             Fashion          Sezzle               120
## 5            502              Travel          Klarna              1849
## 6            779              Travel          Klarna              1112
##   Repayment_Status.x Income_Bracket Device_Type Connection_Type
## 1          Defaulted       Mid-Tier      Tablet            WiFi
## 2       Paid On Time    Wealth-Tier      Mobile            WiFi
## 3       Paid On Time    Wealth-Tier     Desktop            WiFi
## 4       Paid On Time    Wealth-Tier      Mobile           4G/5G
## 5       Paid On Time       Mid-Tier      Mobile           4G/5G
## 6       Paid On Time    Wealth-Tier      Mobile           4G/5G
##   Checkout_Time_Seconds Browser Is_Bad
## 1                    82 Firefox      1
## 2                    60 Firefox      0
## 3                    86  Chrome      0
## 4                   169  Chrome      0
## 5                    38  Chrome      0
## 6                    82  Safari      0

5.0.6 Interpretation

  • I used !grepl (a regular expression filter) because it makes the data preprocessing reproducible and automated.
  • It ensures that any column ending in .y is programmatically removed, which is a standard ‘best practice’ in production-level data science pipelines to maintain data integrity.

5.0.7 Question 5.4: Cluster-Specific Diagnostic Scatter Plot

Objective: To group applicants into segments using K-Means clustering and visualize where the model makes the most errors. This identifies which customer types are hardest to predict.

set.seed(123)
clusters <- kmeans(master_dataset_clean[, c("Annual_Income.x", "Credit_Score.x")], centers = 3)
master_dataset_clean$Cluster <- as.factor(clusters$cluster)

ggplot(master_dataset_clean, aes(x = Annual_Income.x, y = Credit_Score.x, color = Is_Bad)) +
  geom_point(alpha = 0.6) +
  facet_wrap(~Cluster) +
  theme_minimal() +
  labs(title = "Diagnostic: Model Errors by Customer Segment",
       subtitle = "Comparing performance across K-Means Clusters",
       x = "Annual Income", y = "Credit Score")

5.0.8 Interpretation

  • I implemented K-Means clustering to segment the applicant base into three distinct financial tiers based on their income and credit profile:
  • Cluster 1 (Low Income): Entry-level segment with varying credit scores.
  • Cluster 2 (Mid-Income): The middle segment, representing the bulk of the applicant pool.
  • Cluster 3 (High Income): High-income segment, which generally shows different repayment behavioral patterns.
  • dark blue = Safe (0) and light blue = Risky (1)

5.0.9 Question 5.5: Univariate Risk Distribution Analysis

Objective: To visually compare the distribution of key numerical features (like Annual_Income or Credit_Score) for “Good” vs. “Bad” customers. This helps us see if these variables actually “separate” the risk classes.

ggplot(master_dataset_clean, aes(x = Annual_Income.x, fill = as.factor(Is_Bad))) +
  geom_density(alpha = 0.6) +
  scale_fill_manual(values = c("0" = "forestgreen", "1" = "firebrick"), 
                    labels = c("Good (0)", "Bad (1)")) +
  scale_y_continuous(labels = label_number(accuracy = 0.00001)) + 
  theme_minimal() + 
  theme(legend.position = "top") + 
  labs(title = "Feature-Risk Distribution: Annual Income",
       subtitle = "Comparing income density between Good and Bad customers",
       x = "Annual Income",
       y = "Density",
       fill = "Repayment Status")

5.0.10 Interpretation

  • I made this to check if Income is a good tool to spot risky customers.
  • The graph shows that the ‘Bad’ customers are mostly in the lower income group, while the ‘Good’ customers have higher incomes.
  • This proves that Income is a strong indicator of risk. However, since the groups mix in the middle, I know my model needs to look at other data too, not just income.

6 Level 6: Pre-Modeling Feature Analysis

6.0.1 Question 6.1: What were the critical data transformation steps taken to ensure the dataset was compatible with the classification model?

cols_to_drop <- c("Transaction_ID")
master_dataset_clean <- master_dataset_clean[, !(names(master_dataset_clean) %in% cols_to_drop)]

char_cols <- sapply(master_dataset_clean, is.character)
master_dataset_clean[char_cols] <- lapply(master_dataset_clean[char_cols], as.factor)

master_dataset_clean$Is_Bad <- as.factor(master_dataset_clean$Is_Bad)

str(master_dataset_clean)
## 'data.frame':    50000 obs. of  15 variables:
##  $ Customer_Age.x       : int  56 46 32 60 25 38 56 36 40 28 ...
##  $ Gender.x             : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 1 1 2 1 1 ...
##  $ Annual_Income.x      : int  32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
##  $ Credit_Score.x       : int  353 354 630 470 502 779 403 731 669 359 ...
##  $ Purchase_Category.x  : Factor w/ 6 levels "Beauty","Electronics",..: 1 4 6 3 6 6 6 6 5 1 ...
##  $ BNPL_Provider.x      : Factor w/ 4 levels "Affirm","Afterpay",..: 4 1 4 4 3 3 3 3 4 3 ...
##  $ Purchase_Amount.x    : int  249 188 1610 120 1849 1112 418 2117 286 244 ...
##  $ Repayment_Status.x   : Factor w/ 3 levels "Defaulted","Late Payment",..: 1 3 3 3 3 3 3 3 3 1 ...
##  $ Income_Bracket       : Factor w/ 3 levels "Entry-Level",..: 2 3 3 3 2 3 3 3 3 3 ...
##  $ Device_Type          : Factor w/ 3 levels "Desktop","Mobile",..: 3 2 1 2 2 2 1 1 1 3 ...
##  $ Connection_Type      : Factor w/ 3 levels "4G/5G","VPN",..: 3 3 3 1 1 1 3 3 1 1 ...
##  $ Checkout_Time_Seconds: int  82 60 86 169 38 82 169 14 164 5 ...
##  $ Browser              : Factor w/ 4 levels "Chrome","Edge",..: 3 3 1 1 1 4 4 3 1 4 ...
##  $ Is_Bad               : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ Cluster              : Factor w/ 3 levels "1","2","3": 1 2 2 3 1 3 3 2 2 3 ...
summary(master_dataset_clean)
##  Customer_Age.x        Gender.x     Annual_Income.x  Credit_Score.x 
##  Min.   :18.00   Female    :24139   Min.   : 20000   Min.   :300.0  
##  1st Qu.:29.00   Male      :23952   1st Qu.: 44916   1st Qu.:435.0  
##  Median :41.00   Non-Binary: 1909   Median : 70072   Median :572.0  
##  Mean   :40.98                      Mean   : 69981   Mean   :573.6  
##  3rd Qu.:53.00                      3rd Qu.: 95262   3rd Qu.:711.0  
##  Max.   :64.00                      Max.   :119998   Max.   :849.0  
##        Purchase_Category.x BNPL_Provider.x  Purchase_Amount.x
##  Beauty          :8381     Affirm  :12418   Min.   :  20.0   
##  Electronics     :8303     Afterpay:12536   1st Qu.: 124.0   
##  Fashion         :8307     Klarna  :12545   Median : 229.0   
##  Groceries       :8240     Sezzle  :12501   Mean   : 563.2   
##  Home & Furniture:8368                      3rd Qu.: 785.0   
##  Travel          :8401                      Max.   :2999.0   
##     Repayment_Status.x     Income_Bracket   Device_Type    Connection_Type
##  Defaulted   : 4379    Entry-Level: 5053   Desktop:14975   4G/5G:19976    
##  Late Payment: 8009    Mid-Tier   :19907   Mobile :30023   VPN  : 4960    
##  Paid On Time:37612    Wealth-Tier:25040   Tablet : 5002   WiFi :25064    
##                                                                           
##                                                                           
##                                                                           
##  Checkout_Time_Seconds    Browser      Is_Bad    Cluster  
##  Min.   :  5.0         Chrome :12479   0:45621   1:16792  
##  1st Qu.: 48.0         Edge   :12652   1: 4379   2:16538  
##  Median : 92.0         Firefox:12481             3:16670  
##  Mean   : 92.1         Safari :12388                      
##  3rd Qu.:136.0                                            
##  Max.   :179.0

6.0.2 Interpretation

  • Dataset Scale: 50,000 observations across 14 variables, providing a robust foundation for modeling.
  • Feature Encoding: All categorical features are successfully mapped to Factors, and numeric features to Integers, ensuring full compatibility with the Random Forest algorithm.
  • Statistical Validity: Numeric features display realistic ranges (Min/Max/Mean) with no anomalous or negative outliers.
  • Target Readiness: The binary Is_Bad factor (0/1) is properly structured, establishing a clear classification objective.

6.0.3 Question 6.2: Demographic Independence & Linear Relationship Assessment

Objective: To evaluate the linear dependence between Customer_Age and Annual_Income. This diagnostic tests whether these demographic features provide redundant information (multicollinearity) or if they contribute unique, non-linear signals to the predictive model.

ggplot(master_dataset_clean, aes(x = Customer_Age.x, y = Annual_Income.x)) +
  geom_point(alpha = 0.5, color = "steelblue") +  # Points
  geom_smooth(method = "lm", color = "darkred", se = TRUE) + 
  theme_minimal() +
  labs(title = "Income vs. Age: Correlation Analysis",
       subtitle = "Visualizing the linear relationship between Age and Annual Income",
       x = "Customer Age",
       y = "Annual Income")
## `geom_smooth()` using formula = 'y ~ x'

6.0.4 Interpretation

  • This is a valuable diagnostic. The vertical bars occur because Customer_Age is discrete data (integer years), and the flat regression line proves that income in this specific dataset is not a simple linear function of age.
  • This validates that I should not remove either feature; both variables provide unique, independent information to the Random Forest model.
  • If the line had been steep, it would have suggested multicollinearity, which would have forced me to remove one of the variables.

6.0.5 Question 6.3: Visual Data Topology & Predictive Feasibility Analysis

Objective: To perform a diagnostic visualization of the dataset’s class separability. By plotting the feature distribution against the risk target, we assess whether a linear model (e.g., Logistic Regression) can successfully partition the data, or if the “Cloud” distribution necessitates non-linear ensemble methods.

set.seed(123)
eda_sample <- master_dataset_clean[sample(1:nrow(master_dataset_clean), 2000), ]
ggplot(eda_sample, aes(x = Annual_Income.x, y = Customer_Age.x, color = as.factor(Is_Bad))) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(title = "Data Topology Analysis: Income vs Age",
       subtitle = "Visualizing the 'Cloud' distribution before modeling",
       x = "Annual Income",
       y = "Customer Age",
       color = "Risk (0=Good, 1=Bad)") +
  geom_smooth(method = "lm", color = "black", se = FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

6.0.6 Interpretation

  • The data points form an overlapping cloud and the linear trend line is flat, it proves that a simple linear model would fail. This cloud is visual evidence that I need a robust, non-linear model like Random Forest to carve through this complexity and extract the risk patterns.

6.0.7 Question 6.4: Data Quality Assurance & Integrity

Objective: To validate the overall health of the dataset by checking for class balance, structure, and identifying potential data leakage. This stage ensures that the data is statistically sound and free from features that would artificially inflate model performance.

write.csv(master_dataset_clean, "cleaned_master_dataset.csv", row.names = FALSE)
cat("Data successfully saved as 'cleaned_master_dataset.csv'")
## Data successfully saved as 'cleaned_master_dataset.csv'
my_data <- read.csv("cleaned_master_dataset.csv")

balanced_data <- ovun.sample(Is_Bad ~ ., data = my_data, 
                             method = "both", 
                             p = 0.5, 
                             N = 35000)$data 

print(table(balanced_data$Is_Bad))
## 
##     0     1 
## 17759 17241

6.0.8 Interpretation

The class distribution shows that the dataset is almost perfectly balanced after preprocessing and sampling.

  • Class 0 (Low Risk / Good Customers) contains 17,759 observations
  • Class 1 (High Risk / Bad Customers) contains 17,241 observations

This balanced distribution helps the Random Forest model learn both classes effectively and prevents bias toward the majority class. As a result, the model can identify risky borrowers more accurately and improve overall prediction performance.

6.0.9 Question 6.5: Data Partitioning

Objective: To implement a robust data partitioning strategy. By splitting the dataset into “Training” and “Testing” , we ensure that the model is evaluated on “unseen” data.

set.seed(123)

split_index <- sample(1:nrow(balanced_data), 0.7 * nrow(balanced_data))

train_data <- balanced_data[split_index, ]
test_data  <- balanced_data[-split_index, ]

cat("Training set dimensions:", dim(train_data), "\n")
## Training set dimensions: 24500 15
cat("Testing set dimensions:", dim(test_data), "\n")
## Testing set dimensions: 10500 15

6.0.10 Interpretation

The dataset was divided into separate training and testing subsets to evaluate the model on unseen data.

  • The Training Set contains 24,500 rows and 15 variables, which were used to train the Random Forest model.
  • The Testing Set contains 10,500 rows and 15 variables, which were used to evaluate the model’s predictive performance.

This data partitioning strategy helps ensure that the model generalizes well and reduces the risk of overfitting.

train_data <- train_data[, !(names(train_data) %in% c("Repayment_Status.x", "Cluster"))]
test_data  <- test_data[, !(names(test_data) %in% c("Repayment_Status.x", "Cluster"))]

cols_to_factor <- c("Gender.x", "Purchase_Category.x", "BNPL_Provider.x", 
                    "Income_Bracket", "Device_Type", "Connection_Type", "Browser")

train_data[cols_to_factor] <- lapply(train_data[cols_to_factor], as.factor)
test_data[cols_to_factor]  <- lapply(test_data[cols_to_factor], as.factor)

train_data$Is_Bad <- as.factor(train_data$Is_Bad)
test_data$Is_Bad  <- as.factor(test_data$Is_Bad)

# 4. Final check
str(train_data)
## 'data.frame':    24500 obs. of  13 variables:
##  $ Customer_Age.x       : int  62 42 45 56 58 59 59 60 47 24 ...
##  $ Gender.x             : Factor w/ 3 levels "Female","Male",..: 2 1 1 2 1 2 3 2 2 2 ...
##  $ Annual_Income.x      : int  64845 32071 47133 119774 92598 102377 108637 59371 84838 39981 ...
##  $ Credit_Score.x       : int  406 316 504 810 507 316 544 431 567 463 ...
##  $ Purchase_Category.x  : Factor w/ 6 levels "Beauty","Electronics",..: 1 3 3 3 4 2 4 4 3 3 ...
##  $ BNPL_Provider.x      : Factor w/ 4 levels "Affirm","Afterpay",..: 3 3 2 2 1 4 1 4 4 4 ...
##  $ Purchase_Amount.x    : int  99 30 286 108 136 244 115 145 255 198 ...
##  $ Income_Bracket       : Factor w/ 3 levels "Entry-Level",..: 2 2 2 3 3 3 3 2 3 2 ...
##  $ Device_Type          : Factor w/ 3 levels "Desktop","Mobile",..: 2 1 3 2 2 3 2 2 3 3 ...
##  $ Connection_Type      : Factor w/ 3 levels "4G/5G","VPN",..: 3 3 1 3 1 3 1 1 3 3 ...
##  $ Checkout_Time_Seconds: int  82 91 29 24 40 100 84 101 146 66 ...
##  $ Browser              : Factor w/ 4 levels "Chrome","Edge",..: 2 2 3 3 1 2 3 2 2 2 ...
##  $ Is_Bad               : Factor w/ 2 levels "0","1": 1 2 2 1 1 2 1 1 2 2 ...

7 Level 7: Model training

7.0.1 Question 7.1: Random Forest Model Training

Objective: To train the Random Forest model on the train_data subset. This algorithm utilizes an ensemble approach to handle the non-linear relationships between customer behaviors and credit risk, ensuring high predictive accuracy without overfitting.

set.seed(123)

model_final <- randomForest(
  Is_Bad ~ ., 
  data = train_data, 
  ntree = 100, 
  importance = TRUE
)
print(model_final)
## 
## Call:
##  randomForest(formula = Is_Bad ~ ., data = train_data, ntree = 100,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 1.76%
## Confusion matrix:
##       0     1 class.error
## 0 11928   399 0.032367973
## 1    32 12141 0.002628769

7.0.2 Interpretation:

Random Forest model has achieved high predictive accuracy and strong generalization capabilities.

OOB (Out-of-Bag) Error Rate (1.76%): This is a key metric for a Random Forest. An error rate of 1.76% indicates that the model is highly accurate on unseen data. It confirms that the model is robust and not overfitting to your training dataset.

Confusion Matrix Insights:

Class 1 (Bad/High-Risk): The model correctly identified 12,141 risky customers and only missed 32. A class error of 0.26% for high-risk customers is outstanding, as this is the most critical metric for a lender.

Class 0 (Good/Low-Risk): The model shows a very low error rate of ~3.2% for the good class, indicating that it rarely misclassifies reliable customers as risky.

  • The model demonstrates a strong ability to distinguish between high-risk and low-risk applicants. It maintains a healthy balance, showing that it can effectively minimize the risk of lending to defaulters while correctly approving legitimate applicants.

7.0.3 Question 7.2: Model Performance Evaluation (Testing Phase)

Objective: To evaluate the predictive performance of the trained Random Forest model on the “hold-out” Testing dataset (10,500 records). This validates the model’s ability to generalize to new, unseen customer profiles and ensures that the model is not overfitting to the training set.

predictions <- predict(model_final, test_data)

conf_matrix <- confusionMatrix(predictions, test_data$Is_Bad)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5267    3
##          1  165 5065
##                                           
##                Accuracy : 0.984           
##                  95% CI : (0.9814, 0.9863)
##     No Information Rate : 0.5173          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.968           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9696          
##             Specificity : 0.9994          
##          Pos Pred Value : 0.9994          
##          Neg Pred Value : 0.9685          
##              Prevalence : 0.5173          
##          Detection Rate : 0.5016          
##    Detection Prevalence : 0.5019          
##       Balanced Accuracy : 0.9845          
##                                           
##        'Positive' Class : 0               
## 

7.0.4 Interpretation

  • The model demonstrates a high degree of discriminatory power with an accuracy of 98.4% and a Kappa of 0.968.
  • The balanced performance across both sensitivity and specificity indicates that the model is robust and minimizes both types of classification errors, making it highly suitable for production deployment in a fintech environment.

7.0.5 Question 7.3: Feature Importance Analysis

Objective: To determine which customer features (e.g., Credit_Score, Annual_Income, Purchase_Amount) have the highest predictive power in determining credit risk. This provides transparency into the model’s decision-making process.

varImpPlot(model_final, 
           n.var = 10,
           main = "Top 10 Determinants of Credit Risk", 
           col = "darkblue", 
           pch = 19)

7.0.6 Interpretation

  • While financial metrics like Credit Score and Income are important, the model has identified Device_Type as the most discriminative feature. This suggests that the model is capturing behavioral patterns—such as the difference between legitimate high-end users and potentially fraudulent or high-risk users utilizing specific device types or emulators. This gives the bank/lender an actionable insight for fraud prevention strategies.
# After training your model:
# model <- randomForest(...) 

# Save it as an object
saveRDS(model_final, "model.rds")
print("Model saved successfully!")
## [1] "Model saved successfully!"

9 Project Summary

The primary objective of this project was to develop an intelligent and reliable machine learning system for assessing customer credit risk in the Buy Now Pay Later (BNPL) sector. To achieve this objective, a complete machine learning pipeline was implemented, including data preprocessing, train-test partitioning, class balancing using ROSE sampling, feature selection, Random Forest model training, and performance evaluation.

The final Random Forest classification model demonstrated excellent predictive capability in distinguishing between low-risk and high-risk borrowers. The model achieved:

  • Accuracy: 98.4%
  • OOB Error Rate: 1.76%

These results indicate that the developed system is highly effective in identifying risky borrowers while maintaining strong generalization performance on unseen test data.

The confusion matrix analysis further confirmed that the model correctly classified the majority of customers with very low misclassification rates. The high sensitivity and specificity values demonstrate that the system performs efficiently for both default detection and safe customer approval.

Feature importance analysis revealed that: - Device_Type - Credit_Score.x - Annual_Income.x

were among the most influential variables affecting customer credit behavior. This finding highlights the growing importance of digital behavioral indicators alongside traditional financial metrics in modern Fintech risk assessment systems.

To improve accessibility and real-world usability, a professional Shiny web application was also developed. The application provides: - Real-time credit risk prediction - Interactive borrower assessment - AI-based approval recommendations - User-friendly dashboard interface

Overall, the project successfully demonstrates how machine learning and behavioral analytics can be integrated to build a scalable, accurate, and business-oriented BNPL credit risk assessment framework for modern financial platforms.