library(tidyverse)
library(dplyr)
library(ggplot2)
application_record <- read.csv("bnpl_dataset.csv")
credit_record <- read.csv("bnpl_dataset_v2.csv")
# Print Dataset
print(head(application_record))
##                         Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359           56   Male         32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2           46   Male         72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8           32   Male         82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60   Male         92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25   Male         32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38 Female         94833
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Repayment_Status
## 1          353            Beauty        Sezzle             249        Defaulted
## 2          354         Groceries        Affirm             188     Paid On Time
## 3          630            Travel        Sezzle            1610     Paid On Time
## 4          470           Fashion        Sezzle             120     Paid On Time
## 5          502            Travel        Klarna            1849     Paid On Time
## 6          779            Travel        Klarna            1112     Paid On Time
print(head(credit_record))
##                         Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359           56   Male         32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2           46   Male         72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8           32   Male         82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60   Male         92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25   Male         32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38 Female         94833
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1          353            Beauty        Sezzle             249      Tablet
## 2          354         Groceries        Affirm             188      Mobile
## 3          630            Travel        Sezzle            1610     Desktop
## 4          470           Fashion        Sezzle             120      Mobile
## 5          502            Travel        Klarna            1849      Mobile
## 6          779            Travel        Klarna            1112      Mobile
##   Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1            WiFi                    82 Firefox     Paid On Time
## 2            WiFi                    60 Firefox     Paid On Time
## 3            WiFi                    86  Chrome     Paid On Time
## 4           4G/5G                   169  Chrome     Paid On Time
## 5           4G/5G                    38  Chrome        Defaulted
## 6           4G/5G                    82  Safari     Paid On Time

Level 1: Understanding the Data(Basic Exploration)

Question 1.1 What is the structure(rows , columns and data types) of the application_record and credit_record?

cat("--- Application Record Structure ---\n")
## --- Application Record Structure ---
colnames(application_record)
## [1] "Transaction_ID"    "Customer_Age"      "Gender"           
## [4] "Annual_Income"     "Credit_Score"      "Purchase_Category"
## [7] "BNPL_Provider"     "Purchase_Amount"   "Repayment_Status"
cat("---credit_record---")
## ---credit_record---
colnames(credit_record)
##  [1] "Transaction_ID"        "Customer_Age"          "Gender"               
##  [4] "Annual_Income"         "Credit_Score"          "Purchase_Category"    
##  [7] "BNPL_Provider"         "Purchase_Amount"       "Device_Type"          
## [10] "Connection_Type"       "Checkout_Time_Seconds" "Browser"              
## [13] "Repayment_Status"
dim(application_record)
## [1] 50000     9
dim(credit_record)
## [1] 50000    13
str(application_record)
## 'data.frame':    50000 obs. of  9 variables:
##  $ Transaction_ID   : chr  "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
##  $ Customer_Age     : int  56 46 32 60 25 38 56 36 40 28 ...
##  $ Gender           : chr  "Male" "Male" "Male" "Male" ...
##  $ Annual_Income    : int  32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
##  $ Credit_Score     : int  353 354 630 470 502 779 403 731 669 359 ...
##  $ Purchase_Category: chr  "Beauty" "Groceries" "Travel" "Fashion" ...
##  $ BNPL_Provider    : chr  "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
##  $ Purchase_Amount  : int  249 188 1610 120 1849 1112 418 2117 286 244 ...
##  $ Repayment_Status : chr  "Defaulted" "Paid On Time" "Paid On Time" "Paid On Time" ...
str(credit_record)
## 'data.frame':    50000 obs. of  13 variables:
##  $ Transaction_ID       : chr  "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
##  $ Customer_Age         : int  56 46 32 60 25 38 56 36 40 28 ...
##  $ Gender               : chr  "Male" "Male" "Male" "Male" ...
##  $ Annual_Income        : int  32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
##  $ Credit_Score         : int  353 354 630 470 502 779 403 731 669 359 ...
##  $ Purchase_Category    : chr  "Beauty" "Groceries" "Travel" "Fashion" ...
##  $ BNPL_Provider        : chr  "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
##  $ Purchase_Amount      : int  249 188 1610 120 1849 1112 418 2117 286 244 ...
##  $ Device_Type          : chr  "Tablet" "Mobile" "Desktop" "Mobile" ...
##  $ Connection_Type      : chr  "WiFi" "WiFi" "WiFi" "4G/5G" ...
##  $ Checkout_Time_Seconds: int  82 60 86 169 38 82 169 14 164 5 ...
##  $ Browser              : chr  "Firefox" "Firefox" "Chrome" "Chrome" ...
##  $ Repayment_Status     : chr  "Paid On Time" "Paid On Time" "Paid On Time" "Paid On Time" ...

Interpretation: - The application dataset contains 50,000 records with 9 variables, while the credit dataset contains 50,000 records with 13 variables. Both datasets include a mix of numerical variables (such as Age, Income, Credit Score, and Purchase Amount) and categorical variables (such as Gender, Purchase Category, and Repayment Status). The credit dataset includes additional features like Device Type, Connection Type, and Browser, providing more behavioral insights. Overall, the structured and consistent data makes it suitable for analysis and predictive modeling.

Question 1.2: How many missing values exist in each dataset?

colSums(is.na(application_record))
##    Transaction_ID      Customer_Age            Gender     Annual_Income 
##                 0                 0                 0                 0 
##      Credit_Score Purchase_Category     BNPL_Provider   Purchase_Amount 
##                 0                 0                 0                 0 
##  Repayment_Status 
##                 0
colSums(is.na(credit_record))
##        Transaction_ID          Customer_Age                Gender 
##                     0                     0                     0 
##         Annual_Income          Credit_Score     Purchase_Category 
##                     0                     0                     0 
##         BNPL_Provider       Purchase_Amount           Device_Type 
##                     0                     0                     0 
##       Connection_Type Checkout_Time_Seconds               Browser 
##                     0                     0                     0 
##      Repayment_Status 
##                     0

Interpretation: - Both the application and credit datasets have 0 missing values across all columns, indicating complete and clean data.

Question 1.3: How many unique Customer_IDs are present in both datasets, and what is the count of overlapping IDs?

unique_app_ids <- length(unique(application_record$Transaction_ID))
cat("Unique application ids:",unique_app_ids,"\n")
## Unique application ids: 50000
unique_cred_ids <- length(unique(credit_record$Transaction_ID))
cat("Unique credit ids:",unique_cred_ids)
## Unique credit ids: 50000
# Find the overlap
overlap_ids <- intersect(unique(application_record$Transaction_ID), 
                         unique(credit_record$Transaction_ID))
head(overlap_ids)
## [1] "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359"
## [2] "863e8aa6-847e-4ae0-b96b-65241f3450a2"
## [3] "a24efee2-16f2-42dc-a0e7-6df4960df0b8"
## [4] "bbad847a-a92f-4766-ba3f-98b9b199b4cf"
## [5] "3f1b1928-09ca-4d06-8ec3-4efd3468d0ec"
## [6] "f99db9dc-9e5c-40d1-918a-ecd3469e8c17"
num_overlap <- length(overlap_ids)
cat("No. Of Overlap ids:",num_overlap)
## No. Of Overlap ids: 50000

Interpretation:

  • The analysis shows that there are 50,000 overlapping Transaction_IDs between the application and credit datasets.
  • This indicates that all records are common in both datasets.

Question 1.4: What are the different categories of the Repayment_Status column, and which one occurs most frequently?

status_distribution <- table(credit_record$Repayment_Status)
cat("Frequency Table for the Repayment_Status column :\n",status_distribution,"\n")
## Frequency Table for the Repayment_Status column :
##  4237 7364 38399
most_frequent_status <- names(status_distribution[which.max(status_distribution)])
cat("Most Frequent Status: ",most_frequent_status)
## Most Frequent Status:  Paid On Time

Interpretation: - The Repayment_Status column has three categories: Defaulted, Late Payment, and Paid On Time. Among these, “Paid On Time” is the most frequent, indicating that the majority of customers repay their BNPL dues on time. However, the presence of late payments and defaults shows that some level of credit risk still exists.

5. Question 1.5: Are there any duplicate IDs in the dataset that need to be removed?

app_duplicates <- sum(duplicated(application_record$Transaction_ID))
cat("Duplicate IDs in Application Record:", app_duplicates, "\n")
## Duplicate IDs in Application Record: 0
cred_duplicates <- sum(duplicated(credit_record$Transaction_ID))
cat("Duplicate IDs in Credit Record:", cred_duplicates, "\n")
## Duplicate IDs in Credit Record: 0
application_record <- application_record[!duplicated(application_record$Transaction_ID), ]
credit_record <- credit_record[!duplicated(credit_record$Transaction_ID), ]

cat("Cleaned Application Rows:", nrow(application_record), "\n")
## Cleaned Application Rows: 50000
cat("Cleaned Credit Rows:", nrow(credit_record))
## Cleaned Credit Rows: 50000

Interpretation: - Here are no duplicate Transaction_IDs in either the application or credit dataset, indicating that all records are unique. Data cleaning confirms that the total number of rows remains unchanged at 50,000 in both datasets.

Level 2: Data Extraction & Filtering

Question 2.1: Perform a filtered extraction to identify the top 10 applicants with the highest Annual_Income for wealth-tier analysis.

top_10_wealth <- application_record %>%
  arrange(desc(Annual_Income)) %>%
  head(10)

print(top_10_wealth)
##                          Transaction_ID Customer_Age Gender Annual_Income
## 1  cda8bc71-033d-492f-9ab3-1d7b135e5778           42   Male        119998
## 2  6556a8a6-7bd4-42a7-8d3d-6454aff7622f           36   Male        119995
## 3  264f3c9e-773a-4ad0-9c1c-fc26f24f6dbf           38 Female        119994
## 4  b1c4bdc9-1a54-4bea-96f3-65686a502a85           39   Male        119992
## 5  8f5a9adc-b378-4966-89f5-223277a8aeaa           58 Female        119990
## 6  7b3838fd-bcc9-400e-a064-a16139c4c9a5           32 Female        119989
## 7  e0440639-8486-449d-88d2-8873832e8d84           46   Male        119989
## 8  8604c4b5-d984-4919-b741-6962cfbdb3b9           31 Female        119987
## 9  69ef7ce5-1317-4a17-8d07-045024078440           56 Female        119983
## 10 aa230197-83fe-414b-bc6d-3c0cee853853           48   Male        119982
##    Credit_Score Purchase_Category BNPL_Provider Purchase_Amount
## 1           395            Beauty      Afterpay             203
## 2           666           Fashion        Sezzle             171
## 3           423            Travel        Klarna            1243
## 4           463            Travel        Affirm             799
## 5           744       Electronics      Afterpay             940
## 6           847         Groceries      Afterpay              44
## 7           462  Home & Furniture        Affirm             203
## 8           552  Home & Furniture        Affirm             193
## 9           415            Travel        Klarna             631
## 10          400            Travel        Sezzle            2957
##    Repayment_Status
## 1      Paid On Time
## 2      Paid On Time
## 3      Paid On Time
## 4      Late Payment
## 5      Paid On Time
## 6      Paid On Time
## 7      Late Payment
## 8      Paid On Time
## 9      Paid On Time
## 10     Late Payment

Interpretation: - The top 10 applicants represent the highest income group in the dataset, with annual incomes close to 120,000. Most of these customers have “Paid On Time” repayment status. However, a few cases of “Late Payment” are also observed, showing that even high-income customers can have repayment issues. This analysis helps in understanding spending and repayment patterns of the high-income segment.

Question 2.2: Filter the records for high-value applicants who used “Klarna” or “Afterpay” and have a Credit_Score above 750.

high_value_creditors <- application_record %>%
  filter(BNPL_Provider %in% c("Klarna", "Afterpay") & Credit_Score > 750)

nrow(high_value_creditors)
## [1] 4521
  • Interpretation:
  • The filter identified 4,521 high-value applicants using Klarna or Afterpay with a Credit Score above 750. This segment represents the “Premium” category with the lowest credit risk for the bank.

####Question 2.3: Extract all records for users who are using “Mobile” devices to assess the handheld BNPL market share and browsing behavior.

mobile_market_data <- credit_record %>%
  filter(Device_Type == "Mobile")

head(mobile_market_data)
##                         Transaction_ID Customer_Age     Gender Annual_Income
## 1 863e8aa6-847e-4ae0-b96b-65241f3450a2           46       Male         72774
## 2 bbad847a-a92f-4766-ba3f-98b9b199b4cf           60       Male         92498
## 3 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec           25       Male         32060
## 4 f99db9dc-9e5c-40d1-918a-ecd3469e8c17           38     Female         94833
## 5 861b5462-14eb-4392-8765-baba4df192b2           28 Non-Binary         99953
## 6 83b75c0c-4e94-4966-8e89-c70dc5de1f69           41       Male         45213
##   Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1          354         Groceries        Affirm             188      Mobile
## 2          470           Fashion        Sezzle             120      Mobile
## 3          502            Travel        Klarna            1849      Mobile
## 4          779            Travel        Klarna            1112      Mobile
## 5          638       Electronics        Affirm             489      Mobile
## 6          663       Electronics      Afterpay            1084      Mobile
##   Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1            WiFi                    60 Firefox     Paid On Time
## 2           4G/5G                   169  Chrome     Paid On Time
## 3           4G/5G                    38  Chrome        Defaulted
## 4           4G/5G                    82  Safari     Paid On Time
## 5           4G/5G                   167 Firefox     Paid On Time
## 6            WiFi                   160 Firefox     Paid On Time
cat("No. of Mobile Data Rows: \n")
## No. of Mobile Data Rows:
print(nrow(mobile_market_data))
## [1] 30023

Interpretation: - A total of 30,023 records were found for users using mobile devices. This indicates that a significant portion of customers prefer mobile devices for BNPL transactions. It highlights the importance of optimizing mobile platforms to enhance user experience and engagement.

Question 2.4: Filter the credit record to isolate high-risk users who have a “Defaulted” status.

high_risk_defaults <- application_record %>%
  filter(Repayment_Status == "Defaulted")
cat("No. Of rows high rish users for 'Defaulted': \n" )
## No. Of rows high rish users for 'Defaulted':
nrow(high_risk_defaults)
## [1] 4379

Interpretation: - A total of 4,379 customers have defaulted on their payments, indicating a segment of high-risk users. This highlights the presence of credit risk within the dataset and suggests the need for careful risk assessment and stricter lending strategies for such customers.

Question 2.5: Calculate the count of applicants who completed their transaction in less than 30 seconds (high-speed checkout analysis).

# Filtering for rapid transactions under 30 seconds
rapid_transactions <- credit_record %>%
  filter(Checkout_Time_Seconds < 30)

# Counting the high-speed transactions
cat("Count of high speed transaction 'rapid checkouts':\n")
## Count of high speed transaction 'rapid checkouts':
 nrow(rapid_transactions)
## [1] 7261

Interpretation:

  • A total of 7261 transactions were completed in less than 30 seconds.
  • This indicates a significant number of users prefer quick and efficient checkout processes.
  • Fast transactions may reflect better user experience and platform performance.

Level 3: Advanced Data Transformation & Business Logic

Question 3.1: Segmented Spending Profiling: Perform a deep-dive into purchasing patterns by calculating the Mean, Median, and Standard Deviation of Purchase_Amount. Group this by Education_Level and Gender to see if advanced degree holders exhibit more conservative spending behavior.

# Grouping by existing columns: Purchase_Category and Gender
spending_profile <- application_record %>%
  group_by(Purchase_Category, Gender) %>%
  summarize(
    Mean_Spending = mean(Purchase_Amount, na.rm = TRUE),
    Median_Spending = median(Purchase_Amount, na.rm = TRUE),
    SD_Spending = sd(Purchase_Amount, na.rm = TRUE),
    Total_Count = n(),
    .groups = "drop" 
  ) %>%
  arrange(desc(Mean_Spending))

# Display the summary table
cat("Summary Table for 'Purchase category and Gender':\n")
## Summary Table for 'Purchase category and Gender':
print(spending_profile)
## # A tibble: 18 Ă— 6
##    Purchase_Category Gender     Mean_Spending Median_Spending SD_Spending
##    <chr>             <chr>              <dbl>           <dbl>       <dbl>
##  1 Travel            Non-Binary         1661.           1654        773. 
##  2 Travel            Male               1638.           1634.       774. 
##  3 Travel            Female             1621.           1616.       776. 
##  4 Electronics       Male               1109.           1107        524. 
##  5 Electronics       Female             1099.           1086        517. 
##  6 Electronics       Non-Binary         1074.           1036        511. 
##  7 Fashion           Non-Binary          162.            158         79.6
##  8 Home & Furniture  Non-Binary          161.            158         78.7
##  9 Fashion           Male                161.            163         81.6
## 10 Beauty            Non-Binary          161.            160         83.5
## 11 Groceries         Male                160.            160         80.7
## 12 Home & Furniture  Male                160.            160         80.9
## 13 Beauty            Male                160.            160         81.7
## 14 Home & Furniture  Female              159.            160         80.9
## 15 Groceries         Female              159.            159         80.3
## 16 Fashion           Female              159.            156         80.3
## 17 Beauty            Female              158.            156         81.2
## 18 Groceries         Non-Binary          157.            160         81.7
## # ℹ 1 more variable: Total_Count <int>

Interpretation:

  • The analysis shows that spending patterns vary significantly across different purchase categories.
  • Travel and Electronics have the highest average spending, indicating high-value purchases.
  • Categories such as Fashion, Beauty, and Groceries show lower spending, representing everyday expenses.
  • Spending behavior is largely consistent across gender groups, with minimal differences observed.
  • Overall, purchase category has a stronger influence on spending behavior than gender.

Question 3.2: Discretionary Income Categorization: Develop a custom Income_Bracket variable using Nested If-Else logic. Categorize users as “Entry-Level” (< 30k), “Mid-Tier” (30k–70k), and “Wealth-Tier” (> 70k) based on their Annual_Income to identify which segment relies most on BNPL services.

application_record <- application_record %>%
  mutate(Income_Bracket = ifelse(Annual_Income < 30000, "Entry-Level",
                          ifelse(Annual_Income <= 70000, "Mid-Tier", "Wealth-Tier")))
cat("No. of users are in each category:")
## No. of users are in each category:
table(application_record$Income_Bracket)
## 
## Entry-Level    Mid-Tier Wealth-Tier 
##        5053       19907       25040
head(application_record[, c("Transaction_ID", "Annual_Income", "Income_Bracket")])
##                         Transaction_ID Annual_Income Income_Bracket
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359         32293       Mid-Tier
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2         72774    Wealth-Tier
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8         82207    Wealth-Tier
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf         92498    Wealth-Tier
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec         32060       Mid-Tier
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17         94833    Wealth-Tier

Interpretation:

  • Users are categorized into Entry-Level, Mid-Tier, and Wealth-Tier based on income.
  • Most users belong to Mid and Wealth segments, indicating higher BNPL usage in these groups.
  • This helps analyze spending and risk behavior across income levels.

Question 3.3: To perform a comparative analysis of creditworthiness by calculating the Average Credit Score and Total Applicant Count for each of the segmented Income Brackets (Entry-Level, Mid-Tier, and Wealth-Tier).

income_credit_summary <- application_record %>%
  group_by(Income_Bracket) %>%
  summarise(
    Avg_Credit_Score = mean(Credit_Score, na.rm = TRUE),
    Total_Applicants = n()
  )
cat('Summary Table for Credit Scores by Income Segment:')
## Summary Table for Credit Scores by Income Segment:
print(income_credit_summary)
## # A tibble: 3 Ă— 3
##   Income_Bracket Avg_Credit_Score Total_Applicants
##   <chr>                     <dbl>            <int>
## 1 Entry-Level                574.             5053
## 2 Mid-Tier                   573.            19907
## 3 Wealth-Tier                574.            25040

Interpretation:

  • The average credit score is almost the same for all income groups.
  • This means income does not have a big impact on credit score.
  • Wealth-Tier has the highest number of applicants, followed by Mid-Tier and Entry-Level.
  • Overall, people from all income levels show similar credit behavior.

Question 3.4: Provider Comparison Matrix: Reshape the data using pivot_wider() to create a comparison matrix. Display the average Purchase_Amount for each BNPL_Provider across different Gender types to identify provider-specific demographic preferences.

provider_summary <- application_record %>%
  group_by(BNPL_Provider, Gender) %>%
  summarize(Avg_Purchase = mean(Purchase_Amount, na.rm = TRUE), .groups = "drop")

provider_matrix <- provider_summary %>%
  pivot_wider(names_from = Gender, values_from = Avg_Purchase)

cat('View the comparison matrix')
## View the comparison matrix
print(provider_matrix)
## # A tibble: 4 Ă— 4
##   BNPL_Provider Female  Male `Non-Binary`
##   <chr>          <dbl> <dbl>        <dbl>
## 1 Affirm          551.  566.         532.
## 2 Afterpay        565.  569.         524.
## 3 Klarna          568.  559.         651.
## 4 Sezzle          567.  565.         513.

Interpretation:

  • The table shows average spending for each BNPL provider across different genders.
  • It helps compare how Male, Female, and other users spend on each provider.
  • Small differences suggest similar spending behavior across genders.
  • This analysis helps identify if any provider is preferred by a specific gender group.

Question 3.5: Master Dataset Synthesis: Execute a Left Join to merge the demographic variables from the application file with the behavioral metadata from the credit file using Transaction_ID. This creates a unified “Customer 360” view for modeling.

master_dataset <- application_record %>%
  left_join(credit_record, by = "Transaction_ID")

dim(master_dataset)
## [1] 50000    22
colnames(master_dataset)
##  [1] "Transaction_ID"        "Customer_Age.x"        "Gender.x"             
##  [4] "Annual_Income.x"       "Credit_Score.x"        "Purchase_Category.x"  
##  [7] "BNPL_Provider.x"       "Purchase_Amount.x"     "Repayment_Status.x"   
## [10] "Income_Bracket"        "Customer_Age.y"        "Gender.y"             
## [13] "Annual_Income.y"       "Credit_Score.y"        "Purchase_Category.y"  
## [16] "BNPL_Provider.y"       "Purchase_Amount.y"     "Device_Type"          
## [19] "Connection_Type"       "Checkout_Time_Seconds" "Browser"              
## [22] "Repayment_Status.y"
sum(is.na(master_dataset$Purchase_Amount))
## [1] 0

Interpretation:

  • The left join successfully merged both datasets using Transaction_ID.
  • The final dataset has 50,000 rows and 23 columns, meaning no records were lost.
  • Columns with .x and .y show overlapping variables from both datasets.
  • No missing values after the join indicate perfect data alignment.
  • The dataset is ready for further analysis and modeling.
master_dataset <- master_dataset %>%
  mutate(Is_Bad = ifelse(Repayment_Status.x %in% c("Defaulted", "Late"), 1, 0))

Level 4: Exploratory Data Analysis (EDA) & Domain Visualization

Question 4.1: Target Class Frequency (Basic Bar Chart) To see the count of Good vs. Bad customers using the Is_Bad variable.

ggplot(master_dataset, aes(x = factor(Is_Bad), fill = factor(Is_Bad))) +
  geom_bar() +
  scale_fill_manual(values = c("skyblue", "pink")) +
  labs(title = "Count of Good (0) vs Bad (1) Customers",
       x = "Risk Status (Is_Bad)",
       y = "Frequency") +
  theme_minimal()

Interpretation:

  • The bar chart shows the distribution of Good (0) vs Bad (1) customers.
  • A significantly higher percentage of Good customers indicates a class imbalance.
  • The Bad (default) class is much smaller, which may impact model performance.
  • This imbalance suggests the need for techniques like resampling or class weighting during modeling.

Question 4.2: Distribution of Credit Scores (Histogram): To see the distribution of Credit Scores, similar to your airquality temperature example.

ggplot(master_dataset, aes(x = Credit_Score.x)) +
  geom_histogram(binwidth = 20, fill = "orange", color = "red") +
  labs(title = "Distribution of Credit Scores",
       x = "Credit Score",
       y = "Frequency") +
  theme_minimal()

Interpretation:

  • Most users have mid-range credit scores.
  • Very few users are at extreme low or high levels.
  • Overall credit profile is moderate.

####Question 4.3:Wealth vs. Consumption (Scatter Plot with Regression): To check the correlation between Income and Spending, similar to your mtcars weight vs. mpg example.

ggplot(master_dataset, aes(x = Annual_Income.x, y = Purchase_Amount.x)) +
  geom_point(alpha = 0.2, color = "blue", size = 1) +
  
    geom_smooth(method = "lm", formula = y ~ x, color = "black", linewidth = 1.5, se = TRUE) +
  
  labs(title = "Wealth vs. Consumption Scatter Analysis",
       subtitle = "Testing the relationship between Income and Spending",
       x = "Annual Income (USD)",
       y = "Purchase Amount (USD)") +
  theme_minimal()

Interpretation:

  • The scatter plot shows the relationship between Annual Income and Purchase Amount.
  • The upward sloping regression line indicates a positive relationship, meaning higher income generally leads to higher spending.
  • However, the wide spread of data points suggests that spending varies significantly even among users with similar incomes.
  • This indicates that income alone is not the only factor influencing spending behavior.
  • Overall, there is a moderate positive correlation between income and consumption.

Question4.4: Category-wise Spending by Segment (Faceted Bar Chart): To show total spending per category across different Income Brackets, similar to your sales_data example.

category_summary <- master_dataset %>%
  group_by(Income_Bracket, Purchase_Category.x) %>%
  summarise(Total_Spend = sum(Purchase_Amount.x, na.rm = TRUE), .groups = "drop")
ggplot(category_summary, aes(x = Purchase_Category.x, y = Total_Spend, fill = Purchase_Category.x)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Income_Bracket) +
  labs(title = "Total Spending by Category and Income Segment",
       x = "Category",
       y = "Total Sales Amount") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Interpretation:

  • The faceted bar chart shows total spending across different purchase categories for each income group.
  • The Wealth-Tier segment contributes the highest overall spending, indicating stronger purchasing power.
  • Categories like Travel and Electronics show higher spending across all income groups.
  • Entry-Level users spend less overall, especially in high-value categories.
  • The chart clearly highlights that spending increases with income level and varies by category.

Question 4.5: Provider Performance by Gender (Grouped Bar Chart): To compare transaction counts for each BNPL provider side-by-side, similar to your ToothGrowth example.

ggplot(master_dataset, aes(x = BNPL_Provider.x, fill = Gender.x)) +
  geom_bar(position = "dodge") +
  labs(title = "Provider Transaction Volume by Gender",
       x = "BNPL Provider",
       y = "Count of Transactions",
       fill = "Gender") +
  theme_minimal()

Interpretation:

  • The grouped bar chart compares transaction volumes across BNPL providers for different gender groups.
  • It shows how usage of each provider varies between Male, Female, and other gender categories.
  • Some providers may have higher usage from specific gender groups, indicating demographic preferences.
  • Overall, the chart helps identify which providers are more popular and how gender influences BNPL usage patterns.