library(tidyverse)
library(dplyr)
library(ggplot2)
application_record <- read.csv("bnpl_dataset.csv")
credit_record <- read.csv("bnpl_dataset_v2.csv")
# Print Dataset
print(head(application_record))
## Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359 56 Male 32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2 46 Male 72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8 32 Male 82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf 60 Male 92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec 25 Male 32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17 38 Female 94833
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Repayment_Status
## 1 353 Beauty Sezzle 249 Defaulted
## 2 354 Groceries Affirm 188 Paid On Time
## 3 630 Travel Sezzle 1610 Paid On Time
## 4 470 Fashion Sezzle 120 Paid On Time
## 5 502 Travel Klarna 1849 Paid On Time
## 6 779 Travel Klarna 1112 Paid On Time
print(head(credit_record))
## Transaction_ID Customer_Age Gender Annual_Income
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359 56 Male 32293
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2 46 Male 72774
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8 32 Male 82207
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf 60 Male 92498
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec 25 Male 32060
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17 38 Female 94833
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1 353 Beauty Sezzle 249 Tablet
## 2 354 Groceries Affirm 188 Mobile
## 3 630 Travel Sezzle 1610 Desktop
## 4 470 Fashion Sezzle 120 Mobile
## 5 502 Travel Klarna 1849 Mobile
## 6 779 Travel Klarna 1112 Mobile
## Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1 WiFi 82 Firefox Paid On Time
## 2 WiFi 60 Firefox Paid On Time
## 3 WiFi 86 Chrome Paid On Time
## 4 4G/5G 169 Chrome Paid On Time
## 5 4G/5G 38 Chrome Defaulted
## 6 4G/5G 82 Safari Paid On Time
cat("--- Application Record Structure ---\n")
## --- Application Record Structure ---
colnames(application_record)
## [1] "Transaction_ID" "Customer_Age" "Gender"
## [4] "Annual_Income" "Credit_Score" "Purchase_Category"
## [7] "BNPL_Provider" "Purchase_Amount" "Repayment_Status"
cat("---credit_record---")
## ---credit_record---
colnames(credit_record)
## [1] "Transaction_ID" "Customer_Age" "Gender"
## [4] "Annual_Income" "Credit_Score" "Purchase_Category"
## [7] "BNPL_Provider" "Purchase_Amount" "Device_Type"
## [10] "Connection_Type" "Checkout_Time_Seconds" "Browser"
## [13] "Repayment_Status"
dim(application_record)
## [1] 50000 9
dim(credit_record)
## [1] 50000 13
str(application_record)
## 'data.frame': 50000 obs. of 9 variables:
## $ Transaction_ID : chr "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
## $ Customer_Age : int 56 46 32 60 25 38 56 36 40 28 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Annual_Income : int 32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
## $ Credit_Score : int 353 354 630 470 502 779 403 731 669 359 ...
## $ Purchase_Category: chr "Beauty" "Groceries" "Travel" "Fashion" ...
## $ BNPL_Provider : chr "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
## $ Purchase_Amount : int 249 188 1610 120 1849 1112 418 2117 286 244 ...
## $ Repayment_Status : chr "Defaulted" "Paid On Time" "Paid On Time" "Paid On Time" ...
str(credit_record)
## 'data.frame': 50000 obs. of 13 variables:
## $ Transaction_ID : chr "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359" "863e8aa6-847e-4ae0-b96b-65241f3450a2" "a24efee2-16f2-42dc-a0e7-6df4960df0b8" "bbad847a-a92f-4766-ba3f-98b9b199b4cf" ...
## $ Customer_Age : int 56 46 32 60 25 38 56 36 40 28 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Annual_Income : int 32293 72774 82207 92498 32060 94833 89772 82341 80517 108929 ...
## $ Credit_Score : int 353 354 630 470 502 779 403 731 669 359 ...
## $ Purchase_Category : chr "Beauty" "Groceries" "Travel" "Fashion" ...
## $ BNPL_Provider : chr "Sezzle" "Affirm" "Sezzle" "Sezzle" ...
## $ Purchase_Amount : int 249 188 1610 120 1849 1112 418 2117 286 244 ...
## $ Device_Type : chr "Tablet" "Mobile" "Desktop" "Mobile" ...
## $ Connection_Type : chr "WiFi" "WiFi" "WiFi" "4G/5G" ...
## $ Checkout_Time_Seconds: int 82 60 86 169 38 82 169 14 164 5 ...
## $ Browser : chr "Firefox" "Firefox" "Chrome" "Chrome" ...
## $ Repayment_Status : chr "Paid On Time" "Paid On Time" "Paid On Time" "Paid On Time" ...
Interpretation: - The application dataset contains 50,000 records with 9 variables, while the credit dataset contains 50,000 records with 13 variables. Both datasets include a mix of numerical variables (such as Age, Income, Credit Score, and Purchase Amount) and categorical variables (such as Gender, Purchase Category, and Repayment Status). The credit dataset includes additional features like Device Type, Connection Type, and Browser, providing more behavioral insights. Overall, the structured and consistent data makes it suitable for analysis and predictive modeling.
colSums(is.na(application_record))
## Transaction_ID Customer_Age Gender Annual_Income
## 0 0 0 0
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount
## 0 0 0 0
## Repayment_Status
## 0
colSums(is.na(credit_record))
## Transaction_ID Customer_Age Gender
## 0 0 0
## Annual_Income Credit_Score Purchase_Category
## 0 0 0
## BNPL_Provider Purchase_Amount Device_Type
## 0 0 0
## Connection_Type Checkout_Time_Seconds Browser
## 0 0 0
## Repayment_Status
## 0
Interpretation: - Both the application and credit datasets have 0 missing values across all columns, indicating complete and clean data.
unique_app_ids <- length(unique(application_record$Transaction_ID))
cat("Unique application ids:",unique_app_ids,"\n")
## Unique application ids: 50000
unique_cred_ids <- length(unique(credit_record$Transaction_ID))
cat("Unique credit ids:",unique_cred_ids)
## Unique credit ids: 50000
# Find the overlap
overlap_ids <- intersect(unique(application_record$Transaction_ID),
unique(credit_record$Transaction_ID))
head(overlap_ids)
## [1] "6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359"
## [2] "863e8aa6-847e-4ae0-b96b-65241f3450a2"
## [3] "a24efee2-16f2-42dc-a0e7-6df4960df0b8"
## [4] "bbad847a-a92f-4766-ba3f-98b9b199b4cf"
## [5] "3f1b1928-09ca-4d06-8ec3-4efd3468d0ec"
## [6] "f99db9dc-9e5c-40d1-918a-ecd3469e8c17"
num_overlap <- length(overlap_ids)
cat("No. Of Overlap ids:",num_overlap)
## No. Of Overlap ids: 50000
Interpretation:
status_distribution <- table(credit_record$Repayment_Status)
cat("Frequency Table for the Repayment_Status column :\n",status_distribution,"\n")
## Frequency Table for the Repayment_Status column :
## 4237 7364 38399
most_frequent_status <- names(status_distribution[which.max(status_distribution)])
cat("Most Frequent Status: ",most_frequent_status)
## Most Frequent Status: Paid On Time
Interpretation: - The Repayment_Status
column has three categories: Defaulted, Late Payment, and Paid On Time.
Among these, “Paid On Time” is the most frequent, indicating that the
majority of customers repay their BNPL dues on time. However, the
presence of late payments and defaults shows that some level of credit
risk still exists.
app_duplicates <- sum(duplicated(application_record$Transaction_ID))
cat("Duplicate IDs in Application Record:", app_duplicates, "\n")
## Duplicate IDs in Application Record: 0
cred_duplicates <- sum(duplicated(credit_record$Transaction_ID))
cat("Duplicate IDs in Credit Record:", cred_duplicates, "\n")
## Duplicate IDs in Credit Record: 0
application_record <- application_record[!duplicated(application_record$Transaction_ID), ]
credit_record <- credit_record[!duplicated(credit_record$Transaction_ID), ]
cat("Cleaned Application Rows:", nrow(application_record), "\n")
## Cleaned Application Rows: 50000
cat("Cleaned Credit Rows:", nrow(credit_record))
## Cleaned Credit Rows: 50000
Interpretation: - Here are no duplicate Transaction_IDs in either the application or credit dataset, indicating that all records are unique. Data cleaning confirms that the total number of rows remains unchanged at 50,000 in both datasets.
top_10_wealth <- application_record %>%
arrange(desc(Annual_Income)) %>%
head(10)
print(top_10_wealth)
## Transaction_ID Customer_Age Gender Annual_Income
## 1 cda8bc71-033d-492f-9ab3-1d7b135e5778 42 Male 119998
## 2 6556a8a6-7bd4-42a7-8d3d-6454aff7622f 36 Male 119995
## 3 264f3c9e-773a-4ad0-9c1c-fc26f24f6dbf 38 Female 119994
## 4 b1c4bdc9-1a54-4bea-96f3-65686a502a85 39 Male 119992
## 5 8f5a9adc-b378-4966-89f5-223277a8aeaa 58 Female 119990
## 6 7b3838fd-bcc9-400e-a064-a16139c4c9a5 32 Female 119989
## 7 e0440639-8486-449d-88d2-8873832e8d84 46 Male 119989
## 8 8604c4b5-d984-4919-b741-6962cfbdb3b9 31 Female 119987
## 9 69ef7ce5-1317-4a17-8d07-045024078440 56 Female 119983
## 10 aa230197-83fe-414b-bc6d-3c0cee853853 48 Male 119982
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount
## 1 395 Beauty Afterpay 203
## 2 666 Fashion Sezzle 171
## 3 423 Travel Klarna 1243
## 4 463 Travel Affirm 799
## 5 744 Electronics Afterpay 940
## 6 847 Groceries Afterpay 44
## 7 462 Home & Furniture Affirm 203
## 8 552 Home & Furniture Affirm 193
## 9 415 Travel Klarna 631
## 10 400 Travel Sezzle 2957
## Repayment_Status
## 1 Paid On Time
## 2 Paid On Time
## 3 Paid On Time
## 4 Late Payment
## 5 Paid On Time
## 6 Paid On Time
## 7 Late Payment
## 8 Paid On Time
## 9 Paid On Time
## 10 Late Payment
Interpretation: - The top 10 applicants represent the highest income group in the dataset, with annual incomes close to 120,000. Most of these customers have “Paid On Time” repayment status. However, a few cases of “Late Payment” are also observed, showing that even high-income customers can have repayment issues. This analysis helps in understanding spending and repayment patterns of the high-income segment.
high_value_creditors <- application_record %>%
filter(BNPL_Provider %in% c("Klarna", "Afterpay") & Credit_Score > 750)
nrow(high_value_creditors)
## [1] 4521
####Question 2.3: Extract all records for users who are using “Mobile” devices to assess the handheld BNPL market share and browsing behavior.
mobile_market_data <- credit_record %>%
filter(Device_Type == "Mobile")
head(mobile_market_data)
## Transaction_ID Customer_Age Gender Annual_Income
## 1 863e8aa6-847e-4ae0-b96b-65241f3450a2 46 Male 72774
## 2 bbad847a-a92f-4766-ba3f-98b9b199b4cf 60 Male 92498
## 3 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec 25 Male 32060
## 4 f99db9dc-9e5c-40d1-918a-ecd3469e8c17 38 Female 94833
## 5 861b5462-14eb-4392-8765-baba4df192b2 28 Non-Binary 99953
## 6 83b75c0c-4e94-4966-8e89-c70dc5de1f69 41 Male 45213
## Credit_Score Purchase_Category BNPL_Provider Purchase_Amount Device_Type
## 1 354 Groceries Affirm 188 Mobile
## 2 470 Fashion Sezzle 120 Mobile
## 3 502 Travel Klarna 1849 Mobile
## 4 779 Travel Klarna 1112 Mobile
## 5 638 Electronics Affirm 489 Mobile
## 6 663 Electronics Afterpay 1084 Mobile
## Connection_Type Checkout_Time_Seconds Browser Repayment_Status
## 1 WiFi 60 Firefox Paid On Time
## 2 4G/5G 169 Chrome Paid On Time
## 3 4G/5G 38 Chrome Defaulted
## 4 4G/5G 82 Safari Paid On Time
## 5 4G/5G 167 Firefox Paid On Time
## 6 WiFi 160 Firefox Paid On Time
cat("No. of Mobile Data Rows: \n")
## No. of Mobile Data Rows:
print(nrow(mobile_market_data))
## [1] 30023
Interpretation: - A total of 30,023 records were found for users using mobile devices. This indicates that a significant portion of customers prefer mobile devices for BNPL transactions. It highlights the importance of optimizing mobile platforms to enhance user experience and engagement.
high_risk_defaults <- application_record %>%
filter(Repayment_Status == "Defaulted")
cat("No. Of rows high rish users for 'Defaulted': \n" )
## No. Of rows high rish users for 'Defaulted':
nrow(high_risk_defaults)
## [1] 4379
Interpretation: - A total of 4,379 customers have defaulted on their payments, indicating a segment of high-risk users. This highlights the presence of credit risk within the dataset and suggests the need for careful risk assessment and stricter lending strategies for such customers.
# Filtering for rapid transactions under 30 seconds
rapid_transactions <- credit_record %>%
filter(Checkout_Time_Seconds < 30)
# Counting the high-speed transactions
cat("Count of high speed transaction 'rapid checkouts':\n")
## Count of high speed transaction 'rapid checkouts':
nrow(rapid_transactions)
## [1] 7261
Interpretation:
# Grouping by existing columns: Purchase_Category and Gender
spending_profile <- application_record %>%
group_by(Purchase_Category, Gender) %>%
summarize(
Mean_Spending = mean(Purchase_Amount, na.rm = TRUE),
Median_Spending = median(Purchase_Amount, na.rm = TRUE),
SD_Spending = sd(Purchase_Amount, na.rm = TRUE),
Total_Count = n(),
.groups = "drop"
) %>%
arrange(desc(Mean_Spending))
# Display the summary table
cat("Summary Table for 'Purchase category and Gender':\n")
## Summary Table for 'Purchase category and Gender':
print(spending_profile)
## # A tibble: 18 Ă— 6
## Purchase_Category Gender Mean_Spending Median_Spending SD_Spending
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Travel Non-Binary 1661. 1654 773.
## 2 Travel Male 1638. 1634. 774.
## 3 Travel Female 1621. 1616. 776.
## 4 Electronics Male 1109. 1107 524.
## 5 Electronics Female 1099. 1086 517.
## 6 Electronics Non-Binary 1074. 1036 511.
## 7 Fashion Non-Binary 162. 158 79.6
## 8 Home & Furniture Non-Binary 161. 158 78.7
## 9 Fashion Male 161. 163 81.6
## 10 Beauty Non-Binary 161. 160 83.5
## 11 Groceries Male 160. 160 80.7
## 12 Home & Furniture Male 160. 160 80.9
## 13 Beauty Male 160. 160 81.7
## 14 Home & Furniture Female 159. 160 80.9
## 15 Groceries Female 159. 159 80.3
## 16 Fashion Female 159. 156 80.3
## 17 Beauty Female 158. 156 81.2
## 18 Groceries Non-Binary 157. 160 81.7
## # ℹ 1 more variable: Total_Count <int>
Interpretation:
application_record <- application_record %>%
mutate(Income_Bracket = ifelse(Annual_Income < 30000, "Entry-Level",
ifelse(Annual_Income <= 70000, "Mid-Tier", "Wealth-Tier")))
cat("No. of users are in each category:")
## No. of users are in each category:
table(application_record$Income_Bracket)
##
## Entry-Level Mid-Tier Wealth-Tier
## 5053 19907 25040
head(application_record[, c("Transaction_ID", "Annual_Income", "Income_Bracket")])
## Transaction_ID Annual_Income Income_Bracket
## 1 6cbfd4e5-8e91-4a7b-8a14-e3dfa86a3359 32293 Mid-Tier
## 2 863e8aa6-847e-4ae0-b96b-65241f3450a2 72774 Wealth-Tier
## 3 a24efee2-16f2-42dc-a0e7-6df4960df0b8 82207 Wealth-Tier
## 4 bbad847a-a92f-4766-ba3f-98b9b199b4cf 92498 Wealth-Tier
## 5 3f1b1928-09ca-4d06-8ec3-4efd3468d0ec 32060 Mid-Tier
## 6 f99db9dc-9e5c-40d1-918a-ecd3469e8c17 94833 Wealth-Tier
Interpretation:
income_credit_summary <- application_record %>%
group_by(Income_Bracket) %>%
summarise(
Avg_Credit_Score = mean(Credit_Score, na.rm = TRUE),
Total_Applicants = n()
)
cat('Summary Table for Credit Scores by Income Segment:')
## Summary Table for Credit Scores by Income Segment:
print(income_credit_summary)
## # A tibble: 3 Ă— 3
## Income_Bracket Avg_Credit_Score Total_Applicants
## <chr> <dbl> <int>
## 1 Entry-Level 574. 5053
## 2 Mid-Tier 573. 19907
## 3 Wealth-Tier 574. 25040
Interpretation:
provider_summary <- application_record %>%
group_by(BNPL_Provider, Gender) %>%
summarize(Avg_Purchase = mean(Purchase_Amount, na.rm = TRUE), .groups = "drop")
provider_matrix <- provider_summary %>%
pivot_wider(names_from = Gender, values_from = Avg_Purchase)
cat('View the comparison matrix')
## View the comparison matrix
print(provider_matrix)
## # A tibble: 4 Ă— 4
## BNPL_Provider Female Male `Non-Binary`
## <chr> <dbl> <dbl> <dbl>
## 1 Affirm 551. 566. 532.
## 2 Afterpay 565. 569. 524.
## 3 Klarna 568. 559. 651.
## 4 Sezzle 567. 565. 513.
Interpretation:
master_dataset <- application_record %>%
left_join(credit_record, by = "Transaction_ID")
dim(master_dataset)
## [1] 50000 22
colnames(master_dataset)
## [1] "Transaction_ID" "Customer_Age.x" "Gender.x"
## [4] "Annual_Income.x" "Credit_Score.x" "Purchase_Category.x"
## [7] "BNPL_Provider.x" "Purchase_Amount.x" "Repayment_Status.x"
## [10] "Income_Bracket" "Customer_Age.y" "Gender.y"
## [13] "Annual_Income.y" "Credit_Score.y" "Purchase_Category.y"
## [16] "BNPL_Provider.y" "Purchase_Amount.y" "Device_Type"
## [19] "Connection_Type" "Checkout_Time_Seconds" "Browser"
## [22] "Repayment_Status.y"
sum(is.na(master_dataset$Purchase_Amount))
## [1] 0
Interpretation:
Transaction_ID..x and .y show overlapping
variables from both datasets.master_dataset <- master_dataset %>%
mutate(Is_Bad = ifelse(Repayment_Status.x %in% c("Defaulted", "Late"), 1, 0))
ggplot(master_dataset, aes(x = factor(Is_Bad), fill = factor(Is_Bad))) +
geom_bar() +
scale_fill_manual(values = c("skyblue", "pink")) +
labs(title = "Count of Good (0) vs Bad (1) Customers",
x = "Risk Status (Is_Bad)",
y = "Frequency") +
theme_minimal()
Interpretation:
ggplot(master_dataset, aes(x = Credit_Score.x)) +
geom_histogram(binwidth = 20, fill = "orange", color = "red") +
labs(title = "Distribution of Credit Scores",
x = "Credit Score",
y = "Frequency") +
theme_minimal()
Interpretation:
####Question 4.3:Wealth vs. Consumption (Scatter Plot with Regression): To check the correlation between Income and Spending, similar to your mtcars weight vs. mpg example.
ggplot(master_dataset, aes(x = Annual_Income.x, y = Purchase_Amount.x)) +
geom_point(alpha = 0.2, color = "blue", size = 1) +
geom_smooth(method = "lm", formula = y ~ x, color = "black", linewidth = 1.5, se = TRUE) +
labs(title = "Wealth vs. Consumption Scatter Analysis",
subtitle = "Testing the relationship between Income and Spending",
x = "Annual Income (USD)",
y = "Purchase Amount (USD)") +
theme_minimal()
Interpretation:
category_summary <- master_dataset %>%
group_by(Income_Bracket, Purchase_Category.x) %>%
summarise(Total_Spend = sum(Purchase_Amount.x, na.rm = TRUE), .groups = "drop")
ggplot(category_summary, aes(x = Purchase_Category.x, y = Total_Spend, fill = Purchase_Category.x)) +
geom_bar(stat = "identity") +
facet_wrap(~Income_Bracket) +
labs(title = "Total Spending by Category and Income Segment",
x = "Category",
y = "Total Sales Amount") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Interpretation:
ggplot(master_dataset, aes(x = BNPL_Provider.x, fill = Gender.x)) +
geom_bar(position = "dodge") +
labs(title = "Provider Transaction Volume by Gender",
x = "BNPL Provider",
y = "Count of Transactions",
fill = "Gender") +
theme_minimal()
Interpretation: