## Variable Names:
## [1] "ID" "LIMIT_BAL"
## [3] "SEX" "EDUCATION"
## [5] "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2"
## [9] "PAY_3" "PAY_4"
## [11] "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2"
## [15] "BILL_AMT3" "BILL_AMT4"
## [17] "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2"
## [21] "PAY_AMT3" "PAY_AMT4"
## [23] "PAY_AMT5" "PAY_AMT6"
## [25] "default payment next month"
## # A tibble: 5 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
## # A tibble: 10 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29991 140000 1 2 1 41 0 0 0 0 0
## 2 29992 210000 1 2 1 34 3 2 2 2 2
## 3 29993 10000 1 3 1 43 0 0 0 -2 -2
## 4 29994 100000 1 1 2 38 0 -1 -1 0 0
## 5 29995 80000 1 2 2 34 2 2 2 2 2
## 6 29996 220000 1 3 1 39 0 0 0 0 0
## 7 29997 150000 1 3 2 43 -1 -1 -1 -1 0
## 8 29998 30000 1 2 2 37 4 3 2 -1 0
## 9 29999 80000 1 3 1 41 1 -1 0 0 0
## 10 30000 50000 1 2 1 46 0 0 0 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
## [31mData Types and Shape:[0m
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
## $ EDUCATION : num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
## $ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
## $ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
## $ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
## $ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
## $ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
## $ default payment next month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
## [31mShape (Rows, Columns):[0m
## [1] 30000 25
credit_card_info_default_duplication <- sum(duplicated(credit_card_info_default))
cat("Number of duplicate rows:", credit_card_info_default_duplication, "\n")## Number of duplicate rows: 0
# Drop duplicates if any
credit_card_info_default <- credit_card_info_default %>% distinct()
cat("Rows after removing duplicates:", nrow(credit_card_info_default), "\n")## Rows after removing duplicates: 30000
Result: No duplicate rows were found. Dataset remains at 30,000 rows.
missing_summary <- data.frame(
Variable = names(credit_card_info_default),
Missing = colSums(is.na(credit_card_info_default))
)
print(missing_summary, caption = "Missing Values per Column", row.names = FALSE)## Variable Missing
## ID 0
## LIMIT_BAL 0
## SEX 0
## EDUCATION 0
## MARRIAGE 0
## AGE 0
## PAY_0 0
## PAY_2 0
## PAY_3 0
## PAY_4 0
## PAY_5 0
## PAY_6 0
## BILL_AMT1 0
## BILL_AMT2 0
## BILL_AMT3 0
## BILL_AMT4 0
## BILL_AMT5 0
## BILL_AMT6 0
## PAY_AMT1 0
## PAY_AMT2 0
## PAY_AMT3 0
## PAY_AMT4 0
## PAY_AMT5 0
## PAY_AMT6 0
## default payment next month 0
Result: There are no missing values in any column. No imputation is required.
# 2. Select numeric columns for analysis
numeric_cols <- credit_card_info_default[, c(2:10, 13:23)]
par(mar = c(10, 4, 4, 2))
boxplot(scale(numeric_cols),
main = "Boxplots of Quantitative Variables",
las = 2,
col = rainbow(ncol(numeric_cols)),
border="darkblue",
cex.axis = 0.7
)# Use Winsorising (capping) at 1st and 99th percentile for financial variables
# This preserves data volume while limiting extreme influence
winsorise <- function(x, low = 0.01, high = 0.99) {
q <- quantile(x, probs = c(low, high), na.rm = TRUE)
x <- pmax(pmin(x, q[2]), q[1])
return(x)
}
financial_cols <- c("LIMIT_BAL", "BILL_AMT1","BILL_AMT2","BILL_AMT3",
"BILL_AMT4","BILL_AMT5","BILL_AMT6",
"PAY_AMT1","PAY_AMT2","PAY_AMT3",
"PAY_AMT4","PAY_AMT5","PAY_AMT6")
credit_card_info_default <- credit_card_info_default %>%
mutate(across(all_of(financial_cols), winsorise))
cat("Outlier treatment complete: Winsorised financial variables at 1st–99th percentile.\n")## Outlier treatment complete: Winsorised financial variables at 1st–99th percentile.
numeric_cols <- credit_card_info_default[, c(2:10, 13:23)]
par(mar = c(10, 4, 4, 2))
boxplot(scale(numeric_cols),
main = "Boxplots of Quantitative Variables",
las = 2,
col = rainbow(ncol(numeric_cols)),
border="darkblue",
cex.axis = 0.7
)# Rename target variable for clarity
credit_card_info_default <- credit_card_info_default %>% rename(DEFAULT = `default payment next month`)
credit_card_info_default <- credit_card_info_default %>%
mutate(
GENDER = factor(SEX, levels = c(1,2), labels = c("Male","Female")),
EDU_LABEL = factor(EDUCATION, levels = c(0,1,2,3,4,5,6),labels = c("Unknown","Graduate School","University",
"High School","Others","Others2","Others3")),
MARITAL = factor(MARRIAGE, levels = c(0,1,2,3),labels = c("Unknown","Married","Single","Others")),
DEFAULT_LBL = factor(DEFAULT, levels = c(0,1), labels = c("No Default","Default")),
AGE_GROUP = cut(AGE,breaks = seq(20, 80, 5),
right = FALSE,
include.lowest=TRUE)
)credit_card_info_default %>%
count(GENDER) %>%
mutate(pct = n / sum(n) * 100) %>%
ggplot(aes(x = GENDER, y = n, fill = GENDER)) +
geom_col(width = 0.5) +
geom_text(aes(label = paste0(scales::comma(n), "\n(", round(pct,1), "%)")),
vjust = -0.3, size = 4) +
scale_fill_manual(values = c("Male" = "#2196F3", "Female" = "#E91E63")) +
labs(title = "Credit Cards Issued by Gender",
x = "Gender", y = "Number of Clients") +
theme_minimal() +
theme(legend.position = "none") +
coord_cartesian(clip = "off")credit_card_info_default %>%
count(EDU_LABEL) %>%
mutate(pct = n / sum(n) * 100) %>%
ggplot(aes(x = reorder(EDU_LABEL, -n), y = n, fill = EDU_LABEL)) +
geom_col(width = 0.6) +
geom_text(aes(label = paste0(comma(n), "\n(", round(pct,1), "%)")),
vjust = -0.3, size = 3.5) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(title = "Distribution by Education Level",
x = "Education Level", y = "Number of Clients") +
theme_minimal() +
theme(legend.position = "none")ggplot(credit_card_info_default, aes(x = AGE, fill = DEFAULT_LBL)) +
geom_histogram(binwidth = 2, position = "stack", colour = "white") +
scale_fill_manual(values = c("No Default" = "#4CAF50", "Default" = "#F44336")) +
labs(title = "Age Distribution of Clients",
x = "Age (years)", y = "Count", fill = "Default Status") +
theme_minimal()credit_card_info_default %>%
count(MARITAL) %>%
mutate(pct = n / sum(n) * 100) %>%
ggplot(aes(x = "", y = n, fill = MARITAL)) +
geom_col(width = 1) +
coord_polar("y") +
geom_text(aes(label = paste0(MARITAL, "\n", round(pct,1), "%")),
position = position_stack(vjust = 0.5), size = 3.5,check_overlap = TRUE) +
labs(title = "Distribution by Marital Status") +
theme_void() +
theme(legend.position = "none")credit_card_info_default %>%
count(DEFAULT_LBL) %>%
mutate(pct = n / sum(n) * 100) %>%
ggplot(aes(x = DEFAULT_LBL, y = n, fill = DEFAULT_LBL)) +
geom_col(width = 0.5) +
geom_text(aes(label = paste0(comma(n), "\n(", round(pct,1), "%)")),
vjust = -0.3, size = 4.5) +
scale_fill_manual(values = c("No Default" = "#4CAF50", "Default" = "#F44336")) +
labs(title = "Default Payment Status",
x = "Default Status", y = "Number of Clients") +
theme_minimal() +
theme(legend.position = "none") +
coord_cartesian(clip = "off")pct_under40 <- round(mean(credit_card_info_default$AGE < 40) * 100, 1)
uni_count <- sum(credit_card_info_default$EDUCATION == 2)
female_count <- sum(credit_card_info_default$SEX == 2)
male_count <- sum(credit_card_info_default$SEX == 1)
female_ratio <- round(female_count / male_count, 2)
default_rate <- round(mean(credit_card_info_default$DEFAULT) * 100, 1)
cat(paste0(
"The EDA shows that ", pct_under40, "% of the clients are less than 40 years.\n",
"Moreover, ", uni_count, " out of 30,000 have university-level education.\n",
"In addition, the number of credit cards issued to female is ", female_ratio,
" times the number issued to Male.\n",
"Generally, ", default_rate, "% of the loans were defaulted.\n"
))## The EDA shows that 69.5% of the clients are less than 40 years.
## Moreover, 14030 out of 30,000 have university-level education.
## In addition, the number of credit cards issued to female is 1.52 times the number issued to Male.
## Generally, 22.1% of the loans were defaulted.
credit_card_info_default %>%
group_by(AGE_GROUP) %>%
summarise(
Clients = n(),
Defaults = sum(DEFAULT),
Default_Rate = round(mean(DEFAULT) * 100, 1)
) %>%
ggplot(aes(x = AGE_GROUP, y = Default_Rate, fill = Default_Rate)) +
geom_col() +
geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.3, size = 3.5) +
scale_fill_gradient(low = "#81C784", high = "#D32F2F") +
labs(title = "Default Rate by Age Group (5-Year Bins)",
x = "Age Group", y = "Default Rate (%)", fill = "Rate (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Insight: The youngest group (20–24) has the highest default rate at 27.2%, followed by clients aged 60+ (≥29%). Middle-aged clients (30–39) have relatively lower default rates around 19–21%.
credit_card_info_default %>%
group_by(EDU_LABEL) %>%
summarise(Default_Rate = round(mean(DEFAULT) * 100, 1), Clients = n()) %>%
#filter(!EDU_LABEL %in% c("Unknown","Others2","Others3")) %>%
ggplot(aes(x = reorder(EDU_LABEL, -Default_Rate), y = Default_Rate, fill = EDU_LABEL)) +
geom_col(width = 0.5) +
geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.3) +
labs(title = "Default Rate by Education Level",
x = "Education", y = "Default Rate (%)") +
theme_minimal() +
theme(legend.position = "none")Insight: High school graduates have the highest default rate (25.2%), followed by university graduates (23.7%). Graduate school clients default least (19.2%), suggesting higher education correlates with lower default risk.
credit_card_info_default %>%
group_by(GENDER) %>%
summarise(Default_Rate = round(mean(DEFAULT) * 100, 1)) %>%
ggplot(aes(x = GENDER, y = Default_Rate, fill = GENDER)) +
geom_col(width = 0.4) +
geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.3, size = 5) +
scale_fill_manual(values = c("Male" = "#2196F3", "Female" = "#E91E63")) +
labs(title = "Default Rate by Gender", x = NULL, y = "Default Rate (%)") +
theme_minimal() +
theme(legend.position = "none")Insight: Males default at a higher rate (24.2%) than females (20.8%), despite females receiving more credit cards overall.
credit_card_info_default %>%
group_by(MARITAL) %>%
summarise(Default_Rate = round(mean(DEFAULT) * 100, 1)) %>%
filter(MARITAL != "Unknown") %>%
ggplot(aes(x = MARITAL, y = Default_Rate, fill = MARITAL)) +
geom_col(width = 0.4) +
geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.3) +
labs(title = "Default Rate by Marital Status", x = NULL, y = "Default Rate (%)") +
theme_minimal() +
theme(legend.position = "none")Insight: Married clients have a slightly higher default rate (23.5%) compared to single clients (20.9%).
# Segment definition:
# Customers aged ≤ 30 with PAY_0 >= 2 (payment delayed ≥ 2 months in Sept 2005)
# This indicates both youth risk AND recent repayment failure
segment <- credit_card_info_default %>%
filter(AGE <= 30, PAY_0 >= 2,EDU_LABEL %in% c("High School", "Others", "Others2", "Others3"))
overall_rate <- round(mean(credit_card_info_default$DEFAULT) * 100, 1)
segment_rate <- round(mean(segment$DEFAULT) * 100, 1)
segment_size <- nrow(segment)
cat("=== HIGH-RISK SEGMENT DEFINITION ===\n")## === HIGH-RISK SEGMENT DEFINITION ===
## Segment: Clients aged ≤ 30 with a repayment delay ≥ 2 months (PAY_0 ≥ 2)
## Segment size: 178 clients
## Segment default rate: 62.4 %
## Overall default rate: 22.1 %
## Relative risk: 2.8 x higher than average
data.frame(
Group = c("Overall", "High-Risk Segment\n(Age ≤ 30, PAY_0 ≥ 2)"),
Rate = c(overall_rate, segment_rate)
) %>%
ggplot(aes(x = Group, y = Rate, fill = Group)) +
geom_col(width = 0.45) +
geom_text(aes(label = paste0(Rate, "%")), vjust = -0.4, size = 5) +
scale_fill_manual(values = c("Overall" = "#64B5F6", "High-Risk Segment\n(Age ≤ 30, PAY_0 ≥ 2)" = "#D32F2F")) +
labs(title = "Default Rate: Overall vs. High-Risk Segment",
x = NULL, y = "Default Rate (%)") +
theme_minimal() +
theme(legend.position = "none")Finding: Young clients (≤ 30 years old) who were already experiencing repayment delays of ≥ 2 months in September 2005 represent a critical high-risk segment. Their default rate of 62.4% is 3.1× higher than the overall rate of 22.1%. This segment of 1,252 clients represents a concentrated, identifiable risk group.
Based on the EDA findings, the following five recommendations are proposed:
Young clients, especially the 20–24 age group, exhibit the highest default rates (27.2%). The bank should implement proactive financial counselling for first-time card holders under 30, including automated alerts when spending approaches 80% of the credit limit, and tailored repayment reminder systems.
Clients with even a 1-month delay (PAY_0 ≥ 1) are at significantly elevated risk. The bank should trigger an automated risk escalation workflow as soon as a delay is detected — including outbound calls, SMS reminders, and temporary credit limit reduction — to prevent the delay from compounding into default.
High school educated clients default at 25.2% vs. 19.2% for graduate school clients. The bank should offer education-adjusted credit products: lower initial limits with graduated increases tied to demonstrated repayment history for less-educated segments, while offering premium products to lower-risk, higher-education segments.
Male clients default at 24.2% vs. female clients at 20.8%. The bank can design targeted retention and monitoring programs for male clients, such as enhanced credit health dashboards or peer-comparison spending summaries, which research shows can reduce male overspending and late payments.
The repayment history variables (PAY_0 through PAY_6) are the strongest observable indicators of imminent default, especially when delays accumulate across consecutive months. The bank should develop a real-time credit risk score using these variables updated monthly, flagging any client whose score crosses a defined threshold for manual review and proactive outreach.
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8
## [2] LC_CTYPE=English_United Kingdom.utf8
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.utf8
##
## time zone: Africa/Kigali
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.51 scales_1.4.0 tidyr_1.3.2 ggplot2_4.0.3 dplyr_1.2.1
## [6] readxl_1.5.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.1 tidyselect_1.2.1
## [5] jquerylib_0.1.4 yaml_2.3.12 fastmap_1.2.0 R6_2.6.1
## [9] labeling_0.4.3 generics_0.1.4 tibble_3.3.1 bslib_0.10.0
## [13] pillar_1.11.1 RColorBrewer_1.1-3 rlang_1.2.0 cachem_1.1.0
## [17] xfun_0.57 sass_0.4.10 S7_0.2.2 otel_0.2.0
## [21] cli_3.6.6 withr_3.0.2 magrittr_2.0.5 digest_0.6.39
## [25] grid_4.5.1 rstudioapi_0.18.0 lifecycle_1.0.5 vctrs_0.7.3
## [29] evaluate_1.0.5 glue_1.8.1 farver_2.1.2 cellranger_1.1.0
## [33] purrr_1.2.2 rmarkdown_2.31 tools_4.5.1 pkgconfig_2.0.3
## [37] htmltools_0.5.9