library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(knitr)
defaultofc <- read_excel("default of credit card clients.xls")
## New names:
## • `` -> `...1`
##view our dataset
View(defaultofc)
##After viewing our dataset, in order to work with meaningful columns we decided to remove row 1 (X1, X2,…etc) since real headers are on row 2, here we used “skip” :
defcredit <- read_excel("default of credit card clients.xls", skip = 1)
names(defcredit)
## [1] "ID" "LIMIT_BAL"
## [3] "SEX" "EDUCATION"
## [5] "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2"
## [9] "PAY_3" "PAY_4"
## [11] "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2"
## [15] "BILL_AMT3" "BILL_AMT4"
## [17] "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2"
## [21] "PAY_AMT3" "PAY_AMT4"
## [23] "PAY_AMT5" "PAY_AMT6"
## [25] "default payment next month"
head(defcredit, 5)
## # A tibble: 5 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
We need to fix sex values first:
defcredit <- defcredit %>%
mutate(SEX = case_when(
SEX == 1 ~ "Male",
SEX == 2 ~ "Female"
))
head(defcredit,5)
## # A tibble: 5 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 20000 Female 2 1 24 2 2 -1 -1 -2
## 2 2 120000 Female 2 2 26 -1 2 0 0 0
## 3 3 90000 Female 2 2 34 0 0 0 0 0
## 4 4 50000 Female 2 1 37 0 0 0 0 0
## 5 5 50000 Male 2 1 57 -1 0 -1 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
tail(defcredit, 10)
## # A tibble: 10 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29991 140000 Male 2 1 41 0 0 0 0 0
## 2 29992 210000 Male 2 1 34 3 2 2 2 2
## 3 29993 10000 Male 3 1 43 0 0 0 -2 -2
## 4 29994 100000 Male 1 2 38 0 -1 -1 0 0
## 5 29995 80000 Male 2 2 34 2 2 2 2 2
## 6 29996 220000 Male 3 1 39 0 0 0 0 0
## 7 29997 150000 Male 3 2 43 -1 -1 -1 -1 0
## 8 29998 30000 Male 2 2 37 4 3 2 -1 0
## 9 29999 80000 Male 3 1 41 1 -1 0 0 0
## 10 30000 50000 Male 2 1 46 0 0 0 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
str(defcredit)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : chr [1:30000] "Female" "Female" "Female" "Female" ...
## $ EDUCATION : num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
## $ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
## $ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
## $ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
## $ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
## $ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
## $ default payment next month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
dim(defcredit)
## [1] 30000 25
sum(duplicated(defcredit))
## [1] 0
# "distinct(defcredit) "
#there is no need to use distinct(), since our dataset has no duplicates.
colSums(is.na(defcredit))
## ID LIMIT_BAL
## 0 0
## SEX EDUCATION
## 0 0
## MARRIAGE AGE
## 0 0
## PAY_0 PAY_2
## 0 0
## PAY_3 PAY_4
## 0 0
## PAY_5 PAY_6
## 0 0
## BILL_AMT1 BILL_AMT2
## 0 0
## BILL_AMT3 BILL_AMT4
## 0 0
## BILL_AMT5 BILL_AMT6
## 0 0
## PAY_AMT1 PAY_AMT2
## 0 0
## PAY_AMT3 PAY_AMT4
## 0 0
## PAY_AMT5 PAY_AMT6
## 0 0
## default payment next month
## 0
No missing values found.
numeric_cols <- defcredit %>% select(LIMIT_BAL, AGE, BILL_AMT1:BILL_AMT6, PAY_AMT1:PAY_AMT6)
boxplot(scale(numeric_cols),
main = "Boxplots of Quantitative Variables",
col = rainbow(ncol(numeric_cols)),
border = "darkgreen",
pch = 20,
outcol = "black",
)
# Use Winsorising at 1st and 99th percentile for financial variables
winsorise <- function(x, low = 0.01, high = 0.99) {
q <- quantile(x, probs = c(low, high), na.rm = TRUE)
x <- pmax(pmin(x, q[2]), q[1])
return(x)
}
financial_cols <- c("LIMIT_BAL", "BILL_AMT1","BILL_AMT2","BILL_AMT3",
"BILL_AMT4","BILL_AMT5","BILL_AMT6",
"PAY_AMT1","PAY_AMT2","PAY_AMT3",
"PAY_AMT4","PAY_AMT5","PAY_AMT6")
defcredit <- defcredit %>%
mutate(across(all_of(financial_cols), winsorise))
numeric_cols <- defcredit %>% select(LIMIT_BAL, AGE, BILL_AMT1:BILL_AMT6, PAY_AMT1:PAY_AMT6)
par(mar = c(10, 4, 4, 2))
boxplot(scale(numeric_cols),
main = "Boxplots after handling outliers",
las = 2,
col = rainbow(ncol(numeric_cols)),
border = "darkgreen",
cex.axis = 0.7,
pch = 20,
outcol = "black")
defcredit <- defcredit %>%
mutate(
DEFAULT_LBL = factor(`default payment next month`, levels = c(0, 1),
labels = c("No Default", "Default")),
AGE_GROUP = cut(AGE, breaks = seq(20, 80, 5),
include.lowest = TRUE)
)
#the graphs need readable labels and grouped ages
# DEFAULT_LBL: converts 0/1 to "No Default"/"Default" for readable graph labels
# AGE_GROUP: groups raw ages into 5-year interval for age graph
ggplot(defcredit, aes(x = SEX, fill = SEX)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count))) +
labs(title = "Clients by Gender", x = "Gender", y = "Count")
#modification of dataset is needed to have meaning full data
defcredit <- defcredit %>%
mutate(EDUCATION = case_when(
EDUCATION == 0 ~ "Unknown",
EDUCATION == 1 ~ "Graduate School",
EDUCATION == 2 ~ "University",
EDUCATION == 3 ~ "High School",
TRUE ~ "Others"
))
ggplot(defcredit, aes(x = EDUCATION, fill = EDUCATION)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = "Clients by Education Level", x = "Education", y = "Count")
ggplot(defcredit, aes(x = AGE_GROUP)) +
geom_bar(fill = "skyblue") +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = "Clients by Age Group", x = "Age Group", y = "Count")
#we have to mutate first in order to have meaningful data:
defcredit <- defcredit %>%
mutate(MARRIAGE = case_when(
MARRIAGE == 0 ~ "Unknown",
MARRIAGE == 1 ~ "Married",
MARRIAGE == 2 ~ "Single",
MARRIAGE == 3 ~ "Others"
))
ggplot(defcredit, aes(x = MARRIAGE, fill = MARRIAGE)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = "Clients by Marital Status", x = "Marital Status", y = "Count")
ggplot(defcredit, aes(x = DEFAULT_LBL, fill = DEFAULT_LBL)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
scale_fill_manual(values = c("No Default" = "skyblue", "Default" = "darkred")) +
labs(title = "Default Payment Status", x = "Status", y = "Count")
percent_clients_under_40 <- round(sum(defcredit$AGE < 40) / nrow(defcredit) * 100, 1)
university_clients <- sum(defcredit$EDUCATION == "University")
female_clients <- sum(defcredit$SEX == "Female")
male_clients <- sum(defcredit$SEX == "Male")
female_to_male_ratio<- round(female_clients / male_clients, 2)
default_rate<- round(sum(defcredit$DEFAULT_LBL == "Default") / nrow(defcredit) * 100, 1)
cat(
"The EDA shows that ", percent_clients_under_40, "% of the clients are less than 40 years. ",
"Moreover, ", university_clients, " out of 30,000 have university-level education. ",
"In addition, the number of credit cards issued to female is ", female_to_male_ratio, " times the number issued to Male. ",
"Generally, ", default_rate, "% of the loans were defaulted."
)
## The EDA shows that 69.5 % of the clients are less than 40 years. Moreover, 14030 out of 30,000 have university-level education. In addition, the number of credit cards issued to female is 1.52 times the number issued to Male. Generally, 22.1 % of the loans were defaulted.
defcredit %>%
group_by(AGE_GROUP) %>%
summarise(Default_Rate = round(mean(`default payment next month`) * 100, 1)) %>%
ggplot(aes(x = AGE_GROUP, y = Default_Rate)) +
geom_col(fill = "skyblue") +
geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.5) +
labs(title = "Default Rate by Age Group", x = "Age Group", y = "Default Rate (%)")
defcredit %>%
group_by(EDUCATION) %>%
summarise(Default_Rate = round(mean(`default payment next month`) * 100, 1)) %>%
ggplot(aes(x = EDUCATION, y = Default_Rate)) +
geom_col(fill = "skyblue") +
geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.5) +
labs(title = "Default Rate by Education Level", x = "Education", y = "Default Rate (%)")
defcredit %>%
group_by(SEX) %>%
summarise(Default_Rate = round(mean(`default payment next month`) * 100, 1)) %>%
ggplot(aes(x = SEX, y = Default_Rate)) +
geom_col(fill = "skyblue") +
geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.5) +
labs(title = "Default Rate by Gender", x = "Gender", y = "Default Rate (%)")
defcredit %>%
group_by(MARRIAGE) %>%
summarise(Default_Rate = round(mean(`default payment next month`) * 100, 1)) %>%
ggplot(aes(x = MARRIAGE, y = Default_Rate)) +
geom_col(fill = "skyblue") +
geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.5) +
labs(title = "Default Rate by Marital Status", x = "Marital Status", y = "Default Rate (%)")
defcredit <- defcredit %>%
mutate(Late_Payments = as.integer((PAY_0 >= 1) + (PAY_2 >= 1) + (PAY_3 >= 1) + (PAY_4 >= 1)))
segment <- defcredit %>% filter(AGE <= 30, EDUCATION == "High School", Late_Payments >= 2)
seg_rate<- round(mean(segment$`default payment next month`) * 100, 1)
overall_rate <- round(mean(defcredit$`default payment next month`) * 100, 1)
cat("Segment size:", nrow(segment), "\n")
## Segment size: 249
cat("Segment default rate:",seg_rate, "%\n")
## Segment default rate: 53 %
cat("Overall default rate:",overall_rate, "%\n")
## Overall default rate: 22.1 %
data.frame(Group = c("Overall", "High Risk"), Rate = c(overall_rate, seg_rate)) %>%
ggplot(aes(x = Group, y = Rate, fill = Group)) +
geom_col(width = 0.4) +
geom_text(aes(label = paste0(Rate, "%")), vjust = -0.5) +
scale_fill_manual(values = c("Overall" = "skyblue", "High Risk" = "darkred")) +
labs(title = "Overall vs High-Risk Segment Default Rate", x = NULL, y = "Default Rate (%)")
Based on the EDA findings, the following recommendations are proposed:
Help Younger Customers Early: People between 20 and 30 years old are the most likely to miss payments. The bank should give smaller credit limits to first-time cardholders under 30 and offer them simple financial tips.
Act Fast on Late Payments: Missing even one payment is a big warning sign. The bank should immediately send a text reminder and lower the customer’s credit limit as soon as they are one month late.
Set Credit Limits by Education Level: Clients who only finished high school have a higher risk of defaulting than university graduates. The bank should start high school graduates with lower credit limits and only raise them after they prove they can pay on time.
Keep a Closer Eye on Male Clients: The data shows that men default more often than women. Because of this, the bank should monitor male accounts a bit more closely for signs of risk.
Use Month by Month Tracking: Looking at past monthly tracking data is the best way to spot future money trouble. The bank should use this data to create a simple monthly risk score to catch and fix problem accounts early.