Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(knitr)

4.1 Summarize the customer information from the dataset

Load Dataset

defaultofc <- read_excel("default of credit card clients.xls")
## New names:
## • `` -> `...1`

##view our dataset

View(defaultofc)

##After viewing our dataset, in order to work with meaningful columns we decided to remove row 1 (X1, X2,…etc) since real headers are on row 2, here we used “skip” :

defcredit <- read_excel("default of credit card clients.xls", skip = 1)

Variable Names

names(defcredit)
##  [1] "ID"                         "LIMIT_BAL"                 
##  [3] "SEX"                        "EDUCATION"                 
##  [5] "MARRIAGE"                   "AGE"                       
##  [7] "PAY_0"                      "PAY_2"                     
##  [9] "PAY_3"                      "PAY_4"                     
## [11] "PAY_5"                      "PAY_6"                     
## [13] "BILL_AMT1"                  "BILL_AMT2"                 
## [15] "BILL_AMT3"                  "BILL_AMT4"                 
## [17] "BILL_AMT5"                  "BILL_AMT6"                 
## [19] "PAY_AMT1"                   "PAY_AMT2"                  
## [21] "PAY_AMT3"                   "PAY_AMT4"                  
## [23] "PAY_AMT5"                   "PAY_AMT6"                  
## [25] "default payment next month"

Top 5 Rows

head(defcredit, 5)
## # A tibble: 5 × 25
##      ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##   <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     20000     2         2        1    24     2     2    -1    -1    -2
## 2     2    120000     2         2        2    26    -1     2     0     0     0
## 3     3     90000     2         2        2    34     0     0     0     0     0
## 4     4     50000     2         2        1    37     0     0     0     0     0
## 5     5     50000     1         2        1    57    -1     0    -1     0     0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>

We need to fix sex values first:

defcredit <- defcredit %>%
  mutate(SEX = case_when(
    SEX == 1 ~ "Male",
    SEX == 2 ~ "Female"
  ))
head(defcredit,5)
## # A tibble: 5 × 25
##      ID LIMIT_BAL SEX    EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##   <dbl>     <dbl> <chr>      <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     20000 Female         2        1    24     2     2    -1    -1    -2
## 2     2    120000 Female         2        2    26    -1     2     0     0     0
## 3     3     90000 Female         2        2    34     0     0     0     0     0
## 4     4     50000 Female         2        1    37     0     0     0     0     0
## 5     5     50000 Male           2        1    57    -1     0    -1     0     0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>

Bottom 10 Rows

tail(defcredit, 10)
## # A tibble: 10 × 25
##       ID LIMIT_BAL SEX   EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##    <dbl>     <dbl> <chr>     <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 29991    140000 Male          2        1    41     0     0     0     0     0
##  2 29992    210000 Male          2        1    34     3     2     2     2     2
##  3 29993     10000 Male          3        1    43     0     0     0    -2    -2
##  4 29994    100000 Male          1        2    38     0    -1    -1     0     0
##  5 29995     80000 Male          2        2    34     2     2     2     2     2
##  6 29996    220000 Male          3        1    39     0     0     0     0     0
##  7 29997    150000 Male          3        2    43    -1    -1    -1    -1     0
##  8 29998     30000 Male          2        2    37     4     3     2    -1     0
##  9 29999     80000 Male          3        1    41     1    -1     0     0     0
## 10 30000     50000 Male          2        1    46     0     0     0     0     0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>

Data Types

str(defcredit)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ID                        : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ LIMIT_BAL                 : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX                       : chr [1:30000] "Female" "Female" "Female" "Female" ...
##  $ EDUCATION                 : num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
##  $ MARRIAGE                  : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
##  $ AGE                       : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0                     : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2                     : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3                     : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4                     : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5                     : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6                     : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1                 : num [1:30000] 3913 2682 29239 46990 8617 ...
##  $ BILL_AMT2                 : num [1:30000] 3102 1725 14027 48233 5670 ...
##  $ BILL_AMT3                 : num [1:30000] 689 2682 13559 49291 35835 ...
##  $ BILL_AMT4                 : num [1:30000] 0 3272 14331 28314 20940 ...
##  $ BILL_AMT5                 : num [1:30000] 0 3455 14948 28959 19146 ...
##  $ BILL_AMT6                 : num [1:30000] 0 3261 15549 29547 19131 ...
##  $ PAY_AMT1                  : num [1:30000] 0 0 1518 2000 2000 ...
##  $ PAY_AMT2                  : num [1:30000] 689 1000 1500 2019 36681 ...
##  $ PAY_AMT3                  : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4                  : num [1:30000] 0 1000 1000 1100 9000 ...
##  $ PAY_AMT5                  : num [1:30000] 0 0 1000 1069 689 ...
##  $ PAY_AMT6                  : num [1:30000] 0 2000 5000 1000 679 ...
##  $ default payment next month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...

Shape

dim(defcredit)
## [1] 30000    25

our dataset structure have 30000 rows and 25 columns

Duplicate Rows

sum(duplicated(defcredit))
## [1] 0

our dataset has no duplicates

# "distinct(defcredit) " 

#there is no need to use distinct(), since our dataset has no duplicates.

Missing Values

colSums(is.na(defcredit))
##                         ID                  LIMIT_BAL 
##                          0                          0 
##                        SEX                  EDUCATION 
##                          0                          0 
##                   MARRIAGE                        AGE 
##                          0                          0 
##                      PAY_0                      PAY_2 
##                          0                          0 
##                      PAY_3                      PAY_4 
##                          0                          0 
##                      PAY_5                      PAY_6 
##                          0                          0 
##                  BILL_AMT1                  BILL_AMT2 
##                          0                          0 
##                  BILL_AMT3                  BILL_AMT4 
##                          0                          0 
##                  BILL_AMT5                  BILL_AMT6 
##                          0                          0 
##                   PAY_AMT1                   PAY_AMT2 
##                          0                          0 
##                   PAY_AMT3                   PAY_AMT4 
##                          0                          0 
##                   PAY_AMT5                   PAY_AMT6 
##                          0                          0 
## default payment next month 
##                          0

No missing values found.

Box Plots to deal with outliers by quantitative Variables

numeric_cols <- defcredit %>% select(LIMIT_BAL, AGE, BILL_AMT1:BILL_AMT6, PAY_AMT1:PAY_AMT6)

boxplot(scale(numeric_cols),
        main = "Boxplots of Quantitative Variables",
   
        col = rainbow(ncol(numeric_cols)), 
        border = "darkgreen",
    
        pch = 20,
        outcol = "black",
     
        )

Handling Outliers

# Use Winsorising  at 1st and 99th percentile for financial variables


winsorise <- function(x, low = 0.01, high = 0.99) {
  q <- quantile(x, probs = c(low, high), na.rm = TRUE)
  x <- pmax(pmin(x, q[2]), q[1])
  return(x)
}

financial_cols <- c("LIMIT_BAL", "BILL_AMT1","BILL_AMT2","BILL_AMT3",
                    "BILL_AMT4","BILL_AMT5","BILL_AMT6",
                    "PAY_AMT1","PAY_AMT2","PAY_AMT3",
                    "PAY_AMT4","PAY_AMT5","PAY_AMT6")

defcredit <- defcredit %>%
  mutate(across(all_of(financial_cols), winsorise))

after handling outliers

numeric_cols <- defcredit %>% select(LIMIT_BAL, AGE, BILL_AMT1:BILL_AMT6, PAY_AMT1:PAY_AMT6)

par(mar = c(10, 4, 4, 2))
boxplot(scale(numeric_cols),
        main = "Boxplots after handling outliers",
        las = 2,
        col = rainbow(ncol(numeric_cols)), 
        border = "darkgreen",
        cex.axis = 0.7,
        pch = 20,
        outcol = "black")


4.2 Customer Information Summary

Categorical Variables

defcredit <- defcredit %>%
  mutate(
    DEFAULT_LBL = factor(`default payment next month`, levels = c(0, 1),
                         labels = c("No Default", "Default")),
    AGE_GROUP   = cut(AGE, breaks = seq(20, 80, 5),
                      include.lowest = TRUE)
  )

#the graphs need readable labels and grouped ages
# DEFAULT_LBL: converts 0/1 to "No Default"/"Default" for readable graph labels
# AGE_GROUP: groups raw ages into 5-year interval for age graph

Gender Distribution

ggplot(defcredit, aes(x = SEX, fill = SEX)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count))) +
  labs(title = "Clients by Gender", x = "Gender", y = "Count")

Education Level Distribution

#modification of dataset is needed to have meaning full data
defcredit <- defcredit %>%
  mutate(EDUCATION = case_when(
    EDUCATION == 0 ~ "Unknown",
    EDUCATION == 1 ~ "Graduate School",
    EDUCATION == 2 ~ "University",
    EDUCATION == 3 ~ "High School",
    TRUE           ~ "Others"
  ))

ggplot(defcredit, aes(x = EDUCATION, fill = EDUCATION)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  labs(title = "Clients by Education Level", x = "Education", y = "Count")

Age Distribution

ggplot(defcredit, aes(x = AGE_GROUP)) +
  geom_bar(fill = "skyblue") +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  labs(title = "Clients by Age Group", x = "Age Group", y = "Count")

Marital Status Distribution

#we have to mutate first in order to have meaningful data: 
defcredit <- defcredit %>%
  mutate(MARRIAGE = case_when(
    MARRIAGE == 0 ~ "Unknown",
    MARRIAGE == 1 ~ "Married",
    MARRIAGE == 2 ~ "Single",
    MARRIAGE == 3 ~ "Others"
  ))

ggplot(defcredit, aes(x = MARRIAGE, fill = MARRIAGE)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  labs(title = "Clients by Marital Status", x = "Marital Status", y = "Count")

Default Payment Status

ggplot(defcredit, aes(x = DEFAULT_LBL, fill = DEFAULT_LBL)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  scale_fill_manual(values = c("No Default" = "skyblue", "Default" = "darkred")) +
  labs(title = "Default Payment Status", x = "Status", y = "Count")

Summary Statement

percent_clients_under_40  <- round(sum(defcredit$AGE < 40) / nrow(defcredit) * 100, 1)
university_clients <- sum(defcredit$EDUCATION == "University")
female_clients <- sum(defcredit$SEX == "Female")
male_clients  <- sum(defcredit$SEX == "Male")
female_to_male_ratio<- round(female_clients / male_clients, 2)
default_rate<- round(sum(defcredit$DEFAULT_LBL == "Default") / nrow(defcredit) * 100, 1)

cat(
  "The EDA shows that ", percent_clients_under_40, "% of the clients are less than 40 years. ",
  "Moreover, ", university_clients, " out of 30,000 have university-level education. ",
  "In addition, the number of credit cards issued to female is ", female_to_male_ratio, " times the number issued to Male. ",
  "Generally, ", default_rate, "% of the loans were defaulted."
)
## The EDA shows that  69.5 % of the clients are less than 40 years.  Moreover,  14030  out of 30,000 have university-level education.  In addition, the number of credit cards issued to female is  1.52  times the number issued to Male.  Generally,  22.1 % of the loans were defaulted.

4.3 Default Rates

By Age Group

defcredit %>%
  group_by(AGE_GROUP) %>%
  summarise(Default_Rate = round(mean(`default payment next month`) * 100, 1)) %>%
  ggplot(aes(x = AGE_GROUP, y = Default_Rate)) +
  geom_col(fill = "skyblue") +
  geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.5) +
  labs(title = "Default Rate by Age Group", x = "Age Group", y = "Default Rate (%)")

Default rate By Education Level

defcredit %>%
  group_by(EDUCATION) %>%
  summarise(Default_Rate = round(mean(`default payment next month`) * 100, 1)) %>%
  ggplot(aes(x = EDUCATION, y = Default_Rate)) +
  geom_col(fill = "skyblue") +
  geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.5) +
  labs(title = "Default Rate by Education Level", x = "Education", y = "Default Rate (%)")

By Gender

defcredit %>%
  group_by(SEX) %>%
  summarise(Default_Rate = round(mean(`default payment next month`) * 100, 1)) %>%
  ggplot(aes(x = SEX, y = Default_Rate)) +
  geom_col(fill = "skyblue") +
  geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.5) +
  labs(title = "Default Rate by Gender", x = "Gender", y = "Default Rate (%)")

By Marital Status

defcredit %>%
  group_by(MARRIAGE) %>%
  summarise(Default_Rate = round(mean(`default payment next month`) * 100, 1)) %>%
  ggplot(aes(x = MARRIAGE, y = Default_Rate)) +
  geom_col(fill = "skyblue") +
  geom_text(aes(label = paste0(Default_Rate, "%")), vjust = -0.5) +
  labs(title = "Default Rate by Marital Status", x = "Marital Status", y = "Default Rate (%)")


4.4 High Risk Customer Segment

defcredit <- defcredit %>%
  mutate(Late_Payments = as.integer((PAY_0 >= 1) + (PAY_2 >= 1) + (PAY_3 >= 1) + (PAY_4 >= 1)))

segment <- defcredit %>% filter(AGE <= 30, EDUCATION == "High School", Late_Payments >= 2)
seg_rate<- round(mean(segment$`default payment next month`) * 100, 1)
overall_rate <- round(mean(defcredit$`default payment next month`) * 100, 1)

cat("Segment size:", nrow(segment), "\n")
## Segment size: 249
cat("Segment default rate:",seg_rate, "%\n")
## Segment default rate: 53 %
cat("Overall default rate:",overall_rate, "%\n")
## Overall default rate: 22.1 %
data.frame(Group = c("Overall", "High Risk"), Rate = c(overall_rate, seg_rate)) %>%
  ggplot(aes(x = Group, y = Rate, fill = Group)) +
  geom_col(width = 0.4) +
  geom_text(aes(label = paste0(Rate, "%")), vjust = -0.5) +
  scale_fill_manual(values = c("Overall" = "skyblue", "High Risk" = "darkred")) +
  labs(title = "Overall vs High-Risk Segment Default Rate", x = NULL, y = "Default Rate (%)")


4.5 Recommendations for Managing Default Risk

Based on the EDA findings, the following recommendations are proposed:

Help Younger Customers Early: People between 20 and 30 years old are the most likely to miss payments. The bank should give smaller credit limits to first-time cardholders under 30 and offer them simple financial tips.

Act Fast on Late Payments: Missing even one payment is a big warning sign. The bank should immediately send a text reminder and lower the customer’s credit limit as soon as they are one month late.

Set Credit Limits by Education Level: Clients who only finished high school have a higher risk of defaulting than university graduates. The bank should start high school graduates with lower credit limits and only raise them after they prove they can pay on time.

Keep a Closer Eye on Male Clients: The data shows that men default more often than women. Because of this, the bank should monitor male accounts a bit more closely for signs of risk.

Use Month by Month Tracking: Looking at past monthly tracking data is the best way to spot future money trouble. The bank should use this data to create a simple monthly risk score to catch and fix problem accounts early.