Libraries and Loading Data

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ✔ readr     2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/user/OneDrive/Desktop/20251MBI051_R_For_Data_Science_Exam")
cc_data <- read_excel("default of credit card clients.xls", skip = 1)
cc_data
## # A tibble: 30,000 × 25
##       ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##    <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1     20000     2         2        1    24     2     2    -1    -1    -2
##  2     2    120000     2         2        2    26    -1     2     0     0     0
##  3     3     90000     2         2        2    34     0     0     0     0     0
##  4     4     50000     2         2        1    37     0     0     0     0     0
##  5     5     50000     1         2        1    57    -1     0    -1     0     0
##  6     6     50000     1         1        2    37     0     0     0     0     0
##  7     7    500000     1         1        2    29     0     0     0     0     0
##  8     8    100000     2         2        2    23     0    -1    -1     0     0
##  9     9    140000     2         3        1    28     0     0     2     0     0
## 10    10     20000     1         3        2    35    -2    -2    -2    -2    -1
## # ℹ 29,990 more rows
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>

4.1. Viewing the data

# Viewing the variable names
names(cc_data)
##  [1] "ID"                         "LIMIT_BAL"                 
##  [3] "SEX"                        "EDUCATION"                 
##  [5] "MARRIAGE"                   "AGE"                       
##  [7] "PAY_0"                      "PAY_2"                     
##  [9] "PAY_3"                      "PAY_4"                     
## [11] "PAY_5"                      "PAY_6"                     
## [13] "BILL_AMT1"                  "BILL_AMT2"                 
## [15] "BILL_AMT3"                  "BILL_AMT4"                 
## [17] "BILL_AMT5"                  "BILL_AMT6"                 
## [19] "PAY_AMT1"                   "PAY_AMT2"                  
## [21] "PAY_AMT3"                   "PAY_AMT4"                  
## [23] "PAY_AMT5"                   "PAY_AMT6"                  
## [25] "default payment next month"
# Top 5 rows
head(cc_data, 5)
## # A tibble: 5 × 25
##      ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##   <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     20000     2         2        1    24     2     2    -1    -1    -2
## 2     2    120000     2         2        2    26    -1     2     0     0     0
## 3     3     90000     2         2        2    34     0     0     0     0     0
## 4     4     50000     2         2        1    37     0     0     0     0     0
## 5     5     50000     1         2        1    57    -1     0    -1     0     0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
# Bottom 10 rows
tail(cc_data, 10)
## # A tibble: 10 × 25
##       ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##    <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 29991    140000     1         2        1    41     0     0     0     0     0
##  2 29992    210000     1         2        1    34     3     2     2     2     2
##  3 29993     10000     1         3        1    43     0     0     0    -2    -2
##  4 29994    100000     1         1        2    38     0    -1    -1     0     0
##  5 29995     80000     1         2        2    34     2     2     2     2     2
##  6 29996    220000     1         3        1    39     0     0     0     0     0
##  7 29997    150000     1         3        2    43    -1    -1    -1    -1     0
##  8 29998     30000     1         2        2    37     4     3     2    -1     0
##  9 29999     80000     1         3        1    41     1    -1     0     0     0
## 10 30000     50000     1         2        1    46     0     0     0     0     0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
# Data type
str(cc_data)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ID                        : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ LIMIT_BAL                 : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX                       : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
##  $ EDUCATION                 : num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
##  $ MARRIAGE                  : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
##  $ AGE                       : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0                     : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2                     : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3                     : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4                     : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5                     : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6                     : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1                 : num [1:30000] 3913 2682 29239 46990 8617 ...
##  $ BILL_AMT2                 : num [1:30000] 3102 1725 14027 48233 5670 ...
##  $ BILL_AMT3                 : num [1:30000] 689 2682 13559 49291 35835 ...
##  $ BILL_AMT4                 : num [1:30000] 0 3272 14331 28314 20940 ...
##  $ BILL_AMT5                 : num [1:30000] 0 3455 14948 28959 19146 ...
##  $ BILL_AMT6                 : num [1:30000] 0 3261 15549 29547 19131 ...
##  $ PAY_AMT1                  : num [1:30000] 0 0 1518 2000 2000 ...
##  $ PAY_AMT2                  : num [1:30000] 689 1000 1500 2019 36681 ...
##  $ PAY_AMT3                  : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4                  : num [1:30000] 0 1000 1000 1100 9000 ...
##  $ PAY_AMT5                  : num [1:30000] 0 0 1000 1069 689 ...
##  $ PAY_AMT6                  : num [1:30000] 0 2000 5000 1000 679 ...
##  $ default payment next month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
# Shape of the data set
nrow(cc_data)
## [1] 30000
ncol(cc_data)
## [1] 25
sample(cc_data)
## # A tibble: 30,000 × 25
##    PAY_AMT4 EDUCATION default payment next mont…¹ PAY_AMT2 PAY_2 BILL_AMT2 PAY_5
##       <dbl>     <dbl>                       <dbl>    <dbl> <dbl>     <dbl> <dbl>
##  1        0         2                           1      689     2      3102    -2
##  2     1000         2                           1     1000     2      1725     0
##  3     1000         2                           0     1500     0     14027     0
##  4     1100         2                           0     2019     0     48233     0
##  5     9000         2                           0    36681     0      5670     0
##  6     1000         1                           0     1815     0     57069     0
##  7    20239         1                           0    40000     0    412023     0
##  8      581         2                           0      601    -1       380     0
##  9     1000         3                           0        0     0     14096     0
## 10    13007         3                           0        0    -2         0    -1
## # ℹ 29,990 more rows
## # ℹ abbreviated name: ¹​`default payment next month`
## # ℹ 18 more variables: BILL_AMT5 <dbl>, BILL_AMT1 <dbl>, PAY_0 <dbl>,
## #   BILL_AMT6 <dbl>, PAY_AMT5 <dbl>, BILL_AMT3 <dbl>, PAY_AMT3 <dbl>,
## #   LIMIT_BAL <dbl>, ID <dbl>, BILL_AMT4 <dbl>, SEX <dbl>, AGE <dbl>,
## #   PAY_4 <dbl>, PAY_AMT1 <dbl>, PAY_6 <dbl>, PAY_3 <dbl>, PAY_AMT6 <dbl>,
## #   MARRIAGE <dbl>

4.1. Data cleaning

# Checking for duplications
sum(duplicated(cc_data))
## [1] 0
# Checking for missing values
colSums(is.na(cc_data))
##                         ID                  LIMIT_BAL 
##                          0                          0 
##                        SEX                  EDUCATION 
##                          0                          0 
##                   MARRIAGE                        AGE 
##                          0                          0 
##                      PAY_0                      PAY_2 
##                          0                          0 
##                      PAY_3                      PAY_4 
##                          0                          0 
##                      PAY_5                      PAY_6 
##                          0                          0 
##                  BILL_AMT1                  BILL_AMT2 
##                          0                          0 
##                  BILL_AMT3                  BILL_AMT4 
##                          0                          0 
##                  BILL_AMT5                  BILL_AMT6 
##                          0                          0 
##                   PAY_AMT1                   PAY_AMT2 
##                          0                          0 
##                   PAY_AMT3                   PAY_AMT4 
##                          0                          0 
##                   PAY_AMT5                   PAY_AMT6 
##                          0                          0 
## default payment next month 
##                          0
# Checking for outliers
# First use the summary () to know how the data looks like, and which data is considered the categorical
summary(cc_data)
##        ID          LIMIT_BAL            SEX          EDUCATION    
##  Min.   :    1   Min.   :  10000   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 7501   1st Qu.:  50000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :15000   Median : 140000   Median :2.000   Median :2.000  
##  Mean   :15000   Mean   : 167484   Mean   :1.604   Mean   :1.853  
##  3rd Qu.:22500   3rd Qu.: 240000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :30000   Max.   :1000000   Max.   :2.000   Max.   :6.000  
##     MARRIAGE          AGE            PAY_0             PAY_2        
##  Min.   :0.000   Min.   :21.00   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:1.000   1st Qu.:28.00   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median :2.000   Median :34.00   Median : 0.0000   Median : 0.0000  
##  Mean   :1.552   Mean   :35.49   Mean   :-0.0167   Mean   :-0.1338  
##  3rd Qu.:2.000   3rd Qu.:41.00   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :3.000   Max.   :79.00   Max.   : 8.0000   Max.   : 8.0000  
##      PAY_3             PAY_4             PAY_5             PAY_6        
##  Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000   Min.   :-2.0000  
##  1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.:-1.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.0000   Median : 0.0000  
##  Mean   :-0.1662   Mean   :-0.2207   Mean   :-0.2662   Mean   :-0.2911  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000   Max.   : 8.0000  
##    BILL_AMT1         BILL_AMT2        BILL_AMT3         BILL_AMT4      
##  Min.   :-165580   Min.   :-69777   Min.   :-157264   Min.   :-170000  
##  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666   1st Qu.:   2327  
##  Median :  22382   Median : 21200   Median :  20089   Median :  19052  
##  Mean   :  51223   Mean   : 49179   Mean   :  47013   Mean   :  43263  
##  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165   3rd Qu.:  54506  
##  Max.   : 964511   Max.   :983931   Max.   :1664089   Max.   : 891586  
##    BILL_AMT5        BILL_AMT6          PAY_AMT1         PAY_AMT2      
##  Min.   :-81334   Min.   :-339603   Min.   :     0   Min.   :      0  
##  1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000   1st Qu.:    833  
##  Median : 18105   Median :  17071   Median :  2100   Median :   2009  
##  Mean   : 40311   Mean   :  38872   Mean   :  5664   Mean   :   5921  
##  3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006   3rd Qu.:   5000  
##  Max.   :927171   Max.   : 961664   Max.   :873552   Max.   :1684259  
##     PAY_AMT3         PAY_AMT4         PAY_AMT5           PAY_AMT6       
##  Min.   :     0   Min.   :     0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5   1st Qu.:   117.8  
##  Median :  1800   Median :  1500   Median :  1500.0   Median :  1500.0  
##  Mean   :  5226   Mean   :  4826   Mean   :  4799.4   Mean   :  5215.5  
##  3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5   3rd Qu.:  4000.0  
##  Max.   :896040   Max.   :621000   Max.   :426529.0   Max.   :528666.0  
##  default payment next month
##  Min.   :0.0000            
##  1st Qu.:0.0000            
##  Median :0.0000            
##  Mean   :0.2212            
##  3rd Qu.:0.0000            
##  Max.   :1.0000

Boxplots for outlier detection

# Plot each variable SEPARATELY in full size
par(mfrow = c(1, 1))  # One plot at a time

quant_vars <- c("LIMIT_BAL", "AGE", 
                "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
                "BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
                "PAY_AMT1", "PAY_AMT2", "PAY_AMT3",
                "PAY_AMT4", "PAY_AMT5", "PAY_AMT6")

for (col in quant_vars) {
  boxplot(cc_data[[col]],
          main  = paste("Boxplot of", col),
          ylab  = col,
          col   = "steelblue",
          border = "darkblue",
          outcol = "red",     
          outpch = 10,         
          cex    = 1,        
          outline = TRUE)
  
  grid()                       
}

## Since we confirmed there are no missing values, Now we proceed with dealing with outliers. # Since our quantitative variables are heavily skewed, we use transformation to deal with outliers # We used transformation for this dataset to deal with the outlier, because it preserves all values, reduces the the skewness, keeps relationships, and it is better.

cc_data_clean <- cc_data
cc_data_clean[quant_vars] <- lapply(cc_data[quant_vars], function(x) {
  log1p(pmax(x, 0))
})

Lets verify with boxplots

par(mfrow = c(1, 1))

for (col in quant_vars) {
  boxplot(cc_data_clean[[col]],
          main   = paste("After log:", col),
          col    = "steelblue",
          outcol = "red",
          outpch = 16,
          outline = TRUE)
  grid()
}

4.2. Summarising the customer information

Gender Distribution

gender_counts <- table(cc_data_clean$SEX)

barplot(gender_counts,
        main   = "Customer Distribution by Gender",
        xlab   = "Gender",
        ylab   = "Number of Customers",
        col    = c("steelblue", "tomato"),
        names.arg = c("Male", "Female"))

# Education Level Distribution

edu_def <- cc_data_clean[cc_data_clean$EDUCATION %in% c(1,2,3,4), ]
edu_counts <- table(edu_def$EDUCATION)

barplot(edu_counts,
        main  = "Customer Distribution by Education Level",
        xlab  = "Education Level",
        ylab  = "Number of Customers",
        col   = c("steelblue","tomato","seagreen","orange"),
        names.arg = c("1","2","3","4"),
        ylim  = c(0, max(edu_counts) + 2000))

text(x = c(0.7, 1.9, 3.1, 4.3),
     y = edu_counts + 1500,
     labels = paste(edu_counts,
                    paste0("(", round(100 * edu_counts/sum(edu_counts), 1), "%)")))
legend("topright",
       legend = c("1 = Graduate",
                  "2 = University",
                  "3 = High School",
                  "4 = Others"),
       fill  = c("steelblue","tomato","seagreen","orange"),
       title = "Education Level",
       bty   = "n")

table(cc_data_clean$EDUCATION)
## 
##     0     1     2     3     4     5     6 
##    14 10585 14030  4917   123   280    51

Age distribution

# Histogram (best for continuous variable like age)
# Use cc_data since the clean is distorted
hist(cc_data$AGE,
     main   = "Customer Distribution by Age",
     xlab   = "Age",
     ylab   = "Number of Customers",
     col    = "steelblue",
     border = "white",
     breaks = 20)

# Marital status distribution

mar_def <- cc_data_clean[cc_data_clean$MARRIAGE %in% c(1,2,3), ]
mar_counts <- table(mar_def$MARRIAGE)

barplot(mar_counts,
        main      = "Customer Distribution by Marital Status",
        xlab      = "Marital Status",
        ylab      = "Number of Customers",
        col       = c("steelblue", "tomato", "seagreen"),
        names.arg = c("Married", "Single", "Others"),
        ylim      = c(0, max(mar_counts) + 2000))

text(x = c(0.7, 1.9, 3.1),
     y = mar_counts + 1500,
     labels = paste(mar_counts,
                    paste0("(", round(100 * mar_counts/sum(mar_counts), 1), "%)")))

# Default payment status

default_counts <- table(cc_data_clean$`default payment next month`)

barplot(default_counts,
        main      = "Default Payment Status",
        xlab      = "Default Payment",
        ylab      = "Number of Customers",
        col       = c("steelblue", "tomato"),
        names.arg = c("No Default", "Default"),
        ylim      = c(0, max(default_counts) + 2000))

text(x = c(0.7, 1.9),
     y = default_counts + 1300,
     labels = paste(default_counts,
                    paste0("(", round(100 * default_counts/sum(default_counts), 1), "%)")))

# The EDA shows that ….% of the clients are less that 40 years. Moreover,…..out # of 30,000 have university-level education. In addition, the number of credit # cards issued to female is ….. times the number issued to Male. Generally, ….% # of the loan were defaulted

# Customers under 40 years of age
under_40 <- sum(cc_data$AGE < 40)
individuals <- nrow(cc_data)
# percentage below 40 years of age 
perc_40 <- round(under_40/individuals * 100, 2)
cat("Percentage under 40 years of age is: ", perc_40, "%.\n")
## Percentage under 40 years of age is:  69.52 %.
# Defaulters out-of 30,000 have university level education
# We chose digit 1 to represent Graduate, 2 university level, 3 secondary school, 4 primary school, 5 Kindergarten, 6 no education

university_count <- sum(cc_data_clean$EDUCATION == 2)
individuals_clean <- nrow(cc_data_clean)
individuals_clean
## [1] 30000
university_count
## [1] 14030
cat("Out of 30,000 defaulters,", university_count,"have university-level education. \n")
## Out of 30,000 defaulters, 14030 have university-level education.
# Female cards compared to male cards
gender_counts
## 
##     1     2 
## 11888 18112
# Considering 2 is Female and 1 is Male
female <- gender_counts[2]
female
##     2 
## 18112
male <- gender_counts[1]
male
##     1 
## 11888
rate_card <- round(female/male, 2)
cat("Cards issued to Female are ", rate_card," times the number of cards issued to Male.\n" )
## Cards issued to Female are  1.52  times the number of cards issued to Male.
# Defaulted Loan
default_counts
## 
##     0     1 
## 23364  6636
defaulted <- default_counts[2]
defaulted
##    1 
## 6636
not_defaulted <- default_counts[1]
not_defaulted
##     0 
## 23364
perc_defaulted <- round(defaulted/individuals_clean * 100, 2)
cat("Percentage loan default is ", perc_defaulted,"%.\n")
## Percentage loan default is  22.12 %.

4.3. How do default rates vary by age groups(take age difference of 5 years), education level, gender and Marital Status.

We use Tapply() function to split a variable into groups and apply a function to each group

#syntax: tapply(WHAT you want to calculate, GROUP by what, FUNCTION to apply) # Default rate by Age Group

# creating the 5 year age group
cc_data$AGE_GROUP <- cut(cc_data$AGE,
                         breaks = seq(20, 80, by = 5),
                         right = FALSE,
                         labels = c("20-24","25-29","30-34","35-39",
                                    "40-44","45-49","50-54","55-59",
                                    "60-64","65-69","70-74","75-79"))

# calculate default rate per age group
age_default <- tapply(cc_data$`default payment next month`,
                      cc_data$AGE_GROUP, mean) * 100

barplot(age_default,
        main = "Default rate by Age Group",
        xlab = "Age Group",
        ylab = "Default Rate (%)",
        col  = "steelblue",
        las  = 2,
        ylim = c(0, max(age_default, na.rm = TRUE) +5))

# add percentage labels
text(x = seq(0.7, by = 1.2, length.out = length(age_default)),
     y = age_default + 1.5,
     labels = paste0(round(age_default, 1), "%"),
     cex = 0.6)

Default rate by Education Level

# Filter valid education categories
edu_def <- cc_data_clean[cc_data_clean$EDUCATION %in% c(1,2,3,4), ]

edu_default <- tapply(edu_def$`default payment next month`,
                      edu_def$EDUCATION, mean) * 100

barplot(edu_default,
        main      = "Default Rate by Education Level",
        xlab      = "Education",
        ylab      = "Default Rate (%)",
        col       = c("steelblue","tomato","seagreen","orange"),
        names.arg = c("1","2","3","4"),
        ylim      = c(0, max(edu_default) + 5))

text(x = c(0.7, 1.9, 3.1, 4.3),
     y = edu_default + 1.5,
     labels = paste0(round(edu_default, 1), "%"))
legend("topright",
       legend = c("1 = Graduate",
                  "2 = University",
                  "3 = High School",
                  "4 = Others"),
       fill   = c("steelblue","tomato","seagreen","orange"),
       title  = "Education Level",
       bty    = "n")

# Default rate by Gender

gender_default <- tapply(cc_data_clean$`default payment next month`,
                         cc_data_clean$SEX, mean) * 100

barplot(gender_default,
        main      = "Default Rate by Gender",
        xlab      = "Gender",                  
        ylab      = "Default Rate (%)",
        col       = c("steelblue","tomato"),
        names.arg = c("Male","Female"),
        ylim      = c(0, max(gender_default) + 8))

text(x = c(0.7, 1.9),
     y = gender_default + 1.5,
     labels = paste0(round(gender_default, 1), "%"))

# Default rate by Marital Status

# Filter valid marital status categories
mar_def <- cc_data_clean[cc_data_clean$MARRIAGE %in% c(1,2,3), ]

mar_default <- tapply(mar_def$`default payment next month`,
                      mar_def$MARRIAGE, mean) * 100

barplot(mar_default,
        main      = "Default Rate by Marital Status",
        xlab      = "Marital Status", 
        ylab      = "Default Rate (%)",
        col       = c("steelblue","tomato","seagreen"),
        names.arg = c("Married","Single","Others"),
        ylim      = c(0, max(mar_default) + 8))

text(x = c(0.7, 1.9, 3.1),
     y = mar_default + 1.5,
     labels = paste0(round(mar_default, 1), "%"))

## 4.4. Identifying a customer segment with a notably high default risk, # The customer segment at study: # A customer within the age group of 30 years old , with a highschool education level, and atleast one default.

segment <- cc_data[cc_data$AGE <= 30 &
                   cc_data$EDUCATION == 3 &
                     cc_data$PAY_0 >= 1, ]
segment
## # A tibble: 325 × 26
##       ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##    <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    16     50000     2         3        3    23     1     2     0     0     0
##  2    69    130000     2         3        2    29     1    -2    -2    -1     2
##  3   200     30000     2         3        2    22     1     2     2     0     0
##  4   230     20000     2         3        2    24     1     2     0     0     0
##  5   832     20000     2         3        1    24     8     7     6     5     4
##  6  1048    140000     2         3        2    24     1     2     2     2     0
##  7  1093    210000     1         3        1    29     1     2     0     0     0
##  8  1256     70000     1         3        2    28     1     2     0     0     0
##  9  1380     80000     1         3        2    26     1     2     2     2     2
## 10  1424    200000     2         3        2    27     1    -2    -2    -2    -2
## # ℹ 315 more rows
## # ℹ 15 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>,
## #   AGE_GROUP <fct>
cat("Default customers at study:", round(nrow(segment)/nrow(cc_data_clean)*100, 1), "%\n")
## Default customers at study: 1.1 %
default_r <- mean(segment$`default payment next month`) * 100
default_r
## [1] 52
overall_r <- mean(cc_data$`default payment next month`) * 100
overall_r
## [1] 22.12
rate_data <- round(default_r/overall_r, 1)
rate_data
## [1] 2.4

#Customers aged 30 and below with High School education who had at least one late payment #represent 1.1% of the total customer base (325 customers). This segment has a default rate #of approximately 52%, which is 2.4 times higher than the overall default rate of 22.12%. #This confirms that young, less educated customers with poor recent payment behavior #represent a significantly high risk group in this portfolio