library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.2 ✔ tidyr 1.3.2
## ✔ readr 2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/user/OneDrive/Desktop/20251MBI051_R_For_Data_Science_Exam")
cc_data <- read_excel("default of credit card clients.xls", skip = 1)
cc_data
## # A tibble: 30,000 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## 6 6 50000 1 1 2 37 0 0 0 0 0
## 7 7 500000 1 1 2 29 0 0 0 0 0
## 8 8 100000 2 2 2 23 0 -1 -1 0 0
## 9 9 140000 2 3 1 28 0 0 2 0 0
## 10 10 20000 1 3 2 35 -2 -2 -2 -2 -1
## # ℹ 29,990 more rows
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
# Viewing the variable names
names(cc_data)
## [1] "ID" "LIMIT_BAL"
## [3] "SEX" "EDUCATION"
## [5] "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2"
## [9] "PAY_3" "PAY_4"
## [11] "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2"
## [15] "BILL_AMT3" "BILL_AMT4"
## [17] "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2"
## [21] "PAY_AMT3" "PAY_AMT4"
## [23] "PAY_AMT5" "PAY_AMT6"
## [25] "default payment next month"
# Top 5 rows
head(cc_data, 5)
## # A tibble: 5 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
# Bottom 10 rows
tail(cc_data, 10)
## # A tibble: 10 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29991 140000 1 2 1 41 0 0 0 0 0
## 2 29992 210000 1 2 1 34 3 2 2 2 2
## 3 29993 10000 1 3 1 43 0 0 0 -2 -2
## 4 29994 100000 1 1 2 38 0 -1 -1 0 0
## 5 29995 80000 1 2 2 34 2 2 2 2 2
## 6 29996 220000 1 3 1 39 0 0 0 0 0
## 7 29997 150000 1 3 2 43 -1 -1 -1 -1 0
## 8 29998 30000 1 2 2 37 4 3 2 -1 0
## 9 29999 80000 1 3 1 41 1 -1 0 0 0
## 10 30000 50000 1 2 1 46 0 0 0 0 0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>
# Data type
str(cc_data)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
## $ EDUCATION : num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
## $ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
## $ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
## $ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
## $ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
## $ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
## $ default payment next month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...
# Shape of the data set
nrow(cc_data)
## [1] 30000
ncol(cc_data)
## [1] 25
sample(cc_data)
## # A tibble: 30,000 × 25
## PAY_AMT4 EDUCATION default payment next mont…¹ PAY_AMT2 PAY_2 BILL_AMT2 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 2 1 689 2 3102 -2
## 2 1000 2 1 1000 2 1725 0
## 3 1000 2 0 1500 0 14027 0
## 4 1100 2 0 2019 0 48233 0
## 5 9000 2 0 36681 0 5670 0
## 6 1000 1 0 1815 0 57069 0
## 7 20239 1 0 40000 0 412023 0
## 8 581 2 0 601 -1 380 0
## 9 1000 3 0 0 0 14096 0
## 10 13007 3 0 0 -2 0 -1
## # ℹ 29,990 more rows
## # ℹ abbreviated name: ¹`default payment next month`
## # ℹ 18 more variables: BILL_AMT5 <dbl>, BILL_AMT1 <dbl>, PAY_0 <dbl>,
## # BILL_AMT6 <dbl>, PAY_AMT5 <dbl>, BILL_AMT3 <dbl>, PAY_AMT3 <dbl>,
## # LIMIT_BAL <dbl>, ID <dbl>, BILL_AMT4 <dbl>, SEX <dbl>, AGE <dbl>,
## # PAY_4 <dbl>, PAY_AMT1 <dbl>, PAY_6 <dbl>, PAY_3 <dbl>, PAY_AMT6 <dbl>,
## # MARRIAGE <dbl>
# Checking for duplications
sum(duplicated(cc_data))
## [1] 0
# Checking for missing values
colSums(is.na(cc_data))
## ID LIMIT_BAL
## 0 0
## SEX EDUCATION
## 0 0
## MARRIAGE AGE
## 0 0
## PAY_0 PAY_2
## 0 0
## PAY_3 PAY_4
## 0 0
## PAY_5 PAY_6
## 0 0
## BILL_AMT1 BILL_AMT2
## 0 0
## BILL_AMT3 BILL_AMT4
## 0 0
## BILL_AMT5 BILL_AMT6
## 0 0
## PAY_AMT1 PAY_AMT2
## 0 0
## PAY_AMT3 PAY_AMT4
## 0 0
## PAY_AMT5 PAY_AMT6
## 0 0
## default payment next month
## 0
# Checking for outliers
# First use the summary () to know how the data looks like, and which data is considered the categorical
summary(cc_data)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000
## 1st Qu.: 7501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000
## Median :15000 Median : 140000 Median :2.000 Median :2.000
## Mean :15000 Mean : 167484 Mean :1.604 Mean :1.853
## 3rd Qu.:22500 3rd Qu.: 240000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :30000 Max. :1000000 Max. :2.000 Max. :6.000
## MARRIAGE AGE PAY_0 PAY_2
## Min. :0.000 Min. :21.00 Min. :-2.0000 Min. :-2.0000
## 1st Qu.:1.000 1st Qu.:28.00 1st Qu.:-1.0000 1st Qu.:-1.0000
## Median :2.000 Median :34.00 Median : 0.0000 Median : 0.0000
## Mean :1.552 Mean :35.49 Mean :-0.0167 Mean :-0.1338
## 3rd Qu.:2.000 3rd Qu.:41.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :3.000 Max. :79.00 Max. : 8.0000 Max. : 8.0000
## PAY_3 PAY_4 PAY_5 PAY_6
## Min. :-2.0000 Min. :-2.0000 Min. :-2.0000 Min. :-2.0000
## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean :-0.1662 Mean :-0.2207 Mean :-0.2662 Mean :-0.2911
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. : 8.0000 Max. : 8.0000 Max. : 8.0000 Max. : 8.0000
## BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4
## Min. :-165580 Min. :-69777 Min. :-157264 Min. :-170000
## 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666 1st Qu.: 2327
## Median : 22382 Median : 21200 Median : 20089 Median : 19052
## Mean : 51223 Mean : 49179 Mean : 47013 Mean : 43263
## 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165 3rd Qu.: 54506
## Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586
## BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## Min. :-81334 Min. :-339603 Min. : 0 Min. : 0
## 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000 1st Qu.: 833
## Median : 18105 Median : 17071 Median : 2100 Median : 2009
## Mean : 40311 Mean : 38872 Mean : 5664 Mean : 5921
## 3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006 3rd Qu.: 5000
## Max. :927171 Max. : 961664 Max. :873552 Max. :1684259
## PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5 1st Qu.: 117.8
## Median : 1800 Median : 1500 Median : 1500.0 Median : 1500.0
## Mean : 5226 Mean : 4826 Mean : 4799.4 Mean : 5215.5
## 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5 3rd Qu.: 4000.0
## Max. :896040 Max. :621000 Max. :426529.0 Max. :528666.0
## default payment next month
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2212
## 3rd Qu.:0.0000
## Max. :1.0000
# Plot each variable SEPARATELY in full size
par(mfrow = c(1, 1)) # One plot at a time
quant_vars <- c("LIMIT_BAL", "AGE",
"BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
"BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
"PAY_AMT1", "PAY_AMT2", "PAY_AMT3",
"PAY_AMT4", "PAY_AMT5", "PAY_AMT6")
for (col in quant_vars) {
boxplot(cc_data[[col]],
main = paste("Boxplot of", col),
ylab = col,
col = "steelblue",
border = "darkblue",
outcol = "red",
outpch = 10,
cex = 1,
outline = TRUE)
grid()
}
## Since we confirmed there are no missing values, Now we proceed with
dealing with outliers. # Since our quantitative variables are heavily
skewed, we use transformation to deal with outliers # We used
transformation for this dataset to deal with the outlier, because it
preserves all values, reduces the the skewness, keeps relationships, and
it is better.
cc_data_clean <- cc_data
cc_data_clean[quant_vars] <- lapply(cc_data[quant_vars], function(x) {
log1p(pmax(x, 0))
})
par(mfrow = c(1, 1))
for (col in quant_vars) {
boxplot(cc_data_clean[[col]],
main = paste("After log:", col),
col = "steelblue",
outcol = "red",
outpch = 16,
outline = TRUE)
grid()
}
gender_counts <- table(cc_data_clean$SEX)
barplot(gender_counts,
main = "Customer Distribution by Gender",
xlab = "Gender",
ylab = "Number of Customers",
col = c("steelblue", "tomato"),
names.arg = c("Male", "Female"))
# Education Level Distribution
edu_def <- cc_data_clean[cc_data_clean$EDUCATION %in% c(1,2,3,4), ]
edu_counts <- table(edu_def$EDUCATION)
barplot(edu_counts,
main = "Customer Distribution by Education Level",
xlab = "Education Level",
ylab = "Number of Customers",
col = c("steelblue","tomato","seagreen","orange"),
names.arg = c("1","2","3","4"),
ylim = c(0, max(edu_counts) + 2000))
text(x = c(0.7, 1.9, 3.1, 4.3),
y = edu_counts + 1500,
labels = paste(edu_counts,
paste0("(", round(100 * edu_counts/sum(edu_counts), 1), "%)")))
legend("topright",
legend = c("1 = Graduate",
"2 = University",
"3 = High School",
"4 = Others"),
fill = c("steelblue","tomato","seagreen","orange"),
title = "Education Level",
bty = "n")
table(cc_data_clean$EDUCATION)
##
## 0 1 2 3 4 5 6
## 14 10585 14030 4917 123 280 51
# Histogram (best for continuous variable like age)
# Use cc_data since the clean is distorted
hist(cc_data$AGE,
main = "Customer Distribution by Age",
xlab = "Age",
ylab = "Number of Customers",
col = "steelblue",
border = "white",
breaks = 20)
# Marital status distribution
mar_def <- cc_data_clean[cc_data_clean$MARRIAGE %in% c(1,2,3), ]
mar_counts <- table(mar_def$MARRIAGE)
barplot(mar_counts,
main = "Customer Distribution by Marital Status",
xlab = "Marital Status",
ylab = "Number of Customers",
col = c("steelblue", "tomato", "seagreen"),
names.arg = c("Married", "Single", "Others"),
ylim = c(0, max(mar_counts) + 2000))
text(x = c(0.7, 1.9, 3.1),
y = mar_counts + 1500,
labels = paste(mar_counts,
paste0("(", round(100 * mar_counts/sum(mar_counts), 1), "%)")))
# Default payment status
default_counts <- table(cc_data_clean$`default payment next month`)
barplot(default_counts,
main = "Default Payment Status",
xlab = "Default Payment",
ylab = "Number of Customers",
col = c("steelblue", "tomato"),
names.arg = c("No Default", "Default"),
ylim = c(0, max(default_counts) + 2000))
text(x = c(0.7, 1.9),
y = default_counts + 1300,
labels = paste(default_counts,
paste0("(", round(100 * default_counts/sum(default_counts), 1), "%)")))
# The EDA shows that ….% of the clients are less that 40 years.
Moreover,…..out # of 30,000 have university-level education. In
addition, the number of credit # cards issued to female is ….. times the
number issued to Male. Generally, ….% # of the loan were defaulted
# Customers under 40 years of age
under_40 <- sum(cc_data$AGE < 40)
individuals <- nrow(cc_data)
# percentage below 40 years of age
perc_40 <- round(under_40/individuals * 100, 2)
cat("Percentage under 40 years of age is: ", perc_40, "%.\n")
## Percentage under 40 years of age is: 69.52 %.
# Defaulters out-of 30,000 have university level education
# We chose digit 1 to represent Graduate, 2 university level, 3 secondary school, 4 primary school, 5 Kindergarten, 6 no education
university_count <- sum(cc_data_clean$EDUCATION == 2)
individuals_clean <- nrow(cc_data_clean)
individuals_clean
## [1] 30000
university_count
## [1] 14030
cat("Out of 30,000 defaulters,", university_count,"have university-level education. \n")
## Out of 30,000 defaulters, 14030 have university-level education.
# Female cards compared to male cards
gender_counts
##
## 1 2
## 11888 18112
# Considering 2 is Female and 1 is Male
female <- gender_counts[2]
female
## 2
## 18112
male <- gender_counts[1]
male
## 1
## 11888
rate_card <- round(female/male, 2)
cat("Cards issued to Female are ", rate_card," times the number of cards issued to Male.\n" )
## Cards issued to Female are 1.52 times the number of cards issued to Male.
# Defaulted Loan
default_counts
##
## 0 1
## 23364 6636
defaulted <- default_counts[2]
defaulted
## 1
## 6636
not_defaulted <- default_counts[1]
not_defaulted
## 0
## 23364
perc_defaulted <- round(defaulted/individuals_clean * 100, 2)
cat("Percentage loan default is ", perc_defaulted,"%.\n")
## Percentage loan default is 22.12 %.
#syntax: tapply(WHAT you want to calculate, GROUP by what, FUNCTION to apply) # Default rate by Age Group
# creating the 5 year age group
cc_data$AGE_GROUP <- cut(cc_data$AGE,
breaks = seq(20, 80, by = 5),
right = FALSE,
labels = c("20-24","25-29","30-34","35-39",
"40-44","45-49","50-54","55-59",
"60-64","65-69","70-74","75-79"))
# calculate default rate per age group
age_default <- tapply(cc_data$`default payment next month`,
cc_data$AGE_GROUP, mean) * 100
barplot(age_default,
main = "Default rate by Age Group",
xlab = "Age Group",
ylab = "Default Rate (%)",
col = "steelblue",
las = 2,
ylim = c(0, max(age_default, na.rm = TRUE) +5))
# add percentage labels
text(x = seq(0.7, by = 1.2, length.out = length(age_default)),
y = age_default + 1.5,
labels = paste0(round(age_default, 1), "%"),
cex = 0.6)
# Filter valid education categories
edu_def <- cc_data_clean[cc_data_clean$EDUCATION %in% c(1,2,3,4), ]
edu_default <- tapply(edu_def$`default payment next month`,
edu_def$EDUCATION, mean) * 100
barplot(edu_default,
main = "Default Rate by Education Level",
xlab = "Education",
ylab = "Default Rate (%)",
col = c("steelblue","tomato","seagreen","orange"),
names.arg = c("1","2","3","4"),
ylim = c(0, max(edu_default) + 5))
text(x = c(0.7, 1.9, 3.1, 4.3),
y = edu_default + 1.5,
labels = paste0(round(edu_default, 1), "%"))
legend("topright",
legend = c("1 = Graduate",
"2 = University",
"3 = High School",
"4 = Others"),
fill = c("steelblue","tomato","seagreen","orange"),
title = "Education Level",
bty = "n")
# Default rate by Gender
gender_default <- tapply(cc_data_clean$`default payment next month`,
cc_data_clean$SEX, mean) * 100
barplot(gender_default,
main = "Default Rate by Gender",
xlab = "Gender",
ylab = "Default Rate (%)",
col = c("steelblue","tomato"),
names.arg = c("Male","Female"),
ylim = c(0, max(gender_default) + 8))
text(x = c(0.7, 1.9),
y = gender_default + 1.5,
labels = paste0(round(gender_default, 1), "%"))
# Default rate by Marital Status
# Filter valid marital status categories
mar_def <- cc_data_clean[cc_data_clean$MARRIAGE %in% c(1,2,3), ]
mar_default <- tapply(mar_def$`default payment next month`,
mar_def$MARRIAGE, mean) * 100
barplot(mar_default,
main = "Default Rate by Marital Status",
xlab = "Marital Status",
ylab = "Default Rate (%)",
col = c("steelblue","tomato","seagreen"),
names.arg = c("Married","Single","Others"),
ylim = c(0, max(mar_default) + 8))
text(x = c(0.7, 1.9, 3.1),
y = mar_default + 1.5,
labels = paste0(round(mar_default, 1), "%"))
## 4.4. Identifying a customer segment with a notably high default risk,
# The customer segment at study: # A customer within the age group of 30
years old , with a highschool education level, and atleast one
default.
segment <- cc_data[cc_data$AGE <= 30 &
cc_data$EDUCATION == 3 &
cc_data$PAY_0 >= 1, ]
segment
## # A tibble: 325 × 26
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 16 50000 2 3 3 23 1 2 0 0 0
## 2 69 130000 2 3 2 29 1 -2 -2 -1 2
## 3 200 30000 2 3 2 22 1 2 2 0 0
## 4 230 20000 2 3 2 24 1 2 0 0 0
## 5 832 20000 2 3 1 24 8 7 6 5 4
## 6 1048 140000 2 3 2 24 1 2 2 2 0
## 7 1093 210000 1 3 1 29 1 2 0 0 0
## 8 1256 70000 1 3 2 28 1 2 0 0 0
## 9 1380 80000 1 3 2 26 1 2 2 2 2
## 10 1424 200000 2 3 2 27 1 -2 -2 -2 -2
## # ℹ 315 more rows
## # ℹ 15 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## # BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## # PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## # PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>,
## # AGE_GROUP <fct>
cat("Default customers at study:", round(nrow(segment)/nrow(cc_data_clean)*100, 1), "%\n")
## Default customers at study: 1.1 %
default_r <- mean(segment$`default payment next month`) * 100
default_r
## [1] 52
overall_r <- mean(cc_data$`default payment next month`) * 100
overall_r
## [1] 22.12
rate_data <- round(default_r/overall_r, 1)
rate_data
## [1] 2.4
#Customers aged 30 and below with High School education who had at least one late payment #represent 1.1% of the total customer base (325 customers). This segment has a default rate #of approximately 52%, which is 2.4 times higher than the overall default rate of 22.12%. #This confirms that young, less educated customers with poor recent payment behavior #represent a significantly high risk group in this portfolio