# extracting all data in Credit Card Clients datase
cl_df <- read_excel("Book1.xlsx")

View(cl_df)

4.1 Data exploration and cleaning

Use given Dataset to display variable names, the top 5 rows, the bottom 10 rows, the data type, and the shape of the dataset. Drop duplicate rows if any, find the number of missing values in each column, and use a box plot to check if there are any outliers in the quantitative variables. Use appropriate methods to deal with missing values How to rename the columns in dataframe using the first columns of the dataframe in R and outliers, if any. ### display variable names

#Getting all columns names and store them in var_names variable
var_names <- names(cl_df)
print(var_names)
##  [1] "ID"                         "LIMIT_BAL"                 
##  [3] "SEX"                        "EDUCATION"                 
##  [5] "MARRIAGE"                   "AGE"                       
##  [7] "PAY_0"                      "PAY_2"                     
##  [9] "PAY_3"                      "PAY_4"                     
## [11] "PAY_5"                      "PAY_6"                     
## [13] "BILL_AMT1"                  "BILL_AMT2"                 
## [15] "BILL_AMT3"                  "BILL_AMT4"                 
## [17] "BILL_AMT5"                  "BILL_AMT6"                 
## [19] "PAY_AMT1"                   "PAY_AMT2"                  
## [21] "PAY_AMT3"                   "PAY_AMT4"                  
## [23] "PAY_AMT5"                   "PAY_AMT6"                  
## [25] "default payment next month"

The top 5 rows

head(cl_df,5)
## # A tibble: 5 × 25
##      ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##   <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     20000     2         2        1    24     2     2    -1    -1    -2
## 2     2    120000     2         2        2    26    -1     2     0     0     0
## 3     3     90000     2         2        2    34     0     0     0     0     0
## 4     4     50000     2         2        1    37     0     0     0     0     0
## 5     5     50000     1         2        1    57    -1     0    -1     0     0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>

The bottom 10 rows

tail(cl_df,10)
## # A tibble: 10 × 25
##       ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##    <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 29991    140000     1         2        1    41     0     0     0     0     0
##  2 29992    210000     1         2        1    34     3     2     2     2     2
##  3 29993     10000     1         3        1    43     0     0     0    -2    -2
##  4 29994    100000     1         1        2    38     0    -1    -1     0     0
##  5 29995     80000     1         2        2    34     2     2     2     2     2
##  6 29996    220000     1         3        1    39     0     0     0     0     0
##  7 29997    150000     1         3        2    43    -1    -1    -1    -1     0
##  8 29998     30000     1         2        2    37     4     3     2    -1     0
##  9 29999     80000     1         3        1    41     1    -1     0     0     0
## 10 30000     50000     1         2        1    46     0     0     0     0     0
## # ℹ 14 more variables: PAY_6 <dbl>, BILL_AMT1 <dbl>, BILL_AMT2 <dbl>,
## #   BILL_AMT3 <dbl>, BILL_AMT4 <dbl>, BILL_AMT5 <dbl>, BILL_AMT6 <dbl>,
## #   PAY_AMT1 <dbl>, PAY_AMT2 <dbl>, PAY_AMT3 <dbl>, PAY_AMT4 <dbl>,
## #   PAY_AMT5 <dbl>, PAY_AMT6 <dbl>, `default payment next month` <dbl>

The data type

str(cl_df)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ID                        : num [1:30000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ LIMIT_BAL                 : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX                       : num [1:30000] 2 2 2 2 1 1 1 2 2 1 ...
##  $ EDUCATION                 : num [1:30000] 2 2 2 2 2 1 1 2 3 3 ...
##  $ MARRIAGE                  : num [1:30000] 1 2 2 1 1 2 2 2 1 2 ...
##  $ AGE                       : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0                     : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2                     : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3                     : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4                     : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5                     : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6                     : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1                 : num [1:30000] 3913 2682 29239 46990 8617 ...
##  $ BILL_AMT2                 : num [1:30000] 3102 1725 14027 48233 5670 ...
##  $ BILL_AMT3                 : num [1:30000] 689 2682 13559 49291 35835 ...
##  $ BILL_AMT4                 : num [1:30000] 0 3272 14331 28314 20940 ...
##  $ BILL_AMT5                 : num [1:30000] 0 3455 14948 28959 19146 ...
##  $ BILL_AMT6                 : num [1:30000] 0 3261 15549 29547 19131 ...
##  $ PAY_AMT1                  : num [1:30000] 0 0 1518 2000 2000 ...
##  $ PAY_AMT2                  : num [1:30000] 689 1000 1500 2019 36681 ...
##  $ PAY_AMT3                  : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4                  : num [1:30000] 0 1000 1000 1100 9000 ...
##  $ PAY_AMT5                  : num [1:30000] 0 0 1000 1069 689 ...
##  $ PAY_AMT6                  : num [1:30000] 0 2000 5000 1000 679 ...
##  $ default payment next month: num [1:30000] 1 1 0 0 0 0 0 0 0 0 ...

The shape of the datase

dim(cl_df)
## [1] 30000    25

Drop duplicate

cl_df <- unique(cl_df)
dim(cl_df)
## [1] 30000    25

Find the number of missing values in each column

sum(is.na(cl_df))
## [1] 0

use a box plot to check if there are any outliers in the quantitative variables

boxplot(cl_df)

Use appropriate methods to deal with missing values and outliers, if an

  • There is no missing values but if there is any I could have used the mean of the available variables and imputate it or remove the whole row For outliers I ca
#Ther is no missing values found

4.2 Summarize data

By using labeled and appropriate graphs: Summarize the customer information from the dataset by Gender, Education Level, Age, Marital Status and Default payment status. by using your findings complete the missing information in the following statement: The EDA shows that ….% of the clients are less that 40 years. Moreover,…..out of 30,000 have university-level education. In addition, the number of credit cards issued to female is ….. times the number issued to Male. Generally, ….% of the loan were defaulted

# Gender bar graph
cl_df$SEX <- factor(cl_df$SEX, levels = c(1, 2), labels = c("Male", "Female"))
ggplot(cl_df,aes(x=SEX))+
  geom_bar()+
  labs(
    title = "Gender",
    x = "Gender",
    y = "Counts"
  )

# Education bar graph
ggplot(cl_df,aes(x=EDUCATION))+
  geom_bar()

# Age bar graph
ggplot(cl_df,aes(x=AGE))+
  geom_histogram(binwidth = 5, fill="blue", col="white") + labs(title="Distribution by Age")

# Marital status bar graph
ggplot(cl_df,aes(x=MARRIAGE))+
  geom_bar(fill="blue", col="white")

# default payment status bar graph
ggplot(cl_df,aes(x=PAY_AMT2))+
  geom_bar(fill="blue", col="white")

df <- cl_df
colnames(df)[ncol(df)] <- "default_status"
total_rows <- nrow(df)

# % less than 40
age_less_40 <- mean(df$AGE < 40) * 100

# University level education
# 1 = graduate school, 2 = university, 3 = high school, 4 = others, 5=unknown, 6=unknown
univ_count <- sum(df$EDUCATION == 2)

# Sex: 1 = male, 2 = female
female_count <- sum(df$SEX == 2)
male_count <- sum(df$SEX == 1)
female_to_male_ratio <- female_count / male_count

# Default rate
default_rate <- mean(df$default_status) * 100

cat(sprintf("Age < 40: %.2f%%\n", age_less_40))
## Age < 40: 69.52%
cat(sprintf("University count: %d\n", univ_count))
## University count: 14030
cat(sprintf("Female to Male ratio: %.2f\n", female_to_male_ratio))
## Female to Male ratio: NaN
cat(sprintf("Default rate: %.2f%%\n", default_rate))
## Default rate: 22.12%
## 4.3 How do default rates vary by age groups(take age difference of 5 years), education level, gender and Marital Status

4.3 How do default rates vary by age groups(take age difference of 5 years), education

level, gender and Marital Status?

df$age_group <- cut(df$AGE, breaks = seq(min(df$AGE), max(df$AGE) + 5, by = 5), right = FALSE)
age_default <- df %>% group_by(age_group) %>% summarize(rate = mean(default_status))
print(age_default)
## # A tibble: 12 × 2
##    age_group  rate
##    <fct>     <dbl>
##  1 [21,26)   0.267
##  2 [26,31)   0.201
##  3 [31,36)   0.194
##  4 [36,41)   0.216
##  5 [41,46)   0.221
##  6 [46,51)   0.250
##  7 [51,56)   0.248
##  8 [56,61)   0.264
##  9 [61,66)   0.269
## 10 [66,71)   0.254
## 11 [71,76)   0.357
## 12 [76,81)   0
# Education
edu_default <- df %>% group_by(EDUCATION) %>% summarize(rate = mean(default_status))
print(edu_default)
## # A tibble: 7 × 2
##   EDUCATION   rate
##       <dbl>  <dbl>
## 1         0 0     
## 2         1 0.192 
## 3         2 0.237 
## 4         3 0.252 
## 5         4 0.0569
## 6         5 0.0643
## 7         6 0.157
# Sex
sex_default <- df %>% group_by(SEX) %>% summarize(rate = mean(default_status))
print(sex_default)
## # A tibble: 2 × 2
##   SEX     rate
##   <fct>  <dbl>
## 1 Male   0.242
## 2 Female 0.208
# Marriage
marriage_default <- df %>% group_by(MARRIAGE) %>% summarize(rate = mean(default_status))
print(marriage_default)
## # A tibble: 4 × 2
##   MARRIAGE   rate
##      <dbl>  <dbl>
## 1        0 0.0926
## 2        1 0.235 
## 3        2 0.209 
## 4        3 0.260

4.4 Identify one customer segment with a notably higher default risk. A segment can be

defined by a combination of attributes or behaviors – for example, “young customers age ≤ 30 with low education level” or “customers who maxed out their credit limit and had multiple late payments.” Clearly define the segment, quantify its default rate.

df<-cl_df
colnames(df)[ncol(df)] <- "default_status"

segment <- df %>% filter(AGE <= 30, SEX == 1, EDUCATION == 3)
cat(sprintf("Segment Default Rate: %.2f%%\n", mean(segment$default_status)*100))
## Segment Default Rate: NaN%
cat(sprintf("Segment Size: %d\n", nrow(segment)))
## Segment Size: 0
segment2 <- df %>% filter(AGE <= 30, EDUCATION == 3)
cat(sprintf("Segment2 Default Rate: %.2f%%\n", mean(segment2$default_status)*100))
## Segment2 Default Rate: 27.10%

Provide five practical recommendation for managing default risk based on your analysis

  1. Segmented Credit Limits: Implement stricter (lower) credit limits for the “Young/High School” demographic.
  2. Automated Early Warnings: Trigger proactive notifications for high-risk segments as soon as a payment is overdue.
  3. Educational Outreach: Target university-level clients with financial literacy programs to improve long-term repayment habits.
  4. Risk-Based Interest Rates: Adjust interest rates based on the demographic risk profiles identified.
  5. Enhanced Monitoring: Prioritize collection efforts for clients with high bill-to-limit ratios combined with lower education levels.