Project Overview

Author: Dridi Slim
Date: February 11, 2024
Institution: ESSAI
Supervisor: Professor Samer Azzabi


Project Goal

The primary objective of this project is to develop a predictive model for determining the likelihood of credit card clients defaulting on their payments.


Checking the data

Load Libraries

library(readr) # Package for loading the csv file. 
library(dplyr) # Package for manipulate DATA .
library(ggplot2) # Package for visualize DATA .
library(plotly) # Package for visualizing DATA in a interactive way .
library(corrplot)  # Package for visualizing correlation matrices .
library(DescTools) # Package for descriptive statistics and visualization tools .
library(ltm)
library(vcd)
library(kableExtra)

Load Data

path_data<-"D:/project PFA (Default of Credit Card Clients Dataset) SAMER/Initial_Dataset/UCI_Credit_Card.csv"
UCI_Credit_Card<-read.csv(path_data)

Overview of the Data

head(UCI_Credit_Card)
##   ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1  1     20000   2         2        1  24     2     2    -1    -1    -2    -2
## 2  2    120000   2         2        2  26    -1     2     0     0     0     2
## 3  3     90000   2         2        2  34     0     0     0     0     0     0
## 4  4     50000   2         2        1  37     0     0     0     0     0     0
## 5  5     50000   1         2        1  57    -1     0    -1     0     0     0
## 6  6     50000   1         1        2  37     0     0     0     0     0     0
##   BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1      3913      3102       689         0         0         0        0      689
## 2      2682      1725      2682      3272      3455      3261        0     1000
## 3     29239     14027     13559     14331     14948     15549     1518     1500
## 4     46990     48233     49291     28314     28959     29547     2000     2019
## 5      8617      5670     35835     20940     19146     19131     2000    36681
## 6     64400     57069     57608     19394     19619     20024     2500     1815
##   PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default.payment.next.month
## 1        0        0        0        0                          1
## 2     1000     1000        0     2000                          1
## 3     1000     1000     1000     5000                          0
## 4     1200     1100     1069     1000                          0
## 5    10000     9000      689      679                          0
## 6      657     1000     1000      800                          0

Check the info of our DATA

str(UCI_Credit_Card)
## 'data.frame':    30000 obs. of  25 variables:
##  $ ID                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ LIMIT_BAL                 : num  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX                       : int  2 2 2 2 1 1 1 2 2 1 ...
##  $ EDUCATION                 : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MARRIAGE                  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ AGE                       : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0                     : int  2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2                     : int  2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3                     : int  -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4                     : int  -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5                     : int  -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6                     : int  -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1                 : num  3913 2682 29239 46990 8617 ...
##  $ BILL_AMT2                 : num  3102 1725 14027 48233 5670 ...
##  $ BILL_AMT3                 : num  689 2682 13559 49291 35835 ...
##  $ BILL_AMT4                 : num  0 3272 14331 28314 20940 ...
##  $ BILL_AMT5                 : num  0 3455 14948 28959 19146 ...
##  $ BILL_AMT6                 : num  0 3261 15549 29547 19131 ...
##  $ PAY_AMT1                  : num  0 0 1518 2000 2000 ...
##  $ PAY_AMT2                  : num  689 1000 1500 2019 36681 ...
##  $ PAY_AMT3                  : num  0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4                  : num  0 1000 1000 1100 9000 ...
##  $ PAY_AMT5                  : num  0 0 1000 1069 689 ...
##  $ PAY_AMT6                  : num  0 2000 5000 1000 679 ...
##  $ default.payment.next.month: int  1 1 0 0 0 0 0 0 0 0 ...

We have :
- Categorical DATA set to int type we should pay attention to that.
- A gap between PAY_0 and PAY_2 we should investigate why.
- ID column is useless we should do something about it.
- Our target variable is default.payment.next.month.

Check is their any NA

colSums(is.na(UCI_Credit_Card)) #calculating in every col the NA value
##                         ID                  LIMIT_BAL 
##                          0                          0 
##                        SEX                  EDUCATION 
##                          0                          0 
##                   MARRIAGE                        AGE 
##                          0                          0 
##                      PAY_0                      PAY_2 
##                          0                          0 
##                      PAY_3                      PAY_4 
##                          0                          0 
##                      PAY_5                      PAY_6 
##                          0                          0 
##                  BILL_AMT1                  BILL_AMT2 
##                          0                          0 
##                  BILL_AMT3                  BILL_AMT4 
##                          0                          0 
##                  BILL_AMT5                  BILL_AMT6 
##                          0                          0 
##                   PAY_AMT1                   PAY_AMT2 
##                          0                          0 
##                   PAY_AMT3                   PAY_AMT4 
##                          0                          0 
##                   PAY_AMT5                   PAY_AMT6 
##                          0                          0 
## default.payment.next.month 
##                          0

We have no Missing DATA


Cleaning and Manipulating our DATA

Removing ID colomn

UCI_Credit_Card<-subset(UCI_Credit_Card,select = -ID) 

Changing the label of PAY_0 to PAY_1 AND default.payment.next.month to PaymentStatus

colnames(UCI_Credit_Card)[colnames(UCI_Credit_Card) == "PAY_0"] <- "PAY_1"
colnames(UCI_Credit_Card)[colnames(UCI_Credit_Card) == "default.payment.next.month"] <- "Payment_Status"

Creating a list that have column names of Quantitative variables and another one for Catogorical variables

col_names_Quanti<-c("LIMIT_BAL","AGE",
                      "BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
                      "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")
col_names_Cato<-c("SEX","EDUCATION","MARRIAGE",
                     "PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
                     "Payment_Status")
all_col_names<-c(col_names_Quanti,col_names_Cato)

Changing Catogorical variables type to Factors

for (nc in col_names_Cato) {
  UCI_Credit_Card[[nc]]<-as.factor(UCI_Credit_Card[[nc]])      
}

Univariate Analysis:

Qantitative DATA

summary(UCI_Credit_Card[col_names_Quanti])
##    LIMIT_BAL            AGE          BILL_AMT1         BILL_AMT2     
##  Min.   :  10000   Min.   :21.00   Min.   :-165580   Min.   :-69777  
##  1st Qu.:  50000   1st Qu.:28.00   1st Qu.:   3559   1st Qu.:  2985  
##  Median : 140000   Median :34.00   Median :  22382   Median : 21200  
##  Mean   : 167484   Mean   :35.49   Mean   :  51223   Mean   : 49179  
##  3rd Qu.: 240000   3rd Qu.:41.00   3rd Qu.:  67091   3rd Qu.: 64006  
##  Max.   :1000000   Max.   :79.00   Max.   : 964511   Max.   :983931  
##    BILL_AMT3         BILL_AMT4         BILL_AMT5        BILL_AMT6      
##  Min.   :-157264   Min.   :-170000   Min.   :-81334   Min.   :-339603  
##  1st Qu.:   2666   1st Qu.:   2327   1st Qu.:  1763   1st Qu.:   1256  
##  Median :  20089   Median :  19052   Median : 18105   Median :  17071  
##  Mean   :  47013   Mean   :  43263   Mean   : 40311   Mean   :  38872  
##  3rd Qu.:  60165   3rd Qu.:  54506   3rd Qu.: 50191   3rd Qu.:  49198  
##  Max.   :1664089   Max.   : 891586   Max.   :927171   Max.   : 961664  
##     PAY_AMT1         PAY_AMT2          PAY_AMT3         PAY_AMT4     
##  Min.   :     0   Min.   :      0   Min.   :     0   Min.   :     0  
##  1st Qu.:  1000   1st Qu.:    833   1st Qu.:   390   1st Qu.:   296  
##  Median :  2100   Median :   2009   Median :  1800   Median :  1500  
##  Mean   :  5664   Mean   :   5921   Mean   :  5226   Mean   :  4826  
##  3rd Qu.:  5006   3rd Qu.:   5000   3rd Qu.:  4505   3rd Qu.:  4013  
##  Max.   :873552   Max.   :1684259   Max.   :896040   Max.   :621000  
##     PAY_AMT5           PAY_AMT6       
##  Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:   252.5   1st Qu.:   117.8  
##  Median :  1500.0   Median :  1500.0  
##  Mean   :  4799.4   Mean   :  5215.5  
##  3rd Qu.:  4031.5   3rd Qu.:  4000.0  
##  Max.   :426529.0   Max.   :528666.0

We have:
- Negative BILL_AMT ? We should investigate that .

Analysing LIMIT_BAL using histogram

plot_ly(x = UCI_Credit_Card$LIMIT_BAL, type = "histogram")%>%layout(title = "Histogram of LIMIT_BAL",
                                                                    xaxis=list(title ="LIMIT_BAL"),
                                                                    yaxis=list(title ="Frequency"))

50k is the most frequent LIMIT_BAL

Checking LIMIT_BAL Distribution unsing Q-Q plot

ggplot(data = UCI_Credit_Card, aes(sample = LIMIT_BAL)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for LIMIT_BAL")

Asymmetrical Distribution (positively skewed)

Analysing AGE using histogram

plot_ly(data = UCI_Credit_Card, x = ~AGE, type = "histogram") %>%
  layout(title = "Kernel Density Plot for Age",
         xaxis = list(title = "Age"),
         yaxis = list(title = "Frequence"))

The most frequent AGE is 29.

Checking AGEL Distribution unsing Q-Q plot

ggplot(data = UCI_Credit_Card, aes(sample = AGE)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for AGE")

Asymmetrical Distribution (positively skewed).

Analysing BILL_AMT(i) using histogram

BILL_AMT_LIST<-list("BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6")
histogram_list <- lapply(BILL_AMT_LIST, function(col_name) {
    plot_ly(UCI_Credit_Card, x = ~get(col_name), type = "histogram", histnorm = "probability density",nbinsx = 80,name = col_name) %>%
      layout(title = "Histograms of Bill Amounts")
  })
  combined_plot <- subplot(histogram_list, nrows = 3)
  combined_plot

We can’t see much from those plots

Using Q_Q plot to check the distrubution

1.BILL_AMT1

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT1)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT1")

2.BILL_AMT2

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT2)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT2")

3.BILL_AMT3

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT3)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT3")

4.BILL_AMT4

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT4)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT4")

5.BILL_AMT5

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT5)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT5")

6.BILL_AMT6

ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT6)) +
  stat_qq() +
  stat_qq_line(col = "red") +
  labs(title = "Q-Q Plot for BILL_AMT6")

Ditrubition WEird ??

BILL_AMT(i) :Amount of bill statement in month (i), 2005 (NT dollar) .So ,what does negative value mean

negative_BILL_AMT<-UCI_Credit_Card %>%
  filter(BILL_AMT1 < 0 | BILL_AMT2 < 0 | BILL_AMT3 < 0 | BILL_AMT4 < 0 | BILL_AMT5 < 0 | BILL_AMT6 < 0)
head(negative_BILL_AMT)
##   LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_1 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1    100000   2         2        2  23     0    -1    -1     0     0    -1
## 2     60000   1         1        2  27     1    -2    -1    -1    -1    -1
## 3    160000   1         1        2  30    -1    -1    -2    -2    -2    -1
## 4     60000   2         2        2  22     0     0     0     0     0    -1
## 5    180000   2         3        1  34     0     0     0    -1    -1    -1
## 6    130000   2         3        2  29     1    -2    -2    -1     2    -1
##   BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1     11876       380       601       221      -159       567      380      601
## 2      -109      -425       259       -57       127      -189        0     1000
## 3     30265      -131      -527      -923     -1488     -1884      131      396
## 4     15054      9806     11068      6026    -28335     18660     1500     1518
## 5     16386     15793      8441      7142      -679      8321     8500     1500
## 6      -190     -9850     -9850     10311     10161      7319        0        0
##   PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 Payment_Status
## 1        0      581     1687     1542              0
## 2        0      500        0     1000              1
## 3      396      565      792        0              0
## 4     2043        0    47671      617              0
## 5     7500      679     9000     2000              0
## 6    20161        0     7319    13899              0

It may mean the bank own them many by paying more than they should be .

Using histogram and Log Transformation(for better visualisation) for “PAY_AMT(i)”

PAY_AMT_list<-c("PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")

histogram_list <- lapply(PAY_AMT_list, function(col_name) {
    plot_ly(UCI_Credit_Card, x = ~log(get(col_name)), type = "histogram",nbinsx = 70 ,name = col_name) %>%
      layout(title ="Histograms of Pay Amounts")
  })
  combined_plot <- subplot(histogram_list, nrows = 3)
  combined_plot

Catagorical DATA

summary(UCI_Credit_Card[col_names_Cato])
##  SEX       EDUCATION MARRIAGE      PAY_1           PAY_2           PAY_3      
##  1:11888   0:   14   0:   54   0      :14737   0      :15730   0      :15764  
##  2:18112   1:10585   1:13659   -1     : 5686   -1     : 6050   -1     : 5938  
##            2:14030   2:15964   1      : 3688   2      : 3927   -2     : 4085  
##            3: 4917   3:  323   -2     : 2759   -2     : 3782   2      : 3819  
##            4:  123             2      : 2667   3      :  326   3      :  240  
##            5:  280             3      :  322   4      :   99   4      :   76  
##            6:   51             (Other):  141   (Other):   86   (Other):   78  
##      PAY_4           PAY_5           PAY_6       Payment_Status
##  0      :16455   0      :16947   0      :16286   0:23364       
##  -1     : 5687   -1     : 5539   -1     : 5740   1: 6636       
##  -2     : 4348   -2     : 4546   -2     : 4895                 
##  2      : 3159   2      : 2626   2      : 2766                 
##  3      :  180   3      :  178   3      :  184                 
##  4      :   69   4      :   84   4      :   49                 
##  (Other):  102   (Other):   80   (Other):   80

Using Pie Chart for SEX

SEX_counts <- table(UCI_Credit_Card$SEX)

plot_ly(labels = c("Male", "Female"), values = SEX_counts, type = 'pie') %>%
  layout(annotations = list(
      x = 0,
      y = 1,
      xref = "paper",
      yref = "paper",
      text = "Pie Chart for SEX",
      font = list(color = "black", size = 20),
      showarrow = FALSE),piecolorway = c( "pink","blue"))

We have more than half female

Using Pie Chart for EDUCATION

EDUCATION_counts <- table(UCI_Credit_Card$EDUCATION)
EDUCATION_counts
## 
##     0     1     2     3     4     5     6 
##    14 10585 14030  4917   123   280    51
plot_ly(labels = list("unlabel","graduate school","university", "high school", "others", "unknown", "unknown"), values = EDUCATION_counts, type = 'pie') %>%
  layout(annotations = list(
      x = 0,
      y = 1,
      xref = "paper",
      yref = "paper",
      text = "Pie Chart for EDUCATION",
      font = list(color = "black", size = 15),
      showarrow = FALSE))

Using Pie Chart for MARRIAGE

MARRIAGE_counts <- table(UCI_Credit_Card$MARRIAGE)
MARRIAGE_counts
## 
##     0     1     2     3 
##    54 13659 15964   323
plot_ly(labels = c("unlabel","married", "single", "others"), values = MARRIAGE_counts, type = 'pie') %>%
  layout(annotations = list(
      x = 0,
      y = 1,
      xref = "paper",
      yref = "paper",
      text = "Pie Chart for MARRIAGE",
      font = list(color = "black", size = 15),
      showarrow = FALSE))

Using barplot for “PAY_0”

labels_PAY <- c("-2" = "unlabel(-2)", "-1" = "pay duly", "0" = "unlabel(0)",
                  "1" = "pay_del_1_mth", "2" = "pay_del_2_mths",
                  "3" = "pay_del_3_mths", "4" = "pay_del_4_mths",
                  "5" = "pay_del_5_mths", "6" = "pay_del_6_mths",
                  "7" = "pay_del_7_mths", "8" = "pay_del_8_mths",
                  "9" = "pay_del_9_mths_above")
ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_1, fill = PAY_1)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_1", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_2”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_2, fill = PAY_2)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_2", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_3”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_3, fill = PAY_3)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_3", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_4”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_4, fill = PAY_4)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_4", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_5”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_5, fill = PAY_5)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_5", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

Using barplot for “PAY_6”

ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_6, fill = PAY_6)) +
  geom_bar() +
  labs(title = "Bar Plot by PAY_6", y = "Frequency") +
  theme_classic() +
  scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)

unlabel(0) is the most frequent category

we should investigate unlabel(0)

Using Charpie for Payment_Status :

PayStatue_counts <- table(UCI_Credit_Card$Payment_Status)

plot_ly(labels = c("Non_default", "Default"), values = PayStatue_counts, type = 'pie') %>%
  layout(annotations = list(
      x = 0,
      y = 1,
      xref = "paper",
      yref = "paper",
      text = "Pie Chart for Payment_Status",
      font = list(color = "black", size = 15),
      showarrow = FALSE),piecolorway = c( "green","red"))

Nearly 80 % did not default

Correlation Test :

Quantitave data and Payment Status Using Point-Biserial Correlation Coefficient :

correlation_results <- lapply(col_names_Quanti, function(var) {
  biserial.cor(UCI_Credit_Card[[var]], UCI_Credit_Card$Payment_Status)
})

correlation_df <- data.frame(
  Variable = col_names_Quanti,
  Correlation = unlist(correlation_results)
)


knitr::kable(correlation_df, caption = "<h2 style='color: black;'>Correlation between Payment Status and Quantitative Variables</h2>") %>%kable_styling(bootstrap_options = c("striped", "hover"))

Correlation between Payment Status and Quantitative Variables

Variable Correlation
LIMIT_BAL 0.1535199
AGE -0.0138898
BILL_AMT1 0.0196442
BILL_AMT2 0.0141932
BILL_AMT3 0.0140755
BILL_AMT4 0.0101565
BILL_AMT5 0.0067605
BILL_AMT6 0.0053723
PAY_AMT1 0.0729295
PAY_AMT2 0.0585787
PAY_AMT3 0.0562504
PAY_AMT4 0.0568274
PAY_AMT5 0.0551235
PAY_AMT6 0.0531833

Catagorical data and Payment Status Using CramérV :

cramer_v_results <- list()

for (var in col_names_Cato) {
  
  contingency_table <- table(UCI_Credit_Card[[var]], UCI_Credit_Card$Payment_Status)
  
  cramers_v <- assocstats(contingency_table)$cramer
  
  
  cramer_v_results[[var]] <- cramers_v
}

cramer_df <- data.frame(
  Cramer_V = unlist(cramer_v_results)
)

knitr::kable(cramer_df, caption = "<h2 style='color: black;'>Cramer's V for Categorical Variables and Payment Status</h2>") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Cramer’s V for Categorical Variables and Payment Status

Cramer_V
SEX 0.0399606
EDUCATION 0.0737601
MARRIAGE 0.0344782
PAY_1 0.4229249
PAY_2 0.3403169
PAY_3 0.2956610
PAY_4 0.2793725
PAY_5 0.2706594
PAY_6 0.2507878
Payment_Status 1.0000000