Author: Dridi Slim
Date: February 11, 2024
Institution: ESSAI
Supervisor: Professor
Samer Azzabi
The primary objective of this project is to develop a predictive model for determining the likelihood of credit card clients defaulting on their payments.
library(readr) # Package for loading the csv file.
library(dplyr) # Package for manipulate DATA .
library(ggplot2) # Package for visualize DATA .
library(plotly) # Package for visualizing DATA in a interactive way .
library(corrplot) # Package for visualizing correlation matrices .
library(DescTools) # Package for descriptive statistics and visualization tools .
library(ltm)
library(vcd)
library(kableExtra)
path_data<-"D:/project PFA (Default of Credit Card Clients Dataset) SAMER/Initial_Dataset/UCI_Credit_Card.csv"
UCI_Credit_Card<-read.csv(path_data)
head(UCI_Credit_Card)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0 2
## 3 3 90000 2 2 2 34 0 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0 0
## 6 6 50000 1 1 2 37 0 0 0 0 0 0
## BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1 3913 3102 689 0 0 0 0 689
## 2 2682 1725 2682 3272 3455 3261 0 1000
## 3 29239 14027 13559 14331 14948 15549 1518 1500
## 4 46990 48233 49291 28314 28959 29547 2000 2019
## 5 8617 5670 35835 20940 19146 19131 2000 36681
## 6 64400 57069 57608 19394 19619 20024 2500 1815
## PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default.payment.next.month
## 1 0 0 0 0 1
## 2 1000 1000 0 2000 1
## 3 1000 1000 1000 5000 0
## 4 1200 1100 1069 1000 0
## 5 10000 9000 689 679 0
## 6 657 1000 1000 800 0
str(UCI_Credit_Card)
## 'data.frame': 30000 obs. of 25 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL : num 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : int 2 2 2 2 1 1 1 2 2 1 ...
## $ EDUCATION : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE : int 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE : int 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : int 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : int 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : int -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : int -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : int -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : int -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : num 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2 : num 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3 : num 689 2682 13559 49291 35835 ...
## $ BILL_AMT4 : num 0 3272 14331 28314 20940 ...
## $ BILL_AMT5 : num 0 3455 14948 28959 19146 ...
## $ BILL_AMT6 : num 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : num 0 1000 1000 1100 9000 ...
## $ PAY_AMT5 : num 0 0 1000 1069 689 ...
## $ PAY_AMT6 : num 0 2000 5000 1000 679 ...
## $ default.payment.next.month: int 1 1 0 0 0 0 0 0 0 0 ...
We have :
- Categorical DATA set to int type we should pay attention to that.
- A gap between PAY_0 and PAY_2 we should investigate why.
- ID column is useless we should do something about it.
- Our target variable is default.payment.next.month.
colSums(is.na(UCI_Credit_Card)) #calculating in every col the NA value
## ID LIMIT_BAL
## 0 0
## SEX EDUCATION
## 0 0
## MARRIAGE AGE
## 0 0
## PAY_0 PAY_2
## 0 0
## PAY_3 PAY_4
## 0 0
## PAY_5 PAY_6
## 0 0
## BILL_AMT1 BILL_AMT2
## 0 0
## BILL_AMT3 BILL_AMT4
## 0 0
## BILL_AMT5 BILL_AMT6
## 0 0
## PAY_AMT1 PAY_AMT2
## 0 0
## PAY_AMT3 PAY_AMT4
## 0 0
## PAY_AMT5 PAY_AMT6
## 0 0
## default.payment.next.month
## 0
We have no Missing DATA
UCI_Credit_Card<-subset(UCI_Credit_Card,select = -ID)
colnames(UCI_Credit_Card)[colnames(UCI_Credit_Card) == "PAY_0"] <- "PAY_1"
colnames(UCI_Credit_Card)[colnames(UCI_Credit_Card) == "default.payment.next.month"] <- "Payment_Status"
col_names_Quanti<-c("LIMIT_BAL","AGE",
"BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
"PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")
col_names_Cato<-c("SEX","EDUCATION","MARRIAGE",
"PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
"Payment_Status")
all_col_names<-c(col_names_Quanti,col_names_Cato)
for (nc in col_names_Cato) {
UCI_Credit_Card[[nc]]<-as.factor(UCI_Credit_Card[[nc]])
}
summary(UCI_Credit_Card[col_names_Quanti])
## LIMIT_BAL AGE BILL_AMT1 BILL_AMT2
## Min. : 10000 Min. :21.00 Min. :-165580 Min. :-69777
## 1st Qu.: 50000 1st Qu.:28.00 1st Qu.: 3559 1st Qu.: 2985
## Median : 140000 Median :34.00 Median : 22382 Median : 21200
## Mean : 167484 Mean :35.49 Mean : 51223 Mean : 49179
## 3rd Qu.: 240000 3rd Qu.:41.00 3rd Qu.: 67091 3rd Qu.: 64006
## Max. :1000000 Max. :79.00 Max. : 964511 Max. :983931
## BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
## Min. :-157264 Min. :-170000 Min. :-81334 Min. :-339603
## 1st Qu.: 2666 1st Qu.: 2327 1st Qu.: 1763 1st Qu.: 1256
## Median : 20089 Median : 19052 Median : 18105 Median : 17071
## Mean : 47013 Mean : 43263 Mean : 40311 Mean : 38872
## 3rd Qu.: 60165 3rd Qu.: 54506 3rd Qu.: 50191 3rd Qu.: 49198
## Max. :1664089 Max. : 891586 Max. :927171 Max. : 961664
## PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 1000 1st Qu.: 833 1st Qu.: 390 1st Qu.: 296
## Median : 2100 Median : 2009 Median : 1800 Median : 1500
## Mean : 5664 Mean : 5921 Mean : 5226 Mean : 4826
## 3rd Qu.: 5006 3rd Qu.: 5000 3rd Qu.: 4505 3rd Qu.: 4013
## Max. :873552 Max. :1684259 Max. :896040 Max. :621000
## PAY_AMT5 PAY_AMT6
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 252.5 1st Qu.: 117.8
## Median : 1500.0 Median : 1500.0
## Mean : 4799.4 Mean : 5215.5
## 3rd Qu.: 4031.5 3rd Qu.: 4000.0
## Max. :426529.0 Max. :528666.0
We have:
- Negative BILL_AMT ? We should investigate that .
plot_ly(x = UCI_Credit_Card$LIMIT_BAL, type = "histogram")%>%layout(title = "Histogram of LIMIT_BAL",
xaxis=list(title ="LIMIT_BAL"),
yaxis=list(title ="Frequency"))
50k is the most frequent LIMIT_BAL
ggplot(data = UCI_Credit_Card, aes(sample = LIMIT_BAL)) +
stat_qq() +
stat_qq_line(col = "red") +
labs(title = "Q-Q Plot for LIMIT_BAL")
Asymmetrical Distribution (positively skewed)
plot_ly(data = UCI_Credit_Card, x = ~AGE, type = "histogram") %>%
layout(title = "Kernel Density Plot for Age",
xaxis = list(title = "Age"),
yaxis = list(title = "Frequence"))
The most frequent AGE is 29.
ggplot(data = UCI_Credit_Card, aes(sample = AGE)) +
stat_qq() +
stat_qq_line(col = "red") +
labs(title = "Q-Q Plot for AGE")
Asymmetrical Distribution (positively skewed).
BILL_AMT_LIST<-list("BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6")
histogram_list <- lapply(BILL_AMT_LIST, function(col_name) {
plot_ly(UCI_Credit_Card, x = ~get(col_name), type = "histogram", histnorm = "probability density",nbinsx = 80,name = col_name) %>%
layout(title = "Histograms of Bill Amounts")
})
combined_plot <- subplot(histogram_list, nrows = 3)
combined_plot
We can’t see much from those plots
ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT1)) +
stat_qq() +
stat_qq_line(col = "red") +
labs(title = "Q-Q Plot for BILL_AMT1")
ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT2)) +
stat_qq() +
stat_qq_line(col = "red") +
labs(title = "Q-Q Plot for BILL_AMT2")
ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT3)) +
stat_qq() +
stat_qq_line(col = "red") +
labs(title = "Q-Q Plot for BILL_AMT3")
ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT4)) +
stat_qq() +
stat_qq_line(col = "red") +
labs(title = "Q-Q Plot for BILL_AMT4")
ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT5)) +
stat_qq() +
stat_qq_line(col = "red") +
labs(title = "Q-Q Plot for BILL_AMT5")
ggplot(data = UCI_Credit_Card, aes(sample = BILL_AMT6)) +
stat_qq() +
stat_qq_line(col = "red") +
labs(title = "Q-Q Plot for BILL_AMT6")
Ditrubition WEird ??
BILL_AMT(i) :Amount of bill statement in month (i), 2005 (NT dollar) .So ,what does negative value mean
negative_BILL_AMT<-UCI_Credit_Card %>%
filter(BILL_AMT1 < 0 | BILL_AMT2 < 0 | BILL_AMT3 < 0 | BILL_AMT4 < 0 | BILL_AMT5 < 0 | BILL_AMT6 < 0)
head(negative_BILL_AMT)
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_1 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## 1 100000 2 2 2 23 0 -1 -1 0 0 -1
## 2 60000 1 1 2 27 1 -2 -1 -1 -1 -1
## 3 160000 1 1 2 30 -1 -1 -2 -2 -2 -1
## 4 60000 2 2 2 22 0 0 0 0 0 -1
## 5 180000 2 3 1 34 0 0 0 -1 -1 -1
## 6 130000 2 3 2 29 1 -2 -2 -1 2 -1
## BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2
## 1 11876 380 601 221 -159 567 380 601
## 2 -109 -425 259 -57 127 -189 0 1000
## 3 30265 -131 -527 -923 -1488 -1884 131 396
## 4 15054 9806 11068 6026 -28335 18660 1500 1518
## 5 16386 15793 8441 7142 -679 8321 8500 1500
## 6 -190 -9850 -9850 10311 10161 7319 0 0
## PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 Payment_Status
## 1 0 581 1687 1542 0
## 2 0 500 0 1000 1
## 3 396 565 792 0 0
## 4 2043 0 47671 617 0
## 5 7500 679 9000 2000 0
## 6 20161 0 7319 13899 0
It may mean the bank own them many by paying more than they should be .
PAY_AMT_list<-c("PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")
histogram_list <- lapply(PAY_AMT_list, function(col_name) {
plot_ly(UCI_Credit_Card, x = ~log(get(col_name)), type = "histogram",nbinsx = 70 ,name = col_name) %>%
layout(title ="Histograms of Pay Amounts")
})
combined_plot <- subplot(histogram_list, nrows = 3)
combined_plot
summary(UCI_Credit_Card[col_names_Cato])
## SEX EDUCATION MARRIAGE PAY_1 PAY_2 PAY_3
## 1:11888 0: 14 0: 54 0 :14737 0 :15730 0 :15764
## 2:18112 1:10585 1:13659 -1 : 5686 -1 : 6050 -1 : 5938
## 2:14030 2:15964 1 : 3688 2 : 3927 -2 : 4085
## 3: 4917 3: 323 -2 : 2759 -2 : 3782 2 : 3819
## 4: 123 2 : 2667 3 : 326 3 : 240
## 5: 280 3 : 322 4 : 99 4 : 76
## 6: 51 (Other): 141 (Other): 86 (Other): 78
## PAY_4 PAY_5 PAY_6 Payment_Status
## 0 :16455 0 :16947 0 :16286 0:23364
## -1 : 5687 -1 : 5539 -1 : 5740 1: 6636
## -2 : 4348 -2 : 4546 -2 : 4895
## 2 : 3159 2 : 2626 2 : 2766
## 3 : 180 3 : 178 3 : 184
## 4 : 69 4 : 84 4 : 49
## (Other): 102 (Other): 80 (Other): 80
SEX_counts <- table(UCI_Credit_Card$SEX)
plot_ly(labels = c("Male", "Female"), values = SEX_counts, type = 'pie') %>%
layout(annotations = list(
x = 0,
y = 1,
xref = "paper",
yref = "paper",
text = "Pie Chart for SEX",
font = list(color = "black", size = 20),
showarrow = FALSE),piecolorway = c( "pink","blue"))
We have more than half female
EDUCATION_counts <- table(UCI_Credit_Card$EDUCATION)
EDUCATION_counts
##
## 0 1 2 3 4 5 6
## 14 10585 14030 4917 123 280 51
plot_ly(labels = list("unlabel","graduate school","university", "high school", "others", "unknown", "unknown"), values = EDUCATION_counts, type = 'pie') %>%
layout(annotations = list(
x = 0,
y = 1,
xref = "paper",
yref = "paper",
text = "Pie Chart for EDUCATION",
font = list(color = "black", size = 15),
showarrow = FALSE))
MARRIAGE_counts <- table(UCI_Credit_Card$MARRIAGE)
MARRIAGE_counts
##
## 0 1 2 3
## 54 13659 15964 323
plot_ly(labels = c("unlabel","married", "single", "others"), values = MARRIAGE_counts, type = 'pie') %>%
layout(annotations = list(
x = 0,
y = 1,
xref = "paper",
yref = "paper",
text = "Pie Chart for MARRIAGE",
font = list(color = "black", size = 15),
showarrow = FALSE))
labels_PAY <- c("-2" = "unlabel(-2)", "-1" = "pay duly", "0" = "unlabel(0)",
"1" = "pay_del_1_mth", "2" = "pay_del_2_mths",
"3" = "pay_del_3_mths", "4" = "pay_del_4_mths",
"5" = "pay_del_5_mths", "6" = "pay_del_6_mths",
"7" = "pay_del_7_mths", "8" = "pay_del_8_mths",
"9" = "pay_del_9_mths_above")
ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_1, fill = PAY_1)) +
geom_bar() +
labs(title = "Bar Plot by PAY_1", y = "Frequency") +
theme_classic() +
scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
unlabel(0) is the most frequent category
ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_2, fill = PAY_2)) +
geom_bar() +
labs(title = "Bar Plot by PAY_2", y = "Frequency") +
theme_classic() +
scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
unlabel(0) is the most frequent category
ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_3, fill = PAY_3)) +
geom_bar() +
labs(title = "Bar Plot by PAY_3", y = "Frequency") +
theme_classic() +
scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
unlabel(0) is the most frequent category
ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_4, fill = PAY_4)) +
geom_bar() +
labs(title = "Bar Plot by PAY_4", y = "Frequency") +
theme_classic() +
scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
unlabel(0) is the most frequent category
ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_5, fill = PAY_5)) +
geom_bar() +
labs(title = "Bar Plot by PAY_5", y = "Frequency") +
theme_classic() +
scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
unlabel(0) is the most frequent category
ggplotly(
ggplot(UCI_Credit_Card, aes(x = PAY_6, fill = PAY_6)) +
geom_bar() +
labs(title = "Bar Plot by PAY_6", y = "Frequency") +
theme_classic() +
scale_x_discrete(labels = labels_PAY)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
unlabel(0) is the most frequent category
we should investigate unlabel(0)
PayStatue_counts <- table(UCI_Credit_Card$Payment_Status)
plot_ly(labels = c("Non_default", "Default"), values = PayStatue_counts, type = 'pie') %>%
layout(annotations = list(
x = 0,
y = 1,
xref = "paper",
yref = "paper",
text = "Pie Chart for Payment_Status",
font = list(color = "black", size = 15),
showarrow = FALSE),piecolorway = c( "green","red"))
Nearly 80 % did not default
correlation_results <- lapply(col_names_Quanti, function(var) {
biserial.cor(UCI_Credit_Card[[var]], UCI_Credit_Card$Payment_Status)
})
correlation_df <- data.frame(
Variable = col_names_Quanti,
Correlation = unlist(correlation_results)
)
knitr::kable(correlation_df, caption = "<h2 style='color: black;'>Correlation between Payment Status and Quantitative Variables</h2>") %>%kable_styling(bootstrap_options = c("striped", "hover"))
| Variable | Correlation |
|---|---|
| LIMIT_BAL | 0.1535199 |
| AGE | -0.0138898 |
| BILL_AMT1 | 0.0196442 |
| BILL_AMT2 | 0.0141932 |
| BILL_AMT3 | 0.0140755 |
| BILL_AMT4 | 0.0101565 |
| BILL_AMT5 | 0.0067605 |
| BILL_AMT6 | 0.0053723 |
| PAY_AMT1 | 0.0729295 |
| PAY_AMT2 | 0.0585787 |
| PAY_AMT3 | 0.0562504 |
| PAY_AMT4 | 0.0568274 |
| PAY_AMT5 | 0.0551235 |
| PAY_AMT6 | 0.0531833 |
cramer_v_results <- list()
for (var in col_names_Cato) {
contingency_table <- table(UCI_Credit_Card[[var]], UCI_Credit_Card$Payment_Status)
cramers_v <- assocstats(contingency_table)$cramer
cramer_v_results[[var]] <- cramers_v
}
cramer_df <- data.frame(
Cramer_V = unlist(cramer_v_results)
)
knitr::kable(cramer_df, caption = "<h2 style='color: black;'>Cramer's V for Categorical Variables and Payment Status</h2>") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| Cramer_V | |
|---|---|
| SEX | 0.0399606 |
| EDUCATION | 0.0737601 |
| MARRIAGE | 0.0344782 |
| PAY_1 | 0.4229249 |
| PAY_2 | 0.3403169 |
| PAY_3 | 0.2956610 |
| PAY_4 | 0.2793725 |
| PAY_5 | 0.2706594 |
| PAY_6 | 0.2507878 |
| Payment_Status | 1.0000000 |