Read in dataset.
bank <- read.csv("BankLoanDefaultDataset.csv")
bank2 <- bank
bank$Car_loan <- as.factor(bank$Car_loan)
bank$Personal_loan <- as.factor(bank$Personal_loan)
bank$Home_loan <- as.factor(bank$Home_loan)
bank$Education_loan <- as.factor(bank$Education_loan)
bank$Gender <- as.factor(bank$Gender)
bank$Marital_status <- as.factor(bank$Marital_status)
bank$Emp_status <- as.factor(bank$Emp_status)
bank$Default <- as.factor(bank$Default)
The original dataset does not have any missing values so I needed to manually create them.
gender.missing.id <- sample(1:1000, 20 , replace = FALSE)
Marital.missing.id <- sample(1:1000, 20, replace = FALSE)
emp.status.missing.id <- sample(1:1000, 20, replace = FALSE)
credit.missing.id <- sample(1:1000, 20, replace = FALSE)
amount.missing.id <- sample(1:1000, 20, replace = FALSE)
emp.duration.missing.id <- sample(1:1000, 20, replace = FALSE)
check.amt.missing.id <- sample(1:1000, 20, replace = FALSE)
age.missing.id <- sample(1:1000, 20, replace = FALSE)
bank$Gender[gender.missing.id] <- NA
bank$Marital_status[Marital.missing.id] <- NA
bank$Emp_status[emp.status.missing.id] <- NA
bank$Credit_score[credit.missing.id] <- NA
bank$Amount[amount.missing.id] <- NA
bank$Emp_duration[emp.duration.missing.id] <- NA
bank$Checking_amount[check.amt.missing.id] <- NA
bank$Age[age.missing.id] <- NA
Missing values are a normal part of data analysis. To resolve this, we can use imputation. For numerical features, we can impute using the mean or median. For categorical variables, we can impute the mode. We can also use regression methods.
After imputation, we do feature engineering. We can do feature transformation, feature selection, and feature creation.
Gender, marital status, and employment status have missing values. These values can be replaced with the mode.
mode_value_gender <- which.max(table(bank$Gender)) %>% names()
mode_value_marital_status <- which.max(table(bank$Marital_status)) %>% names()
mode_value_emp_status <- which.max(table(bank$Emp_status)) %>% names()
bank$Gender[is.na(bank$Gender)] <- mode_value_gender
bank$Marital_status[is.na(bank$Marital_status)] <- mode_value_marital_status
bank$Emp_status[is.na(bank$Emp_status)] <- mode_value_emp_status
Random regression imputation is used in this section for all numerical variables. Regression models are created using features with missing values as the dependent variable and any variable without missing values as a predictor.
bank1 <- bank[ , c('Checking_amount', 'Term','Credit_score', 'Amount','Saving_amount', 'Emp_duration','Age', 'No_of_credit_acc')]
ggpairs(bank1)
pred.mdl1 = lm(Checking_amount ~ Saving_amount, data = bank)
newdata1 = bank[is.na(bank$Checking_amount),]
pred.y1 = predict(pred.mdl1, newdata1 = newdata1)
m1 = sum(is.na(bank$Checking_amount)) # total number of missing values
pred.resid1 = resid(pred.mdl1) # residual
pred.yrand1 = pred.y1 + sample(pred.resid1, m1, replace = TRUE)
bank$Checking_amount[is.na(bank$Checking_amount)] = pred.yrand1
pred.mdl2 = lm(Credit_score ~ Saving_amount, data = bank)
newdata2 = bank[is.na(bank$Credit_score),]
pred.y2 = predict(pred.mdl2, newdata2 = newdata2)
m2 = sum(is.na(bank$Credit_score)) # total number of missing values
pred.resid2 = resid(pred.mdl2) # residual
pred.yrand2 = pred.y2 + sample(pred.resid2, m2, replace = TRUE)
bank$Credit_score[is.na(bank$Credit_score)] = pred.yrand2
pred.mdl3 = lm(Amount ~ Checking_amount, data = bank)
newdata3 = bank[is.na(bank$Amount),]
pred.y3 = predict(pred.mdl3, newdata3 = newdata3)
m3 = sum(is.na(bank$Amount)) # total number of missing values
pred.resid3 = resid(pred.mdl3) # residual
pred.yrand3 = pred.y3 + sample(pred.resid3, m3, replace = TRUE)
bank$Amount[is.na(bank$Amount)] = pred.yrand3
pred.mdl4 = lm(Emp_duration ~ No_of_credit_acc, data = bank)
newdata4 = bank[is.na(bank$Emp_duration),]
pred.y4 = predict(pred.mdl4, newdata4 = newdata4)
m4 = sum(is.na(bank$Emp_duration)) # total number of missing values
pred.resid4 = resid(pred.mdl4) # residual
pred.yrand4 = pred.y4 + sample(pred.resid4, m4, replace = TRUE)
bank$Emp_duration[is.na(bank$Emp_duration)] = pred.yrand4
pred.mdl5 = lm(Age ~ Saving_amount, data = bank)
newdata5 = bank[is.na(bank1$Age),]
pred.y5 = predict(pred.mdl5, newdata5 = newdata5)
m5 = sum(is.na(bank$Age)) # total number of missing values
pred.resid5 = resid(pred.mdl5) # residual
pred.yrand5 = pred.y5 + sample(pred.resid5, m5, replace = TRUE)
bank$Age[is.na(bank$Age)] = pred.yrand5
We use the MICE imputation method in this section. We impute multiple data sets and combine all the results. The pooled standard error is 19.87721
#bank2 <- bank
init <- mice(bank2, maxit = 0)
init$method
Default Checking_amount Term Credit_score
"" "" "" ""
Gender Marital_status Car_loan Personal_loan
"" "" "" ""
Home_loan Education_loan Emp_status Amount
"" "" "" ""
Saving_amount Emp_duration Age No_of_credit_acc
"" "" "" ""
imp <- mice(bank2, method = c("","pmm", "", "pmm", "logreg", "logreg", "", "", "", "", "logreg", "pmm", "", "pmm", "pmm", ""),
maxit = 10,
m = 1,
print=F)
complete(imp, action = 1L)[1:10,]
Default Checking_amount Term Credit_score Gender Marital_status Car_loan
1 0 988 15 796 Female Single 1
2 0 458 15 813 Female Single 1
3 0 158 14 756 Female Single 0
4 1 300 25 737 Female Single 0
5 1 63 24 662 Female Single 0
6 0 1071 20 828 Male Married 1
7 0 -192 13 856 Male Single 1
8 0 172 16 763 Female Single 1
9 0 585 20 778 Female Single 1
10 1 189 19 649 Male Married 1
Personal_loan Home_loan Education_loan Emp_status Amount Saving_amount
1 0 0 0 employed 1536 3455
2 0 0 0 employed 947 3600
3 1 0 0 employed 1678 3093
4 0 0 1 employed 1804 2449
5 0 0 1 unemployed 1184 2867
6 0 0 0 employed 475 3282
7 0 0 0 employed 626 3398
8 0 0 0 employed 1224 3022
9 0 0 0 unemployed 1162 3475
10 0 0 0 employed 786 2711
Emp_duration Age No_of_credit_acc
1 12 38 1
2 25 36 1
3 43 34 1
4 0 29 1
5 4 30 1
6 12 32 2
7 11 38 1
8 12 36 1
9 12 36 1
10 0 29 1
imp <- mice(bank2, method = "pmm",
maxit = 10,
m = 2,
seed = 123,
print=F)
complete(imp, action = "broad")[1:10,]
Default.1 Checking_amount.1 Term.1 Credit_score.1 Gender.1 Marital_status.1
1 0 988 15 796 Female Single
2 0 458 15 813 Female Single
3 0 158 14 756 Female Single
4 1 300 25 737 Female Single
5 1 63 24 662 Female Single
6 0 1071 20 828 Male Married
7 0 -192 13 856 Male Single
8 0 172 16 763 Female Single
9 0 585 20 778 Female Single
10 1 189 19 649 Male Married
Car_loan.1 Personal_loan.1 Home_loan.1 Education_loan.1 Emp_status.1
1 1 0 0 0 employed
2 1 0 0 0 employed
3 0 1 0 0 employed
4 0 0 0 1 employed
5 0 0 0 1 unemployed
6 1 0 0 0 employed
7 1 0 0 0 employed
8 1 0 0 0 employed
9 1 0 0 0 unemployed
10 1 0 0 0 employed
Amount.1 Saving_amount.1 Emp_duration.1 Age.1 No_of_credit_acc.1 Default.2
1 1536 3455 12 38 1 0
2 947 3600 25 36 1 0
3 1678 3093 43 34 1 0
4 1804 2449 0 29 1 1
5 1184 2867 4 30 1 1
6 475 3282 12 32 2 0
7 626 3398 11 38 1 0
8 1224 3022 12 36 1 0
9 1162 3475 12 36 1 0
10 786 2711 0 29 1 1
Checking_amount.2 Term.2 Credit_score.2 Gender.2 Marital_status.2 Car_loan.2
1 988 15 796 Female Single 1
2 458 15 813 Female Single 1
3 158 14 756 Female Single 0
4 300 25 737 Female Single 0
5 63 24 662 Female Single 0
6 1071 20 828 Male Married 1
7 -192 13 856 Male Single 1
8 172 16 763 Female Single 1
9 585 20 778 Female Single 1
10 189 19 649 Male Married 1
Personal_loan.2 Home_loan.2 Education_loan.2 Emp_status.2 Amount.2
1 0 0 0 employed 1536
2 0 0 0 employed 947
3 1 0 0 employed 1678
4 0 0 1 employed 1804
5 0 0 1 unemployed 1184
6 0 0 0 employed 475
7 0 0 0 employed 626
8 0 0 0 employed 1224
9 0 0 0 unemployed 1162
10 0 0 0 employed 786
Saving_amount.2 Emp_duration.2 Age.2 No_of_credit_acc.2
1 3455 12 38 1
2 3600 25 36 1
3 3093 43 34 1
4 2449 0 29 1
5 2867 4 30 1
6 3282 12 32 2
7 3398 11 38 1
8 3022 12 36 1
9 3475 12 36 1
10 2711 0 29 1
imp5 <- mice(bank2, method = "pmm", m = 5, maxit = 10, seed = 123, print=F)
plot(imp5)
model5 <- with(imp5, glm(Default ~ Checking_amount + Term + Credit_score + Gender + Marital_status + Car_loan + Personal_loan + Home_loan + Education_loan + Emp_status + Amount + Saving_amount + Emp_duration+ Age + No_of_credit_acc,family='binomial'))
summary.stats = summary(model5)
summary.stats
# A tibble: 80 × 7
term estimate std.error statistic p.value nobs df.residual
<chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
1 (Intercept) 39.6 4.73 8.38 5.13e-17 1000 984
2 Checking_amount -0.00509 0.000676 -7.53 5.14e-14 1000 984
3 Term 0.170 0.0521 3.27 1.07e- 3 1000 984
4 Credit_score -0.0110 0.00207 -5.29 1.21e- 7 1000 984
5 GenderMale 0.195 0.510 0.383 7.02e- 1 1000 984
6 Marital_statusSingle 0.335 0.492 0.681 4.96e- 1 1000 984
7 Car_loan -0.600 2.76 -0.218 8.28e- 1 1000 984
8 Personal_loan -1.55 2.76 -0.563 5.73e- 1 1000 984
9 Home_loan -3.57 2.85 -1.25 2.10e- 1 1000 984
10 Education_loan 0.650 2.79 0.233 8.16e- 1 1000 984
# ℹ 70 more rows
summary(pool(model5))
term estimate std.error statistic df
1 (Intercept) 39.6415228699 4.7284136162 8.3836834 981.9055
2 Checking_amount -0.0050880131 0.0006758504 -7.5283125 981.9055
3 Term 0.1703676065 0.0520728136 3.2717189 981.9055
4 Credit_score -0.0109792626 0.0020746004 -5.2922299 981.9055
5 GenderMale 0.1950805846 0.5095698488 0.3828338 981.9055
6 Marital_statusSingle 0.3351480374 0.4920119886 0.6811786 981.9055
7 Car_loan -0.6004642922 2.7585197474 -0.2176763 981.9055
8 Personal_loan -1.5540876079 2.7585124281 -0.5633789 981.9055
9 Home_loan -3.5684378136 2.8457131490 -1.2539696 981.9055
10 Education_loan 0.6498872918 2.7894964677 0.2329766 981.9055
11 Emp_statusunemployed 0.5872531813 0.3474375980 1.6902407 981.9055
12 Amount 0.0008025898 0.0005113685 1.5694940 981.9055
13 Saving_amount -0.0048212246 0.0006085494 -7.9224872 981.9055
14 Emp_duration 0.0029178043 0.0044391386 0.6572906 981.9055
15 Age -0.6475369030 0.0646615942 -10.0142428 981.9055
16 No_of_credit_acc -0.0968613571 0.1006466582 -0.9623902 981.9055
p.value
1 1.764053e-16
2 1.161795e-13
3 1.106195e-03
4 1.490504e-07
5 7.019258e-01
6 4.959191e-01
7 8.277266e-01
8 5.733056e-01
9 2.101515e-01
10 8.158282e-01
11 9.129931e-02
12 1.168551e-01
13 6.284160e-15
14 5.111481e-01
15 1.524709e-22
16 3.360906e-01
beta = summary.stats$estimate[seq(1,15,by=3)]
beta.var = (summary.stats$std.error[seq(1,15,by=3)])^2
Q = mean(beta)
U = mean(beta.var)
B = var(beta)
T = U + (6/5)*B
pool.se = sqrt(T)
cbind(pool.se.intercept = pool.se)
pool.se.intercept
[1,] 19.61562
The only variable that is skewed is No_of_credit_acc so we use a log transformation.
boxcox_result <- boxcox(lm(bank$No_of_credit_acc ~ 1), lambda = seq(0, 1, by = 0.1))
title("Box-Cox Transformation")
optimal_lambda <- boxcox_result$x[which.max(boxcox_result$y)]
transformed_data <- if (optimal_lambda == 0) {
log(bank$No_of_credit_acc)
} else {
(bank$No_of_credit_acc^optimal_lambda - 1) / optimal_lambda
}
pooledData <- data.frame(OriginalData=bank$No_of_credit_acc, TransformedData=transformed_data)
We use standardization for all the numerical variables since these variables have different units. This will help with our modeling later on.
standardize <- function(x) {
return((x - mean(x)) / sd(x))
}
bank$Checking_amount_standardized <- standardize(bank$Checking_amount)
bank$term_standardized <- standardize(bank$Term)
bank$Credit_score_standardized <- standardize(bank$Credit_score)
bank$amount_standardized <- standardize(bank$Amount)
bank$Saving_amount_standardized <- standardize(bank$Saving_amount)
bank$Emp_duration_standardized <- standardize(bank$Emp_duration)
bank$Age_standardized <- standardize(bank$Age)
bank$No_of_credit_acc_standardized <- standardize(bank$No_of_credit_acc)
To select the most relevant features for our model, we use lasso regularization. Using all the standardized numerical variables, we see that checking amount, term, credit score, amount, saving amount, age, and no of credit accounts are important in determining the Default (target variable).
x <- as.matrix(bank[, c('Checking_amount_standardized', 'term_standardized', 'Credit_score_standardized', 'amount_standardized', 'Saving_amount_standardized', 'Emp_duration_standardized','Age_standardized','No_of_credit_acc_standardized')])
y <- bank$Default
# data$Default <- as.numeric(data$Default)
# data$Gender <- as.numeric(data$Gender)
# data$Marital_status <- as.numeric(data$Marital_status)
# data$Car_loan <- as.numeric(data$Car_loan)
# data$Personal_loan <- as.numeric(data$Personal_loan)
# data$Home_loan <- as.numeric(data$Home_loan)
# data$Education_loan <- as.numeric(data$Education_loan)
# data$Emp_status <- as.numeric(data$Emp_status)
lasso_model <- cv.glmnet(x, y, alpha = 1, family='binomial')
best_lambda <- lasso_model$lambda.min
selected_features <- rownames(coef(lasso_model, s = best_lambda))[coef(lasso_model, s = best_lambda)[,1] != 0]
print(selected_features)
[1] "(Intercept)" "Checking_amount_standardized"
[3] "term_standardized" "Credit_score_standardized"
[5] "amount_standardized" "Saving_amount_standardized"
[7] "Emp_duration_standardized" "Age_standardized"
[9] "No_of_credit_acc_standardized"
best_model <- glmnet(x, y, alpha = 1, lambda = best_lambda, family='binomial')
coef(best_model)
9 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -2.18604138
Checking_amount_standardized -1.33308370
term_standardized 0.55291905
Credit_score_standardized -0.85852506
amount_standardized 0.16092370
Saving_amount_standardized -1.39262931
Emp_duration_standardized -0.01762975
Age_standardized -2.13856873
No_of_credit_acc_standardized -0.06042559
Feature creation is not necessary for this dataset. For some variables like Age, it only ranges from 18-42 yrs old so it would be difficult to group all into ‘young’ or ‘old’. For other variables, it would be difficult to group them into different bins because they are measured in units such as months or money.