1 Data Preprocession

1.1 Load Dataset

df <- read.csv("crx.data", header = F)
glimpse(df)
## Rows: 690
## Columns: 16
## $ V1  <chr> "b", "a", "a", "b", "b", "b", "b", "a", "b", "b", "b", "b", "a", "…
## $ V2  <chr> "30.83", "58.67", "24.50", "27.83", "20.17", "32.08", "33.17", "22…
## $ V3  <dbl> 0.000, 4.460, 0.500, 1.540, 5.625, 4.000, 1.040, 11.585, 0.500, 4.…
## $ V4  <chr> "u", "u", "u", "u", "u", "u", "u", "u", "y", "y", "u", "u", "u", "…
## $ V5  <chr> "g", "g", "g", "g", "g", "g", "g", "g", "p", "p", "g", "g", "g", "…
## $ V6  <chr> "w", "q", "q", "w", "w", "m", "r", "cc", "k", "w", "c", "c", "k", …
## $ V7  <chr> "v", "h", "h", "v", "v", "v", "h", "v", "h", "v", "h", "h", "v", "…
## $ V8  <dbl> 1.250, 3.040, 1.500, 3.750, 1.710, 2.500, 6.500, 0.040, 3.960, 3.1…
## $ V9  <chr> "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "f", "t", "t", "…
## $ V10 <chr> "t", "t", "f", "t", "f", "f", "f", "f", "f", "f", "f", "f", "f", "…
## $ V11 <int> 1, 6, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 3, 10, 0, 7, 17, …
## $ V12 <chr> "f", "f", "f", "t", "f", "t", "t", "f", "f", "t", "t", "f", "t", "…
## $ V13 <chr> "g", "g", "g", "g", "s", "g", "g", "g", "g", "g", "g", "g", "g", "…
## $ V14 <chr> "00202", "00043", "00280", "00100", "00120", "00360", "00164", "00…
## $ V15 <int> 0, 560, 824, 3, 0, 0, 31285, 1349, 314, 1442, 0, 200, 0, 2690, 0, …
## $ V16 <chr> "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "…

1.2 Data Integration

Add meaningful column names to variables in the data frame.

c("Gender","Age","Debt","MaritalStatus","Bank Customer",
  "Education Level","Ethnicity","YearsEmployed","PriorDefault",
  "Employed","CreditScore","DriversLicense","Citizen",
  "Zip Code","Income","Approved") -> colnames(df)
df %>% head() %>% 
  knitr::kable()
Gender Age Debt MaritalStatus Bank Customer Education Level Ethnicity YearsEmployed PriorDefault Employed CreditScore DriversLicense Citizen Zip Code Income Approved
b 30.83 0.000 u g w v 1.25 t t 1 f g 00202 0 +
a 58.67 4.460 u g q h 3.04 t t 6 f g 00043 560 +
a 24.50 0.500 u g q h 1.50 t f 0 f g 00280 824 +
b 27.83 1.540 u g w v 3.75 t t 5 t g 00100 3 +
b 20.17 5.625 u g w v 1.71 t f 0 f s 00120 0 +
b 32.08 4.000 u g m v 2.50 t f 0 t g 00360 0 +

1.3 Data Cleaning

Before imputing missing values, it is important to ensure the data type of each column is appropriate.

1.3.1 Column: Age

Age is an inherent numerical variable, however, it is encoded as categorical variable in the data frame. There is a need to convert variable “Age” to numerical variable. During the data type conversion, some missing values can be generated as some strings such as “?” can not be converted to a valid numerical value.

df %>% mutate(Age = as.numeric(Age)) -> df

1.3.2 Column: Approved

This column is the response variable, for simplicity, there is a need to re-encode this variable. A symbol of “+” means “Approved”, and a symbol of “-” means “Rejected”.

df %>% 
  group_by(Approved) %>% 
  summarise(N = n())
## # A tibble: 2 × 2
##   Approved     N
##   <chr>    <int>
## 1 +          307
## 2 -          383

It can be observed that, there are 307 positive samples (Approved) and 383 negative samples (Rejected) in this dataset.

df %>% 
  mutate(Approved = case_when(Approved == "+" ~ "Approved", 
                              Approved == "-" ~ "Rejected")) -> df

1.3.3 Remove Useless Variables and Modify the levels of Some Categories

df %>% 
  lapply(function(x){
    class(x)
  }) %>% unlist() -> cols_type

cols_categorical <- names(cols_type[cols_type %in% c("character")])
cols_numerical <- names(cols_type[!(cols_type %in% c("character"))])
df %>% 
  select(all_of(cols_categorical)) %>% 
  gather(key = "Variable", value = "Category") %>% 
  group_by(Variable, Category) %>% 
  summarise(N = n(), .groups = "drop") %>% 
  knitr::kable()
Variable Category N
Approved Approved 307
Approved Rejected 383
Bank Customer ? 6
Bank Customer g 519
Bank Customer gg 2
Bank Customer p 163
Citizen g 625
Citizen p 8
Citizen s 57
DriversLicense f 374
DriversLicense t 316
Education Level ? 9
Education Level aa 54
Education Level c 137
Education Level cc 41
Education Level d 30
Education Level e 25
Education Level ff 53
Education Level i 59
Education Level j 10
Education Level k 51
Education Level m 38
Education Level q 78
Education Level r 3
Education Level w 64
Education Level x 38
Employed f 395
Employed t 295
Ethnicity ? 9
Ethnicity bb 59
Ethnicity dd 6
Ethnicity ff 57
Ethnicity h 138
Ethnicity j 8
Ethnicity n 4
Ethnicity o 2
Ethnicity v 399
Ethnicity z 8
Gender ? 12
Gender a 210
Gender b 468
MaritalStatus ? 6
MaritalStatus l 2
MaritalStatus u 519
MaritalStatus y 163
PriorDefault f 329
PriorDefault t 361
Zip Code 00000 132
Zip Code 00017 1
Zip Code 00020 2
Zip Code 00021 1
Zip Code 00022 1
Zip Code 00024 1
Zip Code 00028 1
Zip Code 00029 1
Zip Code 00030 1
Zip Code 00032 1
Zip Code 00040 4
Zip Code 00043 1
Zip Code 00045 1
Zip Code 00049 1
Zip Code 00050 2
Zip Code 00052 1
Zip Code 00056 1
Zip Code 00060 9
Zip Code 00062 1
Zip Code 00070 4
Zip Code 00073 2
Zip Code 00075 1
Zip Code 00076 1
Zip Code 00080 30
Zip Code 00086 1
Zip Code 00088 2
Zip Code 00092 2
Zip Code 00093 1
Zip Code 00094 1
Zip Code 00096 3
Zip Code 00099 1
Zip Code 00100 30
Zip Code 00102 1
Zip Code 00108 4
Zip Code 00110 2
Zip Code 00112 2
Zip Code 00117 1
Zip Code 00120 35
Zip Code 00121 1
Zip Code 00128 3
Zip Code 00129 2
Zip Code 00130 2
Zip Code 00132 4
Zip Code 00136 2
Zip Code 00140 16
Zip Code 00141 1
Zip Code 00144 4
Zip Code 00145 2
Zip Code 00150 3
Zip Code 00152 1
Zip Code 00154 2
Zip Code 00156 1
Zip Code 00160 34
Zip Code 00163 1
Zip Code 00164 3
Zip Code 00167 1
Zip Code 00168 2
Zip Code 00170 1
Zip Code 00171 1
Zip Code 00174 1
Zip Code 00176 3
Zip Code 00178 1
Zip Code 00180 18
Zip Code 00181 3
Zip Code 00186 1
Zip Code 00188 1
Zip Code 00195 1
Zip Code 00200 35
Zip Code 00202 1
Zip Code 00204 1
Zip Code 00208 1
Zip Code 00210 2
Zip Code 00211 1
Zip Code 00212 1
Zip Code 00216 3
Zip Code 00220 9
Zip Code 00221 1
Zip Code 00224 1
Zip Code 00225 2
Zip Code 00228 2
Zip Code 00230 1
Zip Code 00231 1
Zip Code 00232 4
Zip Code 00239 1
Zip Code 00240 14
Zip Code 00250 1
Zip Code 00252 2
Zip Code 00253 1
Zip Code 00254 1
Zip Code 00256 1
Zip Code 00260 11
Zip Code 00263 1
Zip Code 00268 1
Zip Code 00272 3
Zip Code 00274 1
Zip Code 00276 1
Zip Code 00280 22
Zip Code 00288 1
Zip Code 00290 3
Zip Code 00292 1
Zip Code 00300 13
Zip Code 00303 1
Zip Code 00309 1
Zip Code 00311 1
Zip Code 00312 2
Zip Code 00320 14
Zip Code 00329 1
Zip Code 00330 2
Zip Code 00333 1
Zip Code 00340 7
Zip Code 00348 1
Zip Code 00349 1
Zip Code 00350 2
Zip Code 00352 2
Zip Code 00356 1
Zip Code 00360 7
Zip Code 00368 1
Zip Code 00369 1
Zip Code 00370 2
Zip Code 00371 1
Zip Code 00372 1
Zip Code 00375 1
Zip Code 00380 5
Zip Code 00381 1
Zip Code 00383 1
Zip Code 00393 1
Zip Code 00395 1
Zip Code 00396 2
Zip Code 00399 2
Zip Code 00400 9
Zip Code 00408 1
Zip Code 00410 1
Zip Code 00411 1
Zip Code 00416 1
Zip Code 00420 4
Zip Code 00422 1
Zip Code 00431 1
Zip Code 00432 1
Zip Code 00434 1
Zip Code 00440 4
Zip Code 00443 1
Zip Code 00450 1
Zip Code 00454 1
Zip Code 00455 1
Zip Code 00460 3
Zip Code 00465 1
Zip Code 00470 1
Zip Code 00480 3
Zip Code 00487 1
Zip Code 00491 1
Zip Code 00500 2
Zip Code 00510 1
Zip Code 00515 1
Zip Code 00519 1
Zip Code 00520 4
Zip Code 00523 1
Zip Code 00550 1
Zip Code 00560 2
Zip Code 00583 1
Zip Code 00600 1
Zip Code 00640 1
Zip Code 00680 1
Zip Code 00711 1
Zip Code 00720 2
Zip Code 00760 1
Zip Code 00840 1
Zip Code 00928 1
Zip Code 00980 1
Zip Code 01160 1
Zip Code 02000 1
Zip Code ? 13

Based on the table displaying data summary information, it can be observed that categorical variable “Zip Code” has too many categories (171), and many categories only contain 1 sample. It will be a problem when including this variable to a model, as some new categories may appear in test dataset and makes the model fail to predict on test dataset. The high number of categories will also increase the model complexity, which may decrease the model performance in the test dataset. As a result, there are enough reasons to remove this variables from the data frame.

df %>% select(-`Zip Code`) -> df

In addition to variable “Zip Code”, the percent of certain categories is also very small (<1%) in some variables. Similarly, including those low frequency categories can be a problem when making prediction on test dataset. In this section, all categories with samples lower than 1% percent are encoded into a unified category named “Other Category” for all categorical variables.

df %>% 
  lapply(function(x){
    class(x)
  }) %>% unlist() -> cols_type

cols_categorical <- names(cols_type[cols_type %in% c("character")])
cols_numerical <- names(cols_type[!(cols_type %in% c("character"))])

df %>% 
  lapply(function(x){
    if (class(x) == "character"){
      x_table <- table(x)
      x_table_percent <- x_table / sum(x_table)
      minority_class <- names(x_table_percent[x_table_percent < 0.01])
      x[x %in% minority_class] <- "Other Category"
    }
    x
  }) %>% 
  data.frame() -> df
colnames(df) <- str_replace_all(colnames(df), "\\.", " ")

df %>% 
  select(all_of(cols_categorical)) %>% 
  gather(key = "Variable", value = "Category") %>% 
  group_by(Variable, Category) %>% 
  summarise(N = n(), .groups = "drop") %>% 
  group_by(Variable) %>% 
  mutate(Percent = N / sum(N)) %>% 
  mutate(Percent = sprintf("%.2f%%", Percent* 100)) -> df.sum
  
df.sum %>% 
  knitr::kable()
Variable Category N Percent
Approved Approved 307 44.49%
Approved Rejected 383 55.51%
Bank Customer Other Category 8 1.16%
Bank Customer g 519 75.22%
Bank Customer p 163 23.62%
Citizen g 625 90.58%
Citizen p 8 1.16%
Citizen s 57 8.26%
DriversLicense f 374 54.20%
DriversLicense t 316 45.80%
Education Level ? 9 1.30%
Education Level Other Category 3 0.43%
Education Level aa 54 7.83%
Education Level c 137 19.86%
Education Level cc 41 5.94%
Education Level d 30 4.35%
Education Level e 25 3.62%
Education Level ff 53 7.68%
Education Level i 59 8.55%
Education Level j 10 1.45%
Education Level k 51 7.39%
Education Level m 38 5.51%
Education Level q 78 11.30%
Education Level w 64 9.28%
Education Level x 38 5.51%
Employed f 395 57.25%
Employed t 295 42.75%
Ethnicity ? 9 1.30%
Ethnicity Other Category 12 1.74%
Ethnicity bb 59 8.55%
Ethnicity ff 57 8.26%
Ethnicity h 138 20.00%
Ethnicity j 8 1.16%
Ethnicity v 399 57.83%
Ethnicity z 8 1.16%
Gender ? 12 1.74%
Gender a 210 30.43%
Gender b 468 67.83%
MaritalStatus Other Category 8 1.16%
MaritalStatus u 519 75.22%
MaritalStatus y 163 23.62%
PriorDefault f 329 47.68%
PriorDefault t 361 52.32%
df.sum %>% 
  write.csv("descriptive_statistic.csv", row.names = F)

1.4 Impute Missing Values

df %>% 
  is.na() %>% 
  colSums() %>% 
  data.frame() %>% 
  rownames_to_column(var = "Variable") %>% 
  rename("Number of Missing Values" = ".")
##           Variable Number of Missing Values
## 1           Gender                        0
## 2              Age                       12
## 3             Debt                        0
## 4    MaritalStatus                        0
## 5    Bank Customer                        0
## 6  Education Level                        0
## 7        Ethnicity                        0
## 8    YearsEmployed                        0
## 9     PriorDefault                        0
## 10        Employed                        0
## 11     CreditScore                        0
## 12  DriversLicense                        0
## 13         Citizen                        0
## 14          Income                        0
## 15        Approved                        0
df %>% 
  mutate(Age = case_when(is.na(Age) ~ mean(Age, na.rm = T), 
                         T ~ Age)) -> df

1.5 Exploratory Analysis

df %>% 
  select(c(all_of(cols_numerical), "Approved")) %>% 
  gather(key = "Variable", value = "Value", -Approved) %>% 
  group_by(Variable, Approved) %>% 
  summarise(Mean = mean(Value) %>% round(4), 
            SD = sd(Value) %>% round(4), 
            Min = min(Value) %>% round(4), 
            Max = max(Value) %>% round(4), 
            Median = median(Value) %>% round(4), 
            .groups = "drop") -> df.numerical.sum

df.numerical.sum %>% 
  knitr::kable()
Variable Approved Mean SD Min Max Median
Age Approved 33.7065 12.7689 13.75 76.750 30.670
Age Rejected 29.8542 10.7791 15.17 80.250 27.670
CreditScore Approved 4.6059 6.3202 0.00 67.000 3.000
CreditScore Rejected 0.6319 1.9000 0.00 20.000 0.000
Debt Approved 5.9050 5.4715 0.00 28.000 4.460
Debt Rejected 3.8399 4.3377 0.00 26.335 2.210
Income Approved 2038.8599 7659.7639 0.00 100000.000 221.000
Income Rejected 198.6057 671.6088 0.00 5552.000 1.000
YearsEmployed Approved 3.4279 4.1208 0.00 28.500 2.000
YearsEmployed Rejected 1.2579 2.1205 0.00 13.875 0.415
df.numerical.sum %>% 
  write.csv("numerical_descriptive_statistics.csv", row.names = F)
df %>% 
  select(c(all_of(cols_numerical), "Approved")) %>% 
  gather(key = "Variable", value = "Value", -Approved) %>% 
  ggplot() + 
  geom_boxplot(aes(x = Approved, y = Value, color = Approved)) + 
  scale_color_manual(breaks = c("Rejected", "Approved"), 
                     values = c("red", "darkgreen")) + 
  facet_wrap(~Variable, scales = "free") + 
  theme_minimal() + 
  xlab("") + 
  ylab("")

ggsave("Numerical_variables_vs_Approved.png", dpi = 300, width = 8, height = 5, bg = "white")

2 Split Dataset

set.seed(1)
idx <- sample(1:nrow(df))
n.tr <- floor(nrow(df) * 0.7)
idx.tr <- idx[1:n.tr]
idx.te <- idx[-(1:n.tr)]
df.tr <- df[idx.tr,]
df.te <- df[-idx.tr,]

3 Modelling

3.1 Linear Probability Model

model.lm <- lm(formula = Approved ~ ., data = df.tr %>% 
                 mutate(Approved = case_when(Approved == "Rejected" ~ 0, 
                                             T ~ 1)))
pred.lm.te.prob <- predict(model.lm, df.te)
pred.lm.te.label <- ifelse(pred.lm.te.prob > 0.5, "Approved", "Rejected")

3.2 Logistic Regression Model

glm(formula = Approved ~ ., data = df.tr %>% 
      mutate(Approved = case_when(Approved == "Rejected" ~ 0, 
                                  T ~ 1)), family = "binomial") -> model.lr
pred.lr.te.prob <- predict(model.lr, df.te, type = "response")
pred.lr.te.label <- ifelse(pred.lr.te.prob > 0.5, "Approved", "Rejected")

3.3 Linear Discriminant Analysis Model

MASS::lda(formula = Approved ~ ., data = df.tr %>% 
            mutate(Approved = case_when(Approved == "Rejected" ~ 0, 
                                        T ~ 1))) -> model.lda
pred.lda.te.prob <- predict(model.lda, df.te)$posterior[,2]
pred.lda.te.label <- ifelse(predict(model.lda, df.te)$class == 1, "Approved", "Rejected")

3.4 Quadratic Discriminant Analysis Model

df.tr.dummy <- dummy_cols(df.tr, select_columns = setdiff(cols_categorical, "Approved"), 
                          remove_selected_columns = T, remove_first_dummy = T)
df.te.dummy <- dummy_cols(df.te, select_columns = setdiff(cols_categorical, "Approved"), 
                          remove_selected_columns = T, remove_first_dummy = T)
cols_intersect <- intersect(colnames(df.tr.dummy), colnames(df.te.dummy))
df.tr.dummy <- df.tr.dummy[,cols_intersect]
df.te.dummy <- df.te.dummy[,cols_intersect]

nzv <- caret::nearZeroVar(df.tr.dummy, saveMetrics = TRUE)
df.tr.dummy.sub <- df.tr.dummy[, !nzv$nzv]
correlation_matrix <- cor(df.tr.dummy.sub %>% select(-Approved))
high_cor <- caret::findCorrelation(correlation_matrix, cutoff = 0.9) 
df.tr.dummy.sub <- df.tr.dummy.sub %>% select(-all_of(colnames(correlation_matrix)[high_cor]))

MASS::qda(formula = Approved ~ ., data = df.tr.dummy.sub %>% 
            mutate(Approved = case_when(Approved == "Rejected" ~ 0, 
                                        T ~ 1))) -> model.qda
pred.qda.te.prob <- predict(model.qda, df.te.dummy)$posterior[,2]
pred.qda.te.label <- ifelse(predict(model.qda, df.te.dummy)$class == 1, "Approved", "Rejected")

3.5 Lasso Regression Model

df.tr.mat <- df.tr %>% select(-Approved) %>% as.matrix()

cv.glmnet(df.tr.mat, ifelse(df.tr$Approved == "Approved", 1, 0), 
          family = "binomial", alpha = 1) -> model.lasso.cv

glmnet(df.tr.mat, ifelse(df.tr$Approved == "Approved", 1, 0), 
       family = "binomial", alpha = 1, 
       lambda = model.lasso.cv$lambda.1se) -> model.lasso

predict(model.lasso, df.te %>% select(-Approved) %>% as.matrix(), 
        type = "response") %>% as.numeric() -> pred.lasso.te.prob
pred.lasso.te.label <- ifelse(pred.lasso.te.prob > 0.5, "Approved", "Rejected")

3.6 Ridge Regression Model

df.tr.mat <- df.tr %>% select(-Approved) %>% as.matrix()
cv.glmnet(df.tr.mat, ifelse(df.tr$Approved == "Approved", 1, 0), 
          family = "binomial", alpha = 0) -> model.ridge.cv
glmnet(df.tr.mat, ifelse(df.tr$Approved == "Approved", 1, 0), 
       family = "binomial", alpha = 0, 
       lambda = model.ridge.cv$lambda.1se) -> model.ridge
predict(model.ridge, df.te %>% select(-Approved) %>% as.matrix(), 
        type = "response") %>% as.numeric() -> pred.ridge.te.prob
pred.ridge.te.label <- ifelse(pred.ridge.te.prob > 0.5, "Approved", "Rejected")

3.7 Principal Component Regression

model.pcr <- pcr(Approved ~ ., data = df.tr.dummy %>% 
                   mutate(Approved = ifelse(Approved == "Approved", 1, 0)), 
                 scale = T, validation = "CV")
predict(model.pcr, df.te.dummy, 
        ncomp = selectNcomp(model.pcr, method = "onesigma")) %>% 
  as.numeric() -> pred.pcr.te.prob
pred.pcr.te.label <- ifelse(pred.pcr.te.prob > 0.5, "Approved", "Rejected")

3.8 Partial Least Squares

plsr(Approved ~ ., data = df.tr.dummy %>% 
       mutate(Approved = ifelse(Approved == "Approved", 1, 0)), 
     scale = TRUE, validation = "CV") -> model.plsr
predict(model.plsr, df.te.dummy, 
        ncomp = selectNcomp(model.plsr, method = "onesigma")) %>% 
  as.numeric() -> pred.plsr.te.prob
pred.plsr.te.label <- ifelse(pred.plsr.te.prob > 0.5, "Approved", "Rejected")

3.9 Spline Model

lm(formula = Approved ~ bs(Age) + bs(Debt) + bs(YearsEmployed) + bs(CreditScore) + 
     bs(Income) + Gender + MaritalStatus + `Bank Customer` + `Education Level` + 
     Ethnicity + PriorDefault + Employed, 
   data = df.tr %>% mutate(Approved = ifelse(Approved == "Approved", 1, 0))) -> model.spline
predict(model.spline, df.te) -> pred.spline.te.prob
pred.spline.te.label <- ifelse(pred.spline.te.prob > 0.5, "Approved", "Rejected")

3.10 Polynomial Regression Model

lm(formula = Approved ~ poly(Age, 3) + poly(Debt, 3) + poly(YearsEmployed, 3) + 
     poly(CreditScore, 3) + poly(Income, 3) + Gender + MaritalStatus + 
     `Bank Customer` + `Education Level` + Ethnicity + PriorDefault + Employed, 
   data = df.tr %>% mutate(Approved = ifelse(Approved == "Approved", 1, 0))) -> model.ploy
predict(model.ploy, df.te) -> pred.poly.te.prob
pred.poly.te.label <- ifelse(pred.poly.te.prob > 0.5, "Approved", "Rejected")

3.11 Linear Model Backward Selection

lm(formula = Approved ~ Age + Debt + YearsEmployed + CreditScore + Income + 
     Gender + MaritalStatus + `Bank Customer` + `Education Level` + 
     Ethnicity + PriorDefault + Employed, 
   data = df.tr %>% mutate(Approved = ifelse(Approved == "Approved", 1, 0))) -> model.lm.full
model.lm.backward <- step(model.lm.full, direction = "backward")
## Start:  AIC=-1089.23
## Approved ~ Age + Debt + YearsEmployed + CreditScore + Income + 
##     Gender + MaritalStatus + `Bank Customer` + `Education Level` + 
##     Ethnicity + PriorDefault + Employed
## 
## 
## Step:  AIC=-1089.23
## Approved ~ Age + Debt + YearsEmployed + CreditScore + Income + 
##     Gender + MaritalStatus + `Education Level` + Ethnicity + 
##     PriorDefault + Employed
## 
##                     Df Sum of Sq    RSS      AIC
## - `Education Level` 13    2.0126 46.063 -1093.70
## - Gender             2    0.0025 44.053 -1093.21
## - Ethnicity          6    0.8487 44.900 -1092.03
## - Debt               1    0.0125 44.063 -1091.10
## - Age                1    0.0755 44.126 -1090.41
## <none>                           44.051 -1089.23
## - YearsEmployed      1    0.2537 44.305 -1088.46
## - CreditScore        1    0.3478 44.399 -1087.44
## - Income             1    0.6650 44.716 -1084.01
## - MaritalStatus      2    0.9870 45.038 -1082.55
## - Employed           1    0.9579 45.009 -1080.86
## - PriorDefault       1   27.6466 71.698  -856.45
## 
## Step:  AIC=-1093.7
## Approved ~ Age + Debt + YearsEmployed + CreditScore + Income + 
##     Gender + MaritalStatus + Ethnicity + PriorDefault + Employed
## 
##                 Df Sum of Sq    RSS      AIC
## - Gender         2    0.0096 46.073 -1097.60
## - Debt           1    0.0018 46.065 -1095.68
## - Age            1    0.0404 46.104 -1095.28
## - Ethnicity      7    1.2221 47.286 -1095.08
## <none>                       46.063 -1093.70
## - Income         1    0.3532 46.417 -1092.02
## - YearsEmployed  1    0.4032 46.467 -1091.50
## - CreditScore    1    0.4075 46.471 -1091.45
## - Employed       1    1.1531 47.217 -1083.78
## - MaritalStatus  2    1.3571 47.421 -1083.70
## - PriorDefault   1   30.6477 76.711  -849.87
## 
## Step:  AIC=-1097.6
## Approved ~ Age + Debt + YearsEmployed + CreditScore + Income + 
##     MaritalStatus + Ethnicity + PriorDefault + Employed
## 
##                 Df Sum of Sq    RSS      AIC
## - Debt           1    0.0015 46.075 -1099.58
## - Age            1    0.0389 46.112 -1099.19
## - Ethnicity      7    1.2209 47.294 -1098.99
## <none>                       46.073 -1097.60
## - Income         1    0.3647 46.438 -1095.80
## - YearsEmployed  1    0.3979 46.471 -1095.45
## - CreditScore    1    0.4073 46.480 -1095.36
## - Employed       1    1.1595 47.233 -1087.62
## - MaritalStatus  2    1.3857 47.459 -1087.32
## - PriorDefault   1   30.7524 76.825  -853.15
## 
## Step:  AIC=-1099.58
## Approved ~ Age + YearsEmployed + CreditScore + Income + MaritalStatus + 
##     Ethnicity + PriorDefault + Employed
## 
##                 Df Sum of Sq    RSS      AIC
## - Age            1    0.0387 46.113 -1101.18
## - Ethnicity      7    1.2275 47.302 -1100.91
## <none>                       46.075 -1099.58
## - Income         1    0.3646 46.439 -1097.78
## - YearsEmployed  1    0.4026 46.477 -1097.39
## - CreditScore    1    0.4114 46.486 -1097.30
## - Employed       1    1.1669 47.241 -1089.53
## - MaritalStatus  2    1.3870 47.462 -1089.29
## - PriorDefault   1   31.0969 77.171  -852.98
## 
## Step:  AIC=-1101.18
## Approved ~ YearsEmployed + CreditScore + Income + MaritalStatus + 
##     Ethnicity + PriorDefault + Employed
## 
##                 Df Sum of Sq    RSS      AIC
## - Ethnicity      7     1.211 47.324 -1102.69
## <none>                       46.113 -1101.18
## - Income         1     0.372 46.485 -1099.31
## - CreditScore    1     0.425 46.538 -1098.76
## - YearsEmployed  1     0.561 46.674 -1097.35
## - Employed       1     1.141 47.254 -1091.40
## - MaritalStatus  2     1.381 47.494 -1090.96
## - PriorDefault   1    31.898 78.011  -849.77
## 
## Step:  AIC=-1102.69
## Approved ~ YearsEmployed + CreditScore + Income + MaritalStatus + 
##     PriorDefault + Employed
## 
##                 Df Sum of Sq    RSS      AIC
## <none>                       47.324 -1102.69
## - CreditScore    1     0.396 47.720 -1100.67
## - Income         1     0.447 47.771 -1100.15
## - YearsEmployed  1     0.501 47.825 -1099.61
## - Employed       1     1.214 48.538 -1092.48
## - MaritalStatus  2     3.151 50.475 -1075.62
## - PriorDefault   1    34.295 81.619  -841.97
predict(model.lm.backward, df.te) -> pred.lm_backward.te.prob
pred.lm_backward.te.label <- ifelse(pred.lm_backward.te.prob > 0.5, "Approved", "Rejected")

3.12 Linear Model Forward Selection

lm(formula = Approved ~ 1, data = df.tr %>% 
     mutate(Approved = ifelse(Approved == "Approved", 1, 0))) -> model.lm.empty
model.lm.forward <- step(model.lm.empty, direction = "forward", 
                         scope = formula(Approved ~ Age + Debt + YearsEmployed + 
                                           CreditScore + Income + Gender + MaritalStatus + 
                                           `Bank Customer` + `Education Level` + 
                                           Ethnicity + PriorDefault + Employed))
## Start:  AIC=-671.41
## Approved ~ 1
## 
##                     Df Sum of Sq     RSS      AIC
## + PriorDefault       1    63.069  56.134 -1032.40
## + Employed           1    23.382  95.821  -774.65
## + CreditScore        1    18.943 100.260  -752.82
## + YearsEmployed      1    14.453 104.750  -731.71
## + `Education Level` 14    15.769 103.435  -711.80
## + Debt               1     5.979 113.224  -694.21
## + Age                1     4.550 114.653  -688.17
## + MaritalStatus      2     4.997 114.207  -688.05
## + `Bank Customer`    2     4.997 114.207  -688.05
## + Ethnicity          7     6.202 113.002  -683.16
## + Income             1     3.264 115.940  -682.79
## <none>                           119.203  -671.41
## + Gender             2     0.731 118.473  -670.37
## 
## Step:  AIC=-1032.4
## Approved ~ PriorDefault
## 
##                     Df Sum of Sq    RSS     AIC
## + MaritalStatus      2    4.4544 51.680 -1068.2
## + `Bank Customer`    2    4.4544 51.680 -1068.2
## + Employed           1    2.7181 53.416 -1054.3
## + CreditScore        1    2.2524 53.882 -1050.1
## + Income             1    1.7824 54.352 -1046.0
## + `Education Level` 14    4.6340 51.500 -1045.9
## + Ethnicity          7    2.9392 53.195 -1044.3
## + YearsEmployed      1    1.1261 55.008 -1040.2
## <none>                           56.134 -1032.4
## + Debt               1    0.2144 55.920 -1032.2
## + Age                1    0.0491 56.085 -1030.8
## + Gender             2    0.0027 56.131 -1028.4
## 
## Step:  AIC=-1068.25
## Approved ~ PriorDefault + MaritalStatus
## 
##                     Df Sum of Sq    RSS     AIC
## + Employed           1   2.74840 48.931 -1092.6
## + CreditScore        1   2.19979 49.480 -1087.2
## + YearsEmployed      1   0.99155 50.688 -1075.6
## + Income             1   0.54798 51.132 -1071.4
## + `Education Level` 14   2.99477 48.685 -1069.0
## <none>                           51.680 -1068.2
## + Debt               1   0.19608 51.484 -1068.1
## + Age                1   0.03650 51.643 -1066.6
## + Ethnicity          7   1.26168 50.418 -1066.2
## + Gender             2   0.00085 51.679 -1064.3
## 
## Step:  AIC=-1092.59
## Approved ~ PriorDefault + MaritalStatus + Employed
## 
##                     Df Sum of Sq    RSS     AIC
## + YearsEmployed      1   0.74810 48.183 -1098.0
## + CreditScore        1   0.65308 48.278 -1097.1
## + Income             1   0.47532 48.456 -1095.3
## <none>                           48.931 -1092.6
## + Debt               1   0.10700 48.824 -1091.6
## + Age                1   0.04722 48.884 -1091.0
## + Ethnicity          7   1.22594 47.705 -1090.8
## + `Education Level` 14   2.57661 46.355 -1090.7
## + Gender             2   0.01972 48.911 -1088.8
## 
## Step:  AIC=-1098.01
## Approved ~ PriorDefault + MaritalStatus + Employed + YearsEmployed
## 
##                     Df Sum of Sq    RSS     AIC
## + Income             1   0.46336 47.720 -1100.7
## + CreditScore        1   0.41198 47.771 -1100.2
## <none>                           48.183 -1098.0
## + Ethnicity          7   1.25542 46.928 -1096.7
## + Age                1   0.01391 48.169 -1096.2
## + Debt               1   0.01271 48.170 -1096.1
## + Gender             2   0.00702 48.176 -1094.1
## + `Education Level` 14   2.34518 45.838 -1094.1
## 
## Step:  AIC=-1100.67
## Approved ~ PriorDefault + MaritalStatus + Employed + YearsEmployed + 
##     Income
## 
##                     Df Sum of Sq    RSS     AIC
## + CreditScore        1   0.39562 47.324 -1102.7
## <none>                           47.720 -1100.7
## + Age                1   0.01296 47.707 -1098.8
## + Ethnicity          7   1.18146 46.538 -1098.8
## + Debt               1   0.00007 47.720 -1098.7
## + `Education Level` 14   2.42417 45.296 -1097.8
## + Gender             2   0.00731 47.712 -1096.8
## 
## Step:  AIC=-1102.69
## Approved ~ PriorDefault + MaritalStatus + Employed + YearsEmployed + 
##     Income + CreditScore
## 
##                     Df Sum of Sq    RSS     AIC
## <none>                           47.324 -1102.7
## + Ethnicity          7   1.21090 46.113 -1101.2
## + Age                1   0.02207 47.302 -1100.9
## + Debt               1   0.01016 47.314 -1100.8
## + `Education Level` 14   2.40167 44.922 -1099.8
## + Gender             2   0.00771 47.316 -1098.8
predict(model.lm.forward, df.te) -> pred.lm_forward.te.prob
pred.lm_forward.te.label <- ifelse(pred.lm_forward.te.prob > 0.5, "Approved", "Rejected")

3.13 Logistic Model Backward Selection

glm(formula = Approved ~ ., data = df.tr %>% 
      mutate(Approved = case_when(Approved == "Rejected" ~ 0, 
                                  T ~ 1)), family = "binomial") -> model.lr.full
model.lr.backward <- step(model.lr.full, direction = "backward")
## Start:  AIC=329.47
## Approved ~ Gender + Age + Debt + MaritalStatus + `Bank Customer` + 
##     `Education Level` + Ethnicity + YearsEmployed + PriorDefault + 
##     Employed + CreditScore + DriversLicense + Citizen + Income
## 
## 
## Step:  AIC=329.47
## Approved ~ Gender + Age + Debt + MaritalStatus + `Education Level` + 
##     Ethnicity + YearsEmployed + PriorDefault + Employed + CreditScore + 
##     DriversLicense + Citizen + Income
## 
##                     Df Deviance    AIC
## - `Education Level` 13   271.26 315.26
## - Ethnicity          6   267.22 325.22
## - Citizen            2   259.47 325.47
## - Gender             2   259.62 325.62
## - Debt               1   259.51 327.51
## - Age                1   260.07 328.07
## - DriversLicense     1   260.41 328.41
## - Employed           1   261.33 329.33
## <none>                   259.47 329.47
## - YearsEmployed      1   262.17 330.17
## - CreditScore        1   262.57 330.57
## - MaritalStatus      2   264.99 330.99
## - Income             1   274.17 342.17
## - PriorDefault       1   391.63 459.63
## 
## Step:  AIC=315.26
## Approved ~ Gender + Age + Debt + MaritalStatus + Ethnicity + 
##     YearsEmployed + PriorDefault + Employed + CreditScore + DriversLicense + 
##     Citizen + Income
## 
##                  Df Deviance    AIC
## - Citizen         2   271.30 311.30
## - Gender          2   271.31 311.31
## - Debt            1   271.29 313.29
## - Age             1   271.80 313.80
## - Ethnicity       7   283.95 313.95
## - DriversLicense  1   272.49 314.49
## <none>                271.26 315.26
## - CreditScore     1   273.88 315.88
## - Employed        1   274.20 316.20
## - YearsEmployed   1   275.13 317.13
## - MaritalStatus   2   284.03 324.03
## - Income          1   288.62 330.62
## - PriorDefault    1   410.43 452.43
## 
## Step:  AIC=311.3
## Approved ~ Gender + Age + Debt + MaritalStatus + Ethnicity + 
##     YearsEmployed + PriorDefault + Employed + CreditScore + DriversLicense + 
##     Income
## 
##                  Df Deviance    AIC
## - Gender          2   271.34 307.34
## - Debt            1   271.33 309.33
## - Age             1   271.85 309.85
## - Ethnicity       7   284.34 310.34
## - DriversLicense  1   272.50 310.50
## <none>                271.30 311.30
## - CreditScore     1   273.89 311.89
## - Employed        1   274.68 312.68
## - YearsEmployed   1   275.16 313.16
## - MaritalStatus   2   285.88 321.88
## - Income          1   289.05 327.05
## - PriorDefault    1   410.54 448.54
## 
## Step:  AIC=307.34
## Approved ~ Age + Debt + MaritalStatus + Ethnicity + YearsEmployed + 
##     PriorDefault + Employed + CreditScore + DriversLicense + 
##     Income
## 
##                  Df Deviance    AIC
## - Debt            1   271.37 305.37
## - Age             1   271.95 305.95
## - DriversLicense  1   272.53 306.53
## - Ethnicity       7   284.65 306.65
## <none>                271.34 307.34
## - CreditScore     1   273.90 307.90
## - Employed        1   274.73 308.73
## - YearsEmployed   1   275.25 309.25
## - MaritalStatus   2   286.46 318.46
## - Income          1   289.06 323.06
## - PriorDefault    1   410.87 444.87
## 
## Step:  AIC=305.37
## Approved ~ Age + MaritalStatus + Ethnicity + YearsEmployed + 
##     PriorDefault + Employed + CreditScore + DriversLicense + 
##     Income
## 
##                  Df Deviance    AIC
## - Age             1   271.99 303.99
## - DriversLicense  1   272.58 304.58
## - Ethnicity       7   284.66 304.66
## <none>                271.37 305.37
## - CreditScore     1   274.08 306.08
## - Employed        1   274.73 306.73
## - YearsEmployed   1   275.44 307.44
## - MaritalStatus   2   286.49 316.49
## - Income          1   289.19 321.19
## - PriorDefault    1   414.14 446.14
## 
## Step:  AIC=303.99
## Approved ~ MaritalStatus + Ethnicity + YearsEmployed + PriorDefault + 
##     Employed + CreditScore + DriversLicense + Income
## 
##                  Df Deviance    AIC
## - Ethnicity       7   285.10 303.10
## - DriversLicense  1   273.16 303.16
## <none>                271.99 303.99
## - CreditScore     1   274.85 304.85
## - Employed        1   275.20 305.20
## - YearsEmployed   1   277.38 307.38
## - MaritalStatus   2   286.96 314.96
## - Income          1   289.56 319.56
## - PriorDefault    1   420.07 450.07
## 
## Step:  AIC=303.1
## Approved ~ MaritalStatus + YearsEmployed + PriorDefault + Employed + 
##     CreditScore + DriversLicense + Income
## 
##                  Df Deviance    AIC
## - DriversLicense  1   286.27 302.27
## <none>                285.10 303.10
## - CreditScore     1   287.53 303.53
## - Employed        1   288.91 304.91
## - YearsEmployed   1   291.16 307.16
## - Income          1   302.88 318.88
## - MaritalStatus   2   311.70 325.70
## - PriorDefault    1   440.32 456.32
## 
## Step:  AIC=302.27
## Approved ~ MaritalStatus + YearsEmployed + PriorDefault + Employed + 
##     CreditScore + Income
## 
##                 Df Deviance    AIC
## <none>               286.27 302.27
## - CreditScore    1   288.46 302.46
## - Employed       1   290.43 304.43
## - YearsEmployed  1   291.58 305.58
## - Income         1   304.42 318.42
## - MaritalStatus  2   313.79 325.79
## - PriorDefault   1   441.39 455.39
pred.lr_backward.te.prob <- predict(model.lr.backward, df.te, type = "response")
pred.lr_backward.te.label <- ifelse(pred.lr_backward.te.prob > 0.5, "Approved", "Rejected")

3.14 Logistic Model Forward Selection

glm(formula = Approved ~ 1, data = df.tr %>% 
      mutate(Approved = case_when(Approved == "Rejected" ~ 0, 
                                  T ~ 1)), family = "binomial") -> model.lr.empty
model.lr.forward <- step(model.lr.empty, direction = "forward", 
                         scope = formula(Approved ~ Age + Debt + YearsEmployed + 
                                           CreditScore + Income + Gender + MaritalStatus + 
                                           `Bank Customer` + `Education Level` + 
                                           Ethnicity + PriorDefault + Employed))
## Start:  AIC=665
## Approved ~ 1
## 
##                     Df Deviance    AIC
## + PriorDefault       1   369.56 373.56
## + CreditScore        1   535.20 539.20
## + Employed           1   565.47 569.47
## + YearsEmployed      1   590.77 594.77
## + Income             1   597.06 601.06
## + `Education Level` 14   596.48 626.48
## + Debt               1   638.31 642.31
## + MaritalStatus      2   641.92 647.92
## + `Bank Customer`    2   641.92 647.92
## + Age                1   644.40 648.40
## + Ethnicity          7   636.75 652.75
## <none>                   663.00 665.00
## + Gender             2   659.68 665.68
## 
## Step:  AIC=373.56
## Approved ~ PriorDefault
## 
##                     Df Deviance    AIC
## + MaritalStatus      2   336.92 344.92
## + `Bank Customer`    2   336.92 344.92
## + Income             1   340.10 346.10
## + CreditScore        1   345.34 351.34
## + Employed           1   348.88 354.88
## + Ethnicity          7   344.52 362.52
## + `Education Level` 14   331.31 363.31
## + YearsEmployed      1   360.55 366.55
## <none>                   369.56 373.56
## + Debt               1   367.92 373.92
## + Age                1   369.18 375.18
## + Gender             2   369.52 377.52
## 
## Step:  AIC=344.92
## Approved ~ PriorDefault + MaritalStatus
## 
##                     Df Deviance    AIC
## + Income             1   311.65 321.65
## + CreditScore        1   313.29 323.29
## + Employed           1   313.68 323.68
## + YearsEmployed      1   329.61 339.61
## + Ethnicity          7   322.91 344.91
## <none>                   336.92 344.92
## + Debt               1   335.44 345.44
## + Age                1   336.75 346.75
## + `Education Level` 14   310.82 346.82
## + Gender             2   336.90 348.90
## 
## Step:  AIC=321.65
## Approved ~ PriorDefault + MaritalStatus + Income
## 
##                     Df Deviance    AIC
## + Employed           1   294.89 306.89
## + CreditScore        1   295.45 307.45
## + YearsEmployed      1   303.62 315.62
## + Ethnicity          7   296.21 320.21
## <none>                   311.65 321.65
## + Debt               1   310.94 322.94
## + Age                1   311.52 323.52
## + `Education Level` 14   287.22 325.22
## + Gender             2   311.41 325.41
## 
## Step:  AIC=306.89
## Approved ~ PriorDefault + MaritalStatus + Income + Employed
## 
##                     Df Deviance    AIC
## + YearsEmployed      1   288.46 302.46
## + CreditScore        1   291.58 305.58
## <none>                   294.89 306.89
## + Ethnicity          7   281.51 307.51
## + Debt               1   294.45 308.45
## + Age                1   294.81 308.81
## + Gender             2   294.44 310.44
## + `Education Level` 14   274.82 314.82
## 
## Step:  AIC=302.46
## Approved ~ PriorDefault + MaritalStatus + Income + Employed + 
##     YearsEmployed
## 
##                     Df Deviance    AIC
## + CreditScore        1   286.27 302.27
## <none>                   288.46 302.46
## + Ethnicity          7   275.80 303.80
## + Age                1   288.14 304.14
## + Debt               1   288.45 304.45
## + Gender             2   288.33 306.33
## + `Education Level` 14   271.20 313.20
## 
## Step:  AIC=302.27
## Approved ~ PriorDefault + MaritalStatus + Income + Employed + 
##     YearsEmployed + CreditScore
## 
##                     Df Deviance    AIC
## <none>                   286.27 302.27
## + Ethnicity          7   273.16 303.16
## + Age                1   285.82 303.82
## + Debt               1   286.27 304.27
## + Gender             2   286.10 306.10
## + `Education Level` 14   268.88 312.88
pred.lr_forward.te.prob <- predict(model.lr.forward, df.te, type = "response")
pred.lr_forward.te.label <- ifelse(pred.lr_forward.te.prob > 0.5, "Approved", "Rejected")

3.15 Random Forest

randomForest(df.tr %>% select(-Approved),
             df.tr$Approved %>% factor(levels = c("Rejected", "Approved")), 
             ntree = 500) -> model.rf
pred.rf.te.prob <- predict(model.rf, df.te, type = "prob")
pred.rf.te.label <- predict(model.rf, df.te, type = "class")

3.16 Bagging

library(baguette)
## Warning: 程序包'baguette'是用R版本4.4.2 来建造的
## 载入需要的程序包:parsnip
## Warning: 程序包'parsnip'是用R版本4.4.2 来建造的
bagger(Approved ~ ., 
       data = df.tr %>% 
          mutate(Approved = factor(Approved, 
                                   levels = c("Rejected", 
                                              "Approved")))) -> model.bagging
pred.bagging.te.label <- predict(model.bagging, df.te)$`.pred_class`

3.17 Boosting

set.seed(1)
data.tr.matrix <- data.matrix(df.tr.dummy %>% 
                                select(-Approved) %>% scale())
data.tr.xgb <- xgb.DMatrix(data = data.tr.matrix, 
                           label = ifelse(df.tr$Approved == "Approved", 1, 0))
data.te.matrix <- data.matrix(df.te.dummy %>% 
                                select(-Approved) %>% scale())
data.te.xgb <- xgb.DMatrix(data = data.te.matrix, 
                           label = ifelse(df.te$Approved == "Approved", 1, 0))

param <- list(booster = "gblinear", objective = "multi:softprob", 
              num_class = 2, lambda = 0.0001, alpha = 0.0001, nthread = 1)
xgb.train(param, data.tr.xgb, list(tr=data.tr.xgb), nrounds = 50, 
          eta = 0.5,  verbose = F, 
          callbacks = list(cb.gblinear.history())) -> model.xgb
predict(model.xgb, data.te.xgb, type = "prob") %>% 
    matrix(ncol = 2, byrow = T) -> res.df
res.df[,2] -> pred.xgb.te.prob
ifelse(pred.xgb.te.prob > 0.5, "Approved", "Rejected") -> pred.xgb.te.label

4 Model Evaluation

data.frame(`Linear Probability Model` = pred.lm.te.label, 
           `Logistic Regression Model` = pred.lr.te.label, 
           `Linear Discriminant Analysis Model` = pred.lda.te.label, 
           `Quadratic Discriminant Analysis Model` = pred.qda.te.label, 
           `Lasso Regression Model` = pred.lasso.te.label, 
           `Ridge Regression Model` = pred.ridge.te.label, 
           `Principal Component Regression` = pred.pcr.te.label, 
           `Partial Least Squares` = pred.plsr.te.label, 
           `Spline Model` = pred.spline.te.label, 
           `Polynomial Model` = pred.poly.te.label, 
           `Linear Model Backward Selection` = pred.lm_backward.te.label, 
           `Linear Model Forward Selection` = pred.lm_forward.te.label, 
           `Logistic Model Backward Selection` = pred.lr_backward.te.label, 
           `Logistic Model Forward Selection` = pred.lr_forward.te.label, 
           `Random Forest` = pred.rf.te.label, 
           `Bagging` = pred.bagging.te.label, 
           `Boosting` = pred.xgb.te.label) %>% 
  mutate(Index = idx.te) %>% 
  mutate(Real = df.te$Approved) %>% 
  gather(key = "Model", value = "Predicted", -Index, -Real) %>% 
  mutate(Model = str_replace_all(Model, "\\.", " ")) %>% 
  select(Index, Model, Real, Predicted) %>% 
  group_by(Model) %>% 
  summarise(Accuracy = sum(Real == Predicted) / n()) %>% 
  mutate(Accuracy = sprintf("%.4f%%", Accuracy * 100)) -> res.df
## Warning: attributes are not identical across measure variables; they will be
## dropped
res.df %>% 
  knitr::kable()
Model Accuracy
Bagging 84.1346%
Boosting 83.6538%
Lasso Regression Model 72.1154%
Linear Discriminant Analysis Model 85.5769%
Linear Model Backward Selection 85.0962%
Linear Model Forward Selection 85.0962%
Linear Probability Model 85.5769%
Logistic Model Backward Selection 87.0192%
Logistic Model Forward Selection 87.0192%
Logistic Regression Model 83.1731%
Partial Least Squares 86.5385%
Polynomial Model 84.1346%
Principal Component Regression 85.5769%
Quadratic Discriminant Analysis Model 79.8077%
Random Forest 87.0192%
Ridge Regression Model 71.1538%
Spline Model 84.1346%
res.df %>% write.csv("results.csv", row.names = F)

5 Model Interpretation

model.lm <- lm(formula = Approved ~ ., data = df.tr %>% 
                 mutate(Approved = case_when(Approved == "Rejected" ~ 0, 
                                             T ~ 1)))
model.lm.sum <- summary(model.lm)
model.lm.sum$coefficients %>% 
  data.frame() %>% 
  rownames_to_column(var = "Variable") %>% 
  rename("p_value" = "Pr...t..", 
         "t_value" = "t.value", 
         "std_error" = "Std..Error") %>% 
  filter(p_value < 0.1) %>% 
  mutate(Estimate = round(Estimate, 4), 
         std_error = round(std_error, 4), 
         t_value = round(t_value, 4), 
         p_value = round(p_value, 4)) %>% 
  write.csv("significant_variables_in_linear_model.csv", row.names = F)

\[ \begin{aligned} \text{Linear Probability(Approved)} = &\beta_0 + \beta_1 \times \text{Gender} + \beta_2 \times \text{Age} + \beta_3 \times \text{Debt} + \\ &\beta_4 \times \text{Marital Status} + \beta_5 \times \text{Bank Customer} + \beta_6 \times \text{Education Level} + \\ &\beta_7 \times \text{Ethnicity} + \beta_8 \times \text{Years Employed} + \beta_9 \times \text{Prior Default} + \\ &\beta_{10} \times \text{Employed} + \beta_{11} \times \text{CreditScore} + \beta_{12} \times \text{Drivers License} + \\ &\beta_{13} \times \text{Citizen} + \beta_{14} \times \text{Income} \end{aligned} \]