df <- read.csv("crx.data", header = F)
glimpse(df)
## Rows: 690
## Columns: 16
## $ V1 <chr> "b", "a", "a", "b", "b", "b", "b", "a", "b", "b", "b", "b", "a", "…
## $ V2 <chr> "30.83", "58.67", "24.50", "27.83", "20.17", "32.08", "33.17", "22…
## $ V3 <dbl> 0.000, 4.460, 0.500, 1.540, 5.625, 4.000, 1.040, 11.585, 0.500, 4.…
## $ V4 <chr> "u", "u", "u", "u", "u", "u", "u", "u", "y", "y", "u", "u", "u", "…
## $ V5 <chr> "g", "g", "g", "g", "g", "g", "g", "g", "p", "p", "g", "g", "g", "…
## $ V6 <chr> "w", "q", "q", "w", "w", "m", "r", "cc", "k", "w", "c", "c", "k", …
## $ V7 <chr> "v", "h", "h", "v", "v", "v", "h", "v", "h", "v", "h", "h", "v", "…
## $ V8 <dbl> 1.250, 3.040, 1.500, 3.750, 1.710, 2.500, 6.500, 0.040, 3.960, 3.1…
## $ V9 <chr> "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "f", "t", "t", "…
## $ V10 <chr> "t", "t", "f", "t", "f", "f", "f", "f", "f", "f", "f", "f", "f", "…
## $ V11 <int> 1, 6, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 3, 10, 0, 7, 17, …
## $ V12 <chr> "f", "f", "f", "t", "f", "t", "t", "f", "f", "t", "t", "f", "t", "…
## $ V13 <chr> "g", "g", "g", "g", "s", "g", "g", "g", "g", "g", "g", "g", "g", "…
## $ V14 <chr> "00202", "00043", "00280", "00100", "00120", "00360", "00164", "00…
## $ V15 <int> 0, 560, 824, 3, 0, 0, 31285, 1349, 314, 1442, 0, 200, 0, 2690, 0, …
## $ V16 <chr> "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "…
Add meaningful column names to variables in the data frame.
c("Gender","Age","Debt","MaritalStatus","Bank Customer",
"Education Level","Ethnicity","YearsEmployed","PriorDefault",
"Employed","CreditScore","DriversLicense","Citizen",
"Zip Code","Income","Approved") -> colnames(df)
df %>% head() %>%
knitr::kable()
| Gender | Age | Debt | MaritalStatus | Bank Customer | Education Level | Ethnicity | YearsEmployed | PriorDefault | Employed | CreditScore | DriversLicense | Citizen | Zip Code | Income | Approved |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| b | 30.83 | 0.000 | u | g | w | v | 1.25 | t | t | 1 | f | g | 00202 | 0 | + |
| a | 58.67 | 4.460 | u | g | q | h | 3.04 | t | t | 6 | f | g | 00043 | 560 | + |
| a | 24.50 | 0.500 | u | g | q | h | 1.50 | t | f | 0 | f | g | 00280 | 824 | + |
| b | 27.83 | 1.540 | u | g | w | v | 3.75 | t | t | 5 | t | g | 00100 | 3 | + |
| b | 20.17 | 5.625 | u | g | w | v | 1.71 | t | f | 0 | f | s | 00120 | 0 | + |
| b | 32.08 | 4.000 | u | g | m | v | 2.50 | t | f | 0 | t | g | 00360 | 0 | + |
Before imputing missing values, it is important to ensure the data type of each column is appropriate.
Age is an inherent numerical variable, however, it is encoded as categorical variable in the data frame. There is a need to convert variable “Age” to numerical variable. During the data type conversion, some missing values can be generated as some strings such as “?” can not be converted to a valid numerical value.
df %>% mutate(Age = as.numeric(Age)) -> df
This column is the response variable, for simplicity, there is a need to re-encode this variable. A symbol of “+” means “Approved”, and a symbol of “-” means “Rejected”.
df %>%
group_by(Approved) %>%
summarise(N = n())
## # A tibble: 2 × 2
## Approved N
## <chr> <int>
## 1 + 307
## 2 - 383
It can be observed that, there are 307 positive samples (Approved) and 383 negative samples (Rejected) in this dataset.
df %>%
mutate(Approved = case_when(Approved == "+" ~ "Approved",
Approved == "-" ~ "Rejected")) -> df
df %>%
lapply(function(x){
class(x)
}) %>% unlist() -> cols_type
cols_categorical <- names(cols_type[cols_type %in% c("character")])
cols_numerical <- names(cols_type[!(cols_type %in% c("character"))])
df %>%
select(all_of(cols_categorical)) %>%
gather(key = "Variable", value = "Category") %>%
group_by(Variable, Category) %>%
summarise(N = n(), .groups = "drop") %>%
knitr::kable()
| Variable | Category | N |
|---|---|---|
| Approved | Approved | 307 |
| Approved | Rejected | 383 |
| Bank Customer | ? | 6 |
| Bank Customer | g | 519 |
| Bank Customer | gg | 2 |
| Bank Customer | p | 163 |
| Citizen | g | 625 |
| Citizen | p | 8 |
| Citizen | s | 57 |
| DriversLicense | f | 374 |
| DriversLicense | t | 316 |
| Education Level | ? | 9 |
| Education Level | aa | 54 |
| Education Level | c | 137 |
| Education Level | cc | 41 |
| Education Level | d | 30 |
| Education Level | e | 25 |
| Education Level | ff | 53 |
| Education Level | i | 59 |
| Education Level | j | 10 |
| Education Level | k | 51 |
| Education Level | m | 38 |
| Education Level | q | 78 |
| Education Level | r | 3 |
| Education Level | w | 64 |
| Education Level | x | 38 |
| Employed | f | 395 |
| Employed | t | 295 |
| Ethnicity | ? | 9 |
| Ethnicity | bb | 59 |
| Ethnicity | dd | 6 |
| Ethnicity | ff | 57 |
| Ethnicity | h | 138 |
| Ethnicity | j | 8 |
| Ethnicity | n | 4 |
| Ethnicity | o | 2 |
| Ethnicity | v | 399 |
| Ethnicity | z | 8 |
| Gender | ? | 12 |
| Gender | a | 210 |
| Gender | b | 468 |
| MaritalStatus | ? | 6 |
| MaritalStatus | l | 2 |
| MaritalStatus | u | 519 |
| MaritalStatus | y | 163 |
| PriorDefault | f | 329 |
| PriorDefault | t | 361 |
| Zip Code | 00000 | 132 |
| Zip Code | 00017 | 1 |
| Zip Code | 00020 | 2 |
| Zip Code | 00021 | 1 |
| Zip Code | 00022 | 1 |
| Zip Code | 00024 | 1 |
| Zip Code | 00028 | 1 |
| Zip Code | 00029 | 1 |
| Zip Code | 00030 | 1 |
| Zip Code | 00032 | 1 |
| Zip Code | 00040 | 4 |
| Zip Code | 00043 | 1 |
| Zip Code | 00045 | 1 |
| Zip Code | 00049 | 1 |
| Zip Code | 00050 | 2 |
| Zip Code | 00052 | 1 |
| Zip Code | 00056 | 1 |
| Zip Code | 00060 | 9 |
| Zip Code | 00062 | 1 |
| Zip Code | 00070 | 4 |
| Zip Code | 00073 | 2 |
| Zip Code | 00075 | 1 |
| Zip Code | 00076 | 1 |
| Zip Code | 00080 | 30 |
| Zip Code | 00086 | 1 |
| Zip Code | 00088 | 2 |
| Zip Code | 00092 | 2 |
| Zip Code | 00093 | 1 |
| Zip Code | 00094 | 1 |
| Zip Code | 00096 | 3 |
| Zip Code | 00099 | 1 |
| Zip Code | 00100 | 30 |
| Zip Code | 00102 | 1 |
| Zip Code | 00108 | 4 |
| Zip Code | 00110 | 2 |
| Zip Code | 00112 | 2 |
| Zip Code | 00117 | 1 |
| Zip Code | 00120 | 35 |
| Zip Code | 00121 | 1 |
| Zip Code | 00128 | 3 |
| Zip Code | 00129 | 2 |
| Zip Code | 00130 | 2 |
| Zip Code | 00132 | 4 |
| Zip Code | 00136 | 2 |
| Zip Code | 00140 | 16 |
| Zip Code | 00141 | 1 |
| Zip Code | 00144 | 4 |
| Zip Code | 00145 | 2 |
| Zip Code | 00150 | 3 |
| Zip Code | 00152 | 1 |
| Zip Code | 00154 | 2 |
| Zip Code | 00156 | 1 |
| Zip Code | 00160 | 34 |
| Zip Code | 00163 | 1 |
| Zip Code | 00164 | 3 |
| Zip Code | 00167 | 1 |
| Zip Code | 00168 | 2 |
| Zip Code | 00170 | 1 |
| Zip Code | 00171 | 1 |
| Zip Code | 00174 | 1 |
| Zip Code | 00176 | 3 |
| Zip Code | 00178 | 1 |
| Zip Code | 00180 | 18 |
| Zip Code | 00181 | 3 |
| Zip Code | 00186 | 1 |
| Zip Code | 00188 | 1 |
| Zip Code | 00195 | 1 |
| Zip Code | 00200 | 35 |
| Zip Code | 00202 | 1 |
| Zip Code | 00204 | 1 |
| Zip Code | 00208 | 1 |
| Zip Code | 00210 | 2 |
| Zip Code | 00211 | 1 |
| Zip Code | 00212 | 1 |
| Zip Code | 00216 | 3 |
| Zip Code | 00220 | 9 |
| Zip Code | 00221 | 1 |
| Zip Code | 00224 | 1 |
| Zip Code | 00225 | 2 |
| Zip Code | 00228 | 2 |
| Zip Code | 00230 | 1 |
| Zip Code | 00231 | 1 |
| Zip Code | 00232 | 4 |
| Zip Code | 00239 | 1 |
| Zip Code | 00240 | 14 |
| Zip Code | 00250 | 1 |
| Zip Code | 00252 | 2 |
| Zip Code | 00253 | 1 |
| Zip Code | 00254 | 1 |
| Zip Code | 00256 | 1 |
| Zip Code | 00260 | 11 |
| Zip Code | 00263 | 1 |
| Zip Code | 00268 | 1 |
| Zip Code | 00272 | 3 |
| Zip Code | 00274 | 1 |
| Zip Code | 00276 | 1 |
| Zip Code | 00280 | 22 |
| Zip Code | 00288 | 1 |
| Zip Code | 00290 | 3 |
| Zip Code | 00292 | 1 |
| Zip Code | 00300 | 13 |
| Zip Code | 00303 | 1 |
| Zip Code | 00309 | 1 |
| Zip Code | 00311 | 1 |
| Zip Code | 00312 | 2 |
| Zip Code | 00320 | 14 |
| Zip Code | 00329 | 1 |
| Zip Code | 00330 | 2 |
| Zip Code | 00333 | 1 |
| Zip Code | 00340 | 7 |
| Zip Code | 00348 | 1 |
| Zip Code | 00349 | 1 |
| Zip Code | 00350 | 2 |
| Zip Code | 00352 | 2 |
| Zip Code | 00356 | 1 |
| Zip Code | 00360 | 7 |
| Zip Code | 00368 | 1 |
| Zip Code | 00369 | 1 |
| Zip Code | 00370 | 2 |
| Zip Code | 00371 | 1 |
| Zip Code | 00372 | 1 |
| Zip Code | 00375 | 1 |
| Zip Code | 00380 | 5 |
| Zip Code | 00381 | 1 |
| Zip Code | 00383 | 1 |
| Zip Code | 00393 | 1 |
| Zip Code | 00395 | 1 |
| Zip Code | 00396 | 2 |
| Zip Code | 00399 | 2 |
| Zip Code | 00400 | 9 |
| Zip Code | 00408 | 1 |
| Zip Code | 00410 | 1 |
| Zip Code | 00411 | 1 |
| Zip Code | 00416 | 1 |
| Zip Code | 00420 | 4 |
| Zip Code | 00422 | 1 |
| Zip Code | 00431 | 1 |
| Zip Code | 00432 | 1 |
| Zip Code | 00434 | 1 |
| Zip Code | 00440 | 4 |
| Zip Code | 00443 | 1 |
| Zip Code | 00450 | 1 |
| Zip Code | 00454 | 1 |
| Zip Code | 00455 | 1 |
| Zip Code | 00460 | 3 |
| Zip Code | 00465 | 1 |
| Zip Code | 00470 | 1 |
| Zip Code | 00480 | 3 |
| Zip Code | 00487 | 1 |
| Zip Code | 00491 | 1 |
| Zip Code | 00500 | 2 |
| Zip Code | 00510 | 1 |
| Zip Code | 00515 | 1 |
| Zip Code | 00519 | 1 |
| Zip Code | 00520 | 4 |
| Zip Code | 00523 | 1 |
| Zip Code | 00550 | 1 |
| Zip Code | 00560 | 2 |
| Zip Code | 00583 | 1 |
| Zip Code | 00600 | 1 |
| Zip Code | 00640 | 1 |
| Zip Code | 00680 | 1 |
| Zip Code | 00711 | 1 |
| Zip Code | 00720 | 2 |
| Zip Code | 00760 | 1 |
| Zip Code | 00840 | 1 |
| Zip Code | 00928 | 1 |
| Zip Code | 00980 | 1 |
| Zip Code | 01160 | 1 |
| Zip Code | 02000 | 1 |
| Zip Code | ? | 13 |
Based on the table displaying data summary information, it can be observed that categorical variable “Zip Code” has too many categories (171), and many categories only contain 1 sample. It will be a problem when including this variable to a model, as some new categories may appear in test dataset and makes the model fail to predict on test dataset. The high number of categories will also increase the model complexity, which may decrease the model performance in the test dataset. As a result, there are enough reasons to remove this variables from the data frame.
df %>% select(-`Zip Code`) -> df
In addition to variable “Zip Code”, the percent of certain categories is also very small (<1%) in some variables. Similarly, including those low frequency categories can be a problem when making prediction on test dataset. In this section, all categories with samples lower than 1% percent are encoded into a unified category named “Other Category” for all categorical variables.
df %>%
lapply(function(x){
class(x)
}) %>% unlist() -> cols_type
cols_categorical <- names(cols_type[cols_type %in% c("character")])
cols_numerical <- names(cols_type[!(cols_type %in% c("character"))])
df %>%
lapply(function(x){
if (class(x) == "character"){
x_table <- table(x)
x_table_percent <- x_table / sum(x_table)
minority_class <- names(x_table_percent[x_table_percent < 0.01])
x[x %in% minority_class] <- "Other Category"
}
x
}) %>%
data.frame() -> df
colnames(df) <- str_replace_all(colnames(df), "\\.", " ")
df %>%
select(all_of(cols_categorical)) %>%
gather(key = "Variable", value = "Category") %>%
group_by(Variable, Category) %>%
summarise(N = n(), .groups = "drop") %>%
group_by(Variable) %>%
mutate(Percent = N / sum(N)) %>%
mutate(Percent = sprintf("%.2f%%", Percent* 100)) -> df.sum
df.sum %>%
knitr::kable()
| Variable | Category | N | Percent |
|---|---|---|---|
| Approved | Approved | 307 | 44.49% |
| Approved | Rejected | 383 | 55.51% |
| Bank Customer | Other Category | 8 | 1.16% |
| Bank Customer | g | 519 | 75.22% |
| Bank Customer | p | 163 | 23.62% |
| Citizen | g | 625 | 90.58% |
| Citizen | p | 8 | 1.16% |
| Citizen | s | 57 | 8.26% |
| DriversLicense | f | 374 | 54.20% |
| DriversLicense | t | 316 | 45.80% |
| Education Level | ? | 9 | 1.30% |
| Education Level | Other Category | 3 | 0.43% |
| Education Level | aa | 54 | 7.83% |
| Education Level | c | 137 | 19.86% |
| Education Level | cc | 41 | 5.94% |
| Education Level | d | 30 | 4.35% |
| Education Level | e | 25 | 3.62% |
| Education Level | ff | 53 | 7.68% |
| Education Level | i | 59 | 8.55% |
| Education Level | j | 10 | 1.45% |
| Education Level | k | 51 | 7.39% |
| Education Level | m | 38 | 5.51% |
| Education Level | q | 78 | 11.30% |
| Education Level | w | 64 | 9.28% |
| Education Level | x | 38 | 5.51% |
| Employed | f | 395 | 57.25% |
| Employed | t | 295 | 42.75% |
| Ethnicity | ? | 9 | 1.30% |
| Ethnicity | Other Category | 12 | 1.74% |
| Ethnicity | bb | 59 | 8.55% |
| Ethnicity | ff | 57 | 8.26% |
| Ethnicity | h | 138 | 20.00% |
| Ethnicity | j | 8 | 1.16% |
| Ethnicity | v | 399 | 57.83% |
| Ethnicity | z | 8 | 1.16% |
| Gender | ? | 12 | 1.74% |
| Gender | a | 210 | 30.43% |
| Gender | b | 468 | 67.83% |
| MaritalStatus | Other Category | 8 | 1.16% |
| MaritalStatus | u | 519 | 75.22% |
| MaritalStatus | y | 163 | 23.62% |
| PriorDefault | f | 329 | 47.68% |
| PriorDefault | t | 361 | 52.32% |
df.sum %>%
write.csv("descriptive_statistic.csv", row.names = F)
df %>%
is.na() %>%
colSums() %>%
data.frame() %>%
rownames_to_column(var = "Variable") %>%
rename("Number of Missing Values" = ".")
## Variable Number of Missing Values
## 1 Gender 0
## 2 Age 12
## 3 Debt 0
## 4 MaritalStatus 0
## 5 Bank Customer 0
## 6 Education Level 0
## 7 Ethnicity 0
## 8 YearsEmployed 0
## 9 PriorDefault 0
## 10 Employed 0
## 11 CreditScore 0
## 12 DriversLicense 0
## 13 Citizen 0
## 14 Income 0
## 15 Approved 0
df %>%
mutate(Age = case_when(is.na(Age) ~ mean(Age, na.rm = T),
T ~ Age)) -> df
df %>%
select(c(all_of(cols_numerical), "Approved")) %>%
gather(key = "Variable", value = "Value", -Approved) %>%
group_by(Variable, Approved) %>%
summarise(Mean = mean(Value) %>% round(4),
SD = sd(Value) %>% round(4),
Min = min(Value) %>% round(4),
Max = max(Value) %>% round(4),
Median = median(Value) %>% round(4),
.groups = "drop") -> df.numerical.sum
df.numerical.sum %>%
knitr::kable()
| Variable | Approved | Mean | SD | Min | Max | Median |
|---|---|---|---|---|---|---|
| Age | Approved | 33.7065 | 12.7689 | 13.75 | 76.750 | 30.670 |
| Age | Rejected | 29.8542 | 10.7791 | 15.17 | 80.250 | 27.670 |
| CreditScore | Approved | 4.6059 | 6.3202 | 0.00 | 67.000 | 3.000 |
| CreditScore | Rejected | 0.6319 | 1.9000 | 0.00 | 20.000 | 0.000 |
| Debt | Approved | 5.9050 | 5.4715 | 0.00 | 28.000 | 4.460 |
| Debt | Rejected | 3.8399 | 4.3377 | 0.00 | 26.335 | 2.210 |
| Income | Approved | 2038.8599 | 7659.7639 | 0.00 | 100000.000 | 221.000 |
| Income | Rejected | 198.6057 | 671.6088 | 0.00 | 5552.000 | 1.000 |
| YearsEmployed | Approved | 3.4279 | 4.1208 | 0.00 | 28.500 | 2.000 |
| YearsEmployed | Rejected | 1.2579 | 2.1205 | 0.00 | 13.875 | 0.415 |
df.numerical.sum %>%
write.csv("numerical_descriptive_statistics.csv", row.names = F)
df %>%
select(c(all_of(cols_numerical), "Approved")) %>%
gather(key = "Variable", value = "Value", -Approved) %>%
ggplot() +
geom_boxplot(aes(x = Approved, y = Value, color = Approved)) +
scale_color_manual(breaks = c("Rejected", "Approved"),
values = c("red", "darkgreen")) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
xlab("") +
ylab("")
ggsave("Numerical_variables_vs_Approved.png", dpi = 300, width = 8, height = 5, bg = "white")
set.seed(1)
idx <- sample(1:nrow(df))
n.tr <- floor(nrow(df) * 0.7)
idx.tr <- idx[1:n.tr]
idx.te <- idx[-(1:n.tr)]
df.tr <- df[idx.tr,]
df.te <- df[-idx.tr,]
model.lm <- lm(formula = Approved ~ ., data = df.tr %>%
mutate(Approved = case_when(Approved == "Rejected" ~ 0,
T ~ 1)))
pred.lm.te.prob <- predict(model.lm, df.te)
pred.lm.te.label <- ifelse(pred.lm.te.prob > 0.5, "Approved", "Rejected")
glm(formula = Approved ~ ., data = df.tr %>%
mutate(Approved = case_when(Approved == "Rejected" ~ 0,
T ~ 1)), family = "binomial") -> model.lr
pred.lr.te.prob <- predict(model.lr, df.te, type = "response")
pred.lr.te.label <- ifelse(pred.lr.te.prob > 0.5, "Approved", "Rejected")
MASS::lda(formula = Approved ~ ., data = df.tr %>%
mutate(Approved = case_when(Approved == "Rejected" ~ 0,
T ~ 1))) -> model.lda
pred.lda.te.prob <- predict(model.lda, df.te)$posterior[,2]
pred.lda.te.label <- ifelse(predict(model.lda, df.te)$class == 1, "Approved", "Rejected")
df.tr.dummy <- dummy_cols(df.tr, select_columns = setdiff(cols_categorical, "Approved"),
remove_selected_columns = T, remove_first_dummy = T)
df.te.dummy <- dummy_cols(df.te, select_columns = setdiff(cols_categorical, "Approved"),
remove_selected_columns = T, remove_first_dummy = T)
cols_intersect <- intersect(colnames(df.tr.dummy), colnames(df.te.dummy))
df.tr.dummy <- df.tr.dummy[,cols_intersect]
df.te.dummy <- df.te.dummy[,cols_intersect]
nzv <- caret::nearZeroVar(df.tr.dummy, saveMetrics = TRUE)
df.tr.dummy.sub <- df.tr.dummy[, !nzv$nzv]
correlation_matrix <- cor(df.tr.dummy.sub %>% select(-Approved))
high_cor <- caret::findCorrelation(correlation_matrix, cutoff = 0.9)
df.tr.dummy.sub <- df.tr.dummy.sub %>% select(-all_of(colnames(correlation_matrix)[high_cor]))
MASS::qda(formula = Approved ~ ., data = df.tr.dummy.sub %>%
mutate(Approved = case_when(Approved == "Rejected" ~ 0,
T ~ 1))) -> model.qda
pred.qda.te.prob <- predict(model.qda, df.te.dummy)$posterior[,2]
pred.qda.te.label <- ifelse(predict(model.qda, df.te.dummy)$class == 1, "Approved", "Rejected")
df.tr.mat <- df.tr %>% select(-Approved) %>% as.matrix()
cv.glmnet(df.tr.mat, ifelse(df.tr$Approved == "Approved", 1, 0),
family = "binomial", alpha = 1) -> model.lasso.cv
glmnet(df.tr.mat, ifelse(df.tr$Approved == "Approved", 1, 0),
family = "binomial", alpha = 1,
lambda = model.lasso.cv$lambda.1se) -> model.lasso
predict(model.lasso, df.te %>% select(-Approved) %>% as.matrix(),
type = "response") %>% as.numeric() -> pred.lasso.te.prob
pred.lasso.te.label <- ifelse(pred.lasso.te.prob > 0.5, "Approved", "Rejected")
df.tr.mat <- df.tr %>% select(-Approved) %>% as.matrix()
cv.glmnet(df.tr.mat, ifelse(df.tr$Approved == "Approved", 1, 0),
family = "binomial", alpha = 0) -> model.ridge.cv
glmnet(df.tr.mat, ifelse(df.tr$Approved == "Approved", 1, 0),
family = "binomial", alpha = 0,
lambda = model.ridge.cv$lambda.1se) -> model.ridge
predict(model.ridge, df.te %>% select(-Approved) %>% as.matrix(),
type = "response") %>% as.numeric() -> pred.ridge.te.prob
pred.ridge.te.label <- ifelse(pred.ridge.te.prob > 0.5, "Approved", "Rejected")
model.pcr <- pcr(Approved ~ ., data = df.tr.dummy %>%
mutate(Approved = ifelse(Approved == "Approved", 1, 0)),
scale = T, validation = "CV")
predict(model.pcr, df.te.dummy,
ncomp = selectNcomp(model.pcr, method = "onesigma")) %>%
as.numeric() -> pred.pcr.te.prob
pred.pcr.te.label <- ifelse(pred.pcr.te.prob > 0.5, "Approved", "Rejected")
plsr(Approved ~ ., data = df.tr.dummy %>%
mutate(Approved = ifelse(Approved == "Approved", 1, 0)),
scale = TRUE, validation = "CV") -> model.plsr
predict(model.plsr, df.te.dummy,
ncomp = selectNcomp(model.plsr, method = "onesigma")) %>%
as.numeric() -> pred.plsr.te.prob
pred.plsr.te.label <- ifelse(pred.plsr.te.prob > 0.5, "Approved", "Rejected")
lm(formula = Approved ~ bs(Age) + bs(Debt) + bs(YearsEmployed) + bs(CreditScore) +
bs(Income) + Gender + MaritalStatus + `Bank Customer` + `Education Level` +
Ethnicity + PriorDefault + Employed,
data = df.tr %>% mutate(Approved = ifelse(Approved == "Approved", 1, 0))) -> model.spline
predict(model.spline, df.te) -> pred.spline.te.prob
pred.spline.te.label <- ifelse(pred.spline.te.prob > 0.5, "Approved", "Rejected")
lm(formula = Approved ~ poly(Age, 3) + poly(Debt, 3) + poly(YearsEmployed, 3) +
poly(CreditScore, 3) + poly(Income, 3) + Gender + MaritalStatus +
`Bank Customer` + `Education Level` + Ethnicity + PriorDefault + Employed,
data = df.tr %>% mutate(Approved = ifelse(Approved == "Approved", 1, 0))) -> model.ploy
predict(model.ploy, df.te) -> pred.poly.te.prob
pred.poly.te.label <- ifelse(pred.poly.te.prob > 0.5, "Approved", "Rejected")
lm(formula = Approved ~ Age + Debt + YearsEmployed + CreditScore + Income +
Gender + MaritalStatus + `Bank Customer` + `Education Level` +
Ethnicity + PriorDefault + Employed,
data = df.tr %>% mutate(Approved = ifelse(Approved == "Approved", 1, 0))) -> model.lm.full
model.lm.backward <- step(model.lm.full, direction = "backward")
## Start: AIC=-1089.23
## Approved ~ Age + Debt + YearsEmployed + CreditScore + Income +
## Gender + MaritalStatus + `Bank Customer` + `Education Level` +
## Ethnicity + PriorDefault + Employed
##
##
## Step: AIC=-1089.23
## Approved ~ Age + Debt + YearsEmployed + CreditScore + Income +
## Gender + MaritalStatus + `Education Level` + Ethnicity +
## PriorDefault + Employed
##
## Df Sum of Sq RSS AIC
## - `Education Level` 13 2.0126 46.063 -1093.70
## - Gender 2 0.0025 44.053 -1093.21
## - Ethnicity 6 0.8487 44.900 -1092.03
## - Debt 1 0.0125 44.063 -1091.10
## - Age 1 0.0755 44.126 -1090.41
## <none> 44.051 -1089.23
## - YearsEmployed 1 0.2537 44.305 -1088.46
## - CreditScore 1 0.3478 44.399 -1087.44
## - Income 1 0.6650 44.716 -1084.01
## - MaritalStatus 2 0.9870 45.038 -1082.55
## - Employed 1 0.9579 45.009 -1080.86
## - PriorDefault 1 27.6466 71.698 -856.45
##
## Step: AIC=-1093.7
## Approved ~ Age + Debt + YearsEmployed + CreditScore + Income +
## Gender + MaritalStatus + Ethnicity + PriorDefault + Employed
##
## Df Sum of Sq RSS AIC
## - Gender 2 0.0096 46.073 -1097.60
## - Debt 1 0.0018 46.065 -1095.68
## - Age 1 0.0404 46.104 -1095.28
## - Ethnicity 7 1.2221 47.286 -1095.08
## <none> 46.063 -1093.70
## - Income 1 0.3532 46.417 -1092.02
## - YearsEmployed 1 0.4032 46.467 -1091.50
## - CreditScore 1 0.4075 46.471 -1091.45
## - Employed 1 1.1531 47.217 -1083.78
## - MaritalStatus 2 1.3571 47.421 -1083.70
## - PriorDefault 1 30.6477 76.711 -849.87
##
## Step: AIC=-1097.6
## Approved ~ Age + Debt + YearsEmployed + CreditScore + Income +
## MaritalStatus + Ethnicity + PriorDefault + Employed
##
## Df Sum of Sq RSS AIC
## - Debt 1 0.0015 46.075 -1099.58
## - Age 1 0.0389 46.112 -1099.19
## - Ethnicity 7 1.2209 47.294 -1098.99
## <none> 46.073 -1097.60
## - Income 1 0.3647 46.438 -1095.80
## - YearsEmployed 1 0.3979 46.471 -1095.45
## - CreditScore 1 0.4073 46.480 -1095.36
## - Employed 1 1.1595 47.233 -1087.62
## - MaritalStatus 2 1.3857 47.459 -1087.32
## - PriorDefault 1 30.7524 76.825 -853.15
##
## Step: AIC=-1099.58
## Approved ~ Age + YearsEmployed + CreditScore + Income + MaritalStatus +
## Ethnicity + PriorDefault + Employed
##
## Df Sum of Sq RSS AIC
## - Age 1 0.0387 46.113 -1101.18
## - Ethnicity 7 1.2275 47.302 -1100.91
## <none> 46.075 -1099.58
## - Income 1 0.3646 46.439 -1097.78
## - YearsEmployed 1 0.4026 46.477 -1097.39
## - CreditScore 1 0.4114 46.486 -1097.30
## - Employed 1 1.1669 47.241 -1089.53
## - MaritalStatus 2 1.3870 47.462 -1089.29
## - PriorDefault 1 31.0969 77.171 -852.98
##
## Step: AIC=-1101.18
## Approved ~ YearsEmployed + CreditScore + Income + MaritalStatus +
## Ethnicity + PriorDefault + Employed
##
## Df Sum of Sq RSS AIC
## - Ethnicity 7 1.211 47.324 -1102.69
## <none> 46.113 -1101.18
## - Income 1 0.372 46.485 -1099.31
## - CreditScore 1 0.425 46.538 -1098.76
## - YearsEmployed 1 0.561 46.674 -1097.35
## - Employed 1 1.141 47.254 -1091.40
## - MaritalStatus 2 1.381 47.494 -1090.96
## - PriorDefault 1 31.898 78.011 -849.77
##
## Step: AIC=-1102.69
## Approved ~ YearsEmployed + CreditScore + Income + MaritalStatus +
## PriorDefault + Employed
##
## Df Sum of Sq RSS AIC
## <none> 47.324 -1102.69
## - CreditScore 1 0.396 47.720 -1100.67
## - Income 1 0.447 47.771 -1100.15
## - YearsEmployed 1 0.501 47.825 -1099.61
## - Employed 1 1.214 48.538 -1092.48
## - MaritalStatus 2 3.151 50.475 -1075.62
## - PriorDefault 1 34.295 81.619 -841.97
predict(model.lm.backward, df.te) -> pred.lm_backward.te.prob
pred.lm_backward.te.label <- ifelse(pred.lm_backward.te.prob > 0.5, "Approved", "Rejected")
lm(formula = Approved ~ 1, data = df.tr %>%
mutate(Approved = ifelse(Approved == "Approved", 1, 0))) -> model.lm.empty
model.lm.forward <- step(model.lm.empty, direction = "forward",
scope = formula(Approved ~ Age + Debt + YearsEmployed +
CreditScore + Income + Gender + MaritalStatus +
`Bank Customer` + `Education Level` +
Ethnicity + PriorDefault + Employed))
## Start: AIC=-671.41
## Approved ~ 1
##
## Df Sum of Sq RSS AIC
## + PriorDefault 1 63.069 56.134 -1032.40
## + Employed 1 23.382 95.821 -774.65
## + CreditScore 1 18.943 100.260 -752.82
## + YearsEmployed 1 14.453 104.750 -731.71
## + `Education Level` 14 15.769 103.435 -711.80
## + Debt 1 5.979 113.224 -694.21
## + Age 1 4.550 114.653 -688.17
## + MaritalStatus 2 4.997 114.207 -688.05
## + `Bank Customer` 2 4.997 114.207 -688.05
## + Ethnicity 7 6.202 113.002 -683.16
## + Income 1 3.264 115.940 -682.79
## <none> 119.203 -671.41
## + Gender 2 0.731 118.473 -670.37
##
## Step: AIC=-1032.4
## Approved ~ PriorDefault
##
## Df Sum of Sq RSS AIC
## + MaritalStatus 2 4.4544 51.680 -1068.2
## + `Bank Customer` 2 4.4544 51.680 -1068.2
## + Employed 1 2.7181 53.416 -1054.3
## + CreditScore 1 2.2524 53.882 -1050.1
## + Income 1 1.7824 54.352 -1046.0
## + `Education Level` 14 4.6340 51.500 -1045.9
## + Ethnicity 7 2.9392 53.195 -1044.3
## + YearsEmployed 1 1.1261 55.008 -1040.2
## <none> 56.134 -1032.4
## + Debt 1 0.2144 55.920 -1032.2
## + Age 1 0.0491 56.085 -1030.8
## + Gender 2 0.0027 56.131 -1028.4
##
## Step: AIC=-1068.25
## Approved ~ PriorDefault + MaritalStatus
##
## Df Sum of Sq RSS AIC
## + Employed 1 2.74840 48.931 -1092.6
## + CreditScore 1 2.19979 49.480 -1087.2
## + YearsEmployed 1 0.99155 50.688 -1075.6
## + Income 1 0.54798 51.132 -1071.4
## + `Education Level` 14 2.99477 48.685 -1069.0
## <none> 51.680 -1068.2
## + Debt 1 0.19608 51.484 -1068.1
## + Age 1 0.03650 51.643 -1066.6
## + Ethnicity 7 1.26168 50.418 -1066.2
## + Gender 2 0.00085 51.679 -1064.3
##
## Step: AIC=-1092.59
## Approved ~ PriorDefault + MaritalStatus + Employed
##
## Df Sum of Sq RSS AIC
## + YearsEmployed 1 0.74810 48.183 -1098.0
## + CreditScore 1 0.65308 48.278 -1097.1
## + Income 1 0.47532 48.456 -1095.3
## <none> 48.931 -1092.6
## + Debt 1 0.10700 48.824 -1091.6
## + Age 1 0.04722 48.884 -1091.0
## + Ethnicity 7 1.22594 47.705 -1090.8
## + `Education Level` 14 2.57661 46.355 -1090.7
## + Gender 2 0.01972 48.911 -1088.8
##
## Step: AIC=-1098.01
## Approved ~ PriorDefault + MaritalStatus + Employed + YearsEmployed
##
## Df Sum of Sq RSS AIC
## + Income 1 0.46336 47.720 -1100.7
## + CreditScore 1 0.41198 47.771 -1100.2
## <none> 48.183 -1098.0
## + Ethnicity 7 1.25542 46.928 -1096.7
## + Age 1 0.01391 48.169 -1096.2
## + Debt 1 0.01271 48.170 -1096.1
## + Gender 2 0.00702 48.176 -1094.1
## + `Education Level` 14 2.34518 45.838 -1094.1
##
## Step: AIC=-1100.67
## Approved ~ PriorDefault + MaritalStatus + Employed + YearsEmployed +
## Income
##
## Df Sum of Sq RSS AIC
## + CreditScore 1 0.39562 47.324 -1102.7
## <none> 47.720 -1100.7
## + Age 1 0.01296 47.707 -1098.8
## + Ethnicity 7 1.18146 46.538 -1098.8
## + Debt 1 0.00007 47.720 -1098.7
## + `Education Level` 14 2.42417 45.296 -1097.8
## + Gender 2 0.00731 47.712 -1096.8
##
## Step: AIC=-1102.69
## Approved ~ PriorDefault + MaritalStatus + Employed + YearsEmployed +
## Income + CreditScore
##
## Df Sum of Sq RSS AIC
## <none> 47.324 -1102.7
## + Ethnicity 7 1.21090 46.113 -1101.2
## + Age 1 0.02207 47.302 -1100.9
## + Debt 1 0.01016 47.314 -1100.8
## + `Education Level` 14 2.40167 44.922 -1099.8
## + Gender 2 0.00771 47.316 -1098.8
predict(model.lm.forward, df.te) -> pred.lm_forward.te.prob
pred.lm_forward.te.label <- ifelse(pred.lm_forward.te.prob > 0.5, "Approved", "Rejected")
glm(formula = Approved ~ ., data = df.tr %>%
mutate(Approved = case_when(Approved == "Rejected" ~ 0,
T ~ 1)), family = "binomial") -> model.lr.full
model.lr.backward <- step(model.lr.full, direction = "backward")
## Start: AIC=329.47
## Approved ~ Gender + Age + Debt + MaritalStatus + `Bank Customer` +
## `Education Level` + Ethnicity + YearsEmployed + PriorDefault +
## Employed + CreditScore + DriversLicense + Citizen + Income
##
##
## Step: AIC=329.47
## Approved ~ Gender + Age + Debt + MaritalStatus + `Education Level` +
## Ethnicity + YearsEmployed + PriorDefault + Employed + CreditScore +
## DriversLicense + Citizen + Income
##
## Df Deviance AIC
## - `Education Level` 13 271.26 315.26
## - Ethnicity 6 267.22 325.22
## - Citizen 2 259.47 325.47
## - Gender 2 259.62 325.62
## - Debt 1 259.51 327.51
## - Age 1 260.07 328.07
## - DriversLicense 1 260.41 328.41
## - Employed 1 261.33 329.33
## <none> 259.47 329.47
## - YearsEmployed 1 262.17 330.17
## - CreditScore 1 262.57 330.57
## - MaritalStatus 2 264.99 330.99
## - Income 1 274.17 342.17
## - PriorDefault 1 391.63 459.63
##
## Step: AIC=315.26
## Approved ~ Gender + Age + Debt + MaritalStatus + Ethnicity +
## YearsEmployed + PriorDefault + Employed + CreditScore + DriversLicense +
## Citizen + Income
##
## Df Deviance AIC
## - Citizen 2 271.30 311.30
## - Gender 2 271.31 311.31
## - Debt 1 271.29 313.29
## - Age 1 271.80 313.80
## - Ethnicity 7 283.95 313.95
## - DriversLicense 1 272.49 314.49
## <none> 271.26 315.26
## - CreditScore 1 273.88 315.88
## - Employed 1 274.20 316.20
## - YearsEmployed 1 275.13 317.13
## - MaritalStatus 2 284.03 324.03
## - Income 1 288.62 330.62
## - PriorDefault 1 410.43 452.43
##
## Step: AIC=311.3
## Approved ~ Gender + Age + Debt + MaritalStatus + Ethnicity +
## YearsEmployed + PriorDefault + Employed + CreditScore + DriversLicense +
## Income
##
## Df Deviance AIC
## - Gender 2 271.34 307.34
## - Debt 1 271.33 309.33
## - Age 1 271.85 309.85
## - Ethnicity 7 284.34 310.34
## - DriversLicense 1 272.50 310.50
## <none> 271.30 311.30
## - CreditScore 1 273.89 311.89
## - Employed 1 274.68 312.68
## - YearsEmployed 1 275.16 313.16
## - MaritalStatus 2 285.88 321.88
## - Income 1 289.05 327.05
## - PriorDefault 1 410.54 448.54
##
## Step: AIC=307.34
## Approved ~ Age + Debt + MaritalStatus + Ethnicity + YearsEmployed +
## PriorDefault + Employed + CreditScore + DriversLicense +
## Income
##
## Df Deviance AIC
## - Debt 1 271.37 305.37
## - Age 1 271.95 305.95
## - DriversLicense 1 272.53 306.53
## - Ethnicity 7 284.65 306.65
## <none> 271.34 307.34
## - CreditScore 1 273.90 307.90
## - Employed 1 274.73 308.73
## - YearsEmployed 1 275.25 309.25
## - MaritalStatus 2 286.46 318.46
## - Income 1 289.06 323.06
## - PriorDefault 1 410.87 444.87
##
## Step: AIC=305.37
## Approved ~ Age + MaritalStatus + Ethnicity + YearsEmployed +
## PriorDefault + Employed + CreditScore + DriversLicense +
## Income
##
## Df Deviance AIC
## - Age 1 271.99 303.99
## - DriversLicense 1 272.58 304.58
## - Ethnicity 7 284.66 304.66
## <none> 271.37 305.37
## - CreditScore 1 274.08 306.08
## - Employed 1 274.73 306.73
## - YearsEmployed 1 275.44 307.44
## - MaritalStatus 2 286.49 316.49
## - Income 1 289.19 321.19
## - PriorDefault 1 414.14 446.14
##
## Step: AIC=303.99
## Approved ~ MaritalStatus + Ethnicity + YearsEmployed + PriorDefault +
## Employed + CreditScore + DriversLicense + Income
##
## Df Deviance AIC
## - Ethnicity 7 285.10 303.10
## - DriversLicense 1 273.16 303.16
## <none> 271.99 303.99
## - CreditScore 1 274.85 304.85
## - Employed 1 275.20 305.20
## - YearsEmployed 1 277.38 307.38
## - MaritalStatus 2 286.96 314.96
## - Income 1 289.56 319.56
## - PriorDefault 1 420.07 450.07
##
## Step: AIC=303.1
## Approved ~ MaritalStatus + YearsEmployed + PriorDefault + Employed +
## CreditScore + DriversLicense + Income
##
## Df Deviance AIC
## - DriversLicense 1 286.27 302.27
## <none> 285.10 303.10
## - CreditScore 1 287.53 303.53
## - Employed 1 288.91 304.91
## - YearsEmployed 1 291.16 307.16
## - Income 1 302.88 318.88
## - MaritalStatus 2 311.70 325.70
## - PriorDefault 1 440.32 456.32
##
## Step: AIC=302.27
## Approved ~ MaritalStatus + YearsEmployed + PriorDefault + Employed +
## CreditScore + Income
##
## Df Deviance AIC
## <none> 286.27 302.27
## - CreditScore 1 288.46 302.46
## - Employed 1 290.43 304.43
## - YearsEmployed 1 291.58 305.58
## - Income 1 304.42 318.42
## - MaritalStatus 2 313.79 325.79
## - PriorDefault 1 441.39 455.39
pred.lr_backward.te.prob <- predict(model.lr.backward, df.te, type = "response")
pred.lr_backward.te.label <- ifelse(pred.lr_backward.te.prob > 0.5, "Approved", "Rejected")
glm(formula = Approved ~ 1, data = df.tr %>%
mutate(Approved = case_when(Approved == "Rejected" ~ 0,
T ~ 1)), family = "binomial") -> model.lr.empty
model.lr.forward <- step(model.lr.empty, direction = "forward",
scope = formula(Approved ~ Age + Debt + YearsEmployed +
CreditScore + Income + Gender + MaritalStatus +
`Bank Customer` + `Education Level` +
Ethnicity + PriorDefault + Employed))
## Start: AIC=665
## Approved ~ 1
##
## Df Deviance AIC
## + PriorDefault 1 369.56 373.56
## + CreditScore 1 535.20 539.20
## + Employed 1 565.47 569.47
## + YearsEmployed 1 590.77 594.77
## + Income 1 597.06 601.06
## + `Education Level` 14 596.48 626.48
## + Debt 1 638.31 642.31
## + MaritalStatus 2 641.92 647.92
## + `Bank Customer` 2 641.92 647.92
## + Age 1 644.40 648.40
## + Ethnicity 7 636.75 652.75
## <none> 663.00 665.00
## + Gender 2 659.68 665.68
##
## Step: AIC=373.56
## Approved ~ PriorDefault
##
## Df Deviance AIC
## + MaritalStatus 2 336.92 344.92
## + `Bank Customer` 2 336.92 344.92
## + Income 1 340.10 346.10
## + CreditScore 1 345.34 351.34
## + Employed 1 348.88 354.88
## + Ethnicity 7 344.52 362.52
## + `Education Level` 14 331.31 363.31
## + YearsEmployed 1 360.55 366.55
## <none> 369.56 373.56
## + Debt 1 367.92 373.92
## + Age 1 369.18 375.18
## + Gender 2 369.52 377.52
##
## Step: AIC=344.92
## Approved ~ PriorDefault + MaritalStatus
##
## Df Deviance AIC
## + Income 1 311.65 321.65
## + CreditScore 1 313.29 323.29
## + Employed 1 313.68 323.68
## + YearsEmployed 1 329.61 339.61
## + Ethnicity 7 322.91 344.91
## <none> 336.92 344.92
## + Debt 1 335.44 345.44
## + Age 1 336.75 346.75
## + `Education Level` 14 310.82 346.82
## + Gender 2 336.90 348.90
##
## Step: AIC=321.65
## Approved ~ PriorDefault + MaritalStatus + Income
##
## Df Deviance AIC
## + Employed 1 294.89 306.89
## + CreditScore 1 295.45 307.45
## + YearsEmployed 1 303.62 315.62
## + Ethnicity 7 296.21 320.21
## <none> 311.65 321.65
## + Debt 1 310.94 322.94
## + Age 1 311.52 323.52
## + `Education Level` 14 287.22 325.22
## + Gender 2 311.41 325.41
##
## Step: AIC=306.89
## Approved ~ PriorDefault + MaritalStatus + Income + Employed
##
## Df Deviance AIC
## + YearsEmployed 1 288.46 302.46
## + CreditScore 1 291.58 305.58
## <none> 294.89 306.89
## + Ethnicity 7 281.51 307.51
## + Debt 1 294.45 308.45
## + Age 1 294.81 308.81
## + Gender 2 294.44 310.44
## + `Education Level` 14 274.82 314.82
##
## Step: AIC=302.46
## Approved ~ PriorDefault + MaritalStatus + Income + Employed +
## YearsEmployed
##
## Df Deviance AIC
## + CreditScore 1 286.27 302.27
## <none> 288.46 302.46
## + Ethnicity 7 275.80 303.80
## + Age 1 288.14 304.14
## + Debt 1 288.45 304.45
## + Gender 2 288.33 306.33
## + `Education Level` 14 271.20 313.20
##
## Step: AIC=302.27
## Approved ~ PriorDefault + MaritalStatus + Income + Employed +
## YearsEmployed + CreditScore
##
## Df Deviance AIC
## <none> 286.27 302.27
## + Ethnicity 7 273.16 303.16
## + Age 1 285.82 303.82
## + Debt 1 286.27 304.27
## + Gender 2 286.10 306.10
## + `Education Level` 14 268.88 312.88
pred.lr_forward.te.prob <- predict(model.lr.forward, df.te, type = "response")
pred.lr_forward.te.label <- ifelse(pred.lr_forward.te.prob > 0.5, "Approved", "Rejected")
randomForest(df.tr %>% select(-Approved),
df.tr$Approved %>% factor(levels = c("Rejected", "Approved")),
ntree = 500) -> model.rf
pred.rf.te.prob <- predict(model.rf, df.te, type = "prob")
pred.rf.te.label <- predict(model.rf, df.te, type = "class")
library(baguette)
## Warning: 程序包'baguette'是用R版本4.4.2 来建造的
## 载入需要的程序包:parsnip
## Warning: 程序包'parsnip'是用R版本4.4.2 来建造的
bagger(Approved ~ .,
data = df.tr %>%
mutate(Approved = factor(Approved,
levels = c("Rejected",
"Approved")))) -> model.bagging
pred.bagging.te.label <- predict(model.bagging, df.te)$`.pred_class`
set.seed(1)
data.tr.matrix <- data.matrix(df.tr.dummy %>%
select(-Approved) %>% scale())
data.tr.xgb <- xgb.DMatrix(data = data.tr.matrix,
label = ifelse(df.tr$Approved == "Approved", 1, 0))
data.te.matrix <- data.matrix(df.te.dummy %>%
select(-Approved) %>% scale())
data.te.xgb <- xgb.DMatrix(data = data.te.matrix,
label = ifelse(df.te$Approved == "Approved", 1, 0))
param <- list(booster = "gblinear", objective = "multi:softprob",
num_class = 2, lambda = 0.0001, alpha = 0.0001, nthread = 1)
xgb.train(param, data.tr.xgb, list(tr=data.tr.xgb), nrounds = 50,
eta = 0.5, verbose = F,
callbacks = list(cb.gblinear.history())) -> model.xgb
predict(model.xgb, data.te.xgb, type = "prob") %>%
matrix(ncol = 2, byrow = T) -> res.df
res.df[,2] -> pred.xgb.te.prob
ifelse(pred.xgb.te.prob > 0.5, "Approved", "Rejected") -> pred.xgb.te.label
data.frame(`Linear Probability Model` = pred.lm.te.label,
`Logistic Regression Model` = pred.lr.te.label,
`Linear Discriminant Analysis Model` = pred.lda.te.label,
`Quadratic Discriminant Analysis Model` = pred.qda.te.label,
`Lasso Regression Model` = pred.lasso.te.label,
`Ridge Regression Model` = pred.ridge.te.label,
`Principal Component Regression` = pred.pcr.te.label,
`Partial Least Squares` = pred.plsr.te.label,
`Spline Model` = pred.spline.te.label,
`Polynomial Model` = pred.poly.te.label,
`Linear Model Backward Selection` = pred.lm_backward.te.label,
`Linear Model Forward Selection` = pred.lm_forward.te.label,
`Logistic Model Backward Selection` = pred.lr_backward.te.label,
`Logistic Model Forward Selection` = pred.lr_forward.te.label,
`Random Forest` = pred.rf.te.label,
`Bagging` = pred.bagging.te.label,
`Boosting` = pred.xgb.te.label) %>%
mutate(Index = idx.te) %>%
mutate(Real = df.te$Approved) %>%
gather(key = "Model", value = "Predicted", -Index, -Real) %>%
mutate(Model = str_replace_all(Model, "\\.", " ")) %>%
select(Index, Model, Real, Predicted) %>%
group_by(Model) %>%
summarise(Accuracy = sum(Real == Predicted) / n()) %>%
mutate(Accuracy = sprintf("%.4f%%", Accuracy * 100)) -> res.df
## Warning: attributes are not identical across measure variables; they will be
## dropped
res.df %>%
knitr::kable()
| Model | Accuracy |
|---|---|
| Bagging | 84.1346% |
| Boosting | 83.6538% |
| Lasso Regression Model | 72.1154% |
| Linear Discriminant Analysis Model | 85.5769% |
| Linear Model Backward Selection | 85.0962% |
| Linear Model Forward Selection | 85.0962% |
| Linear Probability Model | 85.5769% |
| Logistic Model Backward Selection | 87.0192% |
| Logistic Model Forward Selection | 87.0192% |
| Logistic Regression Model | 83.1731% |
| Partial Least Squares | 86.5385% |
| Polynomial Model | 84.1346% |
| Principal Component Regression | 85.5769% |
| Quadratic Discriminant Analysis Model | 79.8077% |
| Random Forest | 87.0192% |
| Ridge Regression Model | 71.1538% |
| Spline Model | 84.1346% |
res.df %>% write.csv("results.csv", row.names = F)
model.lm <- lm(formula = Approved ~ ., data = df.tr %>%
mutate(Approved = case_when(Approved == "Rejected" ~ 0,
T ~ 1)))
model.lm.sum <- summary(model.lm)
model.lm.sum$coefficients %>%
data.frame() %>%
rownames_to_column(var = "Variable") %>%
rename("p_value" = "Pr...t..",
"t_value" = "t.value",
"std_error" = "Std..Error") %>%
filter(p_value < 0.1) %>%
mutate(Estimate = round(Estimate, 4),
std_error = round(std_error, 4),
t_value = round(t_value, 4),
p_value = round(p_value, 4)) %>%
write.csv("significant_variables_in_linear_model.csv", row.names = F)
\[ \begin{aligned} \text{Linear Probability(Approved)} = &\beta_0 + \beta_1 \times \text{Gender} + \beta_2 \times \text{Age} + \beta_3 \times \text{Debt} + \\ &\beta_4 \times \text{Marital Status} + \beta_5 \times \text{Bank Customer} + \beta_6 \times \text{Education Level} + \\ &\beta_7 \times \text{Ethnicity} + \beta_8 \times \text{Years Employed} + \beta_9 \times \text{Prior Default} + \\ &\beta_{10} \times \text{Employed} + \beta_{11} \times \text{CreditScore} + \beta_{12} \times \text{Drivers License} + \\ &\beta_{13} \times \text{Citizen} + \beta_{14} \times \text{Income} \end{aligned} \]