This report builds and evaluates three predictive models on the cleaned loan approval dataset. Each model is assessed using a consistent set of outputs — summary/coefficients, confusion matrix, feature importance, and ROC/AUC — making it easy to compare performance and select the best approach for the business question.
Models covered:
library(tidyverse)
library(plotly)
library(kableExtra)
library(scales)
library(caret) # confusion matrix, train/test split
library(rpart) # decision tree
library(rpart.plot) # tree visualization
library(randomForest) # random forest
library(pROC) # ROC curves & AUCCOL_APPROVED <- "#1E5FAD"
COL_REJECTED <- "#F4A62A"
COL_NAVY <- "#0D1B3E"
COL_LIGHT <- "#C8DEFF"
theme_datathon <- function() {
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", color = COL_NAVY, size = 14),
plot.subtitle = element_text(color = "#64748B", size = 11),
axis.text = element_text(color = "#64748B"),
axis.title = element_text(color = COL_NAVY, face = "bold"),
legend.position = "top",
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold", color = COL_NAVY)
)
}## Dataset loaded: 4269 rows × 17 columns
# Select modeling features — drop engineered buckets (redundant with numeric versions)
# and ID-like columns
model_df <- df %>%
select(
loan_status,
cibil_score,
income_annum,
loan_amount,
loan_term,
no_of_dependents,
education,
self_employed,
residential_assets_value,
commercial_assets_value,
luxury_assets_value,
bank_asset_value,
debt_to_income,
total_assets,
loan_to_asset
) %>%
drop_na()
cat("Modeling dataset:", nrow(model_df), "rows ×", ncol(model_df), "columns\n")## Modeling dataset: 4269 rows × 15 columns
## Target distribution:
##
## Rejected Approved
## 1613 2656
set.seed(2026) # reproducible split
train_idx <- createDataPartition(model_df$loan_status, p = 0.75, list = FALSE)
train_df <- model_df[train_idx, ]
test_df <- model_df[-train_idx, ]
cat("Training rows:", nrow(train_df), "\n")## Training rows: 3202
## Testing rows: 1067
A transparent, interpretable baseline. Coefficients directly explain the direction and magnitude of each feature’s influence on approval probability.
logit_model <- glm(
loan_status ~ .,
data = train_df,
family = binomial(link = "logit")
)
summary(logit_model)##
## Call:
## glm(formula = loan_status ~ ., family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.402e+01 9.454e-01 -14.828 < 2e-16 ***
## cibil_score 2.502e-02 9.784e-04 25.569 < 2e-16 ***
## income_annum 1.028e-07 1.725e-07 0.596 0.55133
## loan_amount 4.647e-09 4.173e-08 0.111 0.91133
## loan_term -1.597e-01 1.336e-02 -11.952 < 2e-16 ***
## no_of_dependents -3.231e-02 4.016e-02 -0.805 0.42102
## educationNot Graduate -2.027e-01 1.372e-01 -1.477 0.13974
## self_employedYes 8.000e-02 1.367e-01 0.585 0.55836
## residential_assets_value -5.214e-08 1.968e-08 -2.649 0.00807 **
## commercial_assets_value -5.547e-08 2.463e-08 -2.252 0.02429 *
## luxury_assets_value -1.210e-09 2.436e-08 -0.050 0.96037
## bank_asset_value -1.438e-08 4.135e-08 -0.348 0.72803
## debt_to_income 1.742e+00 3.088e-01 5.641 1.69e-08 ***
## total_assets NA NA NA NA
## loan_to_asset -4.783e+00 1.186e+00 -4.031 5.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4246.0 on 3201 degrees of freedom
## Residual deviance: 1406.2 on 3188 degrees of freedom
## AIC: 1434.2
##
## Number of Fisher Scoring iterations: 7
logit_coef <- broom::tidy(logit_model) %>%
mutate(
odds_ratio = round(exp(estimate), 4),
estimate = round(estimate, 4),
std.error = round(std.error, 4),
statistic = round(statistic, 3),
p.value = round(p.value, 4),
significant = if_else(p.value < 0.05, "✓", "")
) %>%
arrange(p.value)
logit_coef %>%
kable(col.names = c("Term", "Estimate (log-odds)", "Std Error",
"z-statistic", "p-value", "Odds Ratio", "Significant")) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(which(logit_coef$significant == "✓"), background = "#C8DEFF")| Term | Estimate (log-odds) | Std Error | z-statistic | p-value | Odds Ratio | Significant |
|---|---|---|---|---|---|---|
| (Intercept) | -14.0179 | 0.9454 | -14.828 | 0.0000 | 0.0000 | ✓ |
| cibil_score | 0.0250 | 0.0010 | 25.569 | 0.0000 | 1.0253 | ✓ |
| loan_term | -0.1597 | 0.0134 | -11.952 | 0.0000 | 0.8524 | ✓ |
| debt_to_income | 1.7423 | 0.3088 | 5.641 | 0.0000 | 5.7102 | ✓ |
| loan_to_asset | -4.7828 | 1.1864 | -4.031 | 0.0001 | 0.0084 | ✓ |
| residential_assets_value | 0.0000 | 0.0000 | -2.649 | 0.0081 | 1.0000 | ✓ |
| commercial_assets_value | 0.0000 | 0.0000 | -2.252 | 0.0243 | 1.0000 | ✓ |
| educationNot Graduate | -0.2027 | 0.1372 | -1.477 | 0.1397 | 0.8165 | |
| no_of_dependents | -0.0323 | 0.0402 | -0.805 | 0.4210 | 0.9682 | |
| income_annum | 0.0000 | 0.0000 | 0.596 | 0.5513 | 1.0000 | |
| self_employedYes | 0.0800 | 0.1367 | 0.585 | 0.5584 | 1.0833 | |
| bank_asset_value | 0.0000 | 0.0000 | -0.348 | 0.7280 | 1.0000 | |
| loan_amount | 0.0000 | 0.0000 | 0.111 | 0.9113 | 1.0000 | |
| luxury_assets_value | 0.0000 | 0.0000 | -0.050 | 0.9604 | 1.0000 | |
| total_assets | NA | NA | NA | NA | NA | NA |
logit_probs <- predict(logit_model, test_df, type = "response")
logit_preds <- factor(if_else(logit_probs > 0.5, "Approved", "Rejected"),
levels = c("Rejected", "Approved"))
logit_cm <- confusionMatrix(logit_preds, test_df$loan_status, positive = "Approved")
# Tidy confusion matrix display
cm_tbl <- as.data.frame(logit_cm$table)
ggplot(cm_tbl, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white", linewidth = 1) +
geom_text(aes(label = Freq), size = 7, fontface = "bold",
color = "white") +
scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
labs(
title = "Logistic Regression — Confusion Matrix",
subtitle = paste0("Accuracy: ",
round(logit_cm$overall["Accuracy"] * 100, 1), "%"),
x = "Actual",
y = "Predicted"
) +
theme_datathon() +
theme(legend.position = "none")tibble(
Metric = c("Accuracy", "Sensitivity (Recall)", "Specificity", "Precision", "F1 Score"),
Value = c(
round(logit_cm$overall["Accuracy"], 4),
round(logit_cm$byClass["Sensitivity"], 4),
round(logit_cm$byClass["Specificity"], 4),
round(logit_cm$byClass["Precision"], 4),
round(logit_cm$byClass["F1"], 4)
)
) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| Accuracy | 0.9128 |
| Sensitivity (Recall) | 0.9337 |
| Specificity | 0.8784 |
| Precision | 0.9268 |
| F1 Score | 0.9302 |
logit_roc <- roc(
response = as.numeric(test_df$loan_status == "Approved"),
predictor = logit_probs
)
logit_auc <- round(auc(logit_roc), 4)
roc_df <- data.frame(
specificity = logit_roc$specificities,
sensitivity = logit_roc$sensitivities
)
p_roc_logit <- ggplot(roc_df, aes(x = 1 - specificity, y = sensitivity)) +
geom_line(color = COL_APPROVED, linewidth = 1.2) +
geom_abline(linetype = "dashed", color = "#94A3B8") +
annotate("text", x = 0.65, y = 0.15,
label = paste0("AUC = ", logit_auc),
size = 5, fontface = "bold", color = COL_NAVY) +
labs(
title = "Logistic Regression — ROC Curve",
subtitle = "Diagonal = random classifier baseline",
x = "1 − Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)"
) +
theme_datathon()
ggplotly(p_roc_logit)AUC = 0.9742 — values above 0.90 indicate excellent discrimination.
Rule-based and highly explainable — ideal for presenting to a non-technical audience. The tree diagram tells the story visually without needing statistical interpretation.
rpart.plot(
tree_model,
type = 4,
extra = 104, # show probability + % of obs
fallen.leaves = TRUE,
main = "Decision Tree — Loan Approval",
box.palette = list(COL_REJECTED, COL_APPROVED),
shadow.col = "gray80",
cex = 0.75
)tree_imp <- tree_model$variable.importance %>%
enframe(name = "feature", value = "importance") %>%
mutate(
importance = round(importance, 2),
pct = round(importance / sum(importance) * 100, 1)
) %>%
arrange(desc(importance))
p_tree_imp <- ggplot(tree_imp, aes(x = reorder(feature, importance),
y = importance, fill = importance)) +
geom_col(show.legend = FALSE, alpha = 0.9) +
geom_text(aes(label = paste0(pct, "%")),
hjust = -0.15, size = 3.5, color = COL_NAVY) +
coord_flip() +
scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
labs(
title = "Decision Tree — Feature Importance",
subtitle = "Relative contribution to splitting decisions",
x = NULL,
y = "Importance Score"
) +
theme_datathon()
ggplotly(p_tree_imp, tooltip = c("x", "y"))tree_probs <- predict(tree_model, test_df, type = "prob")[, "Approved"]
tree_preds <- factor(if_else(tree_probs > 0.5, "Approved", "Rejected"),
levels = c("Rejected", "Approved"))
tree_cm <- confusionMatrix(tree_preds, test_df$loan_status, positive = "Approved")
cm_tbl_tree <- as.data.frame(tree_cm$table)
ggplot(cm_tbl_tree, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white", linewidth = 1) +
geom_text(aes(label = Freq), size = 7, fontface = "bold", color = "white") +
scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
labs(
title = "Decision Tree — Confusion Matrix",
subtitle = paste0("Accuracy: ",
round(tree_cm$overall["Accuracy"] * 100, 1), "%"),
x = "Actual", y = "Predicted"
) +
theme_datathon() +
theme(legend.position = "none")tree_roc <- roc(
response = as.numeric(test_df$loan_status == "Approved"),
predictor = tree_probs
)
tree_auc <- round(auc(tree_roc), 4)
roc_df_tree <- data.frame(
specificity = tree_roc$specificities,
sensitivity = tree_roc$sensitivities
)
p_roc_tree <- ggplot(roc_df_tree, aes(x = 1 - specificity, y = sensitivity)) +
geom_line(color = COL_REJECTED, linewidth = 1.2) +
geom_abline(linetype = "dashed", color = "#94A3B8") +
annotate("text", x = 0.65, y = 0.15,
label = paste0("AUC = ", tree_auc),
size = 5, fontface = "bold", color = COL_NAVY) +
labs(
title = "Decision Tree — ROC Curve",
subtitle = "Diagonal = random classifier baseline",
x = "1 − Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)"
) +
theme_datathon()
ggplotly(p_roc_tree)AUC = 0.9999
Ensemble of trees — highest predictive accuracy. Feature importance is aggregated across all trees, giving a more reliable signal than a single tree.
set.seed(2026)
rf_model <- randomForest(
loan_status ~ .,
data = train_df,
ntree = 500,
mtry = floor(sqrt(ncol(train_df) - 1)), # default: sqrt(p) for classification
importance = TRUE
)
print(rf_model)##
## Call:
## randomForest(formula = loan_status ~ ., data = train_df, ntree = 500, mtry = floor(sqrt(ncol(train_df) - 1)), importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 0.09%
## Confusion matrix:
## Rejected Approved class.error
## Rejected 1207 3 0.002479339
## Approved 0 1992 0.000000000
rf_imp <- importance(rf_model) %>%
as.data.frame() %>%
rownames_to_column("feature") %>%
select(feature, MeanDecreaseAccuracy, MeanDecreaseGini) %>%
arrange(desc(MeanDecreaseGini)) %>%
mutate(across(where(is.numeric), ~ round(., 2)))
rf_imp %>%
kable(col.names = c("Feature", "Mean Decrease Accuracy", "Mean Decrease Gini")) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(1:3, background = "#C8DEFF")| Feature | Mean Decrease Accuracy | Mean Decrease Gini |
|---|---|---|
| cibil_score | 275.68 | 1185.60 |
| loan_term | 110.73 | 113.07 |
| debt_to_income | 47.53 | 52.83 |
| loan_to_asset | 19.21 | 35.18 |
| loan_amount | 14.56 | 18.54 |
| total_assets | 12.32 | 16.64 |
| luxury_assets_value | 11.04 | 16.35 |
| residential_assets_value | 6.62 | 14.86 |
| commercial_assets_value | 7.97 | 14.60 |
| income_annum | 11.81 | 13.42 |
| bank_asset_value | 9.30 | 13.35 |
| no_of_dependents | 0.48 | 6.61 |
| education | -0.77 | 1.95 |
| self_employed | -1.51 | 1.93 |
p_rf_imp <- rf_imp %>%
ggplot(aes(x = reorder(feature, MeanDecreaseGini),
y = MeanDecreaseGini, fill = MeanDecreaseGini)) +
geom_col(show.legend = FALSE, alpha = 0.9) +
coord_flip() +
scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
labs(
title = "Random Forest — Feature Importance (Gini)",
subtitle = "Higher = more important to splitting decisions across all 500 trees",
x = NULL,
y = "Mean Decrease in Gini Impurity"
) +
theme_datathon()
ggplotly(p_rf_imp, tooltip = c("x", "y"))rf_probs <- predict(rf_model, test_df, type = "prob")[, "Approved"]
rf_preds <- predict(rf_model, test_df, type = "class")
rf_cm <- confusionMatrix(rf_preds, test_df$loan_status, positive = "Approved")
cm_tbl_rf <- as.data.frame(rf_cm$table)
ggplot(cm_tbl_rf, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white", linewidth = 1) +
geom_text(aes(label = Freq), size = 7, fontface = "bold", color = "white") +
scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
labs(
title = "Random Forest — Confusion Matrix",
subtitle = paste0("Accuracy: ",
round(rf_cm$overall["Accuracy"] * 100, 1), "%"),
x = "Actual", y = "Predicted"
) +
theme_datathon() +
theme(legend.position = "none")rf_roc <- roc(
response = as.numeric(test_df$loan_status == "Approved"),
predictor = rf_probs
)
rf_auc <- round(auc(rf_roc), 4)
roc_df_rf <- data.frame(
specificity = rf_roc$specificities,
sensitivity = rf_roc$sensitivities
)
p_roc_rf <- ggplot(roc_df_rf, aes(x = 1 - specificity, y = sensitivity)) +
geom_line(color = COL_NAVY, linewidth = 1.2) +
geom_abline(linetype = "dashed", color = "#94A3B8") +
annotate("text", x = 0.65, y = 0.15,
label = paste0("AUC = ", rf_auc),
size = 5, fontface = "bold", color = COL_NAVY) +
labs(
title = "Random Forest — ROC Curve",
subtitle = "Diagonal = random classifier baseline",
x = "1 − Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)"
) +
theme_datathon()
ggplotly(p_roc_rf)AUC = 1
comparison <- tibble(
Model = c("Logistic Regression", "Decision Tree", "Random Forest"),
Accuracy = c(
round(logit_cm$overall["Accuracy"] * 100, 1),
round(tree_cm$overall["Accuracy"] * 100, 1),
round(rf_cm$overall["Accuracy"] * 100, 1)
),
AUC = c(logit_auc, tree_auc, rf_auc),
Sensitivity = c(
round(logit_cm$byClass["Sensitivity"] * 100, 1),
round(tree_cm$byClass["Sensitivity"] * 100, 1),
round(rf_cm$byClass["Sensitivity"] * 100, 1)
),
Specificity = c(
round(logit_cm$byClass["Specificity"] * 100, 1),
round(tree_cm$byClass["Specificity"] * 100, 1),
round(rf_cm$byClass["Specificity"] * 100, 1)
),
Interpretability = c("High", "High", "Medium")
)
best_row <- which.max(comparison$AUC)
comparison %>%
kable(col.names = c("Model", "Accuracy (%)", "AUC",
"Sensitivity (%)", "Specificity (%)", "Interpretability")) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(best_row, bold = TRUE, background = "#C8DEFF")| Model | Accuracy (%) | AUC | Sensitivity (%) | Specificity (%) | Interpretability |
|---|---|---|---|---|---|
| Logistic Regression | 91.3 | 0.9742 | 93.4 | 87.8 | High |
| Decision Tree | 99.9 | 0.9999 | 100.0 | 99.8 | High |
| Random Forest | 100.0 | 1.0000 | 100.0 | 100.0 | Medium |
# All three ROC curves on one chart for easy comparison
roc_all <- bind_rows(
roc_df %>% mutate(Model = paste0("Logistic Regression (AUC = ", logit_auc, ")")),
roc_df_tree %>% mutate(Model = paste0("Decision Tree (AUC = ", tree_auc, ")")),
roc_df_rf %>% mutate(Model = paste0("Random Forest (AUC = ", rf_auc, ")"))
)
p_roc_all <- ggplot(roc_all, aes(x = 1 - specificity, y = sensitivity, color = Model)) +
geom_line(linewidth = 1.1) +
geom_abline(linetype = "dashed", color = "#94A3B8") +
scale_color_manual(values = setNames(
c(COL_APPROVED, COL_REJECTED, COL_NAVY),
c(paste0("Logistic Regression (AUC = ", logit_auc, ")"),
paste0("Decision Tree (AUC = ", tree_auc, ")"),
paste0("Random Forest (AUC = ", rf_auc, ")"))
)) +
labs(
title = "ROC Curve Comparison — All Three Models",
subtitle = "Higher and further left = better discrimination",
x = "1 − Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)",
color = NULL
) +
theme_datathon()
ggplotly(p_roc_all)tibble(
`#` = 1:4,
Finding = c(
"CIBIL score is the dominant predictor across all three models",
"Random Forest achieves the highest AUC and accuracy",
"Logistic Regression is the most interpretable — best for explaining to judges",
"Decision Tree provides the clearest visual narrative for a presentation"
),
Recommendation = c(
"Lead with CIBIL score in both the presentation and business recommendations",
"Use Random Forest as the primary model for prediction performance claims",
"Quote Logistic Regression coefficients when discussing individual feature effects",
"Include the tree diagram in the slide deck — audiences understand it immediately"
)
) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(c(2, 4), background = "#C8DEFF")| # | Finding | Recommendation |
|---|---|---|
| 1 | CIBIL score is the dominant predictor across all three models | Lead with CIBIL score in both the presentation and business recommendations |
| 2 | Random Forest achieves the highest AUC and accuracy | Use Random Forest as the primary model for prediction performance claims |
| 3 | Logistic Regression is the most interpretable — best for explaining to judges | Quote Logistic Regression coefficients when discussing individual feature effects |
| 4 | Decision Tree provides the clearest visual narrative for a presentation | Include the tree diagram in the slide deck — audiences understand it immediately |
Modeling complete. Hand off results to
04_visuals.Rmd for presentation-ready outputs.