Overview

This report builds and evaluates three predictive models on the cleaned loan approval dataset. Each model is assessed using a consistent set of outputs — summary/coefficients, confusion matrix, feature importance, and ROC/AUC — making it easy to compare performance and select the best approach for the business question.

Models covered:

  1. Logistic Regression — interpretable baseline, coefficient-driven
  2. Decision Tree — visual, rule-based, easy to explain to judges
  3. Random Forest — highest accuracy, ensemble feature importance

Setup

library(tidyverse)
library(plotly)
library(kableExtra)
library(scales)
library(caret)        # confusion matrix, train/test split
library(rpart)        # decision tree
library(rpart.plot)   # tree visualization
library(randomForest) # random forest
library(pROC)         # ROC curves & AUC
COL_APPROVED <- "#1E5FAD"
COL_REJECTED <- "#F4A62A"
COL_NAVY     <- "#0D1B3E"
COL_LIGHT    <- "#C8DEFF"

theme_datathon <- function() {
  theme_minimal(base_size = 13) +
    theme(
      plot.title       = element_text(face = "bold", color = COL_NAVY, size = 14),
      plot.subtitle    = element_text(color = "#64748B", size = 11),
      axis.text        = element_text(color = "#64748B"),
      axis.title       = element_text(color = COL_NAVY, face = "bold"),
      legend.position  = "top",
      panel.grid.minor = element_blank(),
      strip.text       = element_text(face = "bold", color = COL_NAVY)
    )
}
load("loan_clean.RData")
cat("Dataset loaded:", nrow(df), "rows ×", ncol(df), "columns")
## Dataset loaded: 4269 rows × 17 columns

1. Preprocessing for Modeling

# Select modeling features — drop engineered buckets (redundant with numeric versions)
# and ID-like columns
model_df <- df %>%
  select(
    loan_status,
    cibil_score,
    income_annum,
    loan_amount,
    loan_term,
    no_of_dependents,
    education,
    self_employed,
    residential_assets_value,
    commercial_assets_value,
    luxury_assets_value,
    bank_asset_value,
    debt_to_income,
    total_assets,
    loan_to_asset
  ) %>%
  drop_na()

cat("Modeling dataset:", nrow(model_df), "rows ×", ncol(model_df), "columns\n")
## Modeling dataset: 4269 rows × 15 columns
cat("Target distribution:\n")
## Target distribution:
print(table(model_df$loan_status))
## 
## Rejected Approved 
##     1613     2656
set.seed(2026)  # reproducible split

train_idx <- createDataPartition(model_df$loan_status, p = 0.75, list = FALSE)
train_df  <- model_df[train_idx, ]
test_df   <- model_df[-train_idx, ]

cat("Training rows:", nrow(train_df), "\n")
## Training rows: 3202
cat("Testing rows: ", nrow(test_df),  "\n")
## Testing rows:  1067

2. Logistic Regression

A transparent, interpretable baseline. Coefficients directly explain the direction and magnitude of each feature’s influence on approval probability.

2a. Fit Model

logit_model <- glm(
  loan_status ~ .,
  data   = train_df,
  family = binomial(link = "logit")
)

summary(logit_model)
## 
## Call:
## glm(formula = loan_status ~ ., family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.402e+01  9.454e-01 -14.828  < 2e-16 ***
## cibil_score               2.502e-02  9.784e-04  25.569  < 2e-16 ***
## income_annum              1.028e-07  1.725e-07   0.596  0.55133    
## loan_amount               4.647e-09  4.173e-08   0.111  0.91133    
## loan_term                -1.597e-01  1.336e-02 -11.952  < 2e-16 ***
## no_of_dependents         -3.231e-02  4.016e-02  -0.805  0.42102    
## educationNot Graduate    -2.027e-01  1.372e-01  -1.477  0.13974    
## self_employedYes          8.000e-02  1.367e-01   0.585  0.55836    
## residential_assets_value -5.214e-08  1.968e-08  -2.649  0.00807 ** 
## commercial_assets_value  -5.547e-08  2.463e-08  -2.252  0.02429 *  
## luxury_assets_value      -1.210e-09  2.436e-08  -0.050  0.96037    
## bank_asset_value         -1.438e-08  4.135e-08  -0.348  0.72803    
## debt_to_income            1.742e+00  3.088e-01   5.641 1.69e-08 ***
## total_assets                     NA         NA      NA       NA    
## loan_to_asset            -4.783e+00  1.186e+00  -4.031 5.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4246.0  on 3201  degrees of freedom
## Residual deviance: 1406.2  on 3188  degrees of freedom
## AIC: 1434.2
## 
## Number of Fisher Scoring iterations: 7

2b. Coefficients Table

logit_coef <- broom::tidy(logit_model) %>%
  mutate(
    odds_ratio  = round(exp(estimate), 4),
    estimate    = round(estimate, 4),
    std.error   = round(std.error, 4),
    statistic   = round(statistic, 3),
    p.value     = round(p.value, 4),
    significant = if_else(p.value < 0.05, "✓", "")
  ) %>%
  arrange(p.value)

logit_coef %>%
  kable(col.names = c("Term", "Estimate (log-odds)", "Std Error",
                      "z-statistic", "p-value", "Odds Ratio", "Significant")) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(which(logit_coef$significant == "✓"), background = "#C8DEFF")
Term Estimate (log-odds) Std Error z-statistic p-value Odds Ratio Significant
(Intercept) -14.0179 0.9454 -14.828 0.0000 0.0000
cibil_score 0.0250 0.0010 25.569 0.0000 1.0253
loan_term -0.1597 0.0134 -11.952 0.0000 0.8524
debt_to_income 1.7423 0.3088 5.641 0.0000 5.7102
loan_to_asset -4.7828 1.1864 -4.031 0.0001 0.0084
residential_assets_value 0.0000 0.0000 -2.649 0.0081 1.0000
commercial_assets_value 0.0000 0.0000 -2.252 0.0243 1.0000
educationNot Graduate -0.2027 0.1372 -1.477 0.1397 0.8165
no_of_dependents -0.0323 0.0402 -0.805 0.4210 0.9682
income_annum 0.0000 0.0000 0.596 0.5513 1.0000
self_employedYes 0.0800 0.1367 0.585 0.5584 1.0833
bank_asset_value 0.0000 0.0000 -0.348 0.7280 1.0000
loan_amount 0.0000 0.0000 0.111 0.9113 1.0000
luxury_assets_value 0.0000 0.0000 -0.050 0.9604 1.0000
total_assets NA NA NA NA NA NA

2c. Confusion Matrix

logit_probs <- predict(logit_model, test_df, type = "response")
logit_preds <- factor(if_else(logit_probs > 0.5, "Approved", "Rejected"),
                      levels = c("Rejected", "Approved"))

logit_cm <- confusionMatrix(logit_preds, test_df$loan_status, positive = "Approved")

# Tidy confusion matrix display
cm_tbl <- as.data.frame(logit_cm$table)
ggplot(cm_tbl, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 7, fontface = "bold",
            color = "white") +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  labs(
    title    = "Logistic Regression — Confusion Matrix",
    subtitle = paste0("Accuracy: ",
                      round(logit_cm$overall["Accuracy"] * 100, 1), "%"),
    x = "Actual",
    y = "Predicted"
  ) +
  theme_datathon() +
  theme(legend.position = "none")

tibble(
  Metric    = c("Accuracy", "Sensitivity (Recall)", "Specificity", "Precision", "F1 Score"),
  Value     = c(
    round(logit_cm$overall["Accuracy"], 4),
    round(logit_cm$byClass["Sensitivity"], 4),
    round(logit_cm$byClass["Specificity"], 4),
    round(logit_cm$byClass["Precision"], 4),
    round(logit_cm$byClass["F1"], 4)
  )
) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Metric Value
Accuracy 0.9128
Sensitivity (Recall) 0.9337
Specificity 0.8784
Precision 0.9268
F1 Score 0.9302

2d. ROC Curve & AUC

logit_roc <- roc(
  response  = as.numeric(test_df$loan_status == "Approved"),
  predictor = logit_probs
)

logit_auc <- round(auc(logit_roc), 4)

roc_df <- data.frame(
  specificity = logit_roc$specificities,
  sensitivity = logit_roc$sensitivities
)

p_roc_logit <- ggplot(roc_df, aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(color = COL_APPROVED, linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = "#94A3B8") +
  annotate("text", x = 0.65, y = 0.15,
           label = paste0("AUC = ", logit_auc),
           size = 5, fontface = "bold", color = COL_NAVY) +
  labs(
    title    = "Logistic Regression — ROC Curve",
    subtitle = "Diagonal = random classifier baseline",
    x        = "1 − Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)"
  ) +
  theme_datathon()

ggplotly(p_roc_logit)

AUC = 0.9742 — values above 0.90 indicate excellent discrimination.


3. Decision Tree

Rule-based and highly explainable — ideal for presenting to a non-technical audience. The tree diagram tells the story visually without needing statistical interpretation.

3a. Fit Model

tree_model <- rpart(
  loan_status ~ .,
  data   = train_df,
  method = "class",
  control = rpart.control(
    cp       = 0.005,   # complexity parameter — lower = deeper tree
    maxdepth = 5,       # cap depth for readability
    minsplit = 20
  )
)

3b. Tree Visualization

rpart.plot(
  tree_model,
  type    = 4,
  extra   = 104,       # show probability + % of obs
  fallen.leaves = TRUE,
  main    = "Decision Tree — Loan Approval",
  box.palette = list(COL_REJECTED, COL_APPROVED),
  shadow.col  = "gray80",
  cex     = 0.75
)

3c. Feature Importance

tree_imp <- tree_model$variable.importance %>%
  enframe(name = "feature", value = "importance") %>%
  mutate(
    importance = round(importance, 2),
    pct        = round(importance / sum(importance) * 100, 1)
  ) %>%
  arrange(desc(importance))

p_tree_imp <- ggplot(tree_imp, aes(x = reorder(feature, importance),
                                    y = importance, fill = importance)) +
  geom_col(show.legend = FALSE, alpha = 0.9) +
  geom_text(aes(label = paste0(pct, "%")),
            hjust = -0.15, size = 3.5, color = COL_NAVY) +
  coord_flip() +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
  labs(
    title    = "Decision Tree — Feature Importance",
    subtitle = "Relative contribution to splitting decisions",
    x        = NULL,
    y        = "Importance Score"
  ) +
  theme_datathon()

ggplotly(p_tree_imp, tooltip = c("x", "y"))

3d. Confusion Matrix & ROC

tree_probs <- predict(tree_model, test_df, type = "prob")[, "Approved"]
tree_preds <- factor(if_else(tree_probs > 0.5, "Approved", "Rejected"),
                     levels = c("Rejected", "Approved"))

tree_cm <- confusionMatrix(tree_preds, test_df$loan_status, positive = "Approved")

cm_tbl_tree <- as.data.frame(tree_cm$table)
ggplot(cm_tbl_tree, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 7, fontface = "bold", color = "white") +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  labs(
    title    = "Decision Tree — Confusion Matrix",
    subtitle = paste0("Accuracy: ",
                      round(tree_cm$overall["Accuracy"] * 100, 1), "%"),
    x = "Actual", y = "Predicted"
  ) +
  theme_datathon() +
  theme(legend.position = "none")

tree_roc <- roc(
  response  = as.numeric(test_df$loan_status == "Approved"),
  predictor = tree_probs
)
tree_auc <- round(auc(tree_roc), 4)

roc_df_tree <- data.frame(
  specificity = tree_roc$specificities,
  sensitivity = tree_roc$sensitivities
)

p_roc_tree <- ggplot(roc_df_tree, aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(color = COL_REJECTED, linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = "#94A3B8") +
  annotate("text", x = 0.65, y = 0.15,
           label = paste0("AUC = ", tree_auc),
           size = 5, fontface = "bold", color = COL_NAVY) +
  labs(
    title    = "Decision Tree — ROC Curve",
    subtitle = "Diagonal = random classifier baseline",
    x        = "1 − Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)"
  ) +
  theme_datathon()

ggplotly(p_roc_tree)

AUC = 0.9999


4. Random Forest

Ensemble of trees — highest predictive accuracy. Feature importance is aggregated across all trees, giving a more reliable signal than a single tree.

4a. Fit Model

set.seed(2026)

rf_model <- randomForest(
  loan_status ~ .,
  data       = train_df,
  ntree      = 500,
  mtry       = floor(sqrt(ncol(train_df) - 1)),  # default: sqrt(p) for classification
  importance = TRUE
)

print(rf_model)
## 
## Call:
##  randomForest(formula = loan_status ~ ., data = train_df, ntree = 500,      mtry = floor(sqrt(ncol(train_df) - 1)), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0.09%
## Confusion matrix:
##          Rejected Approved class.error
## Rejected     1207        3 0.002479339
## Approved        0     1992 0.000000000

4b. Feature Importance

rf_imp <- importance(rf_model) %>%
  as.data.frame() %>%
  rownames_to_column("feature") %>%
  select(feature, MeanDecreaseAccuracy, MeanDecreaseGini) %>%
  arrange(desc(MeanDecreaseGini)) %>%
  mutate(across(where(is.numeric), ~ round(., 2)))

rf_imp %>%
  kable(col.names = c("Feature", "Mean Decrease Accuracy", "Mean Decrease Gini")) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(1:3, background = "#C8DEFF")
Feature Mean Decrease Accuracy Mean Decrease Gini
cibil_score 275.68 1185.60
loan_term 110.73 113.07
debt_to_income 47.53 52.83
loan_to_asset 19.21 35.18
loan_amount 14.56 18.54
total_assets 12.32 16.64
luxury_assets_value 11.04 16.35
residential_assets_value 6.62 14.86
commercial_assets_value 7.97 14.60
income_annum 11.81 13.42
bank_asset_value 9.30 13.35
no_of_dependents 0.48 6.61
education -0.77 1.95
self_employed -1.51 1.93
p_rf_imp <- rf_imp %>%
  ggplot(aes(x = reorder(feature, MeanDecreaseGini),
             y = MeanDecreaseGini, fill = MeanDecreaseGini)) +
  geom_col(show.legend = FALSE, alpha = 0.9) +
  coord_flip() +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(
    title    = "Random Forest — Feature Importance (Gini)",
    subtitle = "Higher = more important to splitting decisions across all 500 trees",
    x        = NULL,
    y        = "Mean Decrease in Gini Impurity"
  ) +
  theme_datathon()

ggplotly(p_rf_imp, tooltip = c("x", "y"))

4c. Confusion Matrix

rf_probs <- predict(rf_model, test_df, type = "prob")[, "Approved"]
rf_preds <- predict(rf_model, test_df, type = "class")

rf_cm <- confusionMatrix(rf_preds, test_df$loan_status, positive = "Approved")

cm_tbl_rf <- as.data.frame(rf_cm$table)
ggplot(cm_tbl_rf, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 7, fontface = "bold", color = "white") +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  labs(
    title    = "Random Forest — Confusion Matrix",
    subtitle = paste0("Accuracy: ",
                      round(rf_cm$overall["Accuracy"] * 100, 1), "%"),
    x = "Actual", y = "Predicted"
  ) +
  theme_datathon() +
  theme(legend.position = "none")

4d. ROC Curve & AUC

rf_roc <- roc(
  response  = as.numeric(test_df$loan_status == "Approved"),
  predictor = rf_probs
)
rf_auc <- round(auc(rf_roc), 4)

roc_df_rf <- data.frame(
  specificity = rf_roc$specificities,
  sensitivity = rf_roc$sensitivities
)

p_roc_rf <- ggplot(roc_df_rf, aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(color = COL_NAVY, linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = "#94A3B8") +
  annotate("text", x = 0.65, y = 0.15,
           label = paste0("AUC = ", rf_auc),
           size = 5, fontface = "bold", color = COL_NAVY) +
  labs(
    title    = "Random Forest — ROC Curve",
    subtitle = "Diagonal = random classifier baseline",
    x        = "1 − Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)"
  ) +
  theme_datathon()

ggplotly(p_roc_rf)

AUC = 1


5. Model Comparison

comparison <- tibble(
  Model = c("Logistic Regression", "Decision Tree", "Random Forest"),
  Accuracy = c(
    round(logit_cm$overall["Accuracy"] * 100, 1),
    round(tree_cm$overall["Accuracy"]  * 100, 1),
    round(rf_cm$overall["Accuracy"]    * 100, 1)
  ),
  AUC = c(logit_auc, tree_auc, rf_auc),
  Sensitivity = c(
    round(logit_cm$byClass["Sensitivity"] * 100, 1),
    round(tree_cm$byClass["Sensitivity"]  * 100, 1),
    round(rf_cm$byClass["Sensitivity"]    * 100, 1)
  ),
  Specificity = c(
    round(logit_cm$byClass["Specificity"] * 100, 1),
    round(tree_cm$byClass["Specificity"]  * 100, 1),
    round(rf_cm$byClass["Specificity"]    * 100, 1)
  ),
  Interpretability = c("High", "High", "Medium")
)

best_row <- which.max(comparison$AUC)

comparison %>%
  kable(col.names = c("Model", "Accuracy (%)", "AUC",
                      "Sensitivity (%)", "Specificity (%)", "Interpretability")) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(best_row, bold = TRUE, background = "#C8DEFF")
Model Accuracy (%) AUC Sensitivity (%) Specificity (%) Interpretability
Logistic Regression 91.3 0.9742 93.4 87.8 High
Decision Tree 99.9 0.9999 100.0 99.8 High
Random Forest 100.0 1.0000 100.0 100.0 Medium
# All three ROC curves on one chart for easy comparison
roc_all <- bind_rows(
  roc_df       %>% mutate(Model = paste0("Logistic Regression (AUC = ", logit_auc, ")")),
  roc_df_tree  %>% mutate(Model = paste0("Decision Tree (AUC = ", tree_auc, ")")),
  roc_df_rf    %>% mutate(Model = paste0("Random Forest (AUC = ", rf_auc, ")"))
)
p_roc_all <- ggplot(roc_all, aes(x = 1 - specificity, y = sensitivity, color = Model)) +
  geom_line(linewidth = 1.1) +
  geom_abline(linetype = "dashed", color = "#94A3B8") +
  scale_color_manual(values = setNames(
    c(COL_APPROVED, COL_REJECTED, COL_NAVY),
    c(paste0("Logistic Regression (AUC = ", logit_auc, ")"),
      paste0("Decision Tree (AUC = ", tree_auc, ")"),
      paste0("Random Forest (AUC = ", rf_auc, ")"))
  )) +
  labs(
    title    = "ROC Curve Comparison — All Three Models",
    subtitle = "Higher and further left = better discrimination",
    x        = "1 − Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)",
    color    = NULL
  ) +
  theme_datathon()

ggplotly(p_roc_all)

6. Modeling Takeaways

tibble(
  `#` = 1:4,
  Finding = c(
    "CIBIL score is the dominant predictor across all three models",
    "Random Forest achieves the highest AUC and accuracy",
    "Logistic Regression is the most interpretable — best for explaining to judges",
    "Decision Tree provides the clearest visual narrative for a presentation"
  ),
  Recommendation = c(
    "Lead with CIBIL score in both the presentation and business recommendations",
    "Use Random Forest as the primary model for prediction performance claims",
    "Quote Logistic Regression coefficients when discussing individual feature effects",
    "Include the tree diagram in the slide deck — audiences understand it immediately"
  )
) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(c(2, 4), background = "#C8DEFF")
# Finding Recommendation
1 CIBIL score is the dominant predictor across all three models Lead with CIBIL score in both the presentation and business recommendations
2 Random Forest achieves the highest AUC and accuracy Use Random Forest as the primary model for prediction performance claims
3 Logistic Regression is the most interpretable — best for explaining to judges Quote Logistic Regression coefficients when discussing individual feature effects
4 Decision Tree provides the clearest visual narrative for a presentation Include the tree diagram in the slide deck — audiences understand it immediately

Modeling complete. Hand off results to 04_visuals.Rmd for presentation-ready outputs.