Overview

This report builds and evaluates three predictive models on the cleaned loan approval dataset. Each model is assessed using a consistent set of outputs — summary/coefficients, confusion matrix, feature importance, and ROC/AUC — making it easy to compare performance and select the best approach for the business question.

Models covered:

Logistic Regression — interpretable baseline, coefficient-driven
Decision Tree — visual, rule-based, easy to explain to judges
Random Forest — highest accuracy, ensemble feature importance

Setup

library(tidyverse)
library(plotly)
library(kableExtra)
library(scales)
library(caret)        # confusion matrix, train/test split
library(rpart)        # decision tree
library(rpart.plot)   # tree visualization
library(randomForest) # random forest
library(pROC)         # ROC curves & AUC

COL_APPROVED <- "#1E5FAD"
COL_REJECTED <- "#F4A62A"
COL_NAVY     <- "#0D1B3E"
COL_LIGHT    <- "#C8DEFF"

theme_datathon <- function() {
  theme_minimal(base_size = 13) +
    theme(
      plot.title       = element_text(face = "bold", color = COL_NAVY, size = 14),
      plot.subtitle    = element_text(color = "#64748B", size = 11),
      axis.text        = element_text(color = "#64748B"),
      axis.title       = element_text(color = COL_NAVY, face = "bold"),
      legend.position  = "top",
      panel.grid.minor = element_blank(),
      strip.text       = element_text(face = "bold", color = COL_NAVY)
    )
}

load("loan_clean.RData")
cat("Dataset loaded:", nrow(df), "rows ×", ncol(df), "columns")

## Dataset loaded: 4269 rows × 17 columns

1. Preprocessing for Modeling

# Select modeling features — drop engineered buckets (redundant with numeric versions)
# and ID-like columns
model_df <- df %>%
  select(
    loan_status,
    cibil_score,
    income_annum,
    loan_amount,
    loan_term,
    no_of_dependents,
    education,
    self_employed,
    residential_assets_value,
    commercial_assets_value,
    luxury_assets_value,
    bank_asset_value,
    debt_to_income,
    total_assets,
    loan_to_asset
  ) %>%
  drop_na()

cat("Modeling dataset:", nrow(model_df), "rows ×", ncol(model_df), "columns\n")

## Modeling dataset: 4269 rows × 15 columns

cat("Target distribution:\n")

## Target distribution:

print(table(model_df$loan_status))

## 
## Rejected Approved 
##     1613     2656

set.seed(2026)  # reproducible split

train_idx <- createDataPartition(model_df$loan_status, p = 0.75, list = FALSE)
train_df  <- model_df[train_idx, ]
test_df   <- model_df[-train_idx, ]

cat("Training rows:", nrow(train_df), "\n")

## Training rows: 3202

cat("Testing rows: ", nrow(test_df),  "\n")

## Testing rows:  1067

2. Logistic Regression

A transparent, interpretable baseline. Coefficients directly explain the direction and magnitude of each feature’s influence on approval probability.

2a. Fit Model

logit_model <- glm(
  loan_status ~ .,
  data   = train_df,
  family = binomial(link = "logit")
)

summary(logit_model)

## 
## Call:
## glm(formula = loan_status ~ ., family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.402e+01  9.454e-01 -14.828  < 2e-16 ***
## cibil_score               2.502e-02  9.784e-04  25.569  < 2e-16 ***
## income_annum              1.028e-07  1.725e-07   0.596  0.55133    
## loan_amount               4.647e-09  4.173e-08   0.111  0.91133    
## loan_term                -1.597e-01  1.336e-02 -11.952  < 2e-16 ***
## no_of_dependents         -3.231e-02  4.016e-02  -0.805  0.42102    
## educationNot Graduate    -2.027e-01  1.372e-01  -1.477  0.13974    
## self_employedYes          8.000e-02  1.367e-01   0.585  0.55836    
## residential_assets_value -5.214e-08  1.968e-08  -2.649  0.00807 ** 
## commercial_assets_value  -5.547e-08  2.463e-08  -2.252  0.02429 *  
## luxury_assets_value      -1.210e-09  2.436e-08  -0.050  0.96037    
## bank_asset_value         -1.438e-08  4.135e-08  -0.348  0.72803    
## debt_to_income            1.742e+00  3.088e-01   5.641 1.69e-08 ***
## total_assets                     NA         NA      NA       NA    
## loan_to_asset            -4.783e+00  1.186e+00  -4.031 5.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4246.0  on 3201  degrees of freedom
## Residual deviance: 1406.2  on 3188  degrees of freedom
## AIC: 1434.2
## 
## Number of Fisher Scoring iterations: 7

2b. Coefficients Table

logit_coef <- broom::tidy(logit_model) %>%
  mutate(
    odds_ratio  = round(exp(estimate), 4),
    estimate    = round(estimate, 4),
    std.error   = round(std.error, 4),
    statistic   = round(statistic, 3),
    p.value     = round(p.value, 4),
    significant = if_else(p.value < 0.05, "✓", "")
  ) %>%
  arrange(p.value)

logit_coef %>%
  kable(col.names = c("Term", "Estimate (log-odds)", "Std Error",
                      "z-statistic", "p-value", "Odds Ratio", "Significant")) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(which(logit_coef$significant == "✓"), background = "#C8DEFF")

Term	Estimate (log-odds)	Std Error	z-statistic	p-value	Odds Ratio	Significant
(Intercept)	-14.0179	0.9454	-14.828	0.0000	0.0000	✓
cibil_score	0.0250	0.0010	25.569	0.0000	1.0253	✓
loan_term	-0.1597	0.0134	-11.952	0.0000	0.8524	✓
debt_to_income	1.7423	0.3088	5.641	0.0000	5.7102	✓
loan_to_asset	-4.7828	1.1864	-4.031	0.0001	0.0084	✓
residential_assets_value	0.0000	0.0000	-2.649	0.0081	1.0000	✓
commercial_assets_value	0.0000	0.0000	-2.252	0.0243	1.0000	✓
educationNot Graduate	-0.2027	0.1372	-1.477	0.1397	0.8165
no_of_dependents	-0.0323	0.0402	-0.805	0.4210	0.9682
income_annum	0.0000	0.0000	0.596	0.5513	1.0000
self_employedYes	0.0800	0.1367	0.585	0.5584	1.0833
bank_asset_value	0.0000	0.0000	-0.348	0.7280	1.0000
loan_amount	0.0000	0.0000	0.111	0.9113	1.0000
luxury_assets_value	0.0000	0.0000	-0.050	0.9604	1.0000
total_assets	NA	NA	NA	NA	NA	NA

2c. Confusion Matrix

logit_probs <- predict(logit_model, test_df, type = "response")
logit_preds <- factor(if_else(logit_probs > 0.5, "Approved", "Rejected"),
                      levels = c("Rejected", "Approved"))

logit_cm <- confusionMatrix(logit_preds, test_df$loan_status, positive = "Approved")

# Tidy confusion matrix display
cm_tbl <- as.data.frame(logit_cm$table)
ggplot(cm_tbl, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 7, fontface = "bold",
            color = "white") +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  labs(
    title    = "Logistic Regression — Confusion Matrix",
    subtitle = paste0("Accuracy: ",
                      round(logit_cm$overall["Accuracy"] * 100, 1), "%"),
    x = "Actual",
    y = "Predicted"
  ) +
  theme_datathon() +
  theme(legend.position = "none")

tibble(
  Metric    = c("Accuracy", "Sensitivity (Recall)", "Specificity", "Precision", "F1 Score"),
  Value     = c(
    round(logit_cm$overall["Accuracy"], 4),
    round(logit_cm$byClass["Sensitivity"], 4),
    round(logit_cm$byClass["Specificity"], 4),
    round(logit_cm$byClass["Precision"], 4),
    round(logit_cm$byClass["F1"], 4)
  )
) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Metric	Value
Accuracy	0.9128
Sensitivity (Recall)	0.9337
Specificity	0.8784
Precision	0.9268
F1 Score	0.9302

2d. ROC Curve & AUC

logit_roc <- roc(
  response  = as.numeric(test_df$loan_status == "Approved"),
  predictor = logit_probs
)

logit_auc <- round(auc(logit_roc), 4)

roc_df <- data.frame(
  specificity = logit_roc$specificities,
  sensitivity = logit_roc$sensitivities
)

p_roc_logit <- ggplot(roc_df, aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(color = COL_APPROVED, linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = "#94A3B8") +
  annotate("text", x = 0.65, y = 0.15,
           label = paste0("AUC = ", logit_auc),
           size = 5, fontface = "bold", color = COL_NAVY) +
  labs(
    title    = "Logistic Regression — ROC Curve",
    subtitle = "Diagonal = random classifier baseline",
    x        = "1 − Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)"
  ) +
  theme_datathon()

ggplotly(p_roc_logit)

AUC = 0.9742 — values above 0.90 indicate excellent discrimination.

3. Decision Tree

Rule-based and highly explainable — ideal for presenting to a non-technical audience. The tree diagram tells the story visually without needing statistical interpretation.

3a. Fit Model

tree_model <- rpart(
  loan_status ~ .,
  data   = train_df,
  method = "class",
  control = rpart.control(
    cp       = 0.005,   # complexity parameter — lower = deeper tree
    maxdepth = 5,       # cap depth for readability
    minsplit = 20
  )
)

3b. Tree Visualization

rpart.plot(
  tree_model,
  type    = 4,
  extra   = 104,       # show probability + % of obs
  fallen.leaves = TRUE,
  main    = "Decision Tree — Loan Approval",
  box.palette = list(COL_REJECTED, COL_APPROVED),
  shadow.col  = "gray80",
  cex     = 0.75
)

3c. Feature Importance

tree_imp <- tree_model$variable.importance %>%
  enframe(name = "feature", value = "importance") %>%
  mutate(
    importance = round(importance, 2),
    pct        = round(importance / sum(importance) * 100, 1)
  ) %>%
  arrange(desc(importance))

p_tree_imp <- ggplot(tree_imp, aes(x = reorder(feature, importance),
                                    y = importance, fill = importance)) +
  geom_col(show.legend = FALSE, alpha = 0.9) +
  geom_text(aes(label = paste0(pct, "%")),
            hjust = -0.15, size = 3.5, color = COL_NAVY) +
  coord_flip() +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
  labs(
    title    = "Decision Tree — Feature Importance",
    subtitle = "Relative contribution to splitting decisions",
    x        = NULL,
    y        = "Importance Score"
  ) +
  theme_datathon()

ggplotly(p_tree_imp, tooltip = c("x", "y"))

3d. Confusion Matrix & ROC

tree_probs <- predict(tree_model, test_df, type = "prob")[, "Approved"]
tree_preds <- factor(if_else(tree_probs > 0.5, "Approved", "Rejected"),
                     levels = c("Rejected", "Approved"))

tree_cm <- confusionMatrix(tree_preds, test_df$loan_status, positive = "Approved")

cm_tbl_tree <- as.data.frame(tree_cm$table)
ggplot(cm_tbl_tree, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 7, fontface = "bold", color = "white") +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  labs(
    title    = "Decision Tree — Confusion Matrix",
    subtitle = paste0("Accuracy: ",
                      round(tree_cm$overall["Accuracy"] * 100, 1), "%"),
    x = "Actual", y = "Predicted"
  ) +
  theme_datathon() +
  theme(legend.position = "none")

tree_roc <- roc(
  response  = as.numeric(test_df$loan_status == "Approved"),
  predictor = tree_probs
)
tree_auc <- round(auc(tree_roc), 4)

roc_df_tree <- data.frame(
  specificity = tree_roc$specificities,
  sensitivity = tree_roc$sensitivities
)

p_roc_tree <- ggplot(roc_df_tree, aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(color = COL_REJECTED, linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = "#94A3B8") +
  annotate("text", x = 0.65, y = 0.15,
           label = paste0("AUC = ", tree_auc),
           size = 5, fontface = "bold", color = COL_NAVY) +
  labs(
    title    = "Decision Tree — ROC Curve",
    subtitle = "Diagonal = random classifier baseline",
    x        = "1 − Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)"
  ) +
  theme_datathon()

ggplotly(p_roc_tree)

AUC = 0.9999

4. Random Forest

Ensemble of trees — highest predictive accuracy. Feature importance is aggregated across all trees, giving a more reliable signal than a single tree.

4a. Fit Model

set.seed(2026)

rf_model <- randomForest(
  loan_status ~ .,
  data       = train_df,
  ntree      = 500,
  mtry       = floor(sqrt(ncol(train_df) - 1)),  # default: sqrt(p) for classification
  importance = TRUE
)

print(rf_model)

## 
## Call:
##  randomForest(formula = loan_status ~ ., data = train_df, ntree = 500,      mtry = floor(sqrt(ncol(train_df) - 1)), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0.09%
## Confusion matrix:
##          Rejected Approved class.error
## Rejected     1207        3 0.002479339
## Approved        0     1992 0.000000000

4b. Feature Importance

rf_imp <- importance(rf_model) %>%
  as.data.frame() %>%
  rownames_to_column("feature") %>%
  select(feature, MeanDecreaseAccuracy, MeanDecreaseGini) %>%
  arrange(desc(MeanDecreaseGini)) %>%
  mutate(across(where(is.numeric), ~ round(., 2)))

rf_imp %>%
  kable(col.names = c("Feature", "Mean Decrease Accuracy", "Mean Decrease Gini")) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(1:3, background = "#C8DEFF")

Feature	Mean Decrease Accuracy	Mean Decrease Gini
cibil_score	275.68	1185.60
loan_term	110.73	113.07
debt_to_income	47.53	52.83
loan_to_asset	19.21	35.18
loan_amount	14.56	18.54
total_assets	12.32	16.64
luxury_assets_value	11.04	16.35
residential_assets_value	6.62	14.86
commercial_assets_value	7.97	14.60
income_annum	11.81	13.42
bank_asset_value	9.30	13.35
no_of_dependents	0.48	6.61
education	-0.77	1.95
self_employed	-1.51	1.93

p_rf_imp <- rf_imp %>%
  ggplot(aes(x = reorder(feature, MeanDecreaseGini),
             y = MeanDecreaseGini, fill = MeanDecreaseGini)) +
  geom_col(show.legend = FALSE, alpha = 0.9) +
  coord_flip() +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(
    title    = "Random Forest — Feature Importance (Gini)",
    subtitle = "Higher = more important to splitting decisions across all 500 trees",
    x        = NULL,
    y        = "Mean Decrease in Gini Impurity"
  ) +
  theme_datathon()

ggplotly(p_rf_imp, tooltip = c("x", "y"))

4c. Confusion Matrix

rf_probs <- predict(rf_model, test_df, type = "prob")[, "Approved"]
rf_preds <- predict(rf_model, test_df, type = "class")

rf_cm <- confusionMatrix(rf_preds, test_df$loan_status, positive = "Approved")

cm_tbl_rf <- as.data.frame(rf_cm$table)
ggplot(cm_tbl_rf, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 7, fontface = "bold", color = "white") +
  scale_fill_gradient(low = COL_LIGHT, high = COL_NAVY) +
  labs(
    title    = "Random Forest — Confusion Matrix",
    subtitle = paste0("Accuracy: ",
                      round(rf_cm$overall["Accuracy"] * 100, 1), "%"),
    x = "Actual", y = "Predicted"
  ) +
  theme_datathon() +
  theme(legend.position = "none")

4d. ROC Curve & AUC

rf_roc <- roc(
  response  = as.numeric(test_df$loan_status == "Approved"),
  predictor = rf_probs
)
rf_auc <- round(auc(rf_roc), 4)

roc_df_rf <- data.frame(
  specificity = rf_roc$specificities,
  sensitivity = rf_roc$sensitivities
)

p_roc_rf <- ggplot(roc_df_rf, aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(color = COL_NAVY, linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = "#94A3B8") +
  annotate("text", x = 0.65, y = 0.15,
           label = paste0("AUC = ", rf_auc),
           size = 5, fontface = "bold", color = COL_NAVY) +
  labs(
    title    = "Random Forest — ROC Curve",
    subtitle = "Diagonal = random classifier baseline",
    x        = "1 − Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)"
  ) +
  theme_datathon()

ggplotly(p_roc_rf)

AUC = 1

5. Model Comparison

comparison <- tibble(
  Model = c("Logistic Regression", "Decision Tree", "Random Forest"),
  Accuracy = c(
    round(logit_cm$overall["Accuracy"] * 100, 1),
    round(tree_cm$overall["Accuracy"]  * 100, 1),
    round(rf_cm$overall["Accuracy"]    * 100, 1)
  ),
  AUC = c(logit_auc, tree_auc, rf_auc),
  Sensitivity = c(
    round(logit_cm$byClass["Sensitivity"] * 100, 1),
    round(tree_cm$byClass["Sensitivity"]  * 100, 1),
    round(rf_cm$byClass["Sensitivity"]    * 100, 1)
  ),
  Specificity = c(
    round(logit_cm$byClass["Specificity"] * 100, 1),
    round(tree_cm$byClass["Specificity"]  * 100, 1),
    round(rf_cm$byClass["Specificity"]    * 100, 1)
  ),
  Interpretability = c("High", "High", "Medium")
)

best_row <- which.max(comparison$AUC)

comparison %>%
  kable(col.names = c("Model", "Accuracy (%)", "AUC",
                      "Sensitivity (%)", "Specificity (%)", "Interpretability")) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(best_row, bold = TRUE, background = "#C8DEFF")

Model	Accuracy (%)	AUC	Sensitivity (%)	Specificity (%)	Interpretability
Logistic Regression	91.3	0.9742	93.4	87.8	High
Decision Tree	99.9	0.9999	100.0	99.8	High
Random Forest	100.0	1.0000	100.0	100.0	Medium

# All three ROC curves on one chart for easy comparison
roc_all <- bind_rows(
  roc_df       %>% mutate(Model = paste0("Logistic Regression (AUC = ", logit_auc, ")")),
  roc_df_tree  %>% mutate(Model = paste0("Decision Tree (AUC = ", tree_auc, ")")),
  roc_df_rf    %>% mutate(Model = paste0("Random Forest (AUC = ", rf_auc, ")"))
)
p_roc_all <- ggplot(roc_all, aes(x = 1 - specificity, y = sensitivity, color = Model)) +
  geom_line(linewidth = 1.1) +
  geom_abline(linetype = "dashed", color = "#94A3B8") +
  scale_color_manual(values = setNames(
    c(COL_APPROVED, COL_REJECTED, COL_NAVY),
    c(paste0("Logistic Regression (AUC = ", logit_auc, ")"),
      paste0("Decision Tree (AUC = ", tree_auc, ")"),
      paste0("Random Forest (AUC = ", rf_auc, ")"))
  )) +
  labs(
    title    = "ROC Curve Comparison — All Three Models",
    subtitle = "Higher and further left = better discrimination",
    x        = "1 − Specificity (False Positive Rate)",
    y        = "Sensitivity (True Positive Rate)",
    color    = NULL
  ) +
  theme_datathon()

ggplotly(p_roc_all)

6. Modeling Takeaways

tibble(
  `#` = 1:4,
  Finding = c(
    "CIBIL score is the dominant predictor across all three models",
    "Random Forest achieves the highest AUC and accuracy",
    "Logistic Regression is the most interpretable — best for explaining to judges",
    "Decision Tree provides the clearest visual narrative for a presentation"
  ),
  Recommendation = c(
    "Lead with CIBIL score in both the presentation and business recommendations",
    "Use Random Forest as the primary model for prediction performance claims",
    "Quote Logistic Regression coefficients when discussing individual feature effects",
    "Include the tree diagram in the slide deck — audiences understand it immediately"
  )
) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(c(2, 4), background = "#C8DEFF")

#	Finding	Recommendation
1	CIBIL score is the dominant predictor across all three models	Lead with CIBIL score in both the presentation and business recommendations
2	Random Forest achieves the highest AUC and accuracy	Use Random Forest as the primary model for prediction performance claims
3	Logistic Regression is the most interpretable — best for explaining to judges	Quote Logistic Regression coefficients when discussing individual feature effects
4	Decision Tree provides the clearest visual narrative for a presentation	Include the tree diagram in the slide deck — audiences understand it immediately

Modeling complete. Hand off results to 04_visuals.Rmd for presentation-ready outputs.

Predictive Modeling Report

Datathon 2026 | Loan Approval Dataset

Madison G

2026-03-08