Overview

Presentation-ready visuals built in the competition color palette — navy, blue, and amber. Every chart is rendered interactively via plotly and exported as a high-resolution PNG for direct use in PowerPoint.

Charts included:

CIBIL Score vs Approval — headline chart
Approval Rate by CIBIL Band
Loan Amount vs Income colored by Approval
Approval Rate by Income Bracket & Education
Feature Importance Comparison across Models
ROC Curve Overlay — all three models

Setup

library(tidyverse)
library(plotly)
library(scales)
library(kableExtra)
library(randomForest)
library(rpart)
library(pROC)

# Competition palette — navy / blue / amber
COL_APPROVED <- "#1E5FAD"
COL_REJECTED <- "#F4A62A"
COL_NAVY     <- "#0D1B3E"
COL_BLUE_MID <- "#2E78D2"
COL_LIGHT    <- "#C8DEFF"
COL_SLATE    <- "#64748B"

theme_datathon <- function(base_size = 13) {
  theme_minimal(base_size = base_size) +
    theme(
      plot.title       = element_text(face = "bold", color = COL_NAVY,
                                      size = base_size + 2),
      plot.subtitle    = element_text(color = COL_SLATE, size = base_size - 1),
      plot.caption     = element_text(color = COL_SLATE, size = 9, hjust = 0),
      axis.text        = element_text(color = COL_SLATE),
      axis.title       = element_text(color = COL_NAVY, face = "bold"),
      legend.position  = "top",
      legend.title     = element_text(face = "bold", color = COL_NAVY),
      panel.grid.minor = element_blank(),
      panel.grid.major = element_line(color = "#E2E8F0"),
      strip.text       = element_text(face = "bold", color = COL_NAVY),
      plot.background  = element_rect(fill = "white", color = NA),
      panel.background = element_rect(fill = "white", color = NA)
    )
}

# PNG export helper — saves ggplot to /visuals folder
dir.create("visuals", showWarnings = FALSE)

save_png <- function(plot, filename, width = 10, height = 6, dpi = 300) {
  ggsave(
    filename = file.path("visuals", paste0(filename, ".png")),
    plot     = plot,
    width    = width,
    height   = height,
    dpi      = dpi,
    bg       = "white"
  )
  cat("✓ Saved:", filename, ".png\n")
}

load("loan_clean.RData")

# Rebuild train/test split (matches 03_modeling.Rmd)
set.seed(2026)
model_df <- df %>%
  select(loan_status, cibil_score, income_annum, loan_amount, loan_term,
         no_of_dependents, education, self_employed,
         residential_assets_value, commercial_assets_value,
         luxury_assets_value, bank_asset_value,
         debt_to_income, total_assets, loan_to_asset) %>%
  drop_na()

library(caret)
train_idx <- createDataPartition(model_df$loan_status, p = 0.75, list = FALSE)
train_df  <- model_df[train_idx, ]
test_df   <- model_df[-train_idx, ]

# Refit models
logit_model <- glm(loan_status ~ ., data = train_df, family = binomial())
tree_model  <- rpart(loan_status ~ ., data = train_df, method = "class",
                     control = rpart.control(cp = 0.005, maxdepth = 5))
set.seed(2026)
rf_model    <- randomForest(loan_status ~ ., data = train_df,
                             ntree = 500, importance = TRUE)

# Predictions & ROC objects
logit_probs <- predict(logit_model, test_df, type = "response")
tree_probs  <- predict(tree_model,  test_df, type = "prob")[, "Approved"]
rf_probs    <- predict(rf_model,    test_df, type = "prob")[, "Approved"]

actual_bin  <- as.numeric(test_df$loan_status == "Approved")
logit_roc   <- roc(actual_bin, logit_probs, quiet = TRUE)
tree_roc    <- roc(actual_bin, tree_probs,  quiet = TRUE)
rf_roc      <- roc(actual_bin, rf_probs,    quiet = TRUE)

logit_auc <- round(auc(logit_roc), 3)
tree_auc  <- round(auc(tree_roc),  3)
rf_auc    <- round(auc(rf_roc),    3)

cat("Models ready. AUC — Logistic:", logit_auc,
    "| Tree:", tree_auc, "| RF:", rf_auc)

## Models ready. AUC — Logistic: 0.974 | Tree: 1 | RF: 1

Chart 1 — CIBIL Score vs Approval (Headline)

The single strongest predictor of loan approval. This is your lead slide visual.

p1 <- ggplot(df, aes(x = loan_status, y = cibil_score, fill = loan_status)) +
  geom_violin(alpha = 0.55, color = NA, trim = FALSE) +
  geom_boxplot(width = 0.12, fill = "white", color = COL_NAVY,
               outlier.shape = NA, linewidth = 0.7) +
  stat_summary(fun = mean, geom = "point", shape = 18,
               size = 3.5, color = COL_NAVY) +
  scale_fill_manual(values = c("Approved" = COL_APPROVED,
                                "Rejected" = COL_REJECTED)) +
  scale_y_continuous(breaks = seq(300, 900, 100)) +
  labs(
    title    = "CIBIL Score Distribution by Loan Outcome",
    subtitle = "Approved applicants consistently score higher — the gap is decisive",
    x        = NULL,
    y        = "CIBIL Score",
    fill     = NULL,
    caption  = "Diamond = group mean | Box = IQR | Width = density"
  ) +
  theme_datathon()

ggplotly(p1, tooltip = c("y", "fill"))

save_png(p1, "01_cibil_vs_approval")

## ✓ Saved: 01_cibil_vs_approval .png

Chart 2 — Approval Rate by CIBIL Band

Shows the stepwise relationship between credit tier and approval — clean story for judges.

cibil_band_data <- df %>%
  group_by(cibil_band) %>%
  summarise(
    total    = n(),
    approved = sum(loan_status == "Approved"),
    rate     = round(approved / total * 100, 1),
    .groups  = "drop"
  )

p2 <- ggplot(cibil_band_data,
             aes(x = cibil_band, y = rate, fill = cibil_band)) +
  geom_col(width = 0.65, show.legend = FALSE, alpha = 0.92) +
  geom_text(aes(label = paste0(rate, "%")),
            vjust = -0.6, fontface = "bold",
            size = 4.5, color = COL_NAVY) +
  geom_hline(yintercept = 50, linetype = "dashed",
             color = COL_SLATE, linewidth = 0.6) +
  annotate("text", x = 0.6, y = 52, label = "50% baseline",
           size = 3.2, color = COL_SLATE, hjust = 0) +
  scale_fill_manual(values = c(
    "Poor (300-500)"      = COL_REJECTED,
    "Fair (500-600)"      = "#F4C06A",
    "Good (600-700)"      = COL_LIGHT,
    "Very Good (700-800)" = COL_BLUE_MID,
    "Excellent (800-900)" = COL_NAVY
  )) +
  scale_y_continuous(limits = c(0, 115),
                     expand  = expansion(mult = c(0, 0)),
                     labels  = label_percent(scale = 1)) +
  labs(
    title    = "Loan Approval Rate by CIBIL Score Band",
    subtitle = "Clear stepwise pattern — credit score is the decisive factor",
    x        = "CIBIL Score Band",
    y        = "Approval Rate",
    caption  = paste0("n = ", nrow(df), " applicants")
  ) +
  theme_datathon()

ggplotly(p2, tooltip = c("x", "y"))

save_png(p2, "02_approval_by_cibil_band")

## ✓ Saved: 02_approval_by_cibil_band .png

Chart 3 — Loan Amount vs Annual Income

Scatter plot colored by approval outcome — reveals whether the income-to-loan ratio drives decisions.

set.seed(42)
df_sample <- df %>% slice_sample(n = 1500)  # sample for readable scatter

p3 <- ggplot(df_sample,
             aes(x = income_annum / 1e6,
                 y = loan_amount   / 1e6,
                 color = loan_status,
                 text  = paste0(
                   "Income: ₹", round(income_annum / 1e6, 2), "M\n",
                   "Loan: ₹",   round(loan_amount   / 1e6, 2), "M\n",
                   "Status: ",  loan_status
                 ))) +
  geom_point(alpha = 0.55, size = 1.8) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1.1) +
  scale_color_manual(values = c("Approved" = COL_APPROVED,
                                 "Rejected" = COL_REJECTED)) +
  scale_x_continuous(labels = label_dollar(prefix = "₹", suffix = "M")) +
  scale_y_continuous(labels = label_dollar(prefix = "₹", suffix = "M")) +
  labs(
    title    = "Loan Amount vs Annual Income by Approval Status",
    subtitle = "Do higher earners borrow more — and does it affect approval?",
    x        = "Annual Income (₹ Millions)",
    y        = "Loan Amount (₹ Millions)",
    color    = NULL,
    caption  = "Sample of 1,500 applicants shown for readability"
  ) +
  theme_datathon()

ggplotly(p3, tooltip = "text")

save_png(p3, "03_loan_vs_income_scatter", width = 10, height = 6)

## ✓ Saved: 03_loan_vs_income_scatter .png

Chart 4 — Approval Rate by Income Bracket & Education

Side-by-side facet showing whether income level and education jointly influence approval.

bracket_edu_data <- df %>%
  filter(!is.na(income_bracket)) %>%
  group_by(income_bracket, education) %>%
  summarise(
    total    = n(),
    approved = sum(loan_status == "Approved"),
    rate     = round(approved / total * 100, 1),
    .groups  = "drop"
  )

p4 <- ggplot(bracket_edu_data,
             aes(x = income_bracket, y = rate,
                 fill = education)) +
  geom_col(position = position_dodge(width = 0.65),
           width = 0.6, alpha = 0.92) +
  geom_text(aes(label = paste0(rate, "%")),
            position = position_dodge(width = 0.65),
            vjust = -0.5, size = 3.8,
            fontface = "bold", color = COL_NAVY) +
  geom_hline(yintercept = 50, linetype = "dashed",
             color = COL_SLATE, linewidth = 0.5) +
  scale_fill_manual(values = c("Graduate"     = COL_APPROVED,
                                "Not Graduate" = COL_REJECTED)) +
  scale_y_continuous(limits = c(0, 100),
                     expand  = expansion(mult = c(0, 0.1)),
                     labels  = label_percent(scale = 1)) +
  labs(
    title    = "Approval Rate by Income Bracket and Education",
    subtitle = "Does being a graduate change your odds at each income level?",
    x        = "Income Bracket",
    y        = "Approval Rate",
    fill     = "Education",
    caption  = "Income brackets split by tercile (Low / Mid / High)"
  ) +
  theme_datathon()

ggplotly(p4, tooltip = c("x", "y", "fill"))

save_png(p4, "04_approval_by_bracket_education")

## ✓ Saved: 04_approval_by_bracket_education .png

Chart 5 — Feature Importance Comparison

Side-by-side importance scores from all three models — shows which features are consistently important vs model-dependent.

# Logistic regression — absolute z-statistic as importance proxy
logit_imp <- broom::tidy(logit_model) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    feature    = term,
    importance = abs(statistic),
    model      = "Logistic Regression"
  ) %>%
  select(feature, importance, model)

# Decision tree
tree_imp <- tree_model$variable.importance %>%
  enframe(name = "feature", value = "importance") %>%
  mutate(model = "Decision Tree")

# Random forest — Mean Decrease Gini
rf_imp <- importance(rf_model) %>%
  as.data.frame() %>%
  rownames_to_column("feature") %>%
  select(feature, importance = MeanDecreaseGini) %>%
  mutate(model = "Random Forest")

# Normalise each model's scores to 0-100 for fair comparison
normalise <- function(x) (x - min(x)) / (max(x) - min(x)) * 100

imp_all <- bind_rows(logit_imp, tree_imp, rf_imp) %>%
  group_by(model) %>%
  mutate(importance_norm = round(normalise(importance), 1)) %>%
  ungroup()

# Keep only features that appear in all three models
common_features <- imp_all %>%
  count(feature) %>%
  filter(n == 3) %>%
  pull(feature)

imp_plot_data <- imp_all %>%
  filter(feature %in% common_features) %>%
  group_by(feature) %>%
  mutate(mean_imp = mean(importance_norm)) %>%
  ungroup()

p5 <- ggplot(imp_plot_data,
             aes(x = reorder(feature, mean_imp),
                 y = importance_norm,
                 fill = model)) +
  geom_col(position = position_dodge(width = 0.75),
           width = 0.7, alpha = 0.92) +
  coord_flip() +
  scale_fill_manual(values = c(
    "Logistic Regression" = COL_APPROVED,
    "Decision Tree"       = COL_REJECTED,
    "Random Forest"       = COL_NAVY
  )) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.08)),
                     labels = label_number(suffix = "%")) +
  labs(
    title    = "Feature Importance — All Three Models",
    subtitle = "Normalised to 0–100 for cross-model comparison",
    x        = NULL,
    y        = "Normalised Importance Score",
    fill     = "Model",
    caption  = "Logistic: |z-statistic| | Tree & RF: variable importance (Gini)"
  ) +
  theme_datathon() +
  theme(legend.position = "top")

ggplotly(p5, tooltip = c("x", "y", "fill"))

save_png(p5, "05_feature_importance_comparison", height = 7)

## ✓ Saved: 05_feature_importance_comparison .png

Chart 6 — ROC Curve Overlay

All three models on one chart — the clearest single-slide evidence of model performance.

roc_all <- bind_rows(
  data.frame(fpr = 1 - logit_roc$specificities,
             tpr = logit_roc$sensitivities,
             model = paste0("Logistic Regression (AUC = ", logit_auc, ")")),
  data.frame(fpr = 1 - tree_roc$specificities,
             tpr = tree_roc$sensitivities,
             model = paste0("Decision Tree (AUC = ", tree_auc, ")")),
  data.frame(fpr = 1 - rf_roc$specificities,
             tpr = rf_roc$sensitivities,
             model = paste0("Random Forest (AUC = ", rf_auc, ")"))
)

p6 <- ggplot(roc_all, aes(x = fpr, y = tpr, color = model)) +
  geom_line(linewidth = 1.2) +
  geom_abline(linetype = "dashed", color = COL_SLATE, linewidth = 0.6) +
  annotate("text", x = 0.72, y = 0.08,
           label = "Random classifier", size = 3.2,
           color = COL_SLATE, fontface = "italic") +
  scale_color_manual(values = setNames(
    c(COL_APPROVED, COL_REJECTED, COL_NAVY),
    c(paste0("Logistic Regression (AUC = ", logit_auc, ")"),
      paste0("Decision Tree (AUC = ", tree_auc, ")"),
      paste0("Random Forest (AUC = ", rf_auc, ")"))
  )) +
  scale_x_continuous(labels = label_percent()) +
  scale_y_continuous(labels = label_percent()) +
  labs(
    title    = "ROC Curve Comparison — All Three Models",
    subtitle = "Closer to the top-left corner = better discrimination",
    x        = "False Positive Rate (1 − Specificity)",
    y        = "True Positive Rate (Sensitivity)",
    color    = NULL,
    caption  = "AUC = Area Under the Curve | Higher is better | Max = 1.0"
  ) +
  theme_datathon()

ggplotly(p6, tooltip = c("x", "y", "color"))

save_png(p6, "06_roc_overlay")

## ✓ Saved: 06_roc_overlay .png

Export Summary

png_files <- list.files("visuals", pattern = "\\.png$", full.names = FALSE)

tibble(
  `#`    = seq_along(png_files),
  File   = png_files,
  Status = "✓ Ready for PowerPoint"
) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

#	File	Status
1	01_cibil_vs_approval.png	✓ Ready for PowerPoint
2	02_approval_by_cibil_band.png	✓ Ready for PowerPoint
3	03_loan_vs_income_scatter.png	✓ Ready for PowerPoint
4	04_approval_by_bracket_education.png	✓ Ready for PowerPoint
5	05_feature_importance_comparison.png	✓ Ready for PowerPoint
6	06_roc_overlay.png	✓ Ready for PowerPoint

cat("\nAll PNGs saved to /visuals folder.\n")

## 
## All PNGs saved to /visuals folder.

cat("Insert into PowerPoint via: Insert → Pictures → This Device\n")

## Insert into PowerPoint via: Insert → Pictures → This Device

All six presentation visuals complete. Drop the PNGs from the /visuals folder directly into your PowerPoint template.

Presentation-Ready Visuals

Datathon Practice 2026 | Loan Approval Dataset

Madison G

2026-03-08