Presentation-ready visuals built in the competition color palette — navy, blue, and amber. Every chart is rendered interactively via plotly and exported as a high-resolution PNG for direct use in PowerPoint.
Charts included:
library(tidyverse)
library(plotly)
library(scales)
library(kableExtra)
library(randomForest)
library(rpart)
library(pROC)# Competition palette — navy / blue / amber
COL_APPROVED <- "#1E5FAD"
COL_REJECTED <- "#F4A62A"
COL_NAVY <- "#0D1B3E"
COL_BLUE_MID <- "#2E78D2"
COL_LIGHT <- "#C8DEFF"
COL_SLATE <- "#64748B"
theme_datathon <- function(base_size = 13) {
theme_minimal(base_size = base_size) +
theme(
plot.title = element_text(face = "bold", color = COL_NAVY,
size = base_size + 2),
plot.subtitle = element_text(color = COL_SLATE, size = base_size - 1),
plot.caption = element_text(color = COL_SLATE, size = 9, hjust = 0),
axis.text = element_text(color = COL_SLATE),
axis.title = element_text(color = COL_NAVY, face = "bold"),
legend.position = "top",
legend.title = element_text(face = "bold", color = COL_NAVY),
panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "#E2E8F0"),
strip.text = element_text(face = "bold", color = COL_NAVY),
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA)
)
}
# PNG export helper — saves ggplot to /visuals folder
dir.create("visuals", showWarnings = FALSE)
save_png <- function(plot, filename, width = 10, height = 6, dpi = 300) {
ggsave(
filename = file.path("visuals", paste0(filename, ".png")),
plot = plot,
width = width,
height = height,
dpi = dpi,
bg = "white"
)
cat("✓ Saved:", filename, ".png\n")
}load("loan_clean.RData")
# Rebuild train/test split (matches 03_modeling.Rmd)
set.seed(2026)
model_df <- df %>%
select(loan_status, cibil_score, income_annum, loan_amount, loan_term,
no_of_dependents, education, self_employed,
residential_assets_value, commercial_assets_value,
luxury_assets_value, bank_asset_value,
debt_to_income, total_assets, loan_to_asset) %>%
drop_na()
library(caret)
train_idx <- createDataPartition(model_df$loan_status, p = 0.75, list = FALSE)
train_df <- model_df[train_idx, ]
test_df <- model_df[-train_idx, ]
# Refit models
logit_model <- glm(loan_status ~ ., data = train_df, family = binomial())
tree_model <- rpart(loan_status ~ ., data = train_df, method = "class",
control = rpart.control(cp = 0.005, maxdepth = 5))
set.seed(2026)
rf_model <- randomForest(loan_status ~ ., data = train_df,
ntree = 500, importance = TRUE)
# Predictions & ROC objects
logit_probs <- predict(logit_model, test_df, type = "response")
tree_probs <- predict(tree_model, test_df, type = "prob")[, "Approved"]
rf_probs <- predict(rf_model, test_df, type = "prob")[, "Approved"]
actual_bin <- as.numeric(test_df$loan_status == "Approved")
logit_roc <- roc(actual_bin, logit_probs, quiet = TRUE)
tree_roc <- roc(actual_bin, tree_probs, quiet = TRUE)
rf_roc <- roc(actual_bin, rf_probs, quiet = TRUE)
logit_auc <- round(auc(logit_roc), 3)
tree_auc <- round(auc(tree_roc), 3)
rf_auc <- round(auc(rf_roc), 3)
cat("Models ready. AUC — Logistic:", logit_auc,
"| Tree:", tree_auc, "| RF:", rf_auc)## Models ready. AUC — Logistic: 0.974 | Tree: 1 | RF: 1
The single strongest predictor of loan approval. This is your lead slide visual.
p1 <- ggplot(df, aes(x = loan_status, y = cibil_score, fill = loan_status)) +
geom_violin(alpha = 0.55, color = NA, trim = FALSE) +
geom_boxplot(width = 0.12, fill = "white", color = COL_NAVY,
outlier.shape = NA, linewidth = 0.7) +
stat_summary(fun = mean, geom = "point", shape = 18,
size = 3.5, color = COL_NAVY) +
scale_fill_manual(values = c("Approved" = COL_APPROVED,
"Rejected" = COL_REJECTED)) +
scale_y_continuous(breaks = seq(300, 900, 100)) +
labs(
title = "CIBIL Score Distribution by Loan Outcome",
subtitle = "Approved applicants consistently score higher — the gap is decisive",
x = NULL,
y = "CIBIL Score",
fill = NULL,
caption = "Diamond = group mean | Box = IQR | Width = density"
) +
theme_datathon()
ggplotly(p1, tooltip = c("y", "fill"))## ✓ Saved: 01_cibil_vs_approval .png
Shows the stepwise relationship between credit tier and approval — clean story for judges.
cibil_band_data <- df %>%
group_by(cibil_band) %>%
summarise(
total = n(),
approved = sum(loan_status == "Approved"),
rate = round(approved / total * 100, 1),
.groups = "drop"
)
p2 <- ggplot(cibil_band_data,
aes(x = cibil_band, y = rate, fill = cibil_band)) +
geom_col(width = 0.65, show.legend = FALSE, alpha = 0.92) +
geom_text(aes(label = paste0(rate, "%")),
vjust = -0.6, fontface = "bold",
size = 4.5, color = COL_NAVY) +
geom_hline(yintercept = 50, linetype = "dashed",
color = COL_SLATE, linewidth = 0.6) +
annotate("text", x = 0.6, y = 52, label = "50% baseline",
size = 3.2, color = COL_SLATE, hjust = 0) +
scale_fill_manual(values = c(
"Poor (300-500)" = COL_REJECTED,
"Fair (500-600)" = "#F4C06A",
"Good (600-700)" = COL_LIGHT,
"Very Good (700-800)" = COL_BLUE_MID,
"Excellent (800-900)" = COL_NAVY
)) +
scale_y_continuous(limits = c(0, 115),
expand = expansion(mult = c(0, 0)),
labels = label_percent(scale = 1)) +
labs(
title = "Loan Approval Rate by CIBIL Score Band",
subtitle = "Clear stepwise pattern — credit score is the decisive factor",
x = "CIBIL Score Band",
y = "Approval Rate",
caption = paste0("n = ", nrow(df), " applicants")
) +
theme_datathon()
ggplotly(p2, tooltip = c("x", "y"))## ✓ Saved: 02_approval_by_cibil_band .png
Scatter plot colored by approval outcome — reveals whether the income-to-loan ratio drives decisions.
set.seed(42)
df_sample <- df %>% slice_sample(n = 1500) # sample for readable scatter
p3 <- ggplot(df_sample,
aes(x = income_annum / 1e6,
y = loan_amount / 1e6,
color = loan_status,
text = paste0(
"Income: ₹", round(income_annum / 1e6, 2), "M\n",
"Loan: ₹", round(loan_amount / 1e6, 2), "M\n",
"Status: ", loan_status
))) +
geom_point(alpha = 0.55, size = 1.8) +
geom_smooth(method = "lm", se = FALSE, linewidth = 1.1) +
scale_color_manual(values = c("Approved" = COL_APPROVED,
"Rejected" = COL_REJECTED)) +
scale_x_continuous(labels = label_dollar(prefix = "₹", suffix = "M")) +
scale_y_continuous(labels = label_dollar(prefix = "₹", suffix = "M")) +
labs(
title = "Loan Amount vs Annual Income by Approval Status",
subtitle = "Do higher earners borrow more — and does it affect approval?",
x = "Annual Income (₹ Millions)",
y = "Loan Amount (₹ Millions)",
color = NULL,
caption = "Sample of 1,500 applicants shown for readability"
) +
theme_datathon()
ggplotly(p3, tooltip = "text")## ✓ Saved: 03_loan_vs_income_scatter .png
Side-by-side facet showing whether income level and education jointly influence approval.
bracket_edu_data <- df %>%
filter(!is.na(income_bracket)) %>%
group_by(income_bracket, education) %>%
summarise(
total = n(),
approved = sum(loan_status == "Approved"),
rate = round(approved / total * 100, 1),
.groups = "drop"
)
p4 <- ggplot(bracket_edu_data,
aes(x = income_bracket, y = rate,
fill = education)) +
geom_col(position = position_dodge(width = 0.65),
width = 0.6, alpha = 0.92) +
geom_text(aes(label = paste0(rate, "%")),
position = position_dodge(width = 0.65),
vjust = -0.5, size = 3.8,
fontface = "bold", color = COL_NAVY) +
geom_hline(yintercept = 50, linetype = "dashed",
color = COL_SLATE, linewidth = 0.5) +
scale_fill_manual(values = c("Graduate" = COL_APPROVED,
"Not Graduate" = COL_REJECTED)) +
scale_y_continuous(limits = c(0, 100),
expand = expansion(mult = c(0, 0.1)),
labels = label_percent(scale = 1)) +
labs(
title = "Approval Rate by Income Bracket and Education",
subtitle = "Does being a graduate change your odds at each income level?",
x = "Income Bracket",
y = "Approval Rate",
fill = "Education",
caption = "Income brackets split by tercile (Low / Mid / High)"
) +
theme_datathon()
ggplotly(p4, tooltip = c("x", "y", "fill"))## ✓ Saved: 04_approval_by_bracket_education .png
Side-by-side importance scores from all three models — shows which features are consistently important vs model-dependent.
# Logistic regression — absolute z-statistic as importance proxy
logit_imp <- broom::tidy(logit_model) %>%
filter(term != "(Intercept)") %>%
mutate(
feature = term,
importance = abs(statistic),
model = "Logistic Regression"
) %>%
select(feature, importance, model)
# Decision tree
tree_imp <- tree_model$variable.importance %>%
enframe(name = "feature", value = "importance") %>%
mutate(model = "Decision Tree")
# Random forest — Mean Decrease Gini
rf_imp <- importance(rf_model) %>%
as.data.frame() %>%
rownames_to_column("feature") %>%
select(feature, importance = MeanDecreaseGini) %>%
mutate(model = "Random Forest")
# Normalise each model's scores to 0-100 for fair comparison
normalise <- function(x) (x - min(x)) / (max(x) - min(x)) * 100
imp_all <- bind_rows(logit_imp, tree_imp, rf_imp) %>%
group_by(model) %>%
mutate(importance_norm = round(normalise(importance), 1)) %>%
ungroup()
# Keep only features that appear in all three models
common_features <- imp_all %>%
count(feature) %>%
filter(n == 3) %>%
pull(feature)
imp_plot_data <- imp_all %>%
filter(feature %in% common_features) %>%
group_by(feature) %>%
mutate(mean_imp = mean(importance_norm)) %>%
ungroup()p5 <- ggplot(imp_plot_data,
aes(x = reorder(feature, mean_imp),
y = importance_norm,
fill = model)) +
geom_col(position = position_dodge(width = 0.75),
width = 0.7, alpha = 0.92) +
coord_flip() +
scale_fill_manual(values = c(
"Logistic Regression" = COL_APPROVED,
"Decision Tree" = COL_REJECTED,
"Random Forest" = COL_NAVY
)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.08)),
labels = label_number(suffix = "%")) +
labs(
title = "Feature Importance — All Three Models",
subtitle = "Normalised to 0–100 for cross-model comparison",
x = NULL,
y = "Normalised Importance Score",
fill = "Model",
caption = "Logistic: |z-statistic| | Tree & RF: variable importance (Gini)"
) +
theme_datathon() +
theme(legend.position = "top")
ggplotly(p5, tooltip = c("x", "y", "fill"))## ✓ Saved: 05_feature_importance_comparison .png
All three models on one chart — the clearest single-slide evidence of model performance.
roc_all <- bind_rows(
data.frame(fpr = 1 - logit_roc$specificities,
tpr = logit_roc$sensitivities,
model = paste0("Logistic Regression (AUC = ", logit_auc, ")")),
data.frame(fpr = 1 - tree_roc$specificities,
tpr = tree_roc$sensitivities,
model = paste0("Decision Tree (AUC = ", tree_auc, ")")),
data.frame(fpr = 1 - rf_roc$specificities,
tpr = rf_roc$sensitivities,
model = paste0("Random Forest (AUC = ", rf_auc, ")"))
)
p6 <- ggplot(roc_all, aes(x = fpr, y = tpr, color = model)) +
geom_line(linewidth = 1.2) +
geom_abline(linetype = "dashed", color = COL_SLATE, linewidth = 0.6) +
annotate("text", x = 0.72, y = 0.08,
label = "Random classifier", size = 3.2,
color = COL_SLATE, fontface = "italic") +
scale_color_manual(values = setNames(
c(COL_APPROVED, COL_REJECTED, COL_NAVY),
c(paste0("Logistic Regression (AUC = ", logit_auc, ")"),
paste0("Decision Tree (AUC = ", tree_auc, ")"),
paste0("Random Forest (AUC = ", rf_auc, ")"))
)) +
scale_x_continuous(labels = label_percent()) +
scale_y_continuous(labels = label_percent()) +
labs(
title = "ROC Curve Comparison — All Three Models",
subtitle = "Closer to the top-left corner = better discrimination",
x = "False Positive Rate (1 − Specificity)",
y = "True Positive Rate (Sensitivity)",
color = NULL,
caption = "AUC = Area Under the Curve | Higher is better | Max = 1.0"
) +
theme_datathon()
ggplotly(p6, tooltip = c("x", "y", "color"))## ✓ Saved: 06_roc_overlay .png
png_files <- list.files("visuals", pattern = "\\.png$", full.names = FALSE)
tibble(
`#` = seq_along(png_files),
File = png_files,
Status = "✓ Ready for PowerPoint"
) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| # | File | Status |
|---|---|---|
| 1 | 01_cibil_vs_approval.png | ✓ Ready for PowerPoint |
| 2 | 02_approval_by_cibil_band.png | ✓ Ready for PowerPoint |
| 3 | 03_loan_vs_income_scatter.png | ✓ Ready for PowerPoint |
| 4 | 04_approval_by_bracket_education.png | ✓ Ready for PowerPoint |
| 5 | 05_feature_importance_comparison.png | ✓ Ready for PowerPoint |
| 6 | 06_roc_overlay.png | ✓ Ready for PowerPoint |
##
## All PNGs saved to /visuals folder.
## Insert into PowerPoint via: Insert → Pictures → This Device
All six presentation visuals complete. Drop the PNGs from the
/visuals folder directly into your PowerPoint
template.