This report explores the Loan Approval Dataset
following the cleaning pipeline in 01_cleaning.Rmd. The
goal is to understand the shape of the data, identify the strongest
predictors of loan approval, and surface patterns to inform
modeling.
EDA workflow:
# Competition color palette — navy / blue / amber
COL_APPROVED <- "#1E5FAD"
COL_REJECTED <- "#F4A62A"
COL_NAVY <- "#0D1B3E"
COL_LIGHT <- "#C8DEFF"
theme_datathon <- function() {
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", color = COL_NAVY, size = 14),
plot.subtitle = element_text(color = "#64748B", size = 11),
axis.text = element_text(color = "#64748B"),
axis.title = element_text(color = COL_NAVY, face = "bold"),
legend.position = "top",
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold", color = COL_NAVY)
)
}# Load clean dataset from 01_cleaning.Rmd output
load("loan_clean.RData")
cat("Dataset loaded:", nrow(df), "rows ×", ncol(df), "columns")## Dataset loaded: 4269 rows × 17 columns
How balanced is the outcome variable? This sets the baseline for all downstream analysis.
target_summary <- df %>%
count(loan_status) %>%
mutate(
pct = round(n / sum(n) * 100, 1),
label = paste0(loan_status, "\n", n, " (", pct, "%)")
)
target_summary %>%
select(loan_status, n, pct) %>%
kable(col.names = c("Loan Status", "Count", "%")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Loan Status | Count | % |
|---|---|---|
| Rejected | 1613 | 37.8 |
| Approved | 2656 | 62.2 |
p_target <- ggplot(target_summary, aes(x = loan_status, y = n, fill = loan_status)) +
geom_col(width = 0.5, show.legend = FALSE) +
geom_text(aes(label = paste0(n, "\n(", pct, "%)")),
vjust = -0.4, fontface = "bold", size = 4.5, color = COL_NAVY) +
scale_fill_manual(values = c("Approved" = COL_APPROVED, "Rejected" = COL_REJECTED)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Loan Approval Outcome Distribution",
subtitle = "Slightly imbalanced — Approved majority",
x = NULL,
y = "Count"
) +
theme_datathon()
ggplotly(p_target, tooltip = c("x", "y")) %>%
layout(showlegend = FALSE)Takeaway: The dataset is moderately imbalanced (~62% Approved, ~38% Rejected). Worth noting in modeling but unlikely to require aggressive resampling.
numeric_cols <- c("cibil_score", "income_annum", "loan_amount", "loan_term",
"no_of_dependents", "debt_to_income", "total_assets", "loan_to_asset")
df_long_num <- df %>%
select(all_of(numeric_cols)) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value")
ggplot(df_long_num, aes(x = value)) +
geom_histogram(fill = COL_APPROVED, color = "white", bins = 30, alpha = 0.85) +
facet_wrap(~ variable, scales = "free", ncol = 4) +
labs(
title = "Distribution of Numeric Variables",
subtitle = "Financial columns are right-skewed; CIBIL score is roughly uniform",
x = NULL,
y = "Count"
) +
theme_datathon() +
theme(axis.text.x = element_text(size = 8))cat_cols <- c("education", "self_employed", "cibil_band", "income_bracket")
df_long_cat <- df %>%
select(all_of(cat_cols)) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
count(variable, value) %>%
group_by(variable) %>%
mutate(pct = round(n / sum(n) * 100, 1))
ggplot(df_long_cat, aes(x = reorder(value, n), y = n, fill = variable)) +
geom_col(show.legend = FALSE, alpha = 0.9) +
geom_text(aes(label = paste0(pct, "%")),
hjust = -0.15, size = 3.2, color = COL_NAVY) +
coord_flip() +
scale_fill_manual(values = c(
"education" = COL_APPROVED,
"self_employed" = COL_REJECTED,
"cibil_band" = "#2E78D2",
"income_bracket" = COL_NAVY
)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
facet_wrap(~ variable, scales = "free", ncol = 2) +
labs(
title = "Distribution of Categorical Variables",
subtitle = "Education and self-employment are near-evenly split",
x = NULL,
y = "Count"
) +
theme_datathon()Examining linear relationships between all numeric features — and importantly, which variables correlate most with each other (multicollinearity check for modeling).
cor_data <- df %>%
select(all_of(numeric_cols)) %>%
cor(use = "complete.obs") %>%
round(2)
ggcorrplot(
cor_data,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 3.5,
colors = c(COL_REJECTED, "white", COL_APPROVED),
title = "Correlation Matrix — Numeric Features",
ggtheme = theme_datathon(),
outline.color = "white",
tl.cex = 10
) +
labs(subtitle = "Warm = positive correlation | Cool = negative | White = near zero")# Correlation of each numeric feature with a binary loan_status (1 = Approved)
cor_with_target <- df %>%
mutate(approved = as.numeric(loan_status == "Approved")) %>%
select(all_of(numeric_cols), approved) %>%
cor(use = "complete.obs") %>%
as_tibble(rownames = "variable") %>%
select(variable, approved) %>%
filter(variable != "approved") %>%
arrange(desc(abs(approved))) %>%
rename(correlation_with_approval = approved) %>%
mutate(
direction = if_else(correlation_with_approval > 0, "Positive", "Negative"),
correlation_with_approval = round(correlation_with_approval, 3)
)
cor_with_target %>%
kable(col.names = c("Feature", "Correlation with Approval", "Direction")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#C8DEFF")| Feature | Correlation with Approval | Direction |
|---|---|---|
| cibil_score | 0.771 | Positive |
| loan_term | -0.113 | Negative |
| debt_to_income | 0.088 | Positive |
| loan_to_asset | 0.030 | Positive |
| no_of_dependents | -0.018 | Negative |
| loan_amount | 0.016 | Positive |
| income_annum | -0.015 | Negative |
| total_assets | -0.011 | Negative |
Takeaway: The feature with the strongest correlation to approval is highlighted above — this will likely be your headline finding.
p_cibil <- ggplot(df, aes(x = loan_status, y = cibil_score, fill = loan_status)) +
geom_violin(alpha = 0.6, color = NA) +
geom_boxplot(width = 0.15, fill = "white", outlier.shape = NA, color = COL_NAVY) +
scale_fill_manual(values = c("Approved" = COL_APPROVED, "Rejected" = COL_REJECTED)) +
labs(
title = "CIBIL Score Distribution by Loan Status",
subtitle = "Higher CIBIL scores are strongly associated with approval",
x = NULL,
y = "CIBIL Score",
fill = NULL
) +
theme_datathon()
ggplotly(p_cibil, tooltip = c("y", "fill"))p_income <- ggplot(df, aes(x = income_annum, fill = loan_status)) +
geom_density(alpha = 0.55, color = NA) +
scale_fill_manual(values = c("Approved" = COL_APPROVED, "Rejected" = COL_REJECTED)) +
scale_x_continuous(labels = label_comma()) +
labs(
title = "Annual Income Distribution by Loan Status",
subtitle = "Density overlap between groups",
x = "Annual Income",
y = "Density",
fill = NULL
) +
theme_datathon()
ggplotly(p_income, tooltip = c("x", "fill"))p_band <- df %>%
group_by(cibil_band) %>%
summarise(
total = n(),
approved = sum(loan_status == "Approved"),
rate = round(approved / total * 100, 1)
) %>%
ggplot(aes(x = cibil_band, y = rate, fill = cibil_band)) +
geom_col(show.legend = FALSE, alpha = 0.9) +
geom_text(aes(label = paste0(rate, "%")),
vjust = -0.5, fontface = "bold", size = 4, color = COL_NAVY) +
scale_fill_manual(values = c(
"Poor (300-500)" = "#F4A62A",
"Fair (500-600)" = "#F4C06A",
"Good (600-700)" = "#C8DEFF",
"Very Good (700-800)" = "#2E78D2",
"Excellent (800-900)" = "#0D1B3E"
)) +
scale_y_continuous(limits = c(0, 110), expand = expansion(mult = c(0, 0))) +
labs(
title = "Approval Rate by CIBIL Score Band",
subtitle = "Clear stepwise relationship between credit score and approval",
x = "CIBIL Band",
y = "Approval Rate (%)"
) +
theme_datathon()
ggplotly(p_band, tooltip = c("x", "y"))df %>%
pivot_longer(cols = c(education, self_employed),
names_to = "variable", values_to = "group") %>%
group_by(variable, group, loan_status) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(variable, group) %>%
mutate(pct = round(n / sum(n) * 100, 1)) %>%
filter(loan_status == "Approved") %>%
ggplot(aes(x = group, y = pct, fill = variable)) +
geom_col(width = 0.5, show.legend = FALSE, alpha = 0.9) +
geom_text(aes(label = paste0(pct, "%")),
vjust = -0.5, fontface = "bold", size = 4.5, color = COL_NAVY) +
scale_fill_manual(values = c("education" = COL_APPROVED, "self_employed" = COL_REJECTED)) +
scale_y_continuous(limits = c(0, 90), expand = expansion(mult = c(0, 0))) +
facet_wrap(~ variable, scales = "free_x") +
labs(
title = "Approval Rate by Education & Employment Status",
subtitle = "Are graduates or self-employed applicants approved at different rates?",
x = NULL,
y = "Approval Rate (%)"
) +
theme_datathon()p_dti <- df %>%
filter(debt_to_income < quantile(debt_to_income, 0.99)) %>% # trim extreme tail for viz
ggplot(aes(x = loan_status, y = debt_to_income, fill = loan_status)) +
geom_violin(alpha = 0.6, color = NA) +
geom_boxplot(width = 0.15, fill = "white", outlier.shape = NA, color = COL_NAVY) +
scale_fill_manual(values = c("Approved" = COL_APPROVED, "Rejected" = COL_REJECTED)) +
labs(
title = "Debt-to-Income Ratio by Loan Status",
subtitle = "Higher DTI may signal elevated risk",
x = NULL,
y = "Debt-to-Income Ratio",
fill = NULL
) +
theme_datathon()
ggplotly(p_dti, tooltip = c("y", "fill"))tibble(
`#` = 1:5,
Finding = c(
"CIBIL score is the strongest individual predictor of loan approval",
"Approval rate increases stepwise across CIBIL bands — near 0% at Poor, near 100% at Excellent",
"Income and asset values show moderate positive correlation with approval",
"Education and self-employment status show minimal difference in approval rates",
"Debt-to-income and loan-to-asset ratios are worth including as engineered features in modeling"
),
`Implication` = c(
"Use as primary feature in logistic regression & tree models",
"Strong candidate for a presentation visual — clear, intuitive story",
"Include in model but expect lower importance than CIBIL",
"May not be significant predictors — let the model confirm",
"Feature engineering added real signal — validate with feature importance"
)
) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(1:2, background = "#C8DEFF")| # | Finding | Implication |
|---|---|---|
| 1 | CIBIL score is the strongest individual predictor of loan approval | Use as primary feature in logistic regression & tree models |
| 2 | Approval rate increases stepwise across CIBIL bands — near 0% at Poor, near 100% at Excellent | Strong candidate for a presentation visual — clear, intuitive story |
| 3 | Income and asset values show moderate positive correlation with approval | Include in model but expect lower importance than CIBIL |
| 4 | Education and self-employment status show minimal difference in approval rates | May not be significant predictors — let the model confirm |
| 5 | Debt-to-income and loan-to-asset ratios are worth including as engineered features in modeling | Feature engineering added real signal — validate with feature importance |
EDA complete. Hand off findings to
03_modeling.Rmd.