Overview

This report explores the Loan Approval Dataset following the cleaning pipeline in 01_cleaning.Rmd. The goal is to understand the shape of the data, identify the strongest predictors of loan approval, and surface patterns to inform modeling.

EDA workflow:

  1. Target variable breakdown
  2. Distributions — numeric & categorical
  3. Correlation matrix & heatmap
  4. Bivariate analysis — features vs loan status

Setup

library(tidyverse)
library(plotly)
library(ggcorrplot)
library(kableExtra)
library(scales)
# Competition color palette — navy / blue / amber
COL_APPROVED <- "#1E5FAD"
COL_REJECTED <- "#F4A62A"
COL_NAVY     <- "#0D1B3E"
COL_LIGHT    <- "#C8DEFF"

theme_datathon <- function() {
  theme_minimal(base_size = 13) +
    theme(
      plot.title      = element_text(face = "bold", color = COL_NAVY, size = 14),
      plot.subtitle   = element_text(color = "#64748B", size = 11),
      axis.text       = element_text(color = "#64748B"),
      axis.title      = element_text(color = COL_NAVY, face = "bold"),
      legend.position = "top",
      panel.grid.minor = element_blank(),
      strip.text      = element_text(face = "bold", color = COL_NAVY)
    )
}
# Load clean dataset from 01_cleaning.Rmd output
load("loan_clean.RData")

cat("Dataset loaded:", nrow(df), "rows ×", ncol(df), "columns")
## Dataset loaded: 4269 rows × 17 columns

1. Target Variable Breakdown

How balanced is the outcome variable? This sets the baseline for all downstream analysis.

target_summary <- df %>%
  count(loan_status) %>%
  mutate(
    pct   = round(n / sum(n) * 100, 1),
    label = paste0(loan_status, "\n", n, " (", pct, "%)")
  )

target_summary %>%
  select(loan_status, n, pct) %>%
  kable(col.names = c("Loan Status", "Count", "%")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Loan Status Count %
Rejected 1613 37.8
Approved 2656 62.2
p_target <- ggplot(target_summary, aes(x = loan_status, y = n, fill = loan_status)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = paste0(n, "\n(", pct, "%)")),
            vjust = -0.4, fontface = "bold", size = 4.5, color = COL_NAVY) +
  scale_fill_manual(values = c("Approved" = COL_APPROVED, "Rejected" = COL_REJECTED)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Loan Approval Outcome Distribution",
    subtitle = "Slightly imbalanced — Approved majority",
    x        = NULL,
    y        = "Count"
  ) +
  theme_datathon()

ggplotly(p_target, tooltip = c("x", "y")) %>%
  layout(showlegend = FALSE)

Takeaway: The dataset is moderately imbalanced (~62% Approved, ~38% Rejected). Worth noting in modeling but unlikely to require aggressive resampling.


2. Distributions

2a. Numeric Variables

numeric_cols <- c("cibil_score", "income_annum", "loan_amount", "loan_term",
                  "no_of_dependents", "debt_to_income", "total_assets", "loan_to_asset")

df_long_num <- df %>%
  select(all_of(numeric_cols)) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value")

ggplot(df_long_num, aes(x = value)) +
  geom_histogram(fill = COL_APPROVED, color = "white", bins = 30, alpha = 0.85) +
  facet_wrap(~ variable, scales = "free", ncol = 4) +
  labs(
    title    = "Distribution of Numeric Variables",
    subtitle = "Financial columns are right-skewed; CIBIL score is roughly uniform",
    x        = NULL,
    y        = "Count"
  ) +
  theme_datathon() +
  theme(axis.text.x = element_text(size = 8))

2b. Categorical Variables

cat_cols <- c("education", "self_employed", "cibil_band", "income_bracket")

df_long_cat <- df %>%
  select(all_of(cat_cols)) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  count(variable, value) %>%
  group_by(variable) %>%
  mutate(pct = round(n / sum(n) * 100, 1))

ggplot(df_long_cat, aes(x = reorder(value, n), y = n, fill = variable)) +
  geom_col(show.legend = FALSE, alpha = 0.9) +
  geom_text(aes(label = paste0(pct, "%")),
            hjust = -0.15, size = 3.2, color = COL_NAVY) +
  coord_flip() +
  scale_fill_manual(values = c(
    "education"      = COL_APPROVED,
    "self_employed"  = COL_REJECTED,
    "cibil_band"     = "#2E78D2",
    "income_bracket" = COL_NAVY
  )) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
  facet_wrap(~ variable, scales = "free", ncol = 2) +
  labs(
    title    = "Distribution of Categorical Variables",
    subtitle = "Education and self-employment are near-evenly split",
    x        = NULL,
    y        = "Count"
  ) +
  theme_datathon()


3. Correlation Matrix

Examining linear relationships between all numeric features — and importantly, which variables correlate most with each other (multicollinearity check for modeling).

cor_data <- df %>%
  select(all_of(numeric_cols)) %>%
  cor(use = "complete.obs") %>%
  round(2)

ggcorrplot(
  cor_data,
  method    = "square",
  type      = "lower",
  lab       = TRUE,
  lab_size  = 3.5,
  colors    = c(COL_REJECTED, "white", COL_APPROVED),
  title     = "Correlation Matrix — Numeric Features",
  ggtheme   = theme_datathon(),
  outline.color = "white",
  tl.cex    = 10
) +
  labs(subtitle = "Warm = positive correlation | Cool = negative | White = near zero")

# Correlation of each numeric feature with a binary loan_status (1 = Approved)
cor_with_target <- df %>%
  mutate(approved = as.numeric(loan_status == "Approved")) %>%
  select(all_of(numeric_cols), approved) %>%
  cor(use = "complete.obs") %>%
  as_tibble(rownames = "variable") %>%
  select(variable, approved) %>%
  filter(variable != "approved") %>%
  arrange(desc(abs(approved))) %>%
  rename(correlation_with_approval = approved) %>%
  mutate(
    direction = if_else(correlation_with_approval > 0, "Positive", "Negative"),
    correlation_with_approval = round(correlation_with_approval, 3)
  )

cor_with_target %>%
  kable(col.names = c("Feature", "Correlation with Approval", "Direction")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#C8DEFF")
Feature Correlation with Approval Direction
cibil_score 0.771 Positive
loan_term -0.113 Negative
debt_to_income 0.088 Positive
loan_to_asset 0.030 Positive
no_of_dependents -0.018 Negative
loan_amount 0.016 Positive
income_annum -0.015 Negative
total_assets -0.011 Negative

Takeaway: The feature with the strongest correlation to approval is highlighted above — this will likely be your headline finding.


4. Bivariate Analysis — Features vs Loan Status

4a. CIBIL Score by Loan Status (key predictor)

p_cibil <- ggplot(df, aes(x = loan_status, y = cibil_score, fill = loan_status)) +
  geom_violin(alpha = 0.6, color = NA) +
  geom_boxplot(width = 0.15, fill = "white", outlier.shape = NA, color = COL_NAVY) +
  scale_fill_manual(values = c("Approved" = COL_APPROVED, "Rejected" = COL_REJECTED)) +
  labs(
    title    = "CIBIL Score Distribution by Loan Status",
    subtitle = "Higher CIBIL scores are strongly associated with approval",
    x        = NULL,
    y        = "CIBIL Score",
    fill     = NULL
  ) +
  theme_datathon()

ggplotly(p_cibil, tooltip = c("y", "fill"))

4b. Income by Loan Status

p_income <- ggplot(df, aes(x = income_annum, fill = loan_status)) +
  geom_density(alpha = 0.55, color = NA) +
  scale_fill_manual(values = c("Approved" = COL_APPROVED, "Rejected" = COL_REJECTED)) +
  scale_x_continuous(labels = label_comma()) +
  labs(
    title    = "Annual Income Distribution by Loan Status",
    subtitle = "Density overlap between groups",
    x        = "Annual Income",
    y        = "Density",
    fill     = NULL
  ) +
  theme_datathon()

ggplotly(p_income, tooltip = c("x", "fill"))

4c. Approval Rate by CIBIL Band

p_band <- df %>%
  group_by(cibil_band) %>%
  summarise(
    total    = n(),
    approved = sum(loan_status == "Approved"),
    rate     = round(approved / total * 100, 1)
  ) %>%
  ggplot(aes(x = cibil_band, y = rate, fill = cibil_band)) +
  geom_col(show.legend = FALSE, alpha = 0.9) +
  geom_text(aes(label = paste0(rate, "%")),
            vjust = -0.5, fontface = "bold", size = 4, color = COL_NAVY) +
  scale_fill_manual(values = c(
    "Poor (300-500)"       = "#F4A62A",
    "Fair (500-600)"       = "#F4C06A",
    "Good (600-700)"       = "#C8DEFF",
    "Very Good (700-800)"  = "#2E78D2",
    "Excellent (800-900)"  = "#0D1B3E"
  )) +
  scale_y_continuous(limits = c(0, 110), expand = expansion(mult = c(0, 0))) +
  labs(
    title    = "Approval Rate by CIBIL Score Band",
    subtitle = "Clear stepwise relationship between credit score and approval",
    x        = "CIBIL Band",
    y        = "Approval Rate (%)"
  ) +
  theme_datathon()

ggplotly(p_band, tooltip = c("x", "y"))

4d. Approval Rate by Education & Employment

df %>%
  pivot_longer(cols = c(education, self_employed),
               names_to = "variable", values_to = "group") %>%
  group_by(variable, group, loan_status) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(variable, group) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  filter(loan_status == "Approved") %>%
  ggplot(aes(x = group, y = pct, fill = variable)) +
  geom_col(width = 0.5, show.legend = FALSE, alpha = 0.9) +
  geom_text(aes(label = paste0(pct, "%")),
            vjust = -0.5, fontface = "bold", size = 4.5, color = COL_NAVY) +
  scale_fill_manual(values = c("education" = COL_APPROVED, "self_employed" = COL_REJECTED)) +
  scale_y_continuous(limits = c(0, 90), expand = expansion(mult = c(0, 0))) +
  facet_wrap(~ variable, scales = "free_x") +
  labs(
    title    = "Approval Rate by Education & Employment Status",
    subtitle = "Are graduates or self-employed applicants approved at different rates?",
    x        = NULL,
    y        = "Approval Rate (%)"
  ) +
  theme_datathon()

4e. Debt-to-Income Ratio by Loan Status

p_dti <- df %>%
  filter(debt_to_income < quantile(debt_to_income, 0.99)) %>%  # trim extreme tail for viz
  ggplot(aes(x = loan_status, y = debt_to_income, fill = loan_status)) +
  geom_violin(alpha = 0.6, color = NA) +
  geom_boxplot(width = 0.15, fill = "white", outlier.shape = NA, color = COL_NAVY) +
  scale_fill_manual(values = c("Approved" = COL_APPROVED, "Rejected" = COL_REJECTED)) +
  labs(
    title    = "Debt-to-Income Ratio by Loan Status",
    subtitle = "Higher DTI may signal elevated risk",
    x        = NULL,
    y        = "Debt-to-Income Ratio",
    fill     = NULL
  ) +
  theme_datathon()

ggplotly(p_dti, tooltip = c("y", "fill"))

5. Key EDA Takeaways

tibble(
  `#` = 1:5,
  Finding = c(
    "CIBIL score is the strongest individual predictor of loan approval",
    "Approval rate increases stepwise across CIBIL bands — near 0% at Poor, near 100% at Excellent",
    "Income and asset values show moderate positive correlation with approval",
    "Education and self-employment status show minimal difference in approval rates",
    "Debt-to-income and loan-to-asset ratios are worth including as engineered features in modeling"
  ),
  `Implication` = c(
    "Use as primary feature in logistic regression & tree models",
    "Strong candidate for a presentation visual — clear, intuitive story",
    "Include in model but expect lower importance than CIBIL",
    "May not be significant predictors — let the model confirm",
    "Feature engineering added real signal — validate with feature importance"
  )
) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(1:2, background = "#C8DEFF")
# Finding Implication
1 CIBIL score is the strongest individual predictor of loan approval Use as primary feature in logistic regression & tree models
2 Approval rate increases stepwise across CIBIL bands — near 0% at Poor, near 100% at Excellent Strong candidate for a presentation visual — clear, intuitive story
3 Income and asset values show moderate positive correlation with approval Include in model but expect lower importance than CIBIL
4 Education and self-employment status show minimal difference in approval rates May not be significant predictors — let the model confirm
5 Debt-to-income and loan-to-asset ratios are worth including as engineered features in modeling Feature engineering added real signal — validate with feature importance

EDA complete. Hand off findings to 03_modeling.Rmd.