Overview

This report documents the full cleaning pipeline applied to the Loan Approval Dataset prior to exploratory analysis and modeling. All steps are reproducible — simply re-knit with a fresh dataset.

Cleaning workflow:

Load & inspect raw data
Fix column names & drop unused columns
Enforce correct data types
Audit & handle missing values
Remove duplicate rows
Detect & cap outliers
Engineer new features
Save clean output

Setup

library(tidyverse)
library(janitor)
library(skimr)
library(kableExtra)

1. Load Data

# !! Update path to match where your dataset lives !!
raw <- read_csv("loan_approval_dataset.csv")

cat("Rows:", nrow(raw), "| Columns:", ncol(raw))

## Rows: 4269 | Columns: 13

glimpse(raw)

## Rows: 4,269
## Columns: 13
## $ loan_id                  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ no_of_dependents         <dbl> 2, 0, 3, 3, 5, 0, 5, 2, 0, 5, 4, 2, 3, 2, 1, …
## $ education                <chr> "Graduate", "Not Graduate", "Graduate", "Grad…
## $ self_employed            <chr> "No", "Yes", "No", "No", "Yes", "Yes", "No", …
## $ income_annum             <dbl> 9600000, 4100000, 9100000, 8200000, 9800000, …
## $ loan_amount              <dbl> 29900000, 12200000, 29700000, 30700000, 24200…
## $ loan_term                <dbl> 12, 8, 20, 8, 20, 10, 4, 20, 20, 10, 2, 18, 1…
## $ cibil_score              <dbl> 778, 417, 506, 467, 382, 319, 678, 382, 782, …
## $ residential_assets_value <dbl> 2400000, 2700000, 7100000, 18200000, 12400000…
## $ commercial_assets_value  <dbl> 17600000, 2200000, 4500000, 3300000, 8200000,…
## $ luxury_assets_value      <dbl> 22700000, 8800000, 33300000, 23300000, 294000…
## $ bank_asset_value         <dbl> 8000000, 3300000, 12800000, 7900000, 5000000,…
## $ loan_status              <chr> "Approved", "Rejected", "Rejected", "Rejected…

Note: Column names have leading whitespace in the raw file — fixed in Section 2.

2. Fix Column Names & Drop Unused Columns

clean_names() strips whitespace, lowercases everything, and enforces snake_case. loan_id is dropped — it carries no analytical value.

df <- raw %>%
  clean_names() %>%
  select(-loan_id)

names(df)

##  [1] "no_of_dependents"         "education"               
##  [3] "self_employed"            "income_annum"            
##  [5] "loan_amount"              "loan_term"               
##  [7] "cibil_score"              "residential_assets_value"
##  [9] "commercial_assets_value"  "luxury_assets_value"     
## [11] "bank_asset_value"         "loan_status"

3. Data Types

Categorical columns are converted to factors. loan_status is set with Rejected as the reference level, which is the correct orientation for logistic regression.

df <- df %>%
  mutate(
    # Categorical → Factor
    education     = factor(trimws(education)),
    self_employed = factor(trimws(self_employed)),
    loan_status   = factor(trimws(loan_status),
                           levels = c("Rejected", "Approved")),

    # Numeric safety checks
    no_of_dependents         = as.integer(no_of_dependents),
    income_annum             = as.numeric(income_annum),
    loan_amount              = as.numeric(loan_amount),
    loan_term                = as.integer(loan_term),
    cibil_score              = as.numeric(cibil_score),
    residential_assets_value = as.numeric(residential_assets_value),
    commercial_assets_value  = as.numeric(commercial_assets_value),
    luxury_assets_value      = as.numeric(luxury_assets_value),
    bank_asset_value         = as.numeric(bank_asset_value)
  )

sapply(df, class) %>%
  enframe(name = "column", value = "type") %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

column	type
no_of_dependents	integer
education	factor
self_employed	factor
income_annum	numeric
loan_amount	numeric
loan_term	integer
cibil_score	numeric
residential_assets_value	numeric
commercial_assets_value	numeric
luxury_assets_value	numeric
bank_asset_value	numeric
loan_status	factor

4. Missing Values

missing_summary <- df %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
  mutate(pct_missing = round(n_missing / nrow(df) * 100, 2))

missing_summary %>%
  kable(col.names = c("Column", "Missing (n)", "Missing (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(missing_summary$n_missing > 0), background = "#fff3cd")

Column	Missing (n)	Missing (%)
no_of_dependents	0	0
education	0	0
self_employed	0	0
income_annum	0	0
loan_amount	0	0
loan_term	0	0
cibil_score	0	0
residential_assets_value	0	0
commercial_assets_value	0	0
luxury_assets_value	0	0
bank_asset_value	0	0
loan_status	0	0

has_missing <- missing_summary %>% filter(n_missing > 0)

if (nrow(has_missing) == 0) {
  cat("✓ No missing values found. No imputation required.")
} else {
  DROP_THRESHOLD <- 40

  cols_to_drop <- has_missing %>%
    filter(pct_missing > DROP_THRESHOLD) %>%
    pull(column)

  if (length(cols_to_drop) > 0) {
    cat("Dropping columns with >", DROP_THRESHOLD, "% missing:", cols_to_drop)
    df <- df %>% select(-all_of(cols_to_drop))
  }

  # Impute numeric with median
  df <- df %>%
    mutate(across(where(is.numeric),
                  ~ if_else(is.na(.), median(., na.rm = TRUE), .)))

  # Impute factor with mode
  impute_mode <- function(x) {
    mode_val <- names(sort(table(x), decreasing = TRUE))[1]
    fct_na_value_to_level(x, mode_val)
  }
  df <- df %>%
    mutate(across(where(is.factor),
                  ~ if (any(is.na(.))) impute_mode(.) else .))

  cat("✓ Imputation complete — numeric columns: median | factor columns: mode")
}

## ✓ No missing values found. No imputation required.

5. Duplicate Rows

n_before <- nrow(df)
df       <- df %>% distinct()
n_after  <- nrow(df)

tibble(
  `Rows Before` = n_before,
  `Rows After`  = n_after,
  `Removed`     = n_before - n_after
) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Rows Before	Rows After	Removed
4269	4269	0

6. Outlier Detection & Capping

Strategy: IQR-based Winsorization on all financial columns. Values beyond [Q1 − 1.5×IQR, Q3 + 1.5×IQR] are capped rather than removed — preserving sample size while limiting the influence of extreme values.

CIBIL score is hard-clipped to its defined range of 300–900.

cap_outliers_iqr <- function(x) {
  q1    <- quantile(x, 0.25, na.rm = TRUE)
  q3    <- quantile(x, 0.75, na.rm = TRUE)
  iqr   <- q3 - q1
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr
  pmin(pmax(x, lower), upper)
}

financial_cols <- c(
  "income_annum", "loan_amount",
  "residential_assets_value", "commercial_assets_value",
  "luxury_assets_value", "bank_asset_value"
)

outlier_log <- map_dfr(financial_cols, function(col) {
  if (!col %in% names(df)) return(NULL)
  n_out <- sum(
    df[[col]] < quantile(df[[col]], 0.25) - 1.5 * IQR(df[[col]]) |
    df[[col]] > quantile(df[[col]], 0.75) + 1.5 * IQR(df[[col]]),
    na.rm = TRUE
  )
  df[[col]] <<- cap_outliers_iqr(df[[col]])
  tibble(column = col, outliers_capped = n_out)
})

# Hard-clip CIBIL score
df <- df %>%
  mutate(cibil_score = pmin(pmax(cibil_score, 300), 900))

outlier_log %>%
  kable(col.names = c("Column", "Outliers Capped")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Column	Outliers Capped
income_annum	0
loan_amount	0
residential_assets_value	52
commercial_assets_value	37
luxury_assets_value	0
bank_asset_value	5

7. Feature Engineering

New features derived from the raw columns — these will be key variables in both modeling and visualization.

df <- df %>%
  mutate(
    # Classic lending risk metric
    debt_to_income = round(loan_amount / income_annum, 4),

    # Sum of all asset columns
    total_assets = residential_assets_value + commercial_assets_value +
                   luxury_assets_value + bank_asset_value,

    # How much of total assets the loan represents
    loan_to_asset = round(loan_amount / (total_assets + 1), 4),

    # Bucketed CIBIL score — useful for bar charts & group comparisons
    cibil_band = cut(cibil_score,
                     breaks = c(300, 500, 600, 700, 800, 900),
                     labels = c("Poor (300-500)", "Fair (500-600)",
                                "Good (600-700)", "Very Good (700-800)",
                                "Excellent (800-900)"),
                     include.lowest = TRUE),

    # Income tercile bracket
    income_bracket = cut(income_annum,
                         breaks = quantile(income_annum,
                                           probs = c(0, 0.33, 0.66, 1),
                                           na.rm = TRUE),
                         labels = c("Low", "Mid", "High"),
                         include.lowest = TRUE)
  )

Feature	Description
`debt_to_income`	`loan_amount / income_annum` — core risk signal
`total_assets`	Sum of all 4 asset value columns
`loan_to_asset`	`loan_amount / total_assets` — leverage ratio
`cibil_band`	CIBIL score bucketed into 5 credit tiers
`income_bracket`	Income split into Low / Mid / High terciles

8. Final Validation

Dimensions

cat("Final dataset:", nrow(df), "rows ×", ncol(df), "columns")

## Final dataset: 4269 rows × 17 columns

Target Variable Distribution

df %>%
  count(loan_status) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  kable(col.names = c("Loan Status", "Count", "%")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Loan Status	Count	%
Rejected	1613	37.8
Approved	2656	62.2

Factor Levels

df %>%
  select(where(is.factor)) %>%
  sapply(levels) %>%
  enframe(name = "column", value = "levels") %>%
  mutate(levels = map_chr(levels, paste, collapse = " | ")) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

column	levels
education	Graduate \| Not Graduate
self_employed	No \| Yes
loan_status	Rejected \| Approved
cibil_band	Poor (300-500) \| Fair (500-600) \| Good (600-700) \| Very Good (700-800) \| Excellent (800-900)
income_bracket	Low \| Mid \| High

Numeric Summary

df %>%
  select(where(is.numeric)) %>%
  skim() %>%
  as_tibble() %>%
  select(skim_variable, numeric.mean, numeric.sd, numeric.p0, numeric.p50, numeric.p100) %>%
  rename(variable = skim_variable, mean = numeric.mean, sd = numeric.sd,
         min = numeric.p0, median = numeric.p50, max = numeric.p100) %>%
  mutate(across(where(is.numeric), ~ round(., 2))) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

variable	mean	sd	min	median	max
no_of_dependents	2.50	1.70	0.0e+00	3.00e+00	5.000e+00
income_annum	5059123.92	2806839.83	2.0e+05	5.10e+06	9.900e+06
loan_amount	15133450.46	9043362.98	3.0e+05	1.45e+07	3.950e+07
loan_term	10.90	5.71	2.0e+00	1.00e+01	2.000e+01
cibil_score	599.94	172.43	3.0e+02	6.00e+02	9.000e+02
residential_assets_value	7454860.62	6452441.57	-1.0e+05	5.60e+06	2.495e+07
commercial_assets_value	4964289.06	4363080.45	0.0e+00	3.70e+06	1.705e+07
luxury_assets_value	15126305.93	9103753.67	3.0e+05	1.46e+07	3.920e+07
bank_asset_value	4976341.06	3249158.58	0.0e+00	4.60e+06	1.430e+07
debt_to_income	2.98	0.60	1.5e+00	3.00e+00	4.000e+00
total_assets	32521796.67	19454084.10	4.0e+05	3.15e+07	8.655e+07
loan_to_asset	0.49	0.15	1.8e-01	4.70e-01	1.330e+00

9. Save Clean Data

save(df, file = "loan_clean.RData")
write_csv(df, "loan_clean.csv")

cat("✓ Saved: loan_clean.RData\n")

## ✓ Saved: loan_clean.RData

cat("✓ Saved: loan_clean.csv\n")

## ✓ Saved: loan_clean.csv

cat("\nLoad in next script with: load('loan_clean.RData')")

## 
## Load in next script with: load('loan_clean.RData')

Cleaning complete. Hand off df to 02_eda.Rmd or 03_modeling.Rmd.

Data Cleaning Report

Datathon Practice 2026 | Loan Approval Dataset

Madison G

2026-03-08