Overview

This report documents the full cleaning pipeline applied to the Loan Approval Dataset prior to exploratory analysis and modeling. All steps are reproducible — simply re-knit with a fresh dataset.

Cleaning workflow:

  1. Load & inspect raw data
  2. Fix column names & drop unused columns
  3. Enforce correct data types
  4. Audit & handle missing values
  5. Remove duplicate rows
  6. Detect & cap outliers
  7. Engineer new features
  8. Save clean output

Setup

library(tidyverse)
library(janitor)
library(skimr)
library(kableExtra)

1. Load Data

# !! Update path to match where your dataset lives !!
raw <- read_csv("loan_approval_dataset.csv")
cat("Rows:", nrow(raw), "| Columns:", ncol(raw))
## Rows: 4269 | Columns: 13
glimpse(raw)
## Rows: 4,269
## Columns: 13
## $ loan_id                  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ no_of_dependents         <dbl> 2, 0, 3, 3, 5, 0, 5, 2, 0, 5, 4, 2, 3, 2, 1, …
## $ education                <chr> "Graduate", "Not Graduate", "Graduate", "Grad…
## $ self_employed            <chr> "No", "Yes", "No", "No", "Yes", "Yes", "No", …
## $ income_annum             <dbl> 9600000, 4100000, 9100000, 8200000, 9800000, …
## $ loan_amount              <dbl> 29900000, 12200000, 29700000, 30700000, 24200…
## $ loan_term                <dbl> 12, 8, 20, 8, 20, 10, 4, 20, 20, 10, 2, 18, 1…
## $ cibil_score              <dbl> 778, 417, 506, 467, 382, 319, 678, 382, 782, …
## $ residential_assets_value <dbl> 2400000, 2700000, 7100000, 18200000, 12400000…
## $ commercial_assets_value  <dbl> 17600000, 2200000, 4500000, 3300000, 8200000,…
## $ luxury_assets_value      <dbl> 22700000, 8800000, 33300000, 23300000, 294000…
## $ bank_asset_value         <dbl> 8000000, 3300000, 12800000, 7900000, 5000000,…
## $ loan_status              <chr> "Approved", "Rejected", "Rejected", "Rejected…

Note: Column names have leading whitespace in the raw file — fixed in Section 2.


2. Fix Column Names & Drop Unused Columns

clean_names() strips whitespace, lowercases everything, and enforces snake_case. loan_id is dropped — it carries no analytical value.

df <- raw %>%
  clean_names() %>%
  select(-loan_id)

names(df)
##  [1] "no_of_dependents"         "education"               
##  [3] "self_employed"            "income_annum"            
##  [5] "loan_amount"              "loan_term"               
##  [7] "cibil_score"              "residential_assets_value"
##  [9] "commercial_assets_value"  "luxury_assets_value"     
## [11] "bank_asset_value"         "loan_status"

3. Data Types

Categorical columns are converted to factors. loan_status is set with Rejected as the reference level, which is the correct orientation for logistic regression.

df <- df %>%
  mutate(
    # Categorical → Factor
    education     = factor(trimws(education)),
    self_employed = factor(trimws(self_employed)),
    loan_status   = factor(trimws(loan_status),
                           levels = c("Rejected", "Approved")),

    # Numeric safety checks
    no_of_dependents         = as.integer(no_of_dependents),
    income_annum             = as.numeric(income_annum),
    loan_amount              = as.numeric(loan_amount),
    loan_term                = as.integer(loan_term),
    cibil_score              = as.numeric(cibil_score),
    residential_assets_value = as.numeric(residential_assets_value),
    commercial_assets_value  = as.numeric(commercial_assets_value),
    luxury_assets_value      = as.numeric(luxury_assets_value),
    bank_asset_value         = as.numeric(bank_asset_value)
  )
sapply(df, class) %>%
  enframe(name = "column", value = "type") %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
column type
no_of_dependents integer
education factor
self_employed factor
income_annum numeric
loan_amount numeric
loan_term integer
cibil_score numeric
residential_assets_value numeric
commercial_assets_value numeric
luxury_assets_value numeric
bank_asset_value numeric
loan_status factor

4. Missing Values

missing_summary <- df %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
  mutate(pct_missing = round(n_missing / nrow(df) * 100, 2))

missing_summary %>%
  kable(col.names = c("Column", "Missing (n)", "Missing (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(missing_summary$n_missing > 0), background = "#fff3cd")
Column Missing (n) Missing (%)
no_of_dependents 0 0
education 0 0
self_employed 0 0
income_annum 0 0
loan_amount 0 0
loan_term 0 0
cibil_score 0 0
residential_assets_value 0 0
commercial_assets_value 0 0
luxury_assets_value 0 0
bank_asset_value 0 0
loan_status 0 0
has_missing <- missing_summary %>% filter(n_missing > 0)

if (nrow(has_missing) == 0) {
  cat("✓ No missing values found. No imputation required.")
} else {
  DROP_THRESHOLD <- 40

  cols_to_drop <- has_missing %>%
    filter(pct_missing > DROP_THRESHOLD) %>%
    pull(column)

  if (length(cols_to_drop) > 0) {
    cat("Dropping columns with >", DROP_THRESHOLD, "% missing:", cols_to_drop)
    df <- df %>% select(-all_of(cols_to_drop))
  }

  # Impute numeric with median
  df <- df %>%
    mutate(across(where(is.numeric),
                  ~ if_else(is.na(.), median(., na.rm = TRUE), .)))

  # Impute factor with mode
  impute_mode <- function(x) {
    mode_val <- names(sort(table(x), decreasing = TRUE))[1]
    fct_na_value_to_level(x, mode_val)
  }
  df <- df %>%
    mutate(across(where(is.factor),
                  ~ if (any(is.na(.))) impute_mode(.) else .))

  cat("✓ Imputation complete — numeric columns: median | factor columns: mode")
}
## ✓ No missing values found. No imputation required.

5. Duplicate Rows

n_before <- nrow(df)
df       <- df %>% distinct()
n_after  <- nrow(df)

tibble(
  `Rows Before` = n_before,
  `Rows After`  = n_after,
  `Removed`     = n_before - n_after
) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Rows Before Rows After Removed
4269 4269 0

6. Outlier Detection & Capping

Strategy: IQR-based Winsorization on all financial columns. Values beyond [Q1 − 1.5×IQR, Q3 + 1.5×IQR] are capped rather than removed — preserving sample size while limiting the influence of extreme values.

CIBIL score is hard-clipped to its defined range of 300–900.

cap_outliers_iqr <- function(x) {
  q1    <- quantile(x, 0.25, na.rm = TRUE)
  q3    <- quantile(x, 0.75, na.rm = TRUE)
  iqr   <- q3 - q1
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr
  pmin(pmax(x, lower), upper)
}

financial_cols <- c(
  "income_annum", "loan_amount",
  "residential_assets_value", "commercial_assets_value",
  "luxury_assets_value", "bank_asset_value"
)

outlier_log <- map_dfr(financial_cols, function(col) {
  if (!col %in% names(df)) return(NULL)
  n_out <- sum(
    df[[col]] < quantile(df[[col]], 0.25) - 1.5 * IQR(df[[col]]) |
    df[[col]] > quantile(df[[col]], 0.75) + 1.5 * IQR(df[[col]]),
    na.rm = TRUE
  )
  df[[col]] <<- cap_outliers_iqr(df[[col]])
  tibble(column = col, outliers_capped = n_out)
})

# Hard-clip CIBIL score
df <- df %>%
  mutate(cibil_score = pmin(pmax(cibil_score, 300), 900))

outlier_log %>%
  kable(col.names = c("Column", "Outliers Capped")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Column Outliers Capped
income_annum 0
loan_amount 0
residential_assets_value 52
commercial_assets_value 37
luxury_assets_value 0
bank_asset_value 5

7. Feature Engineering

New features derived from the raw columns — these will be key variables in both modeling and visualization.

df <- df %>%
  mutate(
    # Classic lending risk metric
    debt_to_income = round(loan_amount / income_annum, 4),

    # Sum of all asset columns
    total_assets = residential_assets_value + commercial_assets_value +
                   luxury_assets_value + bank_asset_value,

    # How much of total assets the loan represents
    loan_to_asset = round(loan_amount / (total_assets + 1), 4),

    # Bucketed CIBIL score — useful for bar charts & group comparisons
    cibil_band = cut(cibil_score,
                     breaks = c(300, 500, 600, 700, 800, 900),
                     labels = c("Poor (300-500)", "Fair (500-600)",
                                "Good (600-700)", "Very Good (700-800)",
                                "Excellent (800-900)"),
                     include.lowest = TRUE),

    # Income tercile bracket
    income_bracket = cut(income_annum,
                         breaks = quantile(income_annum,
                                           probs = c(0, 0.33, 0.66, 1),
                                           na.rm = TRUE),
                         labels = c("Low", "Mid", "High"),
                         include.lowest = TRUE)
  )
Feature Description
debt_to_income loan_amount / income_annum — core risk signal
total_assets Sum of all 4 asset value columns
loan_to_asset loan_amount / total_assets — leverage ratio
cibil_band CIBIL score bucketed into 5 credit tiers
income_bracket Income split into Low / Mid / High terciles

8. Final Validation

Dimensions

cat("Final dataset:", nrow(df), "rows ×", ncol(df), "columns")
## Final dataset: 4269 rows × 17 columns

Target Variable Distribution

df %>%
  count(loan_status) %>%
  mutate(pct = round(n / sum(n) * 100, 1)) %>%
  kable(col.names = c("Loan Status", "Count", "%")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Loan Status Count %
Rejected 1613 37.8
Approved 2656 62.2

Factor Levels

df %>%
  select(where(is.factor)) %>%
  sapply(levels) %>%
  enframe(name = "column", value = "levels") %>%
  mutate(levels = map_chr(levels, paste, collapse = " | ")) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
column levels
education Graduate &#124; Not Graduate
self_employed No &#124; Yes
loan_status Rejected &#124; Approved
cibil_band Poor (300-500) &#124; Fair (500-600) &#124; Good (600-700) &#124; Very Good (700-800) &#124; Excellent (800-900)
income_bracket Low &#124; Mid &#124; High

Numeric Summary

df %>%
  select(where(is.numeric)) %>%
  skim() %>%
  as_tibble() %>%
  select(skim_variable, numeric.mean, numeric.sd, numeric.p0, numeric.p50, numeric.p100) %>%
  rename(variable = skim_variable, mean = numeric.mean, sd = numeric.sd,
         min = numeric.p0, median = numeric.p50, max = numeric.p100) %>%
  mutate(across(where(is.numeric), ~ round(., 2))) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
variable mean sd min median max
no_of_dependents 2.50 1.70 0.0e+00 3.00e+00 5.000e+00
income_annum 5059123.92 2806839.83 2.0e+05 5.10e+06 9.900e+06
loan_amount 15133450.46 9043362.98 3.0e+05 1.45e+07 3.950e+07
loan_term 10.90 5.71 2.0e+00 1.00e+01 2.000e+01
cibil_score 599.94 172.43 3.0e+02 6.00e+02 9.000e+02
residential_assets_value 7454860.62 6452441.57 -1.0e+05 5.60e+06 2.495e+07
commercial_assets_value 4964289.06 4363080.45 0.0e+00 3.70e+06 1.705e+07
luxury_assets_value 15126305.93 9103753.67 3.0e+05 1.46e+07 3.920e+07
bank_asset_value 4976341.06 3249158.58 0.0e+00 4.60e+06 1.430e+07
debt_to_income 2.98 0.60 1.5e+00 3.00e+00 4.000e+00
total_assets 32521796.67 19454084.10 4.0e+05 3.15e+07 8.655e+07
loan_to_asset 0.49 0.15 1.8e-01 4.70e-01 1.330e+00

9. Save Clean Data

save(df, file = "loan_clean.RData")
write_csv(df, "loan_clean.csv")

cat("✓ Saved: loan_clean.RData\n")
## ✓ Saved: loan_clean.RData
cat("✓ Saved: loan_clean.csv\n")
## ✓ Saved: loan_clean.csv
cat("\nLoad in next script with: load('loan_clean.RData')")
## 
## Load in next script with: load('loan_clean.RData')

Cleaning complete. Hand off df to 02_eda.Rmd or 03_modeling.Rmd.