This report documents the full cleaning pipeline applied to the Loan Approval Dataset prior to exploratory analysis and modeling. All steps are reproducible — simply re-knit with a fresh dataset.
Cleaning workflow:
## Rows: 4269 | Columns: 13
## Rows: 4,269
## Columns: 13
## $ loan_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ no_of_dependents <dbl> 2, 0, 3, 3, 5, 0, 5, 2, 0, 5, 4, 2, 3, 2, 1, …
## $ education <chr> "Graduate", "Not Graduate", "Graduate", "Grad…
## $ self_employed <chr> "No", "Yes", "No", "No", "Yes", "Yes", "No", …
## $ income_annum <dbl> 9600000, 4100000, 9100000, 8200000, 9800000, …
## $ loan_amount <dbl> 29900000, 12200000, 29700000, 30700000, 24200…
## $ loan_term <dbl> 12, 8, 20, 8, 20, 10, 4, 20, 20, 10, 2, 18, 1…
## $ cibil_score <dbl> 778, 417, 506, 467, 382, 319, 678, 382, 782, …
## $ residential_assets_value <dbl> 2400000, 2700000, 7100000, 18200000, 12400000…
## $ commercial_assets_value <dbl> 17600000, 2200000, 4500000, 3300000, 8200000,…
## $ luxury_assets_value <dbl> 22700000, 8800000, 33300000, 23300000, 294000…
## $ bank_asset_value <dbl> 8000000, 3300000, 12800000, 7900000, 5000000,…
## $ loan_status <chr> "Approved", "Rejected", "Rejected", "Rejected…
Note: Column names have leading whitespace in the raw file — fixed in Section 2.
clean_names() strips whitespace, lowercases everything,
and enforces snake_case. loan_id is dropped — it carries no
analytical value.
## [1] "no_of_dependents" "education"
## [3] "self_employed" "income_annum"
## [5] "loan_amount" "loan_term"
## [7] "cibil_score" "residential_assets_value"
## [9] "commercial_assets_value" "luxury_assets_value"
## [11] "bank_asset_value" "loan_status"
Categorical columns are converted to factors.
loan_status is set with Rejected as the
reference level, which is the correct orientation for logistic
regression.
df <- df %>%
mutate(
# Categorical → Factor
education = factor(trimws(education)),
self_employed = factor(trimws(self_employed)),
loan_status = factor(trimws(loan_status),
levels = c("Rejected", "Approved")),
# Numeric safety checks
no_of_dependents = as.integer(no_of_dependents),
income_annum = as.numeric(income_annum),
loan_amount = as.numeric(loan_amount),
loan_term = as.integer(loan_term),
cibil_score = as.numeric(cibil_score),
residential_assets_value = as.numeric(residential_assets_value),
commercial_assets_value = as.numeric(commercial_assets_value),
luxury_assets_value = as.numeric(luxury_assets_value),
bank_asset_value = as.numeric(bank_asset_value)
)sapply(df, class) %>%
enframe(name = "column", value = "type") %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| column | type |
|---|---|
| no_of_dependents | integer |
| education | factor |
| self_employed | factor |
| income_annum | numeric |
| loan_amount | numeric |
| loan_term | integer |
| cibil_score | numeric |
| residential_assets_value | numeric |
| commercial_assets_value | numeric |
| luxury_assets_value | numeric |
| bank_asset_value | numeric |
| loan_status | factor |
missing_summary <- df %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
mutate(pct_missing = round(n_missing / nrow(df) * 100, 2))
missing_summary %>%
kable(col.names = c("Column", "Missing (n)", "Missing (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(missing_summary$n_missing > 0), background = "#fff3cd")| Column | Missing (n) | Missing (%) |
|---|---|---|
| no_of_dependents | 0 | 0 |
| education | 0 | 0 |
| self_employed | 0 | 0 |
| income_annum | 0 | 0 |
| loan_amount | 0 | 0 |
| loan_term | 0 | 0 |
| cibil_score | 0 | 0 |
| residential_assets_value | 0 | 0 |
| commercial_assets_value | 0 | 0 |
| luxury_assets_value | 0 | 0 |
| bank_asset_value | 0 | 0 |
| loan_status | 0 | 0 |
has_missing <- missing_summary %>% filter(n_missing > 0)
if (nrow(has_missing) == 0) {
cat("✓ No missing values found. No imputation required.")
} else {
DROP_THRESHOLD <- 40
cols_to_drop <- has_missing %>%
filter(pct_missing > DROP_THRESHOLD) %>%
pull(column)
if (length(cols_to_drop) > 0) {
cat("Dropping columns with >", DROP_THRESHOLD, "% missing:", cols_to_drop)
df <- df %>% select(-all_of(cols_to_drop))
}
# Impute numeric with median
df <- df %>%
mutate(across(where(is.numeric),
~ if_else(is.na(.), median(., na.rm = TRUE), .)))
# Impute factor with mode
impute_mode <- function(x) {
mode_val <- names(sort(table(x), decreasing = TRUE))[1]
fct_na_value_to_level(x, mode_val)
}
df <- df %>%
mutate(across(where(is.factor),
~ if (any(is.na(.))) impute_mode(.) else .))
cat("✓ Imputation complete — numeric columns: median | factor columns: mode")
}## ✓ No missing values found. No imputation required.
n_before <- nrow(df)
df <- df %>% distinct()
n_after <- nrow(df)
tibble(
`Rows Before` = n_before,
`Rows After` = n_after,
`Removed` = n_before - n_after
) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Rows Before | Rows After | Removed |
|---|---|---|
| 4269 | 4269 | 0 |
Strategy: IQR-based Winsorization on all financial
columns. Values beyond [Q1 − 1.5×IQR, Q3 + 1.5×IQR] are
capped rather than removed — preserving sample size while limiting the
influence of extreme values.
CIBIL score is hard-clipped to its defined range of 300–900.
cap_outliers_iqr <- function(x) {
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
pmin(pmax(x, lower), upper)
}
financial_cols <- c(
"income_annum", "loan_amount",
"residential_assets_value", "commercial_assets_value",
"luxury_assets_value", "bank_asset_value"
)
outlier_log <- map_dfr(financial_cols, function(col) {
if (!col %in% names(df)) return(NULL)
n_out <- sum(
df[[col]] < quantile(df[[col]], 0.25) - 1.5 * IQR(df[[col]]) |
df[[col]] > quantile(df[[col]], 0.75) + 1.5 * IQR(df[[col]]),
na.rm = TRUE
)
df[[col]] <<- cap_outliers_iqr(df[[col]])
tibble(column = col, outliers_capped = n_out)
})
# Hard-clip CIBIL score
df <- df %>%
mutate(cibil_score = pmin(pmax(cibil_score, 300), 900))
outlier_log %>%
kable(col.names = c("Column", "Outliers Capped")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Column | Outliers Capped |
|---|---|
| income_annum | 0 |
| loan_amount | 0 |
| residential_assets_value | 52 |
| commercial_assets_value | 37 |
| luxury_assets_value | 0 |
| bank_asset_value | 5 |
New features derived from the raw columns — these will be key variables in both modeling and visualization.
df <- df %>%
mutate(
# Classic lending risk metric
debt_to_income = round(loan_amount / income_annum, 4),
# Sum of all asset columns
total_assets = residential_assets_value + commercial_assets_value +
luxury_assets_value + bank_asset_value,
# How much of total assets the loan represents
loan_to_asset = round(loan_amount / (total_assets + 1), 4),
# Bucketed CIBIL score — useful for bar charts & group comparisons
cibil_band = cut(cibil_score,
breaks = c(300, 500, 600, 700, 800, 900),
labels = c("Poor (300-500)", "Fair (500-600)",
"Good (600-700)", "Very Good (700-800)",
"Excellent (800-900)"),
include.lowest = TRUE),
# Income tercile bracket
income_bracket = cut(income_annum,
breaks = quantile(income_annum,
probs = c(0, 0.33, 0.66, 1),
na.rm = TRUE),
labels = c("Low", "Mid", "High"),
include.lowest = TRUE)
)| Feature | Description |
|---|---|
debt_to_income |
loan_amount / income_annum — core risk signal |
total_assets |
Sum of all 4 asset value columns |
loan_to_asset |
loan_amount / total_assets — leverage ratio |
cibil_band |
CIBIL score bucketed into 5 credit tiers |
income_bracket |
Income split into Low / Mid / High terciles |
## Final dataset: 4269 rows × 17 columns
df %>%
count(loan_status) %>%
mutate(pct = round(n / sum(n) * 100, 1)) %>%
kable(col.names = c("Loan Status", "Count", "%")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Loan Status | Count | % |
|---|---|---|
| Rejected | 1613 | 37.8 |
| Approved | 2656 | 62.2 |
df %>%
select(where(is.factor)) %>%
sapply(levels) %>%
enframe(name = "column", value = "levels") %>%
mutate(levels = map_chr(levels, paste, collapse = " | ")) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| column | levels |
|---|---|
| education | Graduate | Not Graduate |
| self_employed | No | Yes |
| loan_status | Rejected | Approved |
| cibil_band | Poor (300-500) | Fair (500-600) | Good (600-700) | Very Good (700-800) | Excellent (800-900) |
| income_bracket | Low | Mid | High |
df %>%
select(where(is.numeric)) %>%
skim() %>%
as_tibble() %>%
select(skim_variable, numeric.mean, numeric.sd, numeric.p0, numeric.p50, numeric.p100) %>%
rename(variable = skim_variable, mean = numeric.mean, sd = numeric.sd,
min = numeric.p0, median = numeric.p50, max = numeric.p100) %>%
mutate(across(where(is.numeric), ~ round(., 2))) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| variable | mean | sd | min | median | max |
|---|---|---|---|---|---|
| no_of_dependents | 2.50 | 1.70 | 0.0e+00 | 3.00e+00 | 5.000e+00 |
| income_annum | 5059123.92 | 2806839.83 | 2.0e+05 | 5.10e+06 | 9.900e+06 |
| loan_amount | 15133450.46 | 9043362.98 | 3.0e+05 | 1.45e+07 | 3.950e+07 |
| loan_term | 10.90 | 5.71 | 2.0e+00 | 1.00e+01 | 2.000e+01 |
| cibil_score | 599.94 | 172.43 | 3.0e+02 | 6.00e+02 | 9.000e+02 |
| residential_assets_value | 7454860.62 | 6452441.57 | -1.0e+05 | 5.60e+06 | 2.495e+07 |
| commercial_assets_value | 4964289.06 | 4363080.45 | 0.0e+00 | 3.70e+06 | 1.705e+07 |
| luxury_assets_value | 15126305.93 | 9103753.67 | 3.0e+05 | 1.46e+07 | 3.920e+07 |
| bank_asset_value | 4976341.06 | 3249158.58 | 0.0e+00 | 4.60e+06 | 1.430e+07 |
| debt_to_income | 2.98 | 0.60 | 1.5e+00 | 3.00e+00 | 4.000e+00 |
| total_assets | 32521796.67 | 19454084.10 | 4.0e+05 | 3.15e+07 | 8.655e+07 |
| loan_to_asset | 0.49 | 0.15 | 1.8e-01 | 4.70e-01 | 1.330e+00 |