knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
# Step 1: Read in the Data
hmeq <- read.csv("/Users/dizzysleepyhead/Desktop/Tri/Spring\ 2025/DATA/Week\ 2\ Materials/HMEQ_WK02/HMEQ_Loss.csv", stringsAsFactors = FALSE)
# Structure of the data
str(hmeq)
## 'data.frame': 5960 obs. of 14 variables:
## $ TARGET_BAD_FLAG: int 1 1 1 1 0 1 1 1 1 1 ...
## $ TARGET_LOSS_AMT: int 641 1109 767 1425 NA 335 1841 373 1217 1523 ...
## $ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
## $ MORTDUE : num 25860 70053 13500 NA 97800 ...
## $ VALUE : num 39025 68400 16700 NA 112000 ...
## $ REASON : chr "HomeImp" "HomeImp" "HomeImp" "" ...
## $ JOB : chr "Other" "Other" "Other" "" ...
## $ YOJ : num 10.5 7 4 NA 3 9 5 11 3 16 ...
## $ DEROG : int 0 0 0 NA 0 0 3 0 0 0 ...
## $ DELINQ : int 0 2 0 NA 0 0 2 0 2 0 ...
## $ CLAGE : num 94.4 121.8 149.5 NA 93.3 ...
## $ NINQ : int 1 0 1 NA 0 1 1 0 1 0 ...
## $ CLNO : int 9 14 10 NA 14 8 17 8 12 13 ...
## $ DEBTINC : num NA NA NA NA NA ...
# Summary of the data
summary(hmeq)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN MORTDUE
## Min. :0.0000 Min. : 224 Min. : 1100 Min. : 2063
## 1st Qu.:0.0000 1st Qu.: 5639 1st Qu.:11100 1st Qu.: 46276
## Median :0.0000 Median :11003 Median :16300 Median : 65019
## Mean :0.1995 Mean :13415 Mean :18608 Mean : 73761
## 3rd Qu.:0.0000 3rd Qu.:17634 3rd Qu.:23300 3rd Qu.: 91488
## Max. :1.0000 Max. :78987 Max. :89900 Max. :399550
## NA's :4771 NA's :518
## VALUE REASON JOB YOJ
## Min. : 8000 Length:5960 Length:5960 Min. : 0.000
## 1st Qu.: 66076 Class :character Class :character 1st Qu.: 3.000
## Median : 89236 Mode :character Mode :character Median : 7.000
## Mean :101776 Mean : 8.922
## 3rd Qu.:119824 3rd Qu.:13.000
## Max. :855909 Max. :41.000
## NA's :112 NA's :515
## DEROG DELINQ CLAGE NINQ
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 115.1 1st Qu.: 0.000
## Median : 0.0000 Median : 0.0000 Median : 173.5 Median : 1.000
## Mean : 0.2546 Mean : 0.4494 Mean : 179.8 Mean : 1.186
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 231.6 3rd Qu.: 2.000
## Max. :10.0000 Max. :15.0000 Max. :1168.2 Max. :17.000
## NA's :708 NA's :580 NA's :308 NA's :510
## CLNO DEBTINC
## Min. : 0.0 Min. : 0.5245
## 1st Qu.:15.0 1st Qu.: 29.1400
## Median :20.0 Median : 34.8183
## Mean :21.3 Mean : 33.7799
## 3rd Qu.:26.0 3rd Qu.: 39.0031
## Max. :71.0 Max. :203.3121
## NA's :222 NA's :1267
# First six records
head(hmeq)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN MORTDUE VALUE REASON JOB YOJ DEROG
## 1 1 641 1100 25860 39025 HomeImp Other 10.5 0
## 2 1 1109 1300 70053 68400 HomeImp Other 7.0 0
## 3 1 767 1500 13500 16700 HomeImp Other 4.0 0
## 4 1 1425 1500 NA NA NA NA
## 5 0 NA 1700 97800 112000 HomeImp Office 3.0 0
## 6 1 335 1700 30548 40320 HomeImp Other 9.0 0
## DELINQ CLAGE NINQ CLNO DEBTINC
## 1 0 94.36667 1 9 NA
## 2 2 121.83333 0 14 NA
## 3 0 149.46667 1 10 NA
## 4 NA NA NA NA NA
## 5 0 93.33333 0 14 NA
## 6 0 101.46600 1 8 37.11361
# Step 2: Box-Whisker Plots by TARGET_BAD_FLAG
hmeq_long <- hmeq %>%
select(where(is.numeric), TARGET_BAD_FLAG) %>%
pivot_longer(cols = -TARGET_BAD_FLAG, names_to = "Variable", values_to = "Value")
# Boxplot
ggplot(hmeq_long, aes(x = as.factor(TARGET_BAD_FLAG), y = Value, fill = as.factor(TARGET_BAD_FLAG))) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2, na.rm = TRUE) +
facet_wrap(~ Variable, scales = "free_y") +
labs(title = "Boxplots of Numeric Variables by TARGET_BAD_FLAG - Diyang Yu",
x = "Target (0 = Good Loan, 1 = Bad Loan)",
y = "Value") +
theme_minimal() +
scale_fill_manual(values = c("skyblue", "yellow"))

# Step 3: Histogram + Density Plot
ggplot(hmeq, aes(x = LOAN)) +
geom_histogram(aes(y = after_stat(density)),
bins = 30,
fill = "skyblue", color = "darkblue", alpha = 0.7) +
geom_density(color = "red", linewidth = 1) +
labs(title = "Histogram with Density Curve of LOAN",
x = "Loan Amount",
y = "Density") +
theme_minimal()

# Step 4: Imputation of missing numeric values
# 1. Fix missing TARGET_LOSS_AMT
hmeq$TARGET_LOSS_AMT[is.na(hmeq$TARGET_LOSS_AMT)] <- 0
# 2. Complex Imputation for YOJ — by JOB group
# First, make sure JOB is filled temporarily for grouping
hmeq$JOB <- ifelse(is.na(hmeq$JOB), "Missing", hmeq$JOB)
# Group-wise median imputation for YOJ
if ("YOJ" %in% colnames(hmeq)) {
hmeq <- hmeq %>%
group_by(JOB) %>%
mutate(IMP_YOJ = ifelse(is.na(YOJ), median(YOJ, na.rm = TRUE), YOJ),
M_YOJ = ifelse(is.na(YOJ), 1, 0)) %>%
ungroup()
hmeq$YOJ <- NULL
}
# 3. Median imputation for the rest
vars_to_impute <- c("LOAN", "MORTDUE", "VALUE", "DEROG",
"DELINQ", "CLAGE", "NINQ", "CLNO", "DEBTINC")
for (var in vars_to_impute) {
if (var %in% colnames(hmeq)) {
imp_var <- paste0("IMP_", var)
flag_var <- paste0("M_", var)
hmeq[[flag_var]] <- ifelse(is.na(hmeq[[var]]), 1, 0)
median_val <- median(hmeq[[var]], na.rm = TRUE)
hmeq[[imp_var]] <- ifelse(is.na(hmeq[[var]]), median_val, hmeq[[var]])
hmeq[[var]] <- NULL
}
}
# 4. Summary to confirm imputation & check flags
summary(hmeq)
## TARGET_BAD_FLAG TARGET_LOSS_AMT REASON JOB
## Min. :0.0000 Min. : 0 Length:5960 Length:5960
## 1st Qu.:0.0000 1st Qu.: 0 Class :character Class :character
## Median :0.0000 Median : 0 Mode :character Mode :character
## Mean :0.1995 Mean : 2676
## 3rd Qu.:0.0000 3rd Qu.: 0
## Max. :1.0000 Max. :78987
## IMP_YOJ M_YOJ M_LOAN IMP_LOAN
## Min. : 0.00 Min. :0.00000 Min. :0 Min. : 1100
## 1st Qu.: 3.00 1st Qu.:0.00000 1st Qu.:0 1st Qu.:11100
## Median : 8.00 Median :0.00000 Median :0 Median :16300
## Mean : 8.77 Mean :0.08641 Mean :0 Mean :18608
## 3rd Qu.:12.00 3rd Qu.:0.00000 3rd Qu.:0 3rd Qu.:23300
## Max. :41.00 Max. :1.00000 Max. :0 Max. :89900
## M_MORTDUE IMP_MORTDUE M_VALUE IMP_VALUE
## Min. :0.00000 Min. : 2063 Min. :0.00000 Min. : 8000
## 1st Qu.:0.00000 1st Qu.: 48139 1st Qu.:0.00000 1st Qu.: 66490
## Median :0.00000 Median : 65019 Median :0.00000 Median : 89236
## Mean :0.08691 Mean : 73001 Mean :0.01879 Mean :101540
## 3rd Qu.:0.00000 3rd Qu.: 88200 3rd Qu.:0.00000 3rd Qu.:119005
## Max. :1.00000 Max. :399550 Max. :1.00000 Max. :855909
## M_DEROG IMP_DEROG M_DELINQ IMP_DELINQ
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Min. : 0.0000
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median :0.0000 Median : 0.0000 Median :0.00000 Median : 0.0000
## Mean :0.1188 Mean : 0.2243 Mean :0.09732 Mean : 0.4057
## 3rd Qu.:0.0000 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 0.0000
## Max. :1.0000 Max. :10.0000 Max. :1.00000 Max. :15.0000
## M_CLAGE IMP_CLAGE M_NINQ IMP_NINQ
## Min. :0.00000 Min. : 0.0 Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.: 117.4 1st Qu.:0.00000 1st Qu.: 0.00
## Median :0.00000 Median : 173.5 Median :0.00000 Median : 1.00
## Mean :0.05168 Mean : 179.4 Mean :0.08557 Mean : 1.17
## 3rd Qu.:0.00000 3rd Qu.: 227.1 3rd Qu.:0.00000 3rd Qu.: 2.00
## Max. :1.00000 Max. :1168.2 Max. :1.00000 Max. :17.00
## M_CLNO IMP_CLNO M_DEBTINC IMP_DEBTINC
## Min. :0.00000 Min. : 0.00 Min. :0.0000 Min. : 0.5245
## 1st Qu.:0.00000 1st Qu.:15.00 1st Qu.:0.0000 1st Qu.: 30.7632
## Median :0.00000 Median :20.00 Median :0.0000 Median : 34.8183
## Mean :0.03725 Mean :21.25 Mean :0.2126 Mean : 34.0007
## 3rd Qu.:0.00000 3rd Qu.:26.00 3rd Qu.:0.0000 3rd Qu.: 37.9499
## Max. :1.00000 Max. :71.00 Max. :1.0000 Max. :203.3122
sapply(hmeq[, grep("^M_", names(hmeq))], sum)
## M_YOJ M_LOAN M_MORTDUE M_VALUE M_DEROG M_DELINQ M_CLAGE M_NINQ
## 515 0 518 112 708 580 308 510
## M_CLNO M_DEBTINC
## 222 1267
# Step 5: One-Hot Encoding for Categorical Variables
# Fill NA in REASON and JOB with "Missing"
hmeq$REASON <- ifelse(is.na(hmeq$REASON), "Missing", hmeq$REASON)
hmeq$JOB <- ifelse(is.na(hmeq$JOB), "Missing", hmeq$JOB)
# Get unique values for REASON and JOB
reason_levels <- unique(hmeq$REASON)
job_levels <- unique(hmeq$JOB)
# Create flag variables for REASON
for (val in reason_levels) {
flag_name <- paste0("Flag_REASON_", gsub(" ", "_", val))
hmeq[[flag_name]] <- ifelse(hmeq$REASON == val, 1, 0)
}
# Create flag variables for JOB
for (val in job_levels) {
flag_name <- paste0("Flag_JOB_", gsub(" ", "_", val))
hmeq[[flag_name]] <- ifelse(hmeq$JOB == val, 1, 0)
}
# Drop original REASON and JOB
hmeq$REASON <- NULL
hmeq$JOB <- NULL
# Confirm final structure
summary(hmeq)
## TARGET_BAD_FLAG TARGET_LOSS_AMT IMP_YOJ M_YOJ M_LOAN
## Min. :0.0000 Min. : 0 Min. : 0.00 Min. :0.00000 Min. :0
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.: 3.00 1st Qu.:0.00000 1st Qu.:0
## Median :0.0000 Median : 0 Median : 8.00 Median :0.00000 Median :0
## Mean :0.1995 Mean : 2676 Mean : 8.77 Mean :0.08641 Mean :0
## 3rd Qu.:0.0000 3rd Qu.: 0 3rd Qu.:12.00 3rd Qu.:0.00000 3rd Qu.:0
## Max. :1.0000 Max. :78987 Max. :41.00 Max. :1.00000 Max. :0
## IMP_LOAN M_MORTDUE IMP_MORTDUE M_VALUE
## Min. : 1100 Min. :0.00000 Min. : 2063 Min. :0.00000
## 1st Qu.:11100 1st Qu.:0.00000 1st Qu.: 48139 1st Qu.:0.00000
## Median :16300 Median :0.00000 Median : 65019 Median :0.00000
## Mean :18608 Mean :0.08691 Mean : 73001 Mean :0.01879
## 3rd Qu.:23300 3rd Qu.:0.00000 3rd Qu.: 88200 3rd Qu.:0.00000
## Max. :89900 Max. :1.00000 Max. :399550 Max. :1.00000
## IMP_VALUE M_DEROG IMP_DEROG M_DELINQ
## Min. : 8000 Min. :0.0000 Min. : 0.0000 Min. :0.00000
## 1st Qu.: 66490 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000
## Median : 89236 Median :0.0000 Median : 0.0000 Median :0.00000
## Mean :101540 Mean :0.1188 Mean : 0.2243 Mean :0.09732
## 3rd Qu.:119005 3rd Qu.:0.0000 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :855909 Max. :1.0000 Max. :10.0000 Max. :1.00000
## IMP_DELINQ M_CLAGE IMP_CLAGE M_NINQ
## Min. : 0.0000 Min. :0.00000 Min. : 0.0 Min. :0.00000
## 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 117.4 1st Qu.:0.00000
## Median : 0.0000 Median :0.00000 Median : 173.5 Median :0.00000
## Mean : 0.4057 Mean :0.05168 Mean : 179.4 Mean :0.08557
## 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 227.1 3rd Qu.:0.00000
## Max. :15.0000 Max. :1.00000 Max. :1168.2 Max. :1.00000
## IMP_NINQ M_CLNO IMP_CLNO M_DEBTINC
## Min. : 0.00 Min. :0.00000 Min. : 0.00 Min. :0.0000
## 1st Qu.: 0.00 1st Qu.:0.00000 1st Qu.:15.00 1st Qu.:0.0000
## Median : 1.00 Median :0.00000 Median :20.00 Median :0.0000
## Mean : 1.17 Mean :0.03725 Mean :21.25 Mean :0.2126
## 3rd Qu.: 2.00 3rd Qu.:0.00000 3rd Qu.:26.00 3rd Qu.:0.0000
## Max. :17.00 Max. :1.00000 Max. :71.00 Max. :1.0000
## IMP_DEBTINC Flag_REASON_HomeImp Flag_REASON_ Flag_REASON_DebtCon
## Min. : 0.5245 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.: 30.7632 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 34.8183 Median :0.0000 Median :0.00000 Median :1.0000
## Mean : 34.0007 Mean :0.2987 Mean :0.04228 Mean :0.6591
## 3rd Qu.: 37.9499 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :203.3122 Max. :1.0000 Max. :1.00000 Max. :1.0000
## Flag_JOB_Other Flag_JOB_ Flag_JOB_Office Flag_JOB_Sales
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.4007 Mean :0.04681 Mean :0.1591 Mean :0.01829
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## Flag_JOB_Mgr Flag_JOB_ProfExe Flag_JOB_Self
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1287 Mean :0.2141 Mean :0.03238
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000