knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
# Step 1: Read in the Data
hmeq <- read.csv("/Users/dizzysleepyhead/Desktop/Tri/Spring\ 2025/DATA/Week\ 2\ Materials/HMEQ_WK02/HMEQ_Loss.csv", stringsAsFactors = FALSE)

    # Structure of the data
    str(hmeq)
## 'data.frame':    5960 obs. of  14 variables:
##  $ TARGET_BAD_FLAG: int  1 1 1 1 0 1 1 1 1 1 ...
##  $ TARGET_LOSS_AMT: int  641 1109 767 1425 NA 335 1841 373 1217 1523 ...
##  $ LOAN           : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
##  $ MORTDUE        : num  25860 70053 13500 NA 97800 ...
##  $ VALUE          : num  39025 68400 16700 NA 112000 ...
##  $ REASON         : chr  "HomeImp" "HomeImp" "HomeImp" "" ...
##  $ JOB            : chr  "Other" "Other" "Other" "" ...
##  $ YOJ            : num  10.5 7 4 NA 3 9 5 11 3 16 ...
##  $ DEROG          : int  0 0 0 NA 0 0 3 0 0 0 ...
##  $ DELINQ         : int  0 2 0 NA 0 0 2 0 2 0 ...
##  $ CLAGE          : num  94.4 121.8 149.5 NA 93.3 ...
##  $ NINQ           : int  1 0 1 NA 0 1 1 0 1 0 ...
##  $ CLNO           : int  9 14 10 NA 14 8 17 8 12 13 ...
##  $ DEBTINC        : num  NA NA NA NA NA ...
    # Summary of the data
    summary(hmeq)
##  TARGET_BAD_FLAG  TARGET_LOSS_AMT      LOAN          MORTDUE      
##  Min.   :0.0000   Min.   :  224   Min.   : 1100   Min.   :  2063  
##  1st Qu.:0.0000   1st Qu.: 5639   1st Qu.:11100   1st Qu.: 46276  
##  Median :0.0000   Median :11003   Median :16300   Median : 65019  
##  Mean   :0.1995   Mean   :13415   Mean   :18608   Mean   : 73761  
##  3rd Qu.:0.0000   3rd Qu.:17634   3rd Qu.:23300   3rd Qu.: 91488  
##  Max.   :1.0000   Max.   :78987   Max.   :89900   Max.   :399550  
##                   NA's   :4771                    NA's   :518     
##      VALUE           REASON              JOB                 YOJ        
##  Min.   :  8000   Length:5960        Length:5960        Min.   : 0.000  
##  1st Qu.: 66076   Class :character   Class :character   1st Qu.: 3.000  
##  Median : 89236   Mode  :character   Mode  :character   Median : 7.000  
##  Mean   :101776                                         Mean   : 8.922  
##  3rd Qu.:119824                                         3rd Qu.:13.000  
##  Max.   :855909                                         Max.   :41.000  
##  NA's   :112                                            NA's   :515     
##      DEROG             DELINQ            CLAGE             NINQ       
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :   0.0   Min.   : 0.000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 115.1   1st Qu.: 0.000  
##  Median : 0.0000   Median : 0.0000   Median : 173.5   Median : 1.000  
##  Mean   : 0.2546   Mean   : 0.4494   Mean   : 179.8   Mean   : 1.186  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 231.6   3rd Qu.: 2.000  
##  Max.   :10.0000   Max.   :15.0000   Max.   :1168.2   Max.   :17.000  
##  NA's   :708       NA's   :580       NA's   :308      NA's   :510     
##       CLNO         DEBTINC        
##  Min.   : 0.0   Min.   :  0.5245  
##  1st Qu.:15.0   1st Qu.: 29.1400  
##  Median :20.0   Median : 34.8183  
##  Mean   :21.3   Mean   : 33.7799  
##  3rd Qu.:26.0   3rd Qu.: 39.0031  
##  Max.   :71.0   Max.   :203.3121  
##  NA's   :222    NA's   :1267
    # First six records
    head(hmeq)
##   TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN MORTDUE  VALUE  REASON    JOB  YOJ DEROG
## 1               1             641 1100   25860  39025 HomeImp  Other 10.5     0
## 2               1            1109 1300   70053  68400 HomeImp  Other  7.0     0
## 3               1             767 1500   13500  16700 HomeImp  Other  4.0     0
## 4               1            1425 1500      NA     NA                  NA    NA
## 5               0              NA 1700   97800 112000 HomeImp Office  3.0     0
## 6               1             335 1700   30548  40320 HomeImp  Other  9.0     0
##   DELINQ     CLAGE NINQ CLNO  DEBTINC
## 1      0  94.36667    1    9       NA
## 2      2 121.83333    0   14       NA
## 3      0 149.46667    1   10       NA
## 4     NA        NA   NA   NA       NA
## 5      0  93.33333    0   14       NA
## 6      0 101.46600    1    8 37.11361
# Step 2: Box-Whisker Plots by TARGET_BAD_FLAG
    hmeq_long <- hmeq %>%
      select(where(is.numeric), TARGET_BAD_FLAG) %>%  
      pivot_longer(cols = -TARGET_BAD_FLAG, names_to = "Variable", values_to = "Value")
    
    # Boxplot
    ggplot(hmeq_long, aes(x = as.factor(TARGET_BAD_FLAG), y = Value, fill = as.factor(TARGET_BAD_FLAG))) +
      geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2, na.rm = TRUE) +
      facet_wrap(~ Variable, scales = "free_y") + 
      labs(title = "Boxplots of Numeric Variables by TARGET_BAD_FLAG - Diyang Yu",
           x = "Target (0 = Good Loan, 1 = Bad Loan)",
           y = "Value") +
      theme_minimal() +
      scale_fill_manual(values = c("skyblue", "yellow"))

# Step 3: Histogram + Density Plot
    ggplot(hmeq, aes(x = LOAN)) +
      geom_histogram(aes(y = after_stat(density)),  
                     bins = 30,
                     fill = "skyblue", color = "darkblue", alpha = 0.7) +
      geom_density(color = "red", linewidth = 1) +   
      labs(title = "Histogram with Density Curve of LOAN",
           x = "Loan Amount",
           y = "Density") +
      theme_minimal()

# Step 4: Imputation of missing numeric values
    
    # 1. Fix missing TARGET_LOSS_AMT
    hmeq$TARGET_LOSS_AMT[is.na(hmeq$TARGET_LOSS_AMT)] <- 0
    
    
    # 2. Complex Imputation for YOJ — by JOB group
      # First, make sure JOB is filled temporarily for grouping
      hmeq$JOB <- ifelse(is.na(hmeq$JOB), "Missing", hmeq$JOB)
      
      # Group-wise median imputation for YOJ
      if ("YOJ" %in% colnames(hmeq)) {
        hmeq <- hmeq %>%
          group_by(JOB) %>%
          mutate(IMP_YOJ = ifelse(is.na(YOJ), median(YOJ, na.rm = TRUE), YOJ),
                 M_YOJ = ifelse(is.na(YOJ), 1, 0)) %>%
          ungroup()
        
        hmeq$YOJ <- NULL
        }
      
    # 3. Median imputation for the rest
      vars_to_impute <- c("LOAN", "MORTDUE", "VALUE", "DEROG",
                          "DELINQ", "CLAGE", "NINQ", "CLNO", "DEBTINC")
      
      for (var in vars_to_impute) {
        if (var %in% colnames(hmeq)) {
          imp_var <- paste0("IMP_", var)
          flag_var <- paste0("M_", var)
          
          hmeq[[flag_var]] <- ifelse(is.na(hmeq[[var]]), 1, 0)
          median_val <- median(hmeq[[var]], na.rm = TRUE)
          hmeq[[imp_var]] <- ifelse(is.na(hmeq[[var]]), median_val, hmeq[[var]])
          hmeq[[var]] <- NULL
          }
        }
      
    
    # 4. Summary to confirm imputation & check flags
    summary(hmeq)
##  TARGET_BAD_FLAG  TARGET_LOSS_AMT    REASON              JOB           
##  Min.   :0.0000   Min.   :    0   Length:5960        Length:5960       
##  1st Qu.:0.0000   1st Qu.:    0   Class :character   Class :character  
##  Median :0.0000   Median :    0   Mode  :character   Mode  :character  
##  Mean   :0.1995   Mean   : 2676                                        
##  3rd Qu.:0.0000   3rd Qu.:    0                                        
##  Max.   :1.0000   Max.   :78987                                        
##     IMP_YOJ          M_YOJ             M_LOAN     IMP_LOAN    
##  Min.   : 0.00   Min.   :0.00000   Min.   :0   Min.   : 1100  
##  1st Qu.: 3.00   1st Qu.:0.00000   1st Qu.:0   1st Qu.:11100  
##  Median : 8.00   Median :0.00000   Median :0   Median :16300  
##  Mean   : 8.77   Mean   :0.08641   Mean   :0   Mean   :18608  
##  3rd Qu.:12.00   3rd Qu.:0.00000   3rd Qu.:0   3rd Qu.:23300  
##  Max.   :41.00   Max.   :1.00000   Max.   :0   Max.   :89900  
##    M_MORTDUE        IMP_MORTDUE        M_VALUE          IMP_VALUE     
##  Min.   :0.00000   Min.   :  2063   Min.   :0.00000   Min.   :  8000  
##  1st Qu.:0.00000   1st Qu.: 48139   1st Qu.:0.00000   1st Qu.: 66490  
##  Median :0.00000   Median : 65019   Median :0.00000   Median : 89236  
##  Mean   :0.08691   Mean   : 73001   Mean   :0.01879   Mean   :101540  
##  3rd Qu.:0.00000   3rd Qu.: 88200   3rd Qu.:0.00000   3rd Qu.:119005  
##  Max.   :1.00000   Max.   :399550   Max.   :1.00000   Max.   :855909  
##     M_DEROG         IMP_DEROG          M_DELINQ         IMP_DELINQ     
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000   Min.   : 0.0000  
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.: 0.0000  
##  Median :0.0000   Median : 0.0000   Median :0.00000   Median : 0.0000  
##  Mean   :0.1188   Mean   : 0.2243   Mean   :0.09732   Mean   : 0.4057  
##  3rd Qu.:0.0000   3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.: 0.0000  
##  Max.   :1.0000   Max.   :10.0000   Max.   :1.00000   Max.   :15.0000  
##     M_CLAGE          IMP_CLAGE          M_NINQ           IMP_NINQ    
##  Min.   :0.00000   Min.   :   0.0   Min.   :0.00000   Min.   : 0.00  
##  1st Qu.:0.00000   1st Qu.: 117.4   1st Qu.:0.00000   1st Qu.: 0.00  
##  Median :0.00000   Median : 173.5   Median :0.00000   Median : 1.00  
##  Mean   :0.05168   Mean   : 179.4   Mean   :0.08557   Mean   : 1.17  
##  3rd Qu.:0.00000   3rd Qu.: 227.1   3rd Qu.:0.00000   3rd Qu.: 2.00  
##  Max.   :1.00000   Max.   :1168.2   Max.   :1.00000   Max.   :17.00  
##      M_CLNO           IMP_CLNO       M_DEBTINC       IMP_DEBTINC      
##  Min.   :0.00000   Min.   : 0.00   Min.   :0.0000   Min.   :  0.5245  
##  1st Qu.:0.00000   1st Qu.:15.00   1st Qu.:0.0000   1st Qu.: 30.7632  
##  Median :0.00000   Median :20.00   Median :0.0000   Median : 34.8183  
##  Mean   :0.03725   Mean   :21.25   Mean   :0.2126   Mean   : 34.0007  
##  3rd Qu.:0.00000   3rd Qu.:26.00   3rd Qu.:0.0000   3rd Qu.: 37.9499  
##  Max.   :1.00000   Max.   :71.00   Max.   :1.0000   Max.   :203.3122
    sapply(hmeq[, grep("^M_", names(hmeq))], sum)
##     M_YOJ    M_LOAN M_MORTDUE   M_VALUE   M_DEROG  M_DELINQ   M_CLAGE    M_NINQ 
##       515         0       518       112       708       580       308       510 
##    M_CLNO M_DEBTINC 
##       222      1267
# Step 5: One-Hot Encoding for Categorical Variables
    
    # Fill NA in REASON and JOB with "Missing"
    hmeq$REASON <- ifelse(is.na(hmeq$REASON), "Missing", hmeq$REASON)
    hmeq$JOB <- ifelse(is.na(hmeq$JOB), "Missing", hmeq$JOB)
    
    # Get unique values for REASON and JOB
    reason_levels <- unique(hmeq$REASON)
    job_levels <- unique(hmeq$JOB)
    
    # Create flag variables for REASON
    for (val in reason_levels) {
      flag_name <- paste0("Flag_REASON_", gsub(" ", "_", val))
      hmeq[[flag_name]] <- ifelse(hmeq$REASON == val, 1, 0)
    }
    
    # Create flag variables for JOB
    for (val in job_levels) {
      flag_name <- paste0("Flag_JOB_", gsub(" ", "_", val))
      hmeq[[flag_name]] <- ifelse(hmeq$JOB == val, 1, 0)
    }
    
    # Drop original REASON and JOB
    hmeq$REASON <- NULL
    hmeq$JOB <- NULL
    
    # Confirm final structure
    summary(hmeq)
##  TARGET_BAD_FLAG  TARGET_LOSS_AMT    IMP_YOJ          M_YOJ             M_LOAN 
##  Min.   :0.0000   Min.   :    0   Min.   : 0.00   Min.   :0.00000   Min.   :0  
##  1st Qu.:0.0000   1st Qu.:    0   1st Qu.: 3.00   1st Qu.:0.00000   1st Qu.:0  
##  Median :0.0000   Median :    0   Median : 8.00   Median :0.00000   Median :0  
##  Mean   :0.1995   Mean   : 2676   Mean   : 8.77   Mean   :0.08641   Mean   :0  
##  3rd Qu.:0.0000   3rd Qu.:    0   3rd Qu.:12.00   3rd Qu.:0.00000   3rd Qu.:0  
##  Max.   :1.0000   Max.   :78987   Max.   :41.00   Max.   :1.00000   Max.   :0  
##     IMP_LOAN       M_MORTDUE        IMP_MORTDUE        M_VALUE       
##  Min.   : 1100   Min.   :0.00000   Min.   :  2063   Min.   :0.00000  
##  1st Qu.:11100   1st Qu.:0.00000   1st Qu.: 48139   1st Qu.:0.00000  
##  Median :16300   Median :0.00000   Median : 65019   Median :0.00000  
##  Mean   :18608   Mean   :0.08691   Mean   : 73001   Mean   :0.01879  
##  3rd Qu.:23300   3rd Qu.:0.00000   3rd Qu.: 88200   3rd Qu.:0.00000  
##  Max.   :89900   Max.   :1.00000   Max.   :399550   Max.   :1.00000  
##    IMP_VALUE         M_DEROG         IMP_DEROG          M_DELINQ      
##  Min.   :  8000   Min.   :0.0000   Min.   : 0.0000   Min.   :0.00000  
##  1st Qu.: 66490   1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.00000  
##  Median : 89236   Median :0.0000   Median : 0.0000   Median :0.00000  
##  Mean   :101540   Mean   :0.1188   Mean   : 0.2243   Mean   :0.09732  
##  3rd Qu.:119005   3rd Qu.:0.0000   3rd Qu.: 0.0000   3rd Qu.:0.00000  
##  Max.   :855909   Max.   :1.0000   Max.   :10.0000   Max.   :1.00000  
##    IMP_DELINQ         M_CLAGE          IMP_CLAGE          M_NINQ       
##  Min.   : 0.0000   Min.   :0.00000   Min.   :   0.0   Min.   :0.00000  
##  1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.: 117.4   1st Qu.:0.00000  
##  Median : 0.0000   Median :0.00000   Median : 173.5   Median :0.00000  
##  Mean   : 0.4057   Mean   :0.05168   Mean   : 179.4   Mean   :0.08557  
##  3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.: 227.1   3rd Qu.:0.00000  
##  Max.   :15.0000   Max.   :1.00000   Max.   :1168.2   Max.   :1.00000  
##     IMP_NINQ         M_CLNO           IMP_CLNO       M_DEBTINC     
##  Min.   : 0.00   Min.   :0.00000   Min.   : 0.00   Min.   :0.0000  
##  1st Qu.: 0.00   1st Qu.:0.00000   1st Qu.:15.00   1st Qu.:0.0000  
##  Median : 1.00   Median :0.00000   Median :20.00   Median :0.0000  
##  Mean   : 1.17   Mean   :0.03725   Mean   :21.25   Mean   :0.2126  
##  3rd Qu.: 2.00   3rd Qu.:0.00000   3rd Qu.:26.00   3rd Qu.:0.0000  
##  Max.   :17.00   Max.   :1.00000   Max.   :71.00   Max.   :1.0000  
##   IMP_DEBTINC       Flag_REASON_HomeImp  Flag_REASON_     Flag_REASON_DebtCon
##  Min.   :  0.5245   Min.   :0.0000      Min.   :0.00000   Min.   :0.0000     
##  1st Qu.: 30.7632   1st Qu.:0.0000      1st Qu.:0.00000   1st Qu.:0.0000     
##  Median : 34.8183   Median :0.0000      Median :0.00000   Median :1.0000     
##  Mean   : 34.0007   Mean   :0.2987      Mean   :0.04228   Mean   :0.6591     
##  3rd Qu.: 37.9499   3rd Qu.:1.0000      3rd Qu.:0.00000   3rd Qu.:1.0000     
##  Max.   :203.3122   Max.   :1.0000      Max.   :1.00000   Max.   :1.0000     
##  Flag_JOB_Other     Flag_JOB_       Flag_JOB_Office  Flag_JOB_Sales   
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.4007   Mean   :0.04681   Mean   :0.1591   Mean   :0.01829  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   Flag_JOB_Mgr    Flag_JOB_ProfExe Flag_JOB_Self    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.1287   Mean   :0.2141   Mean   :0.03238  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000