##Missing Data Task #=======Explorasi missing data=======

data_missing <- read_csv("C:/Users/Muhammad Nisar/Downloads/data_missing.csv")
## Rows: 614 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Loan_ID, Gender, Married, Dependents, Education, Self_Employed, Pro...
## dbl (5): ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, C...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(data_missing)
## # A tibble: 614 × 13
##    Loan_ID  Gender Married Dependents Education    Self_Employed ApplicantIncome
##    <chr>    <chr>  <chr>   <chr>      <chr>        <chr>                   <dbl>
##  1 LP001002 Male   No      0          Graduate     No                       5849
##  2 LP001003 Male   Yes     1          Graduate     No                       4583
##  3 LP001005 Male   Yes     0          Graduate     Yes                      3000
##  4 LP001006 Male   Yes     0          Not Graduate No                       2583
##  5 LP001008 Male   No      0          Graduate     No                       6000
##  6 LP001011 Male   Yes     2          Graduate     Yes                      5417
##  7 LP001013 Male   Yes     0          Not Graduate No                       2333
##  8 LP001014 Male   Yes     3+         Graduate     No                       3036
##  9 LP001018 Male   Yes     2          Graduate     No                       4006
## 10 LP001020 Male   Yes     1          Graduate     No                      12841
## # ℹ 604 more rows
## # ℹ 6 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## #   Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>,
## #   Loan_Status <chr>
str(data_missing)
## spc_tbl_ [614 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Loan_ID          : chr [1:614] "LP001002" "LP001003" "LP001005" "LP001006" ...
##  $ Gender           : chr [1:614] "Male" "Male" "Male" "Male" ...
##  $ Married          : chr [1:614] "No" "Yes" "Yes" "Yes" ...
##  $ Dependents       : chr [1:614] "0" "1" "0" "0" ...
##  $ Education        : chr [1:614] "Graduate" "Graduate" "Graduate" "Not Graduate" ...
##  $ Self_Employed    : chr [1:614] "No" "No" "Yes" "No" ...
##  $ ApplicantIncome  : num [1:614] 5849 4583 3000 2583 6000 ...
##  $ CoapplicantIncome: num [1:614] 0 1508 0 2358 0 ...
##  $ LoanAmount       : num [1:614] NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : num [1:614] 360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : num [1:614] 1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : chr [1:614] "Urban" "Rural" "Urban" "Urban" ...
##  $ Loan_Status      : chr [1:614] "Y" "N" "Y" "Y" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Loan_ID = col_character(),
##   ..   Gender = col_character(),
##   ..   Married = col_character(),
##   ..   Dependents = col_character(),
##   ..   Education = col_character(),
##   ..   Self_Employed = col_character(),
##   ..   ApplicantIncome = col_double(),
##   ..   CoapplicantIncome = col_double(),
##   ..   LoanAmount = col_double(),
##   ..   Loan_Amount_Term = col_double(),
##   ..   Credit_History = col_double(),
##   ..   Property_Area = col_character(),
##   ..   Loan_Status = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(data_missing)
##    Loan_ID             Gender            Married           Dependents       
##  Length:614         Length:614         Length:614         Length:614        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Education         Self_Employed      ApplicantIncome CoapplicantIncome
##  Length:614         Length:614         Min.   :  150   Min.   :    0    
##  Class :character   Class :character   1st Qu.: 2878   1st Qu.:    0    
##  Mode  :character   Mode  :character   Median : 3812   Median : 1188    
##                                        Mean   : 5403   Mean   : 1621    
##                                        3rd Qu.: 5795   3rd Qu.: 2297    
##                                        Max.   :81000   Max.   :41667    
##                                                                         
##    LoanAmount    Loan_Amount_Term Credit_History   Property_Area     
##  Min.   :  9.0   Min.   : 12      Min.   :0.0000   Length:614        
##  1st Qu.:100.0   1st Qu.:360      1st Qu.:1.0000   Class :character  
##  Median :128.0   Median :360      Median :1.0000   Mode  :character  
##  Mean   :146.4   Mean   :342      Mean   :0.8422                     
##  3rd Qu.:168.0   3rd Qu.:360      3rd Qu.:1.0000                     
##  Max.   :700.0   Max.   :480      Max.   :1.0000                     
##  NA's   :22      NA's   :14       NA's   :50                         
##  Loan_Status       
##  Length:614        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
#Jumlah data missing value pada setiap kolom
missing_count <- sapply(data_missing, function(x) sum(is.na(x)))
#Persentase data missing value pada setiap kolom
missing_percent <- sapply(data_missing, function(x) mean(is.na(x))) * 100
#menampilkan jumlah dan persentase missing value
missing_summary <- data.frame( 
  Variable = names(data_missing), Missing_Count = missing_count,
  Missing_Percent = round(missing_percent, 2)
)
print(missing_summary)
##                            Variable Missing_Count Missing_Percent
## Loan_ID                     Loan_ID             0            0.00
## Gender                       Gender            13            2.12
## Married                     Married             3            0.49
## Dependents               Dependents            15            2.44
## Education                 Education             0            0.00
## Self_Employed         Self_Employed            32            5.21
## ApplicantIncome     ApplicantIncome             0            0.00
## CoapplicantIncome CoapplicantIncome             0            0.00
## LoanAmount               LoanAmount            22            3.58
## Loan_Amount_Term   Loan_Amount_Term            14            2.28
## Credit_History       Credit_History            50            8.14
## Property_Area         Property_Area             0            0.00
## Loan_Status             Loan_Status             0            0.00
cat("Jumlah Baris dan Kolom:\n")
## Jumlah Baris dan Kolom:
print(dim(data_missing))
## [1] 614  13

#=======Visualisasi NA Variabel Numerik (Before Imputation)=======

#Distribusi plot
distribution <- function(x, binwidth = 20) {
  df <- data.frame(value = x, is_na = is.na(x))
  
  ggplot(df, aes(x = value, fill = is_na)) +
    geom_histogram(binwidth = binwidth, color = "black") +
    scale_fill_manual(values = c("FALSE" = "skyblue", "TRUE" = "red"),
                      labels = c("Valid", "Missing"),
                      name = "Status") +
    labs(title = "Distribusi Variabel dengan NA",
         x = "Nilai", y = "Frekuensi") +
    theme_minimal()
}
#Gap Size Missing value
gapsize <- function(x) {
  df <- data.frame(index = 1:length(x), is_na = is.na(x))
  
  ggplot(df, aes(x = index, y = as.numeric(is_na))) +
    geom_point(aes(color = is_na), size = 2) +
    scale_color_manual(values = c("FALSE" = "skyblue", "TRUE" = "red"),
                       labels = c("Valid", "Missing"),
                       name = "Status") +
    scale_y_continuous(breaks = c(0, 1), labels = c("Valid", "Missing")) +
    labs(title = "Pola Gap (Missing vs Valid)", x = "Index", y = "") +
    theme_minimal()
}

#a) Loan Amount
#Distribusi NA 
distribution(data_missing$LoanAmount)
## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Gap Size Na
gapsize(data_missing$LoanAmount)

#b) Loan_Amount_Term
#Distribusi NA 
distribution(data_missing$Loan_Amount_Term)
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Gap Size Na
gapsize(data_missing$Loan_Amount_Term)

#=======Imputation Methods======= #=======1. Ad-hoc methods (such as mean, median, or mode imputation)=======

# Copy data asli
data_median <- data_missing
data_mode   <- data_missing

# Variabel numerik & kategorik sesuai analisis
num_vars <- c("LoanAmount", "Loan_Amount_Term")     # numerik
cat_vars <- c("Gender", "Married", "Dependents", 
              "Self_Employed", "Credit_History")    # kategorik (termasuk biner)

# 1a. Median
for (col in num_vars) {
  median_val <- median(data_missing[[col]], na.rm = TRUE) #na remove
  data_median[[col]][is.na(data_median[[col]])] <- median_val
}
cat("Cek Data After Imputasi\n")
## Cek Data After Imputasi
print(colSums(is.na(data_median)))
##           Loan_ID            Gender           Married        Dependents 
##                 0                13                 3                15 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                32                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                 0                 0                50                 0 
##       Loan_Status 
##                 0
# 1b.   Modus
for (col in cat_vars) {
  mode_val <- mfv(data_missing[[col]], na_rm = TRUE) #na remove
  data_mode[[col]][is.na(data_mode[[col]])] <- mode_val
}
cat("Cek Data After Imputasi\n")
## Cek Data After Imputasi
print(colSums(is.na(data_mode)))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                14                 0                 0 
##       Loan_Status 
##                 0
cat("Kalau Gabungan, mean dan mode atau median dan mode")
## Kalau Gabungan, mean dan mode atau median dan mode
# Copy data dulu
data_median_mode <- data_missing

#Menentukan Variabel Kategorik dan Numerik
num_var <- c("LoanAmount", "Loan_Amount_Term")     # numerik
cat_var <- c("Gender", "Married", "Dependents", 
             "Self_Employed", "Credit_History")    # kategorik (termasuk biner)

#Median dan Modus
cat("Cek Data Before Imputasi\n")
## Cek Data Before Imputasi
print(colSums(is.na(data_median_mode)))
##           Loan_ID            Gender           Married        Dependents 
##                 0                13                 3                15 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                32                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                14                50                 0 
##       Loan_Status 
##                 0
for (col in num_var) { #Median
  median_val <- median(data_missing[[col]], na.rm = TRUE)
  data_median_mode[[col]][is.na(data_median_mode[[col]])] <- median_val
}

for (col in cat_var) { #Modus
  mode_val <- mfv(data_missing[[col]], na_rm = TRUE)
  data_median_mode[[col]][is.na(data_median_mode[[col]])] <- mode_val
}

cat("Cek Data After Imputasi\n")
## Cek Data After Imputasi
print(colSums(is.na(data_median_mode)))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                 0                 0                 0                 0 
##       Loan_Status 
##                 0

#=======2. Multiple Imputation by Chained Equations (MICE)=======

#Copy
data_mice <- data_missing

# Ubah variabel kategorik jadi factor
data_mice$Gender        <- as.factor(data_mice$Gender)
data_mice$Married       <- as.factor(data_mice$Married)
data_mice$Dependents    <- as.factor(data_mice$Dependents)
data_mice$Self_Employed <- as.factor(data_mice$Self_Employed)
data_mice$Credit_History<- as.factor(data_mice$Credit_History)

# Atur metode imputasi hanya untuk variabel yang missing
meth <- make.method(data_mice)
meth[c("Loan_ID","ApplicantIncome","CoapplicantIncome",
       "Education","Property_Area","Loan_Status")] <- ""
meth["Gender"]           <- "logreg"
meth["Married"]          <- "logreg"
meth["Dependents"]       <- "polyreg"
meth["Self_Employed"]    <- "logreg"
meth["LoanAmount"]       <- "pmm"
meth["Loan_Amount_Term"] <- "pmm"
meth["Credit_History"]   <- "logreg"

# Jalankan imputasi multiple (misal 5 dataset imputasi)
imp <- mice(data_mice, m = 5, method = meth, seed = 123)
## 
##  iter imp variable
##   1   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
## Warning: Number of logged events: 4
completed_data <- complete(imp, 1)

#Visualisasi NA Variabel Numerik (After Imputation)
visualisasi <- data_missing

imputations_hist <- function(original, imputed, binwidth = 20) {
  df <- data.frame(
    value = c(original, imputed),
    status = rep(c("Original", "Imputed"), each = length(original))
  )
  
  ggplot(df, aes(x = value, fill = status)) +
    geom_histogram(position = "identity", alpha = 0.5, binwidth = binwidth, color = "black") +
    scale_fill_manual(values = c("Original" = "skyblue", "Imputed" = "red")) +
    labs(title = "Distribusi Sebelum vs Sesudah Imputasi",
         x = "Nilai", y = "Frekuensi") +
    theme_minimal()
}
print(imputations_hist(visualisasi$LoanAmount, imp$LoanAmount))
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_bin()`).

print(imputations_hist(visualisasi$Loan_Amount_Term, imp$Loan_Amount_Term))
## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_bin()`).

#=======Model Building=======

# Drop Loan_ID di semua dataset
data_drop        <- subset(na.omit(data_missing), select = -Loan_ID)
data_median_mode <- subset(data_median_mode, select = -Loan_ID)
completed_data   <- subset(completed_data, select = -Loan_ID)

# Pastikan Loan_Status faktor 2 level
data_drop$Loan_Status        <- factor(data_drop$Loan_Status, levels = c("N","Y"))
data_median_mode$Loan_Status <- factor(data_median_mode$Loan_Status, levels = c("N","Y"))
completed_data$Loan_Status   <- factor(completed_data$Loan_Status, levels = c("N","Y"))

#1. Log reg tanpa imputasi
model_drop <- glm(Loan_Status ~ ., data = data_drop, family = binomial)
summary(model_drop)
## 
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = data_drop)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.429e+00  9.312e-01  -2.609  0.00909 ** 
## GenderMale              3.254e-01  3.309e-01   0.983  0.32548    
## MarriedYes              5.739e-01  2.924e-01   1.963  0.04970 *  
## Dependents1            -3.756e-01  3.460e-01  -1.085  0.27771    
## Dependents2             2.770e-01  3.782e-01   0.733  0.46378    
## Dependents3+            1.884e-01  4.874e-01   0.386  0.69915    
## EducationNot Graduate  -4.210e-01  3.033e-01  -1.388  0.16510    
## Self_EmployedYes       -1.492e-01  3.523e-01  -0.423  0.67202    
## ApplicantIncome         6.945e-06  2.862e-05   0.243  0.80827    
## CoapplicantIncome      -5.143e-05  4.307e-05  -1.194  0.23246    
## LoanAmount             -2.737e-03  1.773e-03  -1.544  0.12270    
## Loan_Amount_Term       -9.253e-04  2.032e-03  -0.455  0.64885    
## Credit_History          3.650e+00  4.331e-01   8.427  < 2e-16 ***
## Property_AreaSemiurban  9.873e-01  3.036e-01   3.253  0.00114 ** 
## Property_AreaUrban      1.511e-01  3.007e-01   0.503  0.61527    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 593.05  on 479  degrees of freedom
## Residual deviance: 435.72  on 465  degrees of freedom
## AIC: 465.72
## 
## Number of Fisher Scoring iterations: 5
#2 log reg dengan ad-hoc (median + modus)
model_ad <- glm(Loan_Status ~ ., data = data_median_mode, family = binomial)

summary(model_ad)
## 
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = data_median_mode)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.450e+00  8.483e-01  -2.888 0.003873 ** 
## GenderMale             -3.164e-02  2.989e-01  -0.106 0.915701    
## MarriedYes              5.814e-01  2.527e-01   2.301 0.021408 *  
## Dependents1            -4.722e-01  2.947e-01  -1.602 0.109124    
## Dependents2             2.899e-01  3.427e-01   0.846 0.397622    
## Dependents3+            2.183e-02  4.259e-01   0.051 0.959119    
## EducationNot Graduate  -4.063e-01  2.596e-01  -1.565 0.117596    
## Self_EmployedYes       -2.604e-02  3.169e-01  -0.082 0.934506    
## ApplicantIncome         1.023e-05  2.447e-05   0.418 0.675924    
## CoapplicantIncome      -5.378e-05  3.526e-05  -1.525 0.127229    
## LoanAmount             -1.761e-03  1.602e-03  -1.099 0.271641    
## Loan_Amount_Term       -1.267e-03  1.827e-03  -0.693 0.488087    
## Credit_History          3.936e+00  4.210e-01   9.349  < 2e-16 ***
## Property_AreaSemiurban  9.075e-01  2.696e-01   3.366 0.000763 ***
## Property_AreaUrban      2.218e-01  2.597e-01   0.854 0.393134    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 762.89  on 613  degrees of freedom
## Residual deviance: 557.45  on 599  degrees of freedom
## AIC: 587.45
## 
## Number of Fisher Scoring iterations: 5
#3 log reg dengan MICE imputasi
model_mice <- glm(Loan_Status ~ ., data = completed_data, family = binomial)
summary(model_mice)
## 
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = completed_data)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -1.430e+00  7.609e-01  -1.879 0.060265 .  
## GenderMale             -1.344e-01  2.905e-01  -0.463 0.643702    
## MarriedYes              6.498e-01  2.483e-01   2.617 0.008866 ** 
## Dependents1            -4.652e-01  2.898e-01  -1.605 0.108410    
## Dependents2             1.808e-01  3.228e-01   0.560 0.575496    
## Dependents3+           -1.179e-01  4.035e-01  -0.292 0.770204    
## EducationNot Graduate  -4.093e-01  2.506e-01  -1.633 0.102478    
## Self_EmployedYes       -1.135e-01  2.957e-01  -0.384 0.701096    
## ApplicantIncome         8.971e-06  2.272e-05   0.395 0.692966    
## CoapplicantIncome      -3.984e-05  3.907e-05  -1.020 0.307871    
## LoanAmount             -1.669e-03  1.518e-03  -1.099 0.271651    
## Loan_Amount_Term       -1.590e-03  1.766e-03  -0.900 0.368174    
## Credit_History1         3.072e+00  3.060e-01  10.041  < 2e-16 ***
## Property_AreaSemiurban  8.559e-01  2.588e-01   3.307 0.000944 ***
## Property_AreaUrban      2.386e-01  2.532e-01   0.943 0.345923    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 762.89  on 613  degrees of freedom
## Residual deviance: 590.42  on 599  degrees of freedom
## AIC: 620.42
## 
## Number of Fisher Scoring iterations: 4

======== Hitung Akurasi =========

accuracy <- function(model, data){
  pred <- ifelse(predict(model, data, type="response") > 0.5, "Y", "N")
  actual <- as.character(data$Loan_Status)
  mean(pred == actual)
}

acc_drop <- accuracy(model_drop, data_drop)
acc_ad   <- accuracy(model_ad, data_median_mode)
acc_mice <- accuracy(model_mice, completed_data)

cat("Accuracy tanpa imputasi (drop NA): ", acc_drop, "\n")
## Accuracy tanpa imputasi (drop NA):  0.8125
cat("Accuracy ad-hoc imputasi (median+modus): ", acc_ad, "\n")
## Accuracy ad-hoc imputasi (median+modus):  0.8127036
cat("Accuracy MICE imputasi: ", acc_mice, "\n")
## Accuracy MICE imputasi:  0.7964169

#====== Model Evaluation =========

#Menggunakan k-fold cross validation (k=5)

set.seed(123)
# Fungsi helper CV untuk logistic regression
cv_logreg <- function(data, k = 5){
  folds <- createFolds(data$Loan_Status, k = k, list = TRUE)
  acc <- c()
  
  for(i in 1:k){
    train_idx <- setdiff(1:nrow(data), folds[[i]])
    train <- data[train_idx, ]
    test  <- data[folds[[i]], ]
    
    model <- glm(Loan_Status ~ ., data = train, family = binomial)
    pred  <- ifelse(predict(model, test, type="response") > 0.5, "Y", "N")
    acc[i] <- mean(pred == as.character(test$Loan_Status))
  }
  return(acc)
}

# CV untuk masing-masing dataset
cv_drop <- cv_logreg(data_drop, k=5)
cv_ad   <- cv_logreg(data_median_mode, k=5)
cv_mice <- cv_logreg(completed_data, k=5)

cv_results <- data.frame(
  Method = c("Tanpa Imputasi (drop NA)", 
             "Ad-hoc Imputasi (Median+Mode)", 
             "MICE Imputasi"),
  Mean_Accuracy = c(mean(cv_drop), mean(cv_ad), mean(cv_mice)),
  SD_Accuracy   = c(sd(cv_drop), sd(cv_ad), sd(cv_mice))
)

print(cv_results)
##                          Method Mean_Accuracy SD_Accuracy
## 1      Tanpa Imputasi (drop NA)     0.7937853  0.03689531
## 2 Ad-hoc Imputasi (Median+Mode)     0.8111235  0.03152236
## 3                 MICE Imputasi     0.7964365  0.02553660
#Visualisasi box-plot untuk cross validation
cv_all <- data.frame(
  Accuracy = c(cv_drop, cv_ad, cv_mice),
  Method   = rep(c("Tanpa Imputasi (drop NA)", 
                   "Ad-hoc Imputasi (Median+Mode)", 
                   "MICE Imputasi"), each = 5),
  Fold     = rep(1:5, times = 3)
)

# Boxplot + titik tiap fold
ggplot(cv_all, aes(x = Method, y = Accuracy, fill = Method)) +
  geom_boxplot(alpha = 0.6, outlier.shape = NA) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7, color = "black") +
  labs(title = "Perbandingan Akurasi Logistic Regression (5-Fold CV)",
       x = "Metode Imputasi",
       y = "Akurasi") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 15, hjust = 1),
        legend.position = "none")