##Missing Data Task #=======Explorasi missing data=======
data_missing <- read_csv("C:/Users/Muhammad Nisar/Downloads/data_missing.csv")
## Rows: 614 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Loan_ID, Gender, Married, Dependents, Education, Self_Employed, Pro...
## dbl (5): ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, C...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(data_missing)
## # A tibble: 614 × 13
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 LP001002 Male No 0 Graduate No 5849
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## 7 LP001013 Male Yes 0 Not Graduate No 2333
## 8 LP001014 Male Yes 3+ Graduate No 3036
## 9 LP001018 Male Yes 2 Graduate No 4006
## 10 LP001020 Male Yes 1 Graduate No 12841
## # ℹ 604 more rows
## # ℹ 6 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## # Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>,
## # Loan_Status <chr>
str(data_missing)
## spc_tbl_ [614 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Loan_ID : chr [1:614] "LP001002" "LP001003" "LP001005" "LP001006" ...
## $ Gender : chr [1:614] "Male" "Male" "Male" "Male" ...
## $ Married : chr [1:614] "No" "Yes" "Yes" "Yes" ...
## $ Dependents : chr [1:614] "0" "1" "0" "0" ...
## $ Education : chr [1:614] "Graduate" "Graduate" "Graduate" "Not Graduate" ...
## $ Self_Employed : chr [1:614] "No" "No" "Yes" "No" ...
## $ ApplicantIncome : num [1:614] 5849 4583 3000 2583 6000 ...
## $ CoapplicantIncome: num [1:614] 0 1508 0 2358 0 ...
## $ LoanAmount : num [1:614] NA 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : num [1:614] 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : num [1:614] 1 1 1 1 1 1 1 0 1 1 ...
## $ Property_Area : chr [1:614] "Urban" "Rural" "Urban" "Urban" ...
## $ Loan_Status : chr [1:614] "Y" "N" "Y" "Y" ...
## - attr(*, "spec")=
## .. cols(
## .. Loan_ID = col_character(),
## .. Gender = col_character(),
## .. Married = col_character(),
## .. Dependents = col_character(),
## .. Education = col_character(),
## .. Self_Employed = col_character(),
## .. ApplicantIncome = col_double(),
## .. CoapplicantIncome = col_double(),
## .. LoanAmount = col_double(),
## .. Loan_Amount_Term = col_double(),
## .. Credit_History = col_double(),
## .. Property_Area = col_character(),
## .. Loan_Status = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(data_missing)
## Loan_ID Gender Married Dependents
## Length:614 Length:614 Length:614 Length:614
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Education Self_Employed ApplicantIncome CoapplicantIncome
## Length:614 Length:614 Min. : 150 Min. : 0
## Class :character Class :character 1st Qu.: 2878 1st Qu.: 0
## Mode :character Mode :character Median : 3812 Median : 1188
## Mean : 5403 Mean : 1621
## 3rd Qu.: 5795 3rd Qu.: 2297
## Max. :81000 Max. :41667
##
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## Min. : 9.0 Min. : 12 Min. :0.0000 Length:614
## 1st Qu.:100.0 1st Qu.:360 1st Qu.:1.0000 Class :character
## Median :128.0 Median :360 Median :1.0000 Mode :character
## Mean :146.4 Mean :342 Mean :0.8422
## 3rd Qu.:168.0 3rd Qu.:360 3rd Qu.:1.0000
## Max. :700.0 Max. :480 Max. :1.0000
## NA's :22 NA's :14 NA's :50
## Loan_Status
## Length:614
## Class :character
## Mode :character
##
##
##
##
#Jumlah data missing value pada setiap kolom
missing_count <- sapply(data_missing, function(x) sum(is.na(x)))
#Persentase data missing value pada setiap kolom
missing_percent <- sapply(data_missing, function(x) mean(is.na(x))) * 100
#menampilkan jumlah dan persentase missing value
missing_summary <- data.frame(
Variable = names(data_missing), Missing_Count = missing_count,
Missing_Percent = round(missing_percent, 2)
)
print(missing_summary)
## Variable Missing_Count Missing_Percent
## Loan_ID Loan_ID 0 0.00
## Gender Gender 13 2.12
## Married Married 3 0.49
## Dependents Dependents 15 2.44
## Education Education 0 0.00
## Self_Employed Self_Employed 32 5.21
## ApplicantIncome ApplicantIncome 0 0.00
## CoapplicantIncome CoapplicantIncome 0 0.00
## LoanAmount LoanAmount 22 3.58
## Loan_Amount_Term Loan_Amount_Term 14 2.28
## Credit_History Credit_History 50 8.14
## Property_Area Property_Area 0 0.00
## Loan_Status Loan_Status 0 0.00
cat("Jumlah Baris dan Kolom:\n")
## Jumlah Baris dan Kolom:
print(dim(data_missing))
## [1] 614 13
#=======Visualisasi NA Variabel Numerik (Before Imputation)=======
#Distribusi plot
distribution <- function(x, binwidth = 20) {
df <- data.frame(value = x, is_na = is.na(x))
ggplot(df, aes(x = value, fill = is_na)) +
geom_histogram(binwidth = binwidth, color = "black") +
scale_fill_manual(values = c("FALSE" = "skyblue", "TRUE" = "red"),
labels = c("Valid", "Missing"),
name = "Status") +
labs(title = "Distribusi Variabel dengan NA",
x = "Nilai", y = "Frekuensi") +
theme_minimal()
}
#Gap Size Missing value
gapsize <- function(x) {
df <- data.frame(index = 1:length(x), is_na = is.na(x))
ggplot(df, aes(x = index, y = as.numeric(is_na))) +
geom_point(aes(color = is_na), size = 2) +
scale_color_manual(values = c("FALSE" = "skyblue", "TRUE" = "red"),
labels = c("Valid", "Missing"),
name = "Status") +
scale_y_continuous(breaks = c(0, 1), labels = c("Valid", "Missing")) +
labs(title = "Pola Gap (Missing vs Valid)", x = "Index", y = "") +
theme_minimal()
}
#a) Loan Amount
#Distribusi NA
distribution(data_missing$LoanAmount)
## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Gap Size Na
gapsize(data_missing$LoanAmount)
#b) Loan_Amount_Term
#Distribusi NA
distribution(data_missing$Loan_Amount_Term)
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Gap Size Na
gapsize(data_missing$Loan_Amount_Term)
#=======Imputation Methods======= #=======1. Ad-hoc methods (such as
mean, median, or mode imputation)=======
# Copy data asli
data_median <- data_missing
data_mode <- data_missing
# Variabel numerik & kategorik sesuai analisis
num_vars <- c("LoanAmount", "Loan_Amount_Term") # numerik
cat_vars <- c("Gender", "Married", "Dependents",
"Self_Employed", "Credit_History") # kategorik (termasuk biner)
# 1a. Median
for (col in num_vars) {
median_val <- median(data_missing[[col]], na.rm = TRUE) #na remove
data_median[[col]][is.na(data_median[[col]])] <- median_val
}
cat("Cek Data After Imputasi\n")
## Cek Data After Imputasi
print(colSums(is.na(data_median)))
## Loan_ID Gender Married Dependents
## 0 13 3 15
## Education Self_Employed ApplicantIncome CoapplicantIncome
## 0 32 0 0
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## 0 0 50 0
## Loan_Status
## 0
# 1b. Modus
for (col in cat_vars) {
mode_val <- mfv(data_missing[[col]], na_rm = TRUE) #na remove
data_mode[[col]][is.na(data_mode[[col]])] <- mode_val
}
cat("Cek Data After Imputasi\n")
## Cek Data After Imputasi
print(colSums(is.na(data_mode)))
## Loan_ID Gender Married Dependents
## 0 0 0 0
## Education Self_Employed ApplicantIncome CoapplicantIncome
## 0 0 0 0
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## 22 14 0 0
## Loan_Status
## 0
cat("Kalau Gabungan, mean dan mode atau median dan mode")
## Kalau Gabungan, mean dan mode atau median dan mode
# Copy data dulu
data_median_mode <- data_missing
#Menentukan Variabel Kategorik dan Numerik
num_var <- c("LoanAmount", "Loan_Amount_Term") # numerik
cat_var <- c("Gender", "Married", "Dependents",
"Self_Employed", "Credit_History") # kategorik (termasuk biner)
#Median dan Modus
cat("Cek Data Before Imputasi\n")
## Cek Data Before Imputasi
print(colSums(is.na(data_median_mode)))
## Loan_ID Gender Married Dependents
## 0 13 3 15
## Education Self_Employed ApplicantIncome CoapplicantIncome
## 0 32 0 0
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## 22 14 50 0
## Loan_Status
## 0
for (col in num_var) { #Median
median_val <- median(data_missing[[col]], na.rm = TRUE)
data_median_mode[[col]][is.na(data_median_mode[[col]])] <- median_val
}
for (col in cat_var) { #Modus
mode_val <- mfv(data_missing[[col]], na_rm = TRUE)
data_median_mode[[col]][is.na(data_median_mode[[col]])] <- mode_val
}
cat("Cek Data After Imputasi\n")
## Cek Data After Imputasi
print(colSums(is.na(data_median_mode)))
## Loan_ID Gender Married Dependents
## 0 0 0 0
## Education Self_Employed ApplicantIncome CoapplicantIncome
## 0 0 0 0
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## 0 0 0 0
## Loan_Status
## 0
#=======2. Multiple Imputation by Chained Equations (MICE)=======
#Copy
data_mice <- data_missing
# Ubah variabel kategorik jadi factor
data_mice$Gender <- as.factor(data_mice$Gender)
data_mice$Married <- as.factor(data_mice$Married)
data_mice$Dependents <- as.factor(data_mice$Dependents)
data_mice$Self_Employed <- as.factor(data_mice$Self_Employed)
data_mice$Credit_History<- as.factor(data_mice$Credit_History)
# Atur metode imputasi hanya untuk variabel yang missing
meth <- make.method(data_mice)
meth[c("Loan_ID","ApplicantIncome","CoapplicantIncome",
"Education","Property_Area","Loan_Status")] <- ""
meth["Gender"] <- "logreg"
meth["Married"] <- "logreg"
meth["Dependents"] <- "polyreg"
meth["Self_Employed"] <- "logreg"
meth["LoanAmount"] <- "pmm"
meth["Loan_Amount_Term"] <- "pmm"
meth["Credit_History"] <- "logreg"
# Jalankan imputasi multiple (misal 5 dataset imputasi)
imp <- mice(data_mice, m = 5, method = meth, seed = 123)
##
## iter imp variable
## 1 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 1 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 1 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 1 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 1 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 2 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 3 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 4 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 1 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 2 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 3 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 4 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## 5 5 Gender Married Dependents Self_Employed LoanAmount Loan_Amount_Term Credit_History
## Warning: Number of logged events: 4
completed_data <- complete(imp, 1)
#Visualisasi NA Variabel Numerik (After Imputation)
visualisasi <- data_missing
imputations_hist <- function(original, imputed, binwidth = 20) {
df <- data.frame(
value = c(original, imputed),
status = rep(c("Original", "Imputed"), each = length(original))
)
ggplot(df, aes(x = value, fill = status)) +
geom_histogram(position = "identity", alpha = 0.5, binwidth = binwidth, color = "black") +
scale_fill_manual(values = c("Original" = "skyblue", "Imputed" = "red")) +
labs(title = "Distribusi Sebelum vs Sesudah Imputasi",
x = "Nilai", y = "Frekuensi") +
theme_minimal()
}
print(imputations_hist(visualisasi$LoanAmount, imp$LoanAmount))
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_bin()`).
print(imputations_hist(visualisasi$Loan_Amount_Term, imp$Loan_Amount_Term))
## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_bin()`).
#=======Model Building=======
# Drop Loan_ID di semua dataset
data_drop <- subset(na.omit(data_missing), select = -Loan_ID)
data_median_mode <- subset(data_median_mode, select = -Loan_ID)
completed_data <- subset(completed_data, select = -Loan_ID)
# Pastikan Loan_Status faktor 2 level
data_drop$Loan_Status <- factor(data_drop$Loan_Status, levels = c("N","Y"))
data_median_mode$Loan_Status <- factor(data_median_mode$Loan_Status, levels = c("N","Y"))
completed_data$Loan_Status <- factor(completed_data$Loan_Status, levels = c("N","Y"))
#1. Log reg tanpa imputasi
model_drop <- glm(Loan_Status ~ ., data = data_drop, family = binomial)
summary(model_drop)
##
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = data_drop)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.429e+00 9.312e-01 -2.609 0.00909 **
## GenderMale 3.254e-01 3.309e-01 0.983 0.32548
## MarriedYes 5.739e-01 2.924e-01 1.963 0.04970 *
## Dependents1 -3.756e-01 3.460e-01 -1.085 0.27771
## Dependents2 2.770e-01 3.782e-01 0.733 0.46378
## Dependents3+ 1.884e-01 4.874e-01 0.386 0.69915
## EducationNot Graduate -4.210e-01 3.033e-01 -1.388 0.16510
## Self_EmployedYes -1.492e-01 3.523e-01 -0.423 0.67202
## ApplicantIncome 6.945e-06 2.862e-05 0.243 0.80827
## CoapplicantIncome -5.143e-05 4.307e-05 -1.194 0.23246
## LoanAmount -2.737e-03 1.773e-03 -1.544 0.12270
## Loan_Amount_Term -9.253e-04 2.032e-03 -0.455 0.64885
## Credit_History 3.650e+00 4.331e-01 8.427 < 2e-16 ***
## Property_AreaSemiurban 9.873e-01 3.036e-01 3.253 0.00114 **
## Property_AreaUrban 1.511e-01 3.007e-01 0.503 0.61527
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 593.05 on 479 degrees of freedom
## Residual deviance: 435.72 on 465 degrees of freedom
## AIC: 465.72
##
## Number of Fisher Scoring iterations: 5
#2 log reg dengan ad-hoc (median + modus)
model_ad <- glm(Loan_Status ~ ., data = data_median_mode, family = binomial)
summary(model_ad)
##
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = data_median_mode)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.450e+00 8.483e-01 -2.888 0.003873 **
## GenderMale -3.164e-02 2.989e-01 -0.106 0.915701
## MarriedYes 5.814e-01 2.527e-01 2.301 0.021408 *
## Dependents1 -4.722e-01 2.947e-01 -1.602 0.109124
## Dependents2 2.899e-01 3.427e-01 0.846 0.397622
## Dependents3+ 2.183e-02 4.259e-01 0.051 0.959119
## EducationNot Graduate -4.063e-01 2.596e-01 -1.565 0.117596
## Self_EmployedYes -2.604e-02 3.169e-01 -0.082 0.934506
## ApplicantIncome 1.023e-05 2.447e-05 0.418 0.675924
## CoapplicantIncome -5.378e-05 3.526e-05 -1.525 0.127229
## LoanAmount -1.761e-03 1.602e-03 -1.099 0.271641
## Loan_Amount_Term -1.267e-03 1.827e-03 -0.693 0.488087
## Credit_History 3.936e+00 4.210e-01 9.349 < 2e-16 ***
## Property_AreaSemiurban 9.075e-01 2.696e-01 3.366 0.000763 ***
## Property_AreaUrban 2.218e-01 2.597e-01 0.854 0.393134
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 762.89 on 613 degrees of freedom
## Residual deviance: 557.45 on 599 degrees of freedom
## AIC: 587.45
##
## Number of Fisher Scoring iterations: 5
#3 log reg dengan MICE imputasi
model_mice <- glm(Loan_Status ~ ., data = completed_data, family = binomial)
summary(model_mice)
##
## Call:
## glm(formula = Loan_Status ~ ., family = binomial, data = completed_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.430e+00 7.609e-01 -1.879 0.060265 .
## GenderMale -1.344e-01 2.905e-01 -0.463 0.643702
## MarriedYes 6.498e-01 2.483e-01 2.617 0.008866 **
## Dependents1 -4.652e-01 2.898e-01 -1.605 0.108410
## Dependents2 1.808e-01 3.228e-01 0.560 0.575496
## Dependents3+ -1.179e-01 4.035e-01 -0.292 0.770204
## EducationNot Graduate -4.093e-01 2.506e-01 -1.633 0.102478
## Self_EmployedYes -1.135e-01 2.957e-01 -0.384 0.701096
## ApplicantIncome 8.971e-06 2.272e-05 0.395 0.692966
## CoapplicantIncome -3.984e-05 3.907e-05 -1.020 0.307871
## LoanAmount -1.669e-03 1.518e-03 -1.099 0.271651
## Loan_Amount_Term -1.590e-03 1.766e-03 -0.900 0.368174
## Credit_History1 3.072e+00 3.060e-01 10.041 < 2e-16 ***
## Property_AreaSemiurban 8.559e-01 2.588e-01 3.307 0.000944 ***
## Property_AreaUrban 2.386e-01 2.532e-01 0.943 0.345923
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 762.89 on 613 degrees of freedom
## Residual deviance: 590.42 on 599 degrees of freedom
## AIC: 620.42
##
## Number of Fisher Scoring iterations: 4
accuracy <- function(model, data){
pred <- ifelse(predict(model, data, type="response") > 0.5, "Y", "N")
actual <- as.character(data$Loan_Status)
mean(pred == actual)
}
acc_drop <- accuracy(model_drop, data_drop)
acc_ad <- accuracy(model_ad, data_median_mode)
acc_mice <- accuracy(model_mice, completed_data)
cat("Accuracy tanpa imputasi (drop NA): ", acc_drop, "\n")
## Accuracy tanpa imputasi (drop NA): 0.8125
cat("Accuracy ad-hoc imputasi (median+modus): ", acc_ad, "\n")
## Accuracy ad-hoc imputasi (median+modus): 0.8127036
cat("Accuracy MICE imputasi: ", acc_mice, "\n")
## Accuracy MICE imputasi: 0.7964169
#====== Model Evaluation =========
#Menggunakan k-fold cross validation (k=5)
set.seed(123)
# Fungsi helper CV untuk logistic regression
cv_logreg <- function(data, k = 5){
folds <- createFolds(data$Loan_Status, k = k, list = TRUE)
acc <- c()
for(i in 1:k){
train_idx <- setdiff(1:nrow(data), folds[[i]])
train <- data[train_idx, ]
test <- data[folds[[i]], ]
model <- glm(Loan_Status ~ ., data = train, family = binomial)
pred <- ifelse(predict(model, test, type="response") > 0.5, "Y", "N")
acc[i] <- mean(pred == as.character(test$Loan_Status))
}
return(acc)
}
# CV untuk masing-masing dataset
cv_drop <- cv_logreg(data_drop, k=5)
cv_ad <- cv_logreg(data_median_mode, k=5)
cv_mice <- cv_logreg(completed_data, k=5)
cv_results <- data.frame(
Method = c("Tanpa Imputasi (drop NA)",
"Ad-hoc Imputasi (Median+Mode)",
"MICE Imputasi"),
Mean_Accuracy = c(mean(cv_drop), mean(cv_ad), mean(cv_mice)),
SD_Accuracy = c(sd(cv_drop), sd(cv_ad), sd(cv_mice))
)
print(cv_results)
## Method Mean_Accuracy SD_Accuracy
## 1 Tanpa Imputasi (drop NA) 0.7937853 0.03689531
## 2 Ad-hoc Imputasi (Median+Mode) 0.8111235 0.03152236
## 3 MICE Imputasi 0.7964365 0.02553660
#Visualisasi box-plot untuk cross validation
cv_all <- data.frame(
Accuracy = c(cv_drop, cv_ad, cv_mice),
Method = rep(c("Tanpa Imputasi (drop NA)",
"Ad-hoc Imputasi (Median+Mode)",
"MICE Imputasi"), each = 5),
Fold = rep(1:5, times = 3)
)
# Boxplot + titik tiap fold
ggplot(cv_all, aes(x = Method, y = Accuracy, fill = Method)) +
geom_boxplot(alpha = 0.6, outlier.shape = NA) +
geom_jitter(width = 0.2, size = 2, alpha = 0.7, color = "black") +
labs(title = "Perbandingan Akurasi Logistic Regression (5-Fold CV)",
x = "Metode Imputasi",
y = "Akurasi") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 15, hjust = 1),
legend.position = "none")