library(readr)
train_path <- "C:/Users/ASUS/Downloads/train_u6lujuX_CVtuZ9i.csv"
train_df <- read_csv(train_path, show_col_types = FALSE)
# Cek data
head(train_df)
## # A tibble: 6 × 13
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 LP001002 Male No 0 Graduate No 5849
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## # ℹ 6 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## # Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>,
## # Loan_Status <chr>
test_path <- "C:/Users/ASUS/Downloads/test_Y3wMUE5_7gLdaTN.csv"
test_df <- read_csv(test_path, show_col_types = FALSE)
head(test_df)
## # A tibble: 6 × 12
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 LP001015 Male Yes 0 Graduate No 5720
## 2 LP001022 Male Yes 1 Graduate No 3076
## 3 LP001031 Male Yes 2 Graduate No 5000
## 4 LP001035 Male Yes 2 Graduate No 2340
## 5 LP001051 Male No 0 Not Graduate No 3276
## 6 LP001054 Male Yes 0 Not Graduate Yes 2165
## # ℹ 5 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## # Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>
# Analisis data train
cat("Data Train\n")
## Data Train
print(head(train_df))
## # A tibble: 6 × 13
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 LP001002 Male No 0 Graduate No 5849
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## # ℹ 6 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## # Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>,
## # Loan_Status <chr>
print(dim(train_df))
## [1] 614 13
print(str(train_df))
## spc_tbl_ [614 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Loan_ID : chr [1:614] "LP001002" "LP001003" "LP001005" "LP001006" ...
## $ Gender : chr [1:614] "Male" "Male" "Male" "Male" ...
## $ Married : chr [1:614] "No" "Yes" "Yes" "Yes" ...
## $ Dependents : chr [1:614] "0" "1" "0" "0" ...
## $ Education : chr [1:614] "Graduate" "Graduate" "Graduate" "Not Graduate" ...
## $ Self_Employed : chr [1:614] "No" "No" "Yes" "No" ...
## $ ApplicantIncome : num [1:614] 5849 4583 3000 2583 6000 ...
## $ CoapplicantIncome: num [1:614] 0 1508 0 2358 0 ...
## $ LoanAmount : num [1:614] NA 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : num [1:614] 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : num [1:614] 1 1 1 1 1 1 1 0 1 1 ...
## $ Property_Area : chr [1:614] "Urban" "Rural" "Urban" "Urban" ...
## $ Loan_Status : chr [1:614] "Y" "N" "Y" "Y" ...
## - attr(*, "spec")=
## .. cols(
## .. Loan_ID = col_character(),
## .. Gender = col_character(),
## .. Married = col_character(),
## .. Dependents = col_character(),
## .. Education = col_character(),
## .. Self_Employed = col_character(),
## .. ApplicantIncome = col_double(),
## .. CoapplicantIncome = col_double(),
## .. LoanAmount = col_double(),
## .. Loan_Amount_Term = col_double(),
## .. Credit_History = col_double(),
## .. Property_Area = col_character(),
## .. Loan_Status = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
## NULL
print(summary(train_df))
## Loan_ID Gender Married Dependents
## Length:614 Length:614 Length:614 Length:614
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Education Self_Employed ApplicantIncome CoapplicantIncome
## Length:614 Length:614 Min. : 150 Min. : 0
## Class :character Class :character 1st Qu.: 2878 1st Qu.: 0
## Mode :character Mode :character Median : 3812 Median : 1188
## Mean : 5403 Mean : 1621
## 3rd Qu.: 5795 3rd Qu.: 2297
## Max. :81000 Max. :41667
##
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## Min. : 9.0 Min. : 12 Min. :0.0000 Length:614
## 1st Qu.:100.0 1st Qu.:360 1st Qu.:1.0000 Class :character
## Median :128.0 Median :360 Median :1.0000 Mode :character
## Mean :146.4 Mean :342 Mean :0.8422
## 3rd Qu.:168.0 3rd Qu.:360 3rd Qu.:1.0000
## Max. :700.0 Max. :480 Max. :1.0000
## NA's :22 NA's :14 NA's :50
## Loan_Status
## Length:614
## Class :character
## Mode :character
##
##
##
##
print(sapply(train_df, class))
## Loan_ID Gender Married Dependents
## "character" "character" "character" "character"
## Education Self_Employed ApplicantIncome CoapplicantIncome
## "character" "character" "numeric" "numeric"
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## "numeric" "numeric" "numeric" "character"
## Loan_Status
## "character"
print(colSums(is.na(train_df)))
## Loan_ID Gender Married Dependents
## 0 13 3 15
## Education Self_Employed ApplicantIncome CoapplicantIncome
## 0 32 0 0
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## 22 14 50 0
## Loan_Status
## 0
cat("Jumlah baris duplikat:", sum(duplicated(train_df)), "\n")
## Jumlah baris duplikat: 0
# Analisis data test
cat("Data Test\n")
## Data Test
print(head(test_df))
## # A tibble: 6 × 12
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 LP001015 Male Yes 0 Graduate No 5720
## 2 LP001022 Male Yes 1 Graduate No 3076
## 3 LP001031 Male Yes 2 Graduate No 5000
## 4 LP001035 Male Yes 2 Graduate No 2340
## 5 LP001051 Male No 0 Not Graduate No 3276
## 6 LP001054 Male Yes 0 Not Graduate Yes 2165
## # ℹ 5 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## # Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>
print(dim(test_df))
## [1] 367 12
print(str(test_df))
## spc_tbl_ [367 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Loan_ID : chr [1:367] "LP001015" "LP001022" "LP001031" "LP001035" ...
## $ Gender : chr [1:367] "Male" "Male" "Male" "Male" ...
## $ Married : chr [1:367] "Yes" "Yes" "Yes" "Yes" ...
## $ Dependents : chr [1:367] "0" "1" "2" "2" ...
## $ Education : chr [1:367] "Graduate" "Graduate" "Graduate" "Graduate" ...
## $ Self_Employed : chr [1:367] "No" "No" "No" "No" ...
## $ ApplicantIncome : num [1:367] 5720 3076 5000 2340 3276 ...
## $ CoapplicantIncome: num [1:367] 0 1500 1800 2546 0 ...
## $ LoanAmount : num [1:367] 110 126 208 100 78 152 59 147 280 123 ...
## $ Loan_Amount_Term : num [1:367] 360 360 360 360 360 360 360 360 240 360 ...
## $ Credit_History : num [1:367] 1 1 1 NA 1 1 1 0 1 1 ...
## $ Property_Area : chr [1:367] "Urban" "Urban" "Urban" "Urban" ...
## - attr(*, "spec")=
## .. cols(
## .. Loan_ID = col_character(),
## .. Gender = col_character(),
## .. Married = col_character(),
## .. Dependents = col_character(),
## .. Education = col_character(),
## .. Self_Employed = col_character(),
## .. ApplicantIncome = col_double(),
## .. CoapplicantIncome = col_double(),
## .. LoanAmount = col_double(),
## .. Loan_Amount_Term = col_double(),
## .. Credit_History = col_double(),
## .. Property_Area = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
## NULL
print(summary(test_df))
## Loan_ID Gender Married Dependents
## Length:367 Length:367 Length:367 Length:367
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Education Self_Employed ApplicantIncome CoapplicantIncome
## Length:367 Length:367 Min. : 0 Min. : 0
## Class :character Class :character 1st Qu.: 2864 1st Qu.: 0
## Mode :character Mode :character Median : 3786 Median : 1025
## Mean : 4806 Mean : 1570
## 3rd Qu.: 5060 3rd Qu.: 2430
## Max. :72529 Max. :24000
##
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## Min. : 28.0 Min. : 6.0 Min. :0.0000 Length:367
## 1st Qu.:100.2 1st Qu.:360.0 1st Qu.:1.0000 Class :character
## Median :125.0 Median :360.0 Median :1.0000 Mode :character
## Mean :136.1 Mean :342.5 Mean :0.8254
## 3rd Qu.:158.0 3rd Qu.:360.0 3rd Qu.:1.0000
## Max. :550.0 Max. :480.0 Max. :1.0000
## NA's :5 NA's :6 NA's :29
print(sapply(test_df, class))
## Loan_ID Gender Married Dependents
## "character" "character" "character" "character"
## Education Self_Employed ApplicantIncome CoapplicantIncome
## "character" "character" "numeric" "numeric"
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## "numeric" "numeric" "numeric" "character"
print(colSums(is.na(test_df)))
## Loan_ID Gender Married Dependents
## 0 11 0 10
## Education Self_Employed ApplicantIncome CoapplicantIncome
## 0 23 0 0
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## 5 6 29 0
cat("Jumlah baris duplikat:", sum(duplicated(test_df)), "\n")
## Jumlah baris duplikat: 0
PREPROCESSING
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
train_df <- train_df %>% select(-Loan_ID)
test_df <- test_df %>% select(-Loan_ID)
target_col <- "Loan_Status"
num_cols <- names(train_df)[sapply(train_df, is.numeric)]
cat_cols <- setdiff(names(train_df)[sapply(train_df, is.character)], target_col)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Imputasi numerik dengan median
for(col in num_cols) {
median_val <- median(train_df[[col]], na.rm = TRUE)
train_df[[col]][is.na(train_df[[col]])] <- median_val
test_df[[col]][is.na(test_df[[col]])] <- median_val
}
# Imputasi kategorikal dengan modus (nilai paling sering)
get_mode <- function(v) {
uniqv <- unique(v[!is.na(v)])
uniqv[which.max(tabulate(match(v, uniqv)))]
}
for(col in cat_cols) {
mode_val <- get_mode(train_df[[col]])
train_df[[col]][is.na(train_df[[col]])] <- mode_val
test_df[[col]][is.na(test_df[[col]])] <- mode_val
}
train_df[[target_col]] <- ifelse(train_df[[target_col]] == "Y", 1, 0)
library(dplyr)
library(fastDummies)
# Buat dataframe tanpa kolom target dari train, lalu gabung dengan test
combined <- bind_rows(
train_df %>% select(-all_of(target_col)),
test_df
)
# One-hot encoding dengan menghapus kategori pertama (drop_first=True)
combined_encoded <- dummy_cols(combined, remove_selected_columns = TRUE, remove_first_dummy = TRUE)
n_train <- nrow(train_df)
X_train <- combined_encoded[1:n_train, ]
X_test <- combined_encoded[(n_train + 1):nrow(combined_encoded), ]
y_train <- train_df[[target_col]]
# Fungsi clipping berdasarkan IQR
clip_outliers <- function(df, numeric_cols) {
for (col in numeric_cols) {
Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
df[[col]][df[[col]] < lower_bound] <- lower_bound
df[[col]][df[[col]] > upper_bound] <- upper_bound
}
return(df)
}
# Identifikasi kolom numerik pada combined_encoded
num_cols_encoded <- names(combined_encoded)[sapply(combined_encoded, is.numeric)]
# Terapkan clipping pada train dan test secara terpisah untuk menghindari data leakage
combined_encoded[1:n_train, ] <- clip_outliers(combined_encoded[1:n_train, ], num_cols_encoded)
combined_encoded[(n_train + 1):nrow(combined_encoded), ] <- clip_outliers(combined_encoded[(n_train + 1):nrow(combined_encoded), ], num_cols_encoded)
library(smotefamily)
# Membuat dataframe train dengan label
train_for_smote <- cbind(X_train, Loan_Status = y_train)
# SMOTE dengan smotefamily
set.seed(42)
smote_output <- SMOTE(train_for_smote[ , !(names(train_for_smote) %in% "Loan_Status")],
train_for_smote$Loan_Status,
K = 5,
dup_size = 0) # dup_size=0 berarti tidak menambah data sintetis terlalu banyak
# smote_output berisi list dengan data sintetis dan label baru
X_train_smote <- smote_output$data[, -ncol(smote_output$data)]
y_train_smote <- smote_output$data[, ncol(smote_output$data)]
y_train_smote <- as.factor(y_train_smote)
# Update X_train dan y_train dengan hasil SMOTE
X_train <- X_train_smote
y_train <- y_train_smote
X_train_scaled <- scale(X_train)
X_test_scaled <- scale(X_test, center = attr(X_train_scaled, "scaled:center"), scale = attr(X_train_scaled, "scaled:scale"))
library(caret)
set.seed(42)
train_index <- createDataPartition(y_train, p = 0.8, list = FALSE, times = 1)
X_tr <- X_train_scaled[train_index, ]
X_val <- X_train_scaled[-train_index, ]
y_tr <- y_train[train_index]
y_val <- y_train[-train_index]
dupes_train <- sum(duplicated(X_train))
dupes_test <- sum(duplicated(X_test))
if (dupes_train > 0) {
idx_unique <- !duplicated(X_train)
X_train <- X_train[idx_unique, ]
y_train <- y_train[idx_unique]
}
EDA
ggplot(train_df, aes(x = Loan_Status)) +
geom_bar(fill = c("red", "green")) +
labs(title = "Distribusi Loan_Status (Sebelum Preprocessing)",
x = "Loan_Status", y = "Jumlah") +
theme_minimal()
barplot(table(y_train), col = c("red", "green"),
main = "Distribusi Loan_Status (Setelah Preprocessing dan SMOTE)",
xlab = "Loan_Status", ylab = "Jumlah")
library(ggplot2)
num_features <- c("ApplicantIncome", "LoanAmount")
for (feature in num_features) {
p <- ggplot(train_df, aes_string(y = feature)) +
geom_boxplot(fill = "tomato", outlier.color = "red", outlier.shape = 16) +
labs(title = paste("Boxplot", feature, "(Sebelum Preprocessing)"),
y = feature) +
theme_minimal()
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(ggplot2)
num_features <- c("ApplicantIncome", "LoanAmount")
for (feature in num_features) {
# Cari index kolom fitur di X_train_scaled
idx <- which(colnames(X_train_scaled) == feature)
if (length(idx) == 0) {
# Jika nama fitur tidak ditemukan (mungkin karena encoding), gunakan kolom pertama sebagai contoh
idx <- 1
}
df_scaled <- data.frame(
Value = X_train_scaled[, idx]
)
p <- ggplot(df_scaled, aes(y = Value)) +
geom_boxplot(fill = "steelblue", outlier.color = "blue", outlier.shape = 16) +
labs(title = paste("Boxplot", feature, "(Setelah Scaling)"),
y = feature) +
theme_minimal()
print(p)
}
library(corrplot)
## corrplot 0.95 loaded
# Identifikasi kolom numerik di train_df
num_cols <- names(train_df)[sapply(train_df, is.numeric)]
# Hitung matriks korelasi hanya pada kolom numerik (handle missing dengan complete.obs)
cor_mat <- cor(train_df[, num_cols], use = "complete.obs")
# Plot korelasi fitur numerik
corrplot(cor_mat, method = "color", addCoef.col = "black",
tl.col = "black", number.cex = 0.7,
title = "Korelasi Fitur Numerik",
mar = c(0, 0, 1, 0))
library(biotools)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## ---
## biotools version 4.3
train_data <- data.frame(X_tr)
train_data$target <- as.factor(y_tr)
boxm_result <- boxM(train_data[, !(colnames(train_data) %in% "target")], train_data$target)
print(boxm_result)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: train_data[, !(colnames(train_data) %in% "target")]
## Chi-Sq (approx.) = 679.35, df = 105, p-value < 2.2e-16
library(MASS)
lda_model <- lda(target ~ ., data = train_data)
print("Ringkasan Model LDA:")
## [1] "Ringkasan Model LDA:"
print(lda_model)
## Call:
## lda(target ~ ., data = train_data)
##
## Prior probabilities of groups:
## 0 1
## 0.4767802 0.5232198
##
## Group means:
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## 0 0.0009677648 0.05915311 0.065060326 -0.001337883
## 1 0.0194537121 -0.02653977 -0.005787604 -0.024771518
## Credit_History Gender_Male Married_Yes Dependents_1 Dependents_2
## 0 -0.5060363 -0.007919659 -0.0553775 0.06809195 -0.06772812
## 1 0.4773545 -0.003037477 0.1232656 -0.01315853 0.05906957
## Dependents_3. Education_Not.Graduate Self_Employed_Yes
## 0 0.05265518 0.11233729 -0.03481967
## 1 0.01045856 -0.09879705 0.01802579
## Property_Area_Semiurban Property_Area_Urban
## 0 -0.1691134 0.03053692
## 1 0.2060059 -0.07110221
##
## Coefficients of linear discriminants:
## LD1
## ApplicantIncome 0.058691137
## CoapplicantIncome -0.087767967
## LoanAmount -0.169951340
## Loan_Amount_Term -0.027643431
## Credit_History 1.071594230
## Gender_Male -0.035468131
## Married_Yes 0.279987125
## Dependents_1 -0.126251653
## Dependents_2 0.047274962
## Dependents_3. 0.006751067
## Education_Not.Graduate -0.185783374
## Self_Employed_Yes 0.081156732
## Property_Area_Semiurban 0.414123202
## Property_Area_Urban 0.137209969
val_data <- data.frame(X_val)
val_data$target <- as.factor(y_val)
lda_pred_class <- predict(lda_model, newdata = val_data)$class
lda_pred_prob <- predict(lda_model, newdata = val_data)$posterior[, 2]
val_target_factor <- as.factor(val_data$target)
cm <- confusionMatrix(lda_pred_class, val_target_factor, positive = levels(val_target_factor)[2])
cat(sprintf("Akurasi : %.4f\n", cm$overall["Accuracy"]))
## Akurasi : 0.7750
cat(sprintf("Precision : %.4f\n", cm$byClass["Precision"]))
## Precision : 0.7222
cat(sprintf("Recall : %.4f\n", cm$byClass["Recall"]))
## Recall : 0.9286
cat(sprintf("F1-Score : %.4f\n", cm$byClass["F1"]))
## F1-Score : 0.8125
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
val_target_factor <- as.factor(val_data$target)
roc_obj <- roc(val_target_factor, lda_pred_prob, levels=rev(levels(val_target_factor)))
## Setting direction: controls > cases
auc_val <- auc(roc_obj)
cat(sprintf("ROC AUC : %.4f\n", auc_val))
## ROC AUC : 0.8253
-Koefisien diskriminan (menunjukkan kontribusi tiap variabel):
print(lda_model$scaling)
## LD1
## ApplicantIncome 0.058691137
## CoapplicantIncome -0.087767967
## LoanAmount -0.169951340
## Loan_Amount_Term -0.027643431
## Credit_History 1.071594230
## Gender_Male -0.035468131
## Married_Yes 0.279987125
## Dependents_1 -0.126251653
## Dependents_2 0.047274962
## Dependents_3. 0.006751067
## Education_Not.Graduate -0.185783374
## Self_Employed_Yes 0.081156732
## Property_Area_Semiurban 0.414123202
## Property_Area_Urban 0.137209969
print(lda_model$prior)
## 0 1
## 0.4767802 0.5232198
print(lda_model$means)
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## 0 0.0009677648 0.05915311 0.065060326 -0.001337883
## 1 0.0194537121 -0.02653977 -0.005787604 -0.024771518
## Credit_History Gender_Male Married_Yes Dependents_1 Dependents_2
## 0 -0.5060363 -0.007919659 -0.0553775 0.06809195 -0.06772812
## 1 0.4773545 -0.003037477 0.1232656 -0.01315853 0.05906957
## Dependents_3. Education_Not.Graduate Self_Employed_Yes
## 0 0.05265518 0.11233729 -0.03481967
## 1 0.01045856 -0.09879705 0.01802579
## Property_Area_Semiurban Property_Area_Urban
## 0 -0.1691134 0.03053692
## 1 0.2060059 -0.07110221
cm <- confusionMatrix(lda_pred_class, val_target_factor, positive = levels(val_target_factor)[2])
cm_table <- as.data.frame(cm$table)
colnames(cm_table) <- c("Reference", "Prediction", "Freq")
ggplot(data = cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "blue") +
ggtitle("Confusion Matrix") +
theme_minimal()
metrics_df <- data.frame(
Metric = c("Precision", "Recall", "F1"),
Value = c(cm$byClass["Precision"], cm$byClass["Recall"], cm$byClass["F1"])
)
ggplot(metrics_df, aes(x = Metric, y = Value, fill = Metric)) +
geom_bar(stat = "identity", width = 0.6) +
geom_text(aes(label = sprintf("%.2f", Value)), vjust = -0.3, size = 5) +
ylim(0, 1) +
ggtitle("Classification Report Metrics") +
theme_minimal() +
guides(fill = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve - Linear Discriminant Analysis")
abline(a=0, b=1, lty=2, col="gray")
# Load library
library(caret)
library(pROC)
library(ggplot2)
library(reshape2)
# Data training dan validation
train_data <- data.frame(X_tr)
train_data$target <- as.factor(y_tr)
val_data <- data.frame(X_val)
val_data$target <- as.factor(y_val)
# Fit model regresi logistik (estimasi parameter)
logreg_model <- glm(target ~ ., data = train_data, family = binomial)
# Tampilkan ringkasan model
summary(logreg_model)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.18765 0.12320 -1.523 0.12772
## ApplicantIncome 0.08528 0.13370 0.638 0.52357
## CoapplicantIncome -0.08334 0.09416 -0.885 0.37608
## LoanAmount -0.20758 0.11965 -1.735 0.08276 .
## Loan_Amount_Term -0.01191 0.09680 -0.123 0.90209
## Credit_History 1.68831 0.19712 8.565 < 2e-16 ***
## Gender_Male -0.06238 0.10330 -0.604 0.54594
## Married_Yes 0.34177 0.10918 3.130 0.00175 **
## Dependents_1 -0.16468 0.09807 -1.679 0.09312 .
## Dependents_2 0.05507 0.10834 0.508 0.61121
## Dependents_3. 0.01771 0.09945 0.178 0.85865
## Education_Not.Graduate -0.21892 0.09887 -2.214 0.02681 *
## Self_Employed_Yes 0.08385 0.09968 0.841 0.40024
## Property_Area_Semiurban 0.52604 0.11421 4.606 4.1e-06 ***
## Property_Area_Urban 0.16160 0.11024 1.466 0.14266
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 894.15 on 645 degrees of freedom
## Residual deviance: 654.78 on 631 degrees of freedom
## AIC: 684.78
##
## Number of Fisher Scoring iterations: 6
# Model null (hanya intercept)
null_model <- glm(target ~ 1, data = train_data, family = binomial)
# Likelihood ratio test
lr_test <- anova(null_model, logreg_model, test = "Chisq")
print(lr_test)
## Analysis of Deviance Table
##
## Model 1: target ~ 1
## Model 2: target ~ ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + Gender_Male + Married_Yes + Dependents_1 +
## Dependents_2 + Dependents_3. + Education_Not.Graduate + Self_Employed_Yes +
## Property_Area_Semiurban + Property_Area_Urban
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 645 894.15
## 2 631 654.78 14 239.37 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Sudah tersedia di summary(logreg_model)
# Contoh akses p-value
coef_summary <- summary(logreg_model)$coefficients
print(coef_summary)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.18764937 0.12319897 -1.5231408 1.277235e-01
## ApplicantIncome 0.08528074 0.13369867 0.6378578 5.235663e-01
## CoapplicantIncome -0.08334126 0.09415671 -0.8851335 3.760846e-01
## LoanAmount -0.20757815 0.11964912 -1.7348907 8.276016e-02
## Loan_Amount_Term -0.01190812 0.09680105 -0.1230164 9.020941e-01
## Credit_History 1.68830771 0.19712087 8.5648348 1.082309e-17
## Gender_Male -0.06237898 0.10330057 -0.6038590 5.459374e-01
## Married_Yes 0.34177076 0.10918226 3.1302774 1.746413e-03
## Dependents_1 -0.16467818 0.09807280 -1.6791423 9.312432e-02
## Dependents_2 0.05507192 0.10833594 0.5083440 6.112121e-01
## Dependents_3. 0.01771091 0.09944638 0.1780950 8.586483e-01
## Education_Not.Graduate -0.21892003 0.09886793 -2.2142674 2.681039e-02
## Self_Employed_Yes 0.08384939 0.09967876 0.8411962 4.002380e-01
## Property_Area_Semiurban 0.52604265 0.11420666 4.6060591 4.103718e-06
## Property_Area_Urban 0.16159929 0.11023562 1.4659444 1.426634e-01
# Prediksi probabilitas kelas positif pada validation set
logreg_pred_prob <- predict(logreg_model, newdata = val_data, type = "response")
# Prediksi kelas dengan cutoff 0.5
logreg_pred_class <- ifelse(logreg_pred_prob > 0.5, levels(val_data$target)[2], levels(val_data$target)[1])
logreg_pred_class <- factor(logreg_pred_class, levels = levels(val_data$target))
# Confusion Matrix dan metrik evaluasi
cm <- confusionMatrix(logreg_pred_class, val_data$target, positive = levels(val_data$target)[2])
cat("=== Evaluasi Model Logistic Regression ===\n")
## === Evaluasi Model Logistic Regression ===
cat(sprintf("Akurasi : %.4f (%.2f%%)\n", cm$overall["Accuracy"], cm$overall["Accuracy"]*100))
## Akurasi : 0.7562 (75.62%)
cat(sprintf("Precision : %.4f (%.2f%%)\n", cm$byClass["Precision"], cm$byClass["Precision"]*100))
## Precision : 0.7184 (71.84%)
cat(sprintf("Recall : %.4f (%.2f%%)\n", cm$byClass["Recall"], cm$byClass["Recall"]*100))
## Recall : 0.8810 (88.10%)
cat(sprintf("F1-Score : %.4f (%.2f%%)\n", cm$byClass["F1"], cm$byClass["F1"]*100))
## F1-Score : 0.7914 (79.14%)
# ROC dan AUC
roc_obj <- roc(val_data$target, logreg_pred_prob, levels=rev(levels(val_data$target)))
## Setting direction: controls > cases
auc_val <- auc(roc_obj)
cat(sprintf("ROC AUC : %.4f (%.2f%%)\n", auc_val, auc_val*100))
## ROC AUC : 0.8241 (82.41%)
# Ekstrak koefisien dan hitung odds ratio
coef_estimates <- coef(logreg_model)
odds_ratios <- exp(coef_estimates)
# Hitung Confidence Interval 95% untuk odds ratio
conf_int <- confint(logreg_model)
## Waiting for profiling to be done...
odds_ratios_ci <- exp(conf_int)
# Gabungkan hasil
or_table <- data.frame(
Estimate = coef_estimates,
OddsRatio = odds_ratios,
CI_Lower = odds_ratios_ci[,1],
CI_Upper = odds_ratios_ci[,2]
)
print("Tabel Odds Ratio dan Interval Kepercayaan 95%:")
## [1] "Tabel Odds Ratio dan Interval Kepercayaan 95%:"
print(or_table)
## Estimate OddsRatio CI_Lower CI_Upper
## (Intercept) -0.18764937 0.8289053 0.6404098 1.0434943
## ApplicantIncome 0.08528074 1.0890228 0.8432233 1.4148253
## CoapplicantIncome -0.08334126 0.9200371 0.7504612 1.0990703
## LoanAmount -0.20757815 0.8125497 0.6411687 1.0257150
## Loan_Amount_Term -0.01190812 0.9881625 0.8121838 1.1899028
## Credit_History 1.68830771 5.4103171 3.8215434 8.3630174
## Gender_Male -0.06237898 0.9395268 0.7652567 1.1482563
## Married_Yes 0.34177076 1.4074376 1.1377624 1.7467050
## Dependents_1 -0.16467818 0.8481666 0.6998564 1.0288070
## Dependents_2 0.05507192 1.0566166 0.8566963 1.3117083
## Dependents_3. 0.01771091 1.0178687 0.8399075 1.2425463
## Education_Not.Graduate -0.21892003 0.8033860 0.6617442 0.9756884
## Self_Employed_Yes 0.08384939 1.0874651 0.8977427 1.3289250
## Property_Area_Semiurban 0.52604265 1.6922223 1.3557922 2.1228487
## Property_Area_Urban 0.16159929 1.1753892 0.9473710 1.4602172
# Confusion Matrix heatmap
cm_table <- as.data.frame(cm$table)
colnames(cm_table) <- c("Reference", "Prediction", "Freq")
ggplot(data = cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "blue") +
ggtitle("Confusion Matrix - Logistic Regression") +
theme_minimal()
# Barplot metrik Precision, Recall, F1
metrics_df <- data.frame(
Metric = c("Precision", "Recall", "F1"),
Value = c(cm$byClass["Precision"], cm$byClass["Recall"], cm$byClass["F1"])
)
ggplot(metrics_df, aes(x = Metric, y = Value, fill = Metric)) +
geom_bar(stat = "identity", width = 0.6) +
geom_text(aes(label = sprintf("%.2f", Value)), vjust = -0.3, size = 5) +
ylim(0, 1) +
ggtitle("Classification Report Metrics (Precision, Recall, F1-Score)") +
theme_minimal() +
guides(fill = FALSE)
# Plot ROC Curve
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve - Logistic Regression")
abline(a=0, b=1, lty=2, col="gray")