Tugas Akhir Analisis Multivariat

PENGENALAN DATASET

library(readr)
train_path <- "C:/Users/ASUS/Downloads/train_u6lujuX_CVtuZ9i.csv"
train_df <- read_csv(train_path, show_col_types = FALSE)

# Cek data
head(train_df)

## # A tibble: 6 × 13
##   Loan_ID  Gender Married Dependents Education    Self_Employed ApplicantIncome
##   <chr>    <chr>  <chr>   <chr>      <chr>        <chr>                   <dbl>
## 1 LP001002 Male   No      0          Graduate     No                       5849
## 2 LP001003 Male   Yes     1          Graduate     No                       4583
## 3 LP001005 Male   Yes     0          Graduate     Yes                      3000
## 4 LP001006 Male   Yes     0          Not Graduate No                       2583
## 5 LP001008 Male   No      0          Graduate     No                       6000
## 6 LP001011 Male   Yes     2          Graduate     Yes                      5417
## # ℹ 6 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## #   Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>,
## #   Loan_Status <chr>

test_path <- "C:/Users/ASUS/Downloads/test_Y3wMUE5_7gLdaTN.csv"
test_df <- read_csv(test_path, show_col_types = FALSE)
head(test_df)

## # A tibble: 6 × 12
##   Loan_ID  Gender Married Dependents Education    Self_Employed ApplicantIncome
##   <chr>    <chr>  <chr>   <chr>      <chr>        <chr>                   <dbl>
## 1 LP001015 Male   Yes     0          Graduate     No                       5720
## 2 LP001022 Male   Yes     1          Graduate     No                       3076
## 3 LP001031 Male   Yes     2          Graduate     No                       5000
## 4 LP001035 Male   Yes     2          Graduate     No                       2340
## 5 LP001051 Male   No      0          Not Graduate No                       3276
## 6 LP001054 Male   Yes     0          Not Graduate Yes                      2165
## # ℹ 5 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## #   Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>

# Analisis data train
cat("Data Train\n")

## Data Train

print(head(train_df))

## # A tibble: 6 × 13
##   Loan_ID  Gender Married Dependents Education    Self_Employed ApplicantIncome
##   <chr>    <chr>  <chr>   <chr>      <chr>        <chr>                   <dbl>
## 1 LP001002 Male   No      0          Graduate     No                       5849
## 2 LP001003 Male   Yes     1          Graduate     No                       4583
## 3 LP001005 Male   Yes     0          Graduate     Yes                      3000
## 4 LP001006 Male   Yes     0          Not Graduate No                       2583
## 5 LP001008 Male   No      0          Graduate     No                       6000
## 6 LP001011 Male   Yes     2          Graduate     Yes                      5417
## # ℹ 6 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## #   Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>,
## #   Loan_Status <chr>

print(dim(train_df))

## [1] 614  13

print(str(train_df))

## spc_tbl_ [614 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Loan_ID          : chr [1:614] "LP001002" "LP001003" "LP001005" "LP001006" ...
##  $ Gender           : chr [1:614] "Male" "Male" "Male" "Male" ...
##  $ Married          : chr [1:614] "No" "Yes" "Yes" "Yes" ...
##  $ Dependents       : chr [1:614] "0" "1" "0" "0" ...
##  $ Education        : chr [1:614] "Graduate" "Graduate" "Graduate" "Not Graduate" ...
##  $ Self_Employed    : chr [1:614] "No" "No" "Yes" "No" ...
##  $ ApplicantIncome  : num [1:614] 5849 4583 3000 2583 6000 ...
##  $ CoapplicantIncome: num [1:614] 0 1508 0 2358 0 ...
##  $ LoanAmount       : num [1:614] NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : num [1:614] 360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : num [1:614] 1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : chr [1:614] "Urban" "Rural" "Urban" "Urban" ...
##  $ Loan_Status      : chr [1:614] "Y" "N" "Y" "Y" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Loan_ID = col_character(),
##   ..   Gender = col_character(),
##   ..   Married = col_character(),
##   ..   Dependents = col_character(),
##   ..   Education = col_character(),
##   ..   Self_Employed = col_character(),
##   ..   ApplicantIncome = col_double(),
##   ..   CoapplicantIncome = col_double(),
##   ..   LoanAmount = col_double(),
##   ..   Loan_Amount_Term = col_double(),
##   ..   Credit_History = col_double(),
##   ..   Property_Area = col_character(),
##   ..   Loan_Status = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr> 
## NULL

print(summary(train_df))

##    Loan_ID             Gender            Married           Dependents       
##  Length:614         Length:614         Length:614         Length:614        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Education         Self_Employed      ApplicantIncome CoapplicantIncome
##  Length:614         Length:614         Min.   :  150   Min.   :    0    
##  Class :character   Class :character   1st Qu.: 2878   1st Qu.:    0    
##  Mode  :character   Mode  :character   Median : 3812   Median : 1188    
##                                        Mean   : 5403   Mean   : 1621    
##                                        3rd Qu.: 5795   3rd Qu.: 2297    
##                                        Max.   :81000   Max.   :41667    
##                                                                         
##    LoanAmount    Loan_Amount_Term Credit_History   Property_Area     
##  Min.   :  9.0   Min.   : 12      Min.   :0.0000   Length:614        
##  1st Qu.:100.0   1st Qu.:360      1st Qu.:1.0000   Class :character  
##  Median :128.0   Median :360      Median :1.0000   Mode  :character  
##  Mean   :146.4   Mean   :342      Mean   :0.8422                     
##  3rd Qu.:168.0   3rd Qu.:360      3rd Qu.:1.0000                     
##  Max.   :700.0   Max.   :480      Max.   :1.0000                     
##  NA's   :22      NA's   :14       NA's   :50                         
##  Loan_Status       
##  Length:614        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

print(sapply(train_df, class))

##           Loan_ID            Gender           Married        Dependents 
##       "character"       "character"       "character"       "character" 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##       "character"       "character"         "numeric"         "numeric" 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##         "numeric"         "numeric"         "numeric"       "character" 
##       Loan_Status 
##       "character"

print(colSums(is.na(train_df)))

##           Loan_ID            Gender           Married        Dependents 
##                 0                13                 3                15 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                32                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                14                50                 0 
##       Loan_Status 
##                 0

cat("Jumlah baris duplikat:", sum(duplicated(train_df)), "\n")

## Jumlah baris duplikat: 0

# Analisis data test
cat("Data Test\n")

## Data Test

print(head(test_df))

## # A tibble: 6 × 12
##   Loan_ID  Gender Married Dependents Education    Self_Employed ApplicantIncome
##   <chr>    <chr>  <chr>   <chr>      <chr>        <chr>                   <dbl>
## 1 LP001015 Male   Yes     0          Graduate     No                       5720
## 2 LP001022 Male   Yes     1          Graduate     No                       3076
## 3 LP001031 Male   Yes     2          Graduate     No                       5000
## 4 LP001035 Male   Yes     2          Graduate     No                       2340
## 5 LP001051 Male   No      0          Not Graduate No                       3276
## 6 LP001054 Male   Yes     0          Not Graduate Yes                      2165
## # ℹ 5 more variables: CoapplicantIncome <dbl>, LoanAmount <dbl>,
## #   Loan_Amount_Term <dbl>, Credit_History <dbl>, Property_Area <chr>

print(dim(test_df))

## [1] 367  12

print(str(test_df))

## spc_tbl_ [367 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Loan_ID          : chr [1:367] "LP001015" "LP001022" "LP001031" "LP001035" ...
##  $ Gender           : chr [1:367] "Male" "Male" "Male" "Male" ...
##  $ Married          : chr [1:367] "Yes" "Yes" "Yes" "Yes" ...
##  $ Dependents       : chr [1:367] "0" "1" "2" "2" ...
##  $ Education        : chr [1:367] "Graduate" "Graduate" "Graduate" "Graduate" ...
##  $ Self_Employed    : chr [1:367] "No" "No" "No" "No" ...
##  $ ApplicantIncome  : num [1:367] 5720 3076 5000 2340 3276 ...
##  $ CoapplicantIncome: num [1:367] 0 1500 1800 2546 0 ...
##  $ LoanAmount       : num [1:367] 110 126 208 100 78 152 59 147 280 123 ...
##  $ Loan_Amount_Term : num [1:367] 360 360 360 360 360 360 360 360 240 360 ...
##  $ Credit_History   : num [1:367] 1 1 1 NA 1 1 1 0 1 1 ...
##  $ Property_Area    : chr [1:367] "Urban" "Urban" "Urban" "Urban" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Loan_ID = col_character(),
##   ..   Gender = col_character(),
##   ..   Married = col_character(),
##   ..   Dependents = col_character(),
##   ..   Education = col_character(),
##   ..   Self_Employed = col_character(),
##   ..   ApplicantIncome = col_double(),
##   ..   CoapplicantIncome = col_double(),
##   ..   LoanAmount = col_double(),
##   ..   Loan_Amount_Term = col_double(),
##   ..   Credit_History = col_double(),
##   ..   Property_Area = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr> 
## NULL

print(summary(test_df))

##    Loan_ID             Gender            Married           Dependents       
##  Length:367         Length:367         Length:367         Length:367        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Education         Self_Employed      ApplicantIncome CoapplicantIncome
##  Length:367         Length:367         Min.   :    0   Min.   :    0    
##  Class :character   Class :character   1st Qu.: 2864   1st Qu.:    0    
##  Mode  :character   Mode  :character   Median : 3786   Median : 1025    
##                                        Mean   : 4806   Mean   : 1570    
##                                        3rd Qu.: 5060   3rd Qu.: 2430    
##                                        Max.   :72529   Max.   :24000    
##                                                                         
##    LoanAmount    Loan_Amount_Term Credit_History   Property_Area     
##  Min.   : 28.0   Min.   :  6.0    Min.   :0.0000   Length:367        
##  1st Qu.:100.2   1st Qu.:360.0    1st Qu.:1.0000   Class :character  
##  Median :125.0   Median :360.0    Median :1.0000   Mode  :character  
##  Mean   :136.1   Mean   :342.5    Mean   :0.8254                     
##  3rd Qu.:158.0   3rd Qu.:360.0    3rd Qu.:1.0000                     
##  Max.   :550.0   Max.   :480.0    Max.   :1.0000                     
##  NA's   :5       NA's   :6        NA's   :29

print(sapply(test_df, class))

##           Loan_ID            Gender           Married        Dependents 
##       "character"       "character"       "character"       "character" 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##       "character"       "character"         "numeric"         "numeric" 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##         "numeric"         "numeric"         "numeric"       "character"

print(colSums(is.na(test_df)))

##           Loan_ID            Gender           Married        Dependents 
##                 0                11                 0                10 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                23                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                 5                 6                29                 0

cat("Jumlah baris duplikat:", sum(duplicated(test_df)), "\n")

## Jumlah baris duplikat: 0

PREPROCESSING
1. Hapus Kolom Loan Id

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

train_df <- train_df %>% select(-Loan_ID)
test_df <- test_df %>% select(-Loan_ID)

Tentukan target

target_col <- "Loan_Status"

Identifikasi Kolom Numerik

num_cols <- names(train_df)[sapply(train_df, is.numeric)]
cat_cols <- setdiff(names(train_df)[sapply(train_df, is.character)], target_col)

Imputasi Missing Value

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# Imputasi numerik dengan median
for(col in num_cols) {
  median_val <- median(train_df[[col]], na.rm = TRUE)
  train_df[[col]][is.na(train_df[[col]])] <- median_val
  test_df[[col]][is.na(test_df[[col]])] <- median_val
}

# Imputasi kategorikal dengan modus (nilai paling sering)
get_mode <- function(v) {
  uniqv <- unique(v[!is.na(v)])
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

for(col in cat_cols) {
  mode_val <- get_mode(train_df[[col]])
  train_df[[col]][is.na(train_df[[col]])] <- mode_val
  test_df[[col]][is.na(test_df[[col]])] <- mode_val
}

Encoding Target

train_df[[target_col]] <- ifelse(train_df[[target_col]] == "Y", 1, 0)

Gabungkan train test untuk one hot encoding

library(dplyr)
library(fastDummies)
# Buat dataframe tanpa kolom target dari train, lalu gabung dengan test
combined <- bind_rows(
  train_df %>% select(-all_of(target_col)),
  test_df
)

# One-hot encoding dengan menghapus kategori pertama (drop_first=True)
combined_encoded <- dummy_cols(combined, remove_selected_columns = TRUE, remove_first_dummy = TRUE)

Pisahkan kembali train dan test

n_train <- nrow(train_df)

X_train <- combined_encoded[1:n_train, ]
X_test <- combined_encoded[(n_train + 1):nrow(combined_encoded), ]

y_train <- train_df[[target_col]]

Penangan outlier

# Fungsi clipping berdasarkan IQR
clip_outliers <- function(df, numeric_cols) {
  for (col in numeric_cols) {
    Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
    Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
    IQR <- Q3 - Q1
    lower_bound <- Q1 - 1.5 * IQR
    upper_bound <- Q3 + 1.5 * IQR
    df[[col]][df[[col]] < lower_bound] <- lower_bound
    df[[col]][df[[col]] > upper_bound] <- upper_bound
  }
  return(df)
}

# Identifikasi kolom numerik pada combined_encoded
num_cols_encoded <- names(combined_encoded)[sapply(combined_encoded, is.numeric)]

# Terapkan clipping pada train dan test secara terpisah untuk menghindari data leakage
combined_encoded[1:n_train, ] <- clip_outliers(combined_encoded[1:n_train, ], num_cols_encoded)
combined_encoded[(n_train + 1):nrow(combined_encoded), ] <- clip_outliers(combined_encoded[(n_train + 1):nrow(combined_encoded), ], num_cols_encoded)

Penanganan data imbalance

library(smotefamily)

# Membuat dataframe train dengan label
train_for_smote <- cbind(X_train, Loan_Status = y_train)

# SMOTE dengan smotefamily
set.seed(42)
smote_output <- SMOTE(train_for_smote[ , !(names(train_for_smote) %in% "Loan_Status")], 
                      train_for_smote$Loan_Status, 
                      K = 5, 
                      dup_size = 0) # dup_size=0 berarti tidak menambah data sintetis terlalu banyak

# smote_output berisi list dengan data sintetis dan label baru
X_train_smote <- smote_output$data[, -ncol(smote_output$data)]
y_train_smote <- smote_output$data[, ncol(smote_output$data)]
y_train_smote <- as.factor(y_train_smote)

# Update X_train dan y_train dengan hasil SMOTE
X_train <- X_train_smote
y_train <- y_train_smote

Scaling Fitur Numerik

X_train_scaled <- scale(X_train)
X_test_scaled <- scale(X_test, center = attr(X_train_scaled, "scaled:center"), scale = attr(X_train_scaled, "scaled:scale"))

Split data training menjadi training dan validation set

library(caret)

set.seed(42)
train_index <- createDataPartition(y_train, p = 0.8, list = FALSE, times = 1)

X_tr <- X_train_scaled[train_index, ]
X_val <- X_train_scaled[-train_index, ]

y_tr <- y_train[train_index]
y_val <- y_train[-train_index]

dupes_train <- sum(duplicated(X_train))
dupes_test <- sum(duplicated(X_test))

if (dupes_train > 0) {
  idx_unique <- !duplicated(X_train)
  X_train <- X_train[idx_unique, ]
  y_train <- y_train[idx_unique]
}

EDA
1. Distribusi target Loan_Status

ggplot(train_df, aes(x = Loan_Status)) +
  geom_bar(fill = c("red", "green")) +
  labs(title = "Distribusi Loan_Status (Sebelum Preprocessing)",
       x = "Loan_Status", y = "Jumlah") +
  theme_minimal()

barplot(table(y_train), col = c("red", "green"),
        main = "Distribusi Loan_Status (Setelah Preprocessing dan SMOTE)",
        xlab = "Loan_Status", ylab = "Jumlah")

Boxplot Outlier

library(ggplot2)

num_features <- c("ApplicantIncome", "LoanAmount")

for (feature in num_features) {
  p <- ggplot(train_df, aes_string(y = feature)) +
    geom_boxplot(fill = "tomato", outlier.color = "red", outlier.shape = 16) +
    labs(title = paste("Boxplot", feature, "(Sebelum Preprocessing)"),
         y = feature) +
    theme_minimal()
  print(p)
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(ggplot2)

num_features <- c("ApplicantIncome", "LoanAmount")

for (feature in num_features) {
  # Cari index kolom fitur di X_train_scaled
  idx <- which(colnames(X_train_scaled) == feature)
  
  if (length(idx) == 0) {
    # Jika nama fitur tidak ditemukan (mungkin karena encoding), gunakan kolom pertama sebagai contoh
    idx <- 1
  }
  
  df_scaled <- data.frame(
    Value = X_train_scaled[, idx]
  )
  
  p <- ggplot(df_scaled, aes(y = Value)) +
    geom_boxplot(fill = "steelblue", outlier.color = "blue", outlier.shape = 16) +
    labs(title = paste("Boxplot", feature, "(Setelah Scaling)"),
         y = feature) +
    theme_minimal()
  print(p)
}

Heatmap Korelasi

library(corrplot)

## corrplot 0.95 loaded

# Identifikasi kolom numerik di train_df
num_cols <- names(train_df)[sapply(train_df, is.numeric)]

# Hitung matriks korelasi hanya pada kolom numerik (handle missing dengan complete.obs)
cor_mat <- cor(train_df[, num_cols], use = "complete.obs")

# Plot korelasi fitur numerik
corrplot(cor_mat, method = "color", addCoef.col = "black",
         tl.col = "black", number.cex = 0.7,
         title = "Korelasi Fitur Numerik",
         mar = c(0, 0, 1, 0))

Uji Asumsi

Homogenitas Matriks Kovarians (Box’s M Test)

library(biotools)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## ---
## biotools version 4.3

train_data <- data.frame(X_tr)
train_data$target <- as.factor(y_tr)
boxm_result <- boxM(train_data[, !(colnames(train_data) %in% "target")], train_data$target)
print(boxm_result)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  train_data[, !(colnames(train_data) %in% "target")]
## Chi-Sq (approx.) = 679.35, df = 105, p-value < 2.2e-16

Signifikansi Variabel

library(MASS)

lda_model <- lda(target ~ ., data = train_data)

print("Ringkasan Model LDA:")

## [1] "Ringkasan Model LDA:"

print(lda_model)

## Call:
## lda(target ~ ., data = train_data)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4767802 0.5232198 
## 
## Group means:
##   ApplicantIncome CoapplicantIncome   LoanAmount Loan_Amount_Term
## 0    0.0009677648        0.05915311  0.065060326     -0.001337883
## 1    0.0194537121       -0.02653977 -0.005787604     -0.024771518
##   Credit_History  Gender_Male Married_Yes Dependents_1 Dependents_2
## 0     -0.5060363 -0.007919659  -0.0553775   0.06809195  -0.06772812
## 1      0.4773545 -0.003037477   0.1232656  -0.01315853   0.05906957
##   Dependents_3. Education_Not.Graduate Self_Employed_Yes
## 0    0.05265518             0.11233729       -0.03481967
## 1    0.01045856            -0.09879705        0.01802579
##   Property_Area_Semiurban Property_Area_Urban
## 0              -0.1691134          0.03053692
## 1               0.2060059         -0.07110221
## 
## Coefficients of linear discriminants:
##                                  LD1
## ApplicantIncome          0.058691137
## CoapplicantIncome       -0.087767967
## LoanAmount              -0.169951340
## Loan_Amount_Term        -0.027643431
## Credit_History           1.071594230
## Gender_Male             -0.035468131
## Married_Yes              0.279987125
## Dependents_1            -0.126251653
## Dependents_2             0.047274962
## Dependents_3.            0.006751067
## Education_Not.Graduate  -0.185783374
## Self_Employed_Yes        0.081156732
## Property_Area_Semiurban  0.414123202
## Property_Area_Urban      0.137209969

Akurasi Model -Membuat prediksi kelas dan probabilitas:

val_data <- data.frame(X_val)
val_data$target <- as.factor(y_val)
lda_pred_class <- predict(lda_model, newdata = val_data)$class
lda_pred_prob <- predict(lda_model, newdata = val_data)$posterior[, 2]

Menghitung Confusion Matrix dan metrik utama:

val_target_factor <- as.factor(val_data$target)
cm <- confusionMatrix(lda_pred_class, val_target_factor, positive = levels(val_target_factor)[2])
cat(sprintf("Akurasi      : %.4f\n", cm$overall["Accuracy"]))

## Akurasi      : 0.7750

cat(sprintf("Precision   : %.4f\n", cm$byClass["Precision"]))

## Precision   : 0.7222

cat(sprintf("Recall      : %.4f\n", cm$byClass["Recall"]))

## Recall      : 0.9286

cat(sprintf("F1-Score    : %.4f\n", cm$byClass["F1"]))

## F1-Score    : 0.8125

Evaluasi ROC AUC:

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

val_target_factor <- as.factor(val_data$target)
roc_obj <- roc(val_target_factor, lda_pred_prob, levels=rev(levels(val_target_factor)))

## Setting direction: controls > cases

auc_val <- auc(roc_obj)
cat(sprintf("ROC AUC     : %.4f\n", auc_val))

## ROC AUC     : 0.8253

Interpretasi Model

-Koefisien diskriminan (menunjukkan kontribusi tiap variabel):

print(lda_model$scaling)

##                                  LD1
## ApplicantIncome          0.058691137
## CoapplicantIncome       -0.087767967
## LoanAmount              -0.169951340
## Loan_Amount_Term        -0.027643431
## Credit_History           1.071594230
## Gender_Male             -0.035468131
## Married_Yes              0.279987125
## Dependents_1            -0.126251653
## Dependents_2             0.047274962
## Dependents_3.            0.006751067
## Education_Not.Graduate  -0.185783374
## Self_Employed_Yes        0.081156732
## Property_Area_Semiurban  0.414123202
## Property_Area_Urban      0.137209969

Prior probabilities tiap kelas:

print(lda_model$prior)

##         0         1 
## 0.4767802 0.5232198

Rata-rata variabel per kelas (means):

print(lda_model$means)

##   ApplicantIncome CoapplicantIncome   LoanAmount Loan_Amount_Term
## 0    0.0009677648        0.05915311  0.065060326     -0.001337883
## 1    0.0194537121       -0.02653977 -0.005787604     -0.024771518
##   Credit_History  Gender_Male Married_Yes Dependents_1 Dependents_2
## 0     -0.5060363 -0.007919659  -0.0553775   0.06809195  -0.06772812
## 1      0.4773545 -0.003037477   0.1232656  -0.01315853   0.05906957
##   Dependents_3. Education_Not.Graduate Self_Employed_Yes
## 0    0.05265518             0.11233729       -0.03481967
## 1    0.01045856            -0.09879705        0.01802579
##   Property_Area_Semiurban Property_Area_Urban
## 0              -0.1691134          0.03053692
## 1               0.2060059         -0.07110221

Visualisasi

cm <- confusionMatrix(lda_pred_class, val_target_factor, positive = levels(val_target_factor)[2])
cm_table <- as.data.frame(cm$table)
colnames(cm_table) <- c("Reference", "Prediction", "Freq")

ggplot(data = cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "blue") +
  ggtitle("Confusion Matrix") +
  theme_minimal()

metrics_df <- data.frame(
  Metric = c("Precision", "Recall", "F1"),
  Value = c(cm$byClass["Precision"], cm$byClass["Recall"], cm$byClass["F1"])
)

ggplot(metrics_df, aes(x = Metric, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(label = sprintf("%.2f", Value)), vjust = -0.3, size = 5) +
  ylim(0, 1) +
  ggtitle("Classification Report Metrics") +
  theme_minimal() +
  guides(fill = FALSE)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve - Linear Discriminant Analysis")
abline(a=0, b=1, lty=2, col="gray")

Logistic Regression Logistic Reggression

Pembentukan Model (Estimasi Parameter)

# Load library
library(caret)
library(pROC)
library(ggplot2)
library(reshape2)

# Data training dan validation
train_data <- data.frame(X_tr)
train_data$target <- as.factor(y_tr)

val_data <- data.frame(X_val)
val_data$target <- as.factor(y_val)

# Fit model regresi logistik (estimasi parameter)
logreg_model <- glm(target ~ ., data = train_data, family = binomial)

# Tampilkan ringkasan model
summary(logreg_model)

## 
## Call:
## glm(formula = target ~ ., family = binomial, data = train_data)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.18765    0.12320  -1.523  0.12772    
## ApplicantIncome          0.08528    0.13370   0.638  0.52357    
## CoapplicantIncome       -0.08334    0.09416  -0.885  0.37608    
## LoanAmount              -0.20758    0.11965  -1.735  0.08276 .  
## Loan_Amount_Term        -0.01191    0.09680  -0.123  0.90209    
## Credit_History           1.68831    0.19712   8.565  < 2e-16 ***
## Gender_Male             -0.06238    0.10330  -0.604  0.54594    
## Married_Yes              0.34177    0.10918   3.130  0.00175 ** 
## Dependents_1            -0.16468    0.09807  -1.679  0.09312 .  
## Dependents_2             0.05507    0.10834   0.508  0.61121    
## Dependents_3.            0.01771    0.09945   0.178  0.85865    
## Education_Not.Graduate  -0.21892    0.09887  -2.214  0.02681 *  
## Self_Employed_Yes        0.08385    0.09968   0.841  0.40024    
## Property_Area_Semiurban  0.52604    0.11421   4.606  4.1e-06 ***
## Property_Area_Urban      0.16160    0.11024   1.466  0.14266    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 894.15  on 645  degrees of freedom
## Residual deviance: 654.78  on 631  degrees of freedom
## AIC: 684.78
## 
## Number of Fisher Scoring iterations: 6

Uji Signifikansi Variabel

Uji Serentak (Overall Model Significance) — Likelihood Ratio Test (Model vs Null)

# Model null (hanya intercept)
null_model <- glm(target ~ 1, data = train_data, family = binomial)

# Likelihood ratio test
lr_test <- anova(null_model, logreg_model, test = "Chisq")
print(lr_test)

## Analysis of Deviance Table
## 
## Model 1: target ~ 1
## Model 2: target ~ ApplicantIncome + CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + Gender_Male + Married_Yes + Dependents_1 + 
##     Dependents_2 + Dependents_3. + Education_Not.Graduate + Self_Employed_Yes + 
##     Property_Area_Semiurban + Property_Area_Urban
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       645     894.15                          
## 2       631     654.78 14   239.37 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Uji Parsial (Signifikansi Tiap Variabel) — lihat p-value dari summary(model)

# Sudah tersedia di summary(logreg_model)
# Contoh akses p-value
coef_summary <- summary(logreg_model)$coefficients
print(coef_summary)

##                            Estimate Std. Error    z value     Pr(>|z|)
## (Intercept)             -0.18764937 0.12319897 -1.5231408 1.277235e-01
## ApplicantIncome          0.08528074 0.13369867  0.6378578 5.235663e-01
## CoapplicantIncome       -0.08334126 0.09415671 -0.8851335 3.760846e-01
## LoanAmount              -0.20757815 0.11964912 -1.7348907 8.276016e-02
## Loan_Amount_Term        -0.01190812 0.09680105 -0.1230164 9.020941e-01
## Credit_History           1.68830771 0.19712087  8.5648348 1.082309e-17
## Gender_Male             -0.06237898 0.10330057 -0.6038590 5.459374e-01
## Married_Yes              0.34177076 0.10918226  3.1302774 1.746413e-03
## Dependents_1            -0.16467818 0.09807280 -1.6791423 9.312432e-02
## Dependents_2             0.05507192 0.10833594  0.5083440 6.112121e-01
## Dependents_3.            0.01771091 0.09944638  0.1780950 8.586483e-01
## Education_Not.Graduate  -0.21892003 0.09886793 -2.2142674 2.681039e-02
## Self_Employed_Yes        0.08384939 0.09967876  0.8411962 4.002380e-01
## Property_Area_Semiurban  0.52604265 0.11420666  4.6060591 4.103718e-06
## Property_Area_Urban      0.16159929 0.11023562  1.4659444 1.426634e-01

Evaluasi Akurasi Model

# Prediksi probabilitas kelas positif pada validation set
logreg_pred_prob <- predict(logreg_model, newdata = val_data, type = "response")

# Prediksi kelas dengan cutoff 0.5
logreg_pred_class <- ifelse(logreg_pred_prob > 0.5, levels(val_data$target)[2], levels(val_data$target)[1])
logreg_pred_class <- factor(logreg_pred_class, levels = levels(val_data$target))

# Confusion Matrix dan metrik evaluasi
cm <- confusionMatrix(logreg_pred_class, val_data$target, positive = levels(val_data$target)[2])

cat("=== Evaluasi Model Logistic Regression ===\n")

## === Evaluasi Model Logistic Regression ===

cat(sprintf("Akurasi      : %.4f (%.2f%%)\n", cm$overall["Accuracy"], cm$overall["Accuracy"]*100))

## Akurasi      : 0.7562 (75.62%)

cat(sprintf("Precision   : %.4f (%.2f%%)\n", cm$byClass["Precision"], cm$byClass["Precision"]*100))

## Precision   : 0.7184 (71.84%)

cat(sprintf("Recall      : %.4f (%.2f%%)\n", cm$byClass["Recall"], cm$byClass["Recall"]*100))

## Recall      : 0.8810 (88.10%)

cat(sprintf("F1-Score    : %.4f (%.2f%%)\n", cm$byClass["F1"], cm$byClass["F1"]*100))

## F1-Score    : 0.7914 (79.14%)

# ROC dan AUC
roc_obj <- roc(val_data$target, logreg_pred_prob, levels=rev(levels(val_data$target)))

## Setting direction: controls > cases

auc_val <- auc(roc_obj)
cat(sprintf("ROC AUC     : %.4f (%.2f%%)\n", auc_val, auc_val*100))

## ROC AUC     : 0.8241 (82.41%)

Interpretasi dengan Odds Ratio dan Confidence Interval

# Ekstrak koefisien dan hitung odds ratio
coef_estimates <- coef(logreg_model)
odds_ratios <- exp(coef_estimates)

# Hitung Confidence Interval 95% untuk odds ratio
conf_int <- confint(logreg_model)

## Waiting for profiling to be done...

odds_ratios_ci <- exp(conf_int)

# Gabungkan hasil
or_table <- data.frame(
  Estimate = coef_estimates,
  OddsRatio = odds_ratios,
  CI_Lower = odds_ratios_ci[,1],
  CI_Upper = odds_ratios_ci[,2]
)

print("Tabel Odds Ratio dan Interval Kepercayaan 95%:")

## [1] "Tabel Odds Ratio dan Interval Kepercayaan 95%:"

print(or_table)

##                            Estimate OddsRatio  CI_Lower  CI_Upper
## (Intercept)             -0.18764937 0.8289053 0.6404098 1.0434943
## ApplicantIncome          0.08528074 1.0890228 0.8432233 1.4148253
## CoapplicantIncome       -0.08334126 0.9200371 0.7504612 1.0990703
## LoanAmount              -0.20757815 0.8125497 0.6411687 1.0257150
## Loan_Amount_Term        -0.01190812 0.9881625 0.8121838 1.1899028
## Credit_History           1.68830771 5.4103171 3.8215434 8.3630174
## Gender_Male             -0.06237898 0.9395268 0.7652567 1.1482563
## Married_Yes              0.34177076 1.4074376 1.1377624 1.7467050
## Dependents_1            -0.16467818 0.8481666 0.6998564 1.0288070
## Dependents_2             0.05507192 1.0566166 0.8566963 1.3117083
## Dependents_3.            0.01771091 1.0178687 0.8399075 1.2425463
## Education_Not.Graduate  -0.21892003 0.8033860 0.6617442 0.9756884
## Self_Employed_Yes        0.08384939 1.0874651 0.8977427 1.3289250
## Property_Area_Semiurban  0.52604265 1.6922223 1.3557922 2.1228487
## Property_Area_Urban      0.16159929 1.1753892 0.9473710 1.4602172

Visualisasi Evaluasi Hasil

# Confusion Matrix heatmap
cm_table <- as.data.frame(cm$table)
colnames(cm_table) <- c("Reference", "Prediction", "Freq")

ggplot(data = cm_table, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "blue") +
  ggtitle("Confusion Matrix - Logistic Regression") +
  theme_minimal()

# Barplot metrik Precision, Recall, F1
metrics_df <- data.frame(
  Metric = c("Precision", "Recall", "F1"),
  Value = c(cm$byClass["Precision"], cm$byClass["Recall"], cm$byClass["F1"])
)

ggplot(metrics_df, aes(x = Metric, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(label = sprintf("%.2f", Value)), vjust = -0.3, size = 5) +
  ylim(0, 1) +
  ggtitle("Classification Report Metrics (Precision, Recall, F1-Score)") +
  theme_minimal() +
  guides(fill = FALSE)

# Plot ROC Curve
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve - Logistic Regression")
abline(a=0, b=1, lty=2, col="gray")

Tugas Akhir Analisis Multivariat

2025-05-21