Logistic Regression Hyperparameter Tuning

Nama: Ratu Arum Rahma Gati

Dosen Pengampu: Ike Fitriyaningsih, M.Si

Mata Kuliah: Analisis Multivariat

Universitas Negeri Surabaya

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.4.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.4.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.0.4     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4
library(parsnip)
## Warning: package 'parsnip' was built under R version 4.4.3
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(car)                  # Untuk uji VIF
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
library(ResourceSelection)    # Untuk Hosmer-Lemeshow
## Warning: package 'ResourceSelection' was built under R version 4.4.3
## ResourceSelection 0.3-6   2023-06-27
df <- read.csv("C:\\Users\\dell\\Downloads\\alzheimers_disease_data (1).csv")
str(df)
## 'data.frame':    2149 obs. of  35 variables:
##  $ PatientID                : int  4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 ...
##  $ Age                      : int  73 89 73 74 89 86 68 75 72 87 ...
##  $ Gender                   : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ Ethnicity                : int  0 0 3 0 0 1 3 0 1 0 ...
##  $ EducationLevel           : int  2 0 1 1 0 1 2 1 0 0 ...
##  $ BMI                      : num  22.9 26.8 17.8 33.8 20.7 ...
##  $ Smoking                  : int  0 0 0 1 0 0 1 0 0 1 ...
##  $ AlcoholConsumption       : num  13.3 4.54 19.56 12.21 18.45 ...
##  $ PhysicalActivity         : num  6.33 7.62 7.84 8.43 6.31 ...
##  $ DietQuality              : num  1.347 0.519 1.826 7.436 0.795 ...
##  $ SleepQuality             : num  9.03 7.15 9.67 8.39 5.6 ...
##  $ FamilyHistoryAlzheimers  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ CardiovascularDisease    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Diabetes                 : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ Depression               : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ HeadInjury               : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Hypertension             : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ SystolicBP               : int  142 115 99 118 94 168 143 117 117 130 ...
##  $ DiastolicBP              : int  72 64 116 115 117 62 88 63 119 78 ...
##  $ CholesterolTotal         : num  242 231 284 160 238 ...
##  $ CholesterolLDL           : num  56.2 193.4 153.3 65.4 92.9 ...
##  $ CholesterolHDL           : num  33.7 79 69.8 68.5 56.9 ...
##  $ CholesterolTriglycerides : num  162.2 294.6 83.6 277.6 291.2 ...
##  $ MMSE                     : num  21.46 20.61 7.36 13.99 13.52 ...
##  $ FunctionalAssessment     : num  6.52 7.12 5.9 8.97 6.05 ...
##  $ MemoryComplaints         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BehavioralProblems       : int  0 0 0 1 0 0 0 0 1 1 ...
##  $ ADL                      : num  1.7259 2.5924 7.1195 6.4812 0.0147 ...
##  $ Confusion                : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Disorientation           : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ PersonalityChanges       : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ DifficultyCompletingTasks: int  1 0 1 0 1 0 0 0 0 0 ...
##  $ Forgetfulness            : int  0 1 0 0 0 0 1 1 0 0 ...
##  $ Diagnosis                : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ DoctorInCharge           : chr  "XXXConfid" "XXXConfid" "XXXConfid" "XXXConfid" ...
cat("Jumlah duplikasi:", sum(duplicated(df)), "\n")
## Jumlah duplikasi: 0
print(colSums(is.na(df)))
##                 PatientID                       Age                    Gender 
##                         0                         0                         0 
##                 Ethnicity            EducationLevel                       BMI 
##                         0                         0                         0 
##                   Smoking        AlcoholConsumption          PhysicalActivity 
##                         0                         0                         0 
##               DietQuality              SleepQuality   FamilyHistoryAlzheimers 
##                         0                         0                         0 
##     CardiovascularDisease                  Diabetes                Depression 
##                         0                         0                         0 
##                HeadInjury              Hypertension                SystolicBP 
##                         0                         0                         0 
##               DiastolicBP          CholesterolTotal            CholesterolLDL 
##                         0                         0                         0 
##            CholesterolHDL  CholesterolTriglycerides                      MMSE 
##                         0                         0                         0 
##      FunctionalAssessment          MemoryComplaints        BehavioralProblems 
##                         0                         0                         0 
##                       ADL                 Confusion            Disorientation 
##                         0                         0                         0 
##        PersonalityChanges DifficultyCompletingTasks             Forgetfulness 
##                         0                         0                         0 
##                 Diagnosis            DoctorInCharge 
##                         0                         0
df <- df %>% dplyr::select(-PatientID, -DoctorInCharge)
str(df)
## 'data.frame':    2149 obs. of  33 variables:
##  $ Age                      : int  73 89 73 74 89 86 68 75 72 87 ...
##  $ Gender                   : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ Ethnicity                : int  0 0 3 0 0 1 3 0 1 0 ...
##  $ EducationLevel           : int  2 0 1 1 0 1 2 1 0 0 ...
##  $ BMI                      : num  22.9 26.8 17.8 33.8 20.7 ...
##  $ Smoking                  : int  0 0 0 1 0 0 1 0 0 1 ...
##  $ AlcoholConsumption       : num  13.3 4.54 19.56 12.21 18.45 ...
##  $ PhysicalActivity         : num  6.33 7.62 7.84 8.43 6.31 ...
##  $ DietQuality              : num  1.347 0.519 1.826 7.436 0.795 ...
##  $ SleepQuality             : num  9.03 7.15 9.67 8.39 5.6 ...
##  $ FamilyHistoryAlzheimers  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ CardiovascularDisease    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Diabetes                 : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ Depression               : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ HeadInjury               : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Hypertension             : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ SystolicBP               : int  142 115 99 118 94 168 143 117 117 130 ...
##  $ DiastolicBP              : int  72 64 116 115 117 62 88 63 119 78 ...
##  $ CholesterolTotal         : num  242 231 284 160 238 ...
##  $ CholesterolLDL           : num  56.2 193.4 153.3 65.4 92.9 ...
##  $ CholesterolHDL           : num  33.7 79 69.8 68.5 56.9 ...
##  $ CholesterolTriglycerides : num  162.2 294.6 83.6 277.6 291.2 ...
##  $ MMSE                     : num  21.46 20.61 7.36 13.99 13.52 ...
##  $ FunctionalAssessment     : num  6.52 7.12 5.9 8.97 6.05 ...
##  $ MemoryComplaints         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BehavioralProblems       : int  0 0 0 1 0 0 0 0 1 1 ...
##  $ ADL                      : num  1.7259 2.5924 7.1195 6.4812 0.0147 ...
##  $ Confusion                : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Disorientation           : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ PersonalityChanges       : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ DifficultyCompletingTasks: int  1 0 1 0 1 0 0 0 0 0 ...
##  $ Forgetfulness            : int  0 1 0 0 0 0 1 1 0 0 ...
##  $ Diagnosis                : int  0 0 0 0 0 0 0 1 0 0 ...

Pada kode di atas data dimuat, lalu dilakukan proses pembersihan data seperti pengecekan nilai kosong (NA) dan transformasi variabel jika diperlukan, misalnya mengubah tipe data variabel menjadi faktor atau numerik sesuai kebutuhan model.

df$Diagnosis <- as.factor(df$Diagnosis)
df$Diagnosis <- factor(df$Diagnosis, levels = c(0,1), labels = c("Tidak Alzheimer", "Alzheimer"))
num_cols <- sapply(df, is.numeric)
num_cols["Diagnosis"] <- FALSE
num_cols["Gender"] <- FALSE
df[num_cols] <- scale(df[num_cols])
ggplot(df, aes(x = Diagnosis)) +
  geom_bar(fill = "lightblue") +
  ggtitle("Distribusi Diagnosis Alzheimer") +
  xlab("Status Diagnosis") + ylab("Jumlah Pasien")

cat("Jumlah tiap kategori:\n")
## Jumlah tiap kategori:
print(table(df$Diagnosis))
## 
## Tidak Alzheimer       Alzheimer 
##            1389             760
label_percentages <- prop.table(table(df$Diagnosis)) * 100
cat("\nPersentase Tiap Label Diagnosis:\n")
## 
## Persentase Tiap Label Diagnosis:
print(round(label_percentages, 2))
## 
## Tidak Alzheimer       Alzheimer 
##           64.63           35.37
valid_vars <- names(df)[sapply(df, function(x) is.numeric(x) && length(unique(x)) > 1)]

anova_pvals <- sapply(valid_vars, function(var) {
  formula <- as.formula(paste0("`", var, "` ~ Diagnosis"))
  result <- tryCatch({
    aov_model <- aov(formula, data = df)
    summary(aov_model)[[1]][["Pr(>F)"]][1]
  }, error = function(e) NA)
  return(result)
})

selected_features <- names(anova_pvals[anova_pvals < 0.05])
cat("Fitur terpilih (p < 0.05):\n")
## Fitur terpilih (p < 0.05):
print(selected_features)
## [1] "EducationLevel"       "SleepQuality"         "CholesterolHDL"      
## [4] "MMSE"                 "FunctionalAssessment" "MemoryComplaints"    
## [7] "BehavioralProblems"   "ADL"
final_data <- df[, c("Diagnosis", selected_features)]

Bagian ini digunakan untuk memahami struktur data dan melihat distribusi masing-masing variabel. Proses ini mencakup visualisasi distribusi variabel numerik dan kategorikal, serta identifikasi kemungkinan adanya outlier atau ketidakseimbangan kelas (class imbalance).

UJI ASUMSI SEBELUM MODELING

1. Multikolinearitas (VIF)

model_vif <- glm(Diagnosis ~ ., data = final_data, family = binomial)
vif_values <- vif(model_vif)
print("VIF Tiap Variabel:")
## [1] "VIF Tiap Variabel:"
print(vif_values)
##       EducationLevel         SleepQuality       CholesterolHDL 
##             1.003401             1.002325             1.005188 
##                 MMSE FunctionalAssessment     MemoryComplaints 
##             1.192906             1.293479             1.250640 
##   BehavioralProblems                  ADL 
##             1.259085             1.302034

2. Outlier dan leverage plot

plot(model_vif, which = 4, caption = "Influence plot")

3. Hosmer-Lemeshow Test

hoslem <- hoslem.test(as.integer(final_data$Diagnosis == "Alzheimer"), fitted(model_vif), g = 10)
print(hoslem)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  as.integer(final_data$Diagnosis == "Alzheimer"), fitted(model_vif)
## X-squared = 62.119, df = 8, p-value = 1.787e-10

4. Plot residual deviance

plot(fitted(model_vif), residuals(model_vif, type = "deviance"),
     xlab = "Fitted values", ylab = "Deviance Residuals",
     main = "Residuals vs Fitted")
abline(h = 0, col = "red")

SPLIT DATA

set.seed(123)
train_idx <- createDataPartition(final_data$Diagnosis, p = 0.7, list = FALSE)
train_data <- final_data[train_idx, ]
test_data  <- final_data[-train_idx, ]

write.csv(train_data, "C:\\Users\\dell\\latihan\\SEMESTER 4\\ANMUL\\PROJECT\\train_data.csv", row.names = FALSE)
write.csv(test_data, "C:\\Users\\dell\\latihan\\SEMESTER 4\\ANMUL\\PROJECT\\test_data.csv", row.names = FALSE)

cat("Distribusi sebelum oversampling:\n")
## Distribusi sebelum oversampling:
print(table(train_data$Diagnosis))
## 
## Tidak Alzheimer       Alzheimer 
##             973             532
set.seed(123)
data_balanced <- ovun.sample(Diagnosis ~ ., data = train_data, method = "over",
                             N = max(table(train_data$Diagnosis)) * 2)$data

cat("Distribusi setelah oversampling:\n")
## Distribusi setelah oversampling:
print(table(data_balanced$Diagnosis))
## 
## Tidak Alzheimer       Alzheimer 
##             973             973
write.csv(data_balanced, "C:\\Users\\dell\\latihan\\SEMESTER 4\\ANMUL\\PROJECT\\train_data_oversampled.csv", row.names = FALSE)

Lalu data dibagi menjadi dua bagian yaitu data latih (training) dan data uji (testing) menggunakan fungsi createDataPartition() dari package caret. Pembagian ini bertujuan agar model dapat dilatih menggunakan sebagian data, lalu dievaluasi pada data yang belum pernah dilihat sebelumnya untuk mengukur performa secara objektif.

MODEL LOGISTIC REGRESSION

train_data <- read.csv("train_data_oversampled.csv")
train_data$Diagnosis <- factor(train_data$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))

model <- logistic_reg(mixture = 1, penalty = 0.1) %>%
  set_engine("glmnet") %>%
  set_mode("classification") %>%
  fit(Diagnosis ~ ., data = train_data)

tidy(model)
## # A tibble: 9 × 3
##   term                 estimate penalty
##   <chr>                   <dbl>   <dbl>
## 1 (Intercept)           -0.136      0.1
## 2 EducationLevel         0          0.1
## 3 SleepQuality           0          0.1
## 4 CholesterolHDL         0          0.1
## 5 MMSE                  -0.0758     0.1
## 6 FunctionalAssessment  -0.457      0.1
## 7 MemoryComplaints       0.172      0.1
## 8 BehavioralProblems     0.0147     0.1
## 9 ADL                   -0.305      0.1
head(train_data, 5)
##         Diagnosis EducationLevel SleepQuality CholesterolHDL       MMSE
## 1 Tidak Alzheimer     -1.4224508   0.05682309      0.8455334  0.6801384
## 2 Tidak Alzheimer     -1.4224508  -0.82437383     -0.1118981 -0.1436783
## 3 Tidak Alzheimer     -0.3169004   0.11957058      0.8477818  1.4817338
## 4 Tidak Alzheimer     -1.4224508  -0.73876003     -0.7082206  1.2847330
## 5 Tidak Alzheimer     -1.4224508   0.28390795      0.6408057  1.5828443
##   FunctionalAssessment MemoryComplaints BehavioralProblems        ADL
## 1            0.7047429       -0.5123573         -0.4311563 -0.8104125
## 2            0.3335878       -0.5123573         -0.4311563 -1.6842869
## 3            0.1486786       -0.5123573         -0.4311563  1.3671308
## 4            0.8006261       -0.5123573          2.3182650 -1.4328981
## 5           -1.3589699       -0.5123573          2.3182650 -0.1452872
table(train_data$Diagnosis)
## 
## Tidak Alzheimer       Alzheimer 
##             973             973
test <- read.csv("test_data.csv")
test$Diagnosis <- factor(test$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))

pred_class <- predict(model, new_data = test, type = "class")
pred_class <- factor(pred_class$.pred_class, levels = levels(test$Diagnosis))

conf_matrix <- confusionMatrix(pred_class, test$Diagnosis)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Tidak Alzheimer Alzheimer
##   Tidak Alzheimer             329        59
##   Alzheimer                    87       169
##                                          
##                Accuracy : 0.7733         
##                  95% CI : (0.739, 0.8051)
##     No Information Rate : 0.646          
##     P-Value [Acc > NIR] : 1.787e-12      
##                                          
##                   Kappa : 0.5177         
##                                          
##  Mcnemar's Test P-Value : 0.02545        
##                                          
##             Sensitivity : 0.7909         
##             Specificity : 0.7412         
##          Pos Pred Value : 0.8479         
##          Neg Pred Value : 0.6602         
##              Prevalence : 0.6460         
##          Detection Rate : 0.5109         
##    Detection Prevalence : 0.6025         
##       Balanced Accuracy : 0.7660         
##                                          
##        'Positive' Class : Tidak Alzheimer
## 

#MODEL LOGISTIC REGRESSION

library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ dials        1.4.0     ✔ tune         1.3.0
## ✔ infer        1.0.8     ✔ workflows    1.2.0
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ recipes      1.3.0     ✔ yardstick    1.3.2
## ✔ rsample      1.3.0
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ psych::%+%()             masks ggplot2::%+%()
## ✖ scales::alpha()          masks psych::alpha(), ggplot2::alpha()
## ✖ scales::discard()        masks purrr::discard()
## ✖ Matrix::expand()         masks tidyr::expand()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ recipes::fixed()         masks stringr::fixed()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ caret::lift()            masks purrr::lift()
## ✖ Matrix::pack()           masks tidyr::pack()
## ✖ yardstick::precision()   masks caret::precision()
## ✖ yardstick::recall()      masks caret::recall()
## ✖ car::recode()            masks dplyr::recode()
## ✖ MASS::select()           masks dplyr::select()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ car::some()              masks purrr::some()
## ✖ yardstick::spec()        masks readr::spec()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step()          masks stats::step()
## ✖ Matrix::unpack()         masks tidyr::unpack()
## ✖ recipes::update()        masks Matrix::update(), stats::update()
train_data$Diagnosis <- factor(train_data$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))
test_data$Diagnosis <- factor(test_data$Diagnosis, levels = c("Tidak Alzheimer", "Alzheimer"))

log_reg <- logistic_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet") %>%
  set_mode("classification")

rec <- recipe(Diagnosis ~ ., data = train_data)

wf <- workflow() %>%
  add_model(log_reg) %>%
  add_recipe(rec)

grid_vals <- grid_regular(
  mixture(range = c(0, 1)),
  penalty(range = c(-3, 0)),  
  levels = 4
)

set.seed(123)
folds <- vfold_cv(train_data, v = 5)

log_reg_tuned <- tune_grid(
  wf,
  resamples = folds,
  grid = grid_vals,
  metrics = metric_set(roc_auc),
  control = control_grid(save_pred = TRUE)
)

best_model <- select_best(log_reg_tuned, metric = "roc_auc")
final_wf <- finalize_workflow(wf, best_model)
final_model <- fit(final_wf, data = train_data)

pred_class <- predict(final_model, test_data, type = "class")
pred_prob <- predict(final_model, test_data, type = "prob")

results <- bind_cols(test_data, pred_class, pred_prob) %>%
  rename(.pred_class = .pred_class)

roc_data <- roc_curve(results, truth = Diagnosis, .pred_Alzheimer)
autoplot(roc_data) + labs(title = "ROC Curve")

results <- results %>%
  mutate(final_pred = factor(
    if_else(.pred_Alzheimer >= 0.5, "Alzheimer", "Tidak Alzheimer"),
    levels = c("Tidak Alzheimer", "Alzheimer")
  ))

conf_mat_custom <- conf_mat(results, truth = Diagnosis, estimate = final_pred)
autoplot(conf_mat_custom)

metrics_all <- metric_set(accuracy, precision, recall, f_meas, sens, spec)

all_metrics <- metrics_all(results, truth = Diagnosis, estimate = final_pred)
print(all_metrics)
## # A tibble: 6 × 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 accuracy  binary         0.832
## 2 precision binary         0.893
## 3 recall    binary         0.841
## 4 f_meas    binary         0.866
## 5 sens      binary         0.841
## 6 spec      binary         0.816

Model regresi logistik dibangun menggunakan fungsi glm() dengan family = binomial. Variabel dependen biasanya berupa kategori biner (misalnya: sakit vs tidak sakit), dan variabel independen terdiri dari sejumlah prediktor. Model ini bertujuan mempelajari hubungan antara variabel input dengan probabilitas kejadian dari target. Setelah dilakukan hyperparameter ternyata akurasi naik menjadi 83%