Import & Cek Dataset

Library

library(mlbench)
library(readxl)
library(dplyr)
library(ggplot2)
library(caret)
library(MASS)
library(class)
library(rpart)
library(rpart.plot)
library(e1071)
library(pROC)
library(ROCR)
library(corrplot)
library(reshape2)
library(ggcorrplot)

Import Dataset

data_pdm <- read_excel("D:/Files/Anin's Docs/COLLEGE!/UNS/Semester 5/PDM/Tugas/2/gallstone-1/dataset-uci/dataset-uci.xlsx")

set.seed(123)  # memastikan replikasi hasil

Struktur Data

str(data_pdm)
## tibble [319 × 39] (S3: tbl_df/tbl/data.frame)
##  $ Gallstone Status                              : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Age                                           : num [1:319] 50 47 61 41 42 96 37 41 38 38 ...
##  $ Gender                                        : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Comorbidity                                   : num [1:319] 0 1 0 0 0 0 0 0 0 0 ...
##  $ Coronary Artery Disease (CAD)                 : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hypothyroidism                                : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyperlipidemia                                : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Diabetes Mellitus (DM)                        : num [1:319] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Height                                        : num [1:319] 185 176 171 168 178 155 185 176 186 171 ...
##  $ Weight                                        : num [1:319] 92.8 94.5 91.1 67.7 89.6 49 67.1 114 93.7 68.6 ...
##  $ Body Mass Index (BMI)                         : num [1:319] 27.1 30.5 31.2 24 28.3 20.4 19.6 36.8 27.1 23.5 ...
##  $ Total Body Water (TBW)                        : num [1:319] 52.9 43.1 47.2 41.4 51.4 34 41.8 57.6 51.9 39.5 ...
##  $ Extracellular Water (ECW)                     : num [1:319] 21.2 19.5 20.1 17 20 15.7 17.1 23.5 20.9 16.6 ...
##  $ Intracellular Water (ICW)                     : num [1:319] 31.7 23.6 27.1 24.4 31.4 18.3 24.7 34.1 31 22.9 ...
##  $ Extracellular Fluid/Total Body Water (ECF/TBW): num [1:319] 40 45 43 41 39 46 41 41 40 42 ...
##  $ Total Body Fat Ratio (TBFR) (%)               : num [1:319] 19.2 32.8 27.3 15.8 20 6.3 11.3 31.8 21.5 19.2 ...
##  $ Lean Mass (LM) (%)                            : num [1:319] 80.8 67.2 72.7 84.2 80 ...
##  $ Body Protein Content (Protein) (%)            : num [1:319] 18.9 16.7 16.4 16.9 16.8 ...
##  $ Visceral Fat Rating (VFR)                     : num [1:319] 9 15 15 6 8 12 3 16 8 6 ...
##  $ Bone Mass (BM)                                : num [1:319] 3.7 3.2 3.3 2.9 3.5 2.4 3 3.8 3.6 2.8 ...
##  $ Muscle Mass (MM)                              : num [1:319] 71.4 60.3 62.9 54.1 68.2 43.5 56.5 73.9 70 52.6 ...
##  $ Obesity (%)                                   : num [1:319] 23.4 38.8 41.7 9 28.6 7.4 10.9 67.4 23.1 6.7 ...
##  $ Total Fat Content (TFC)                       : num [1:319] 17.8 31 24.9 10.7 17.9 3.1 7.6 36.3 20.1 13.2 ...
##  $ Visceral Fat Area (VFA)                       : num [1:319] 10.6 18.4 16.2 6.5 10.4 0.9 4.6 23.7 12.7 8.2 ...
##  $ Visceral Muscle Area (VMA) (Kg)               : num [1:319] 39.7 32.7 34 29.2 37.4 25.9 31.4 37.5 37.8 28.8 ...
##  $ Hepatic Fat Accumulation (HFA)                : num [1:319] 0 0 0 1 2 0 0 3 2 0 ...
##  $ Glucose                                       : num [1:319] 102 94 103 69 109 74 94 125 93 93 ...
##  $ Total Cholesterol (TC)                        : num [1:319] 250 172 179 173 205 133 166 246 290 239 ...
##  $ Low Density Lipoprotein (LDL)                 : num [1:319] 175 108 124 73 154 ...
##  $ High Density Lipoprotein (HDL)                : num [1:319] 40 43 43 59 30 30 40 34 33 43 ...
##  $ Triglyceride                                  : num [1:319] 134 103 69 53 326 65 60 464 679 129 ...
##  $ Aspartat Aminotransferaz (AST)                : num [1:319] 20 14 18 20 27 13 15 26 68 19 ...
##  $ Alanin Aminotransferaz (ALT)                  : num [1:319] 22 13 14 12 54 13 14 28 102 34 ...
##  $ Alkaline Phosphatase (ALP)                    : num [1:319] 87 46 66 34 71 60 72 69 70 75 ...
##  $ Creatinine                                    : num [1:319] 0.82 0.87 1.25 1.02 0.82 1.46 0.77 1.3 0.91 0.91 ...
##  $ Glomerular Filtration Rate (GFR)              : num [1:319] NA 107.1 65.5 94.1 112.5 ...
##  $ C-Reactive Protein (CRP)                      : num [1:319] 0 0 0 0 0 0 0 0.11 1.57 0 ...
##  $ Hemoglobin (HGB)                              : num [1:319] 16 14.4 16.2 15.4 16.8 11 13.8 16.5 16.5 16.6 ...
##  $ Vitamin D                                     : num [1:319] 33 25 30.2 35.4 40.6 45.8 20 24.5 22.7 15.6 ...
summary(data_pdm)
##  Gallstone Status      Age            Gender        Comorbidity    
##  Min.   :0.0000   Min.   :20.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:38.50   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :49.00   Median :0.0000   Median :0.0000  
##  Mean   :0.4953   Mean   :48.07   Mean   :0.4922   Mean   :0.3354  
##  3rd Qu.:1.0000   3rd Qu.:56.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :96.00   Max.   :1.0000   Max.   :3.0000  
##                                                                    
##  Coronary Artery Disease (CAD) Hypothyroidism    Hyperlipidemia   
##  Min.   :0.00000               Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000               1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000               Median :0.00000   Median :0.00000  
##  Mean   :0.03762               Mean   :0.02821   Mean   :0.02508  
##  3rd Qu.:0.00000               3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000               Max.   :1.00000   Max.   :1.00000  
##                                                                   
##  Diabetes Mellitus (DM)     Height          Weight       Body Mass Index (BMI)
##  Min.   :0.0000         Min.   :145.0   Min.   : 42.90   Min.   :17.40        
##  1st Qu.:0.0000         1st Qu.:159.5   1st Qu.: 69.60   1st Qu.:25.25        
##  Median :0.0000         Median :168.0   Median : 78.80   Median :28.30        
##  Mean   :0.1348         Mean   :167.2   Mean   : 80.56   Mean   :28.88        
##  3rd Qu.:0.0000         3rd Qu.:175.0   3rd Qu.: 91.25   3rd Qu.:31.85        
##  Max.   :1.0000         Max.   :191.0   Max.   :143.50   Max.   :49.70        
##                                                                               
##  Total Body Water (TBW) Extracellular Water (ECW) Intracellular Water (ICW)
##  Min.   :13.00          Min.   : 9.00             Min.   :13.80            
##  1st Qu.:34.20          1st Qu.:14.80             1st Qu.:19.30            
##  Median :39.80          Median :17.10             Median :23.00            
##  Mean   :40.59          Mean   :17.07             Mean   :23.63            
##  3rd Qu.:47.00          3rd Qu.:19.40             3rd Qu.:27.55            
##  Max.   :66.20          Max.   :27.80             Max.   :57.10            
##                                                                            
##  Extracellular Fluid/Total Body Water (ECF/TBW) Total Body Fat Ratio (TBFR) (%)
##  Min.   :29.23                                  Min.   : 6.30                  
##  1st Qu.:40.08                                  1st Qu.:22.02                  
##  Median :42.00                                  Median :27.82                  
##  Mean   :42.21                                  Mean   :28.27                  
##  3rd Qu.:44.00                                  3rd Qu.:34.81                  
##  Max.   :52.00                                  Max.   :50.92                  
##                                                                                
##  Lean Mass (LM) (%) Body Protein Content (Protein) (%)
##  Min.   :48.99      Min.   : 5.56                     
##  1st Qu.:65.17      1st Qu.:14.46                     
##  Median :72.11      Median :15.87                     
##  Mean   :71.64      Mean   :15.94                     
##  3rd Qu.:77.85      3rd Qu.:17.43                     
##  Max.   :93.67      Max.   :24.81                     
##                                                       
##  Visceral Fat Rating (VFR) Bone Mass (BM)  Muscle Mass (MM)  Obesity (%)     
##  Min.   : 1.000            Min.   :1.400   Min.   : 4.70    Min.   :   0.40  
##  1st Qu.: 6.000            1st Qu.:2.400   1st Qu.:45.80    1st Qu.:  13.90  
##  Median : 9.000            Median :2.800   Median :53.90    Median :  25.60  
##  Mean   : 9.078            Mean   :2.803   Mean   :54.27    Mean   :  35.85  
##  3rd Qu.:12.000            3rd Qu.:3.200   3rd Qu.:62.60    3rd Qu.:  41.75  
##  Max.   :31.000            Max.   :4.000   Max.   :78.80    Max.   :1954.00  
##                                                                              
##  Total Fat Content (TFC) Visceral Fat Area (VFA)
##  Min.   : 3.10           Min.   : 0.90          
##  1st Qu.:17.00           1st Qu.: 8.57          
##  Median :22.60           Median :11.59          
##  Mean   :23.49           Mean   :12.17          
##  3rd Qu.:28.55           3rd Qu.:15.10          
##  Max.   :62.50           Max.   :41.00          
##                                                 
##  Visceral Muscle Area (VMA) (Kg) Hepatic Fat Accumulation (HFA)    Glucose     
##  Min.   :18.90                   Min.   :0.00                   Min.   : 69.0  
##  1st Qu.:27.25                   1st Qu.:0.00                   1st Qu.: 92.0  
##  Median :30.41                   Median :1.00                   Median : 98.0  
##  Mean   :30.40                   Mean   :1.15                   Mean   :108.7  
##  3rd Qu.:33.80                   3rd Qu.:2.00                   3rd Qu.:109.0  
##  Max.   :41.10                   Max.   :4.00                   Max.   :575.0  
##                                                                                
##  Total Cholesterol (TC) Low Density Lipoprotein (LDL)
##  Min.   : 60.0          Min.   : 11.0                
##  1st Qu.:172.0          1st Qu.:100.5                
##  Median :198.0          Median :122.0                
##  Mean   :203.5          Mean   :126.7                
##  3rd Qu.:233.0          3rd Qu.:151.0                
##  Max.   :360.0          Max.   :293.0                
##                                                      
##  High Density Lipoprotein (HDL)  Triglyceride    Aspartat Aminotransferaz (AST)
##  Min.   : 25.00                 Min.   :  1.39   Min.   :  8.00                
##  1st Qu.: 40.00                 1st Qu.: 83.00   1st Qu.: 15.00                
##  Median : 46.50                 Median :119.00   Median : 18.00                
##  Mean   : 49.48                 Mean   :144.50   Mean   : 21.68                
##  3rd Qu.: 56.00                 3rd Qu.:172.00   3rd Qu.: 23.00                
##  Max.   :273.00                 Max.   :838.00   Max.   :195.00                
##                                                                                
##  Alanin Aminotransferaz (ALT) Alkaline Phosphatase (ALP)   Creatinine    
##  Min.   :  3.00               Min.   :  7.00             Min.   :0.4600  
##  1st Qu.: 14.25               1st Qu.: 58.00             1st Qu.:0.6500  
##  Median : 19.00               Median : 71.00             Median :0.7900  
##  Mean   : 26.86               Mean   : 73.11             Mean   :0.8006  
##  3rd Qu.: 30.00               3rd Qu.: 86.00             3rd Qu.:0.9200  
##  Max.   :372.00               Max.   :197.00             Max.   :1.4600  
##                                                                          
##  Glomerular Filtration Rate (GFR) C-Reactive Protein (CRP) Hemoglobin (HGB)
##  Min.   : 10.60                   Min.   : 0.000           Min.   : 8.50   
##  1st Qu.: 94.17                   1st Qu.: 0.000           1st Qu.:13.30   
##  Median :104.00                   Median : 0.215           Median :14.40   
##  Mean   :100.78                   Mean   : 1.854           Mean   :14.42   
##  3rd Qu.:110.63                   3rd Qu.: 1.615           3rd Qu.:15.70   
##  Max.   :132.00                   Max.   :43.400           Max.   :18.80   
##  NA's   :1                                                                 
##    Vitamin D    
##  Min.   : 3.50  
##  1st Qu.:13.25  
##  Median :22.00  
##  Mean   :21.40  
##  3rd Qu.:28.06  
##  Max.   :53.10  
## 
head(data_pdm)
## # A tibble: 6 × 39
##   `Gallstone Status`   Age Gender Comorbidity `Coronary Artery Disease (CAD)`
##                <dbl> <dbl>  <dbl>       <dbl>                           <dbl>
## 1                  0    50      0           0                               0
## 2                  0    47      0           1                               0
## 3                  0    61      0           0                               0
## 4                  0    41      0           0                               0
## 5                  0    42      0           0                               0
## 6                  0    96      0           0                               0
## # ℹ 34 more variables: Hypothyroidism <dbl>, Hyperlipidemia <dbl>,
## #   `Diabetes Mellitus (DM)` <dbl>, Height <dbl>, Weight <dbl>,
## #   `Body Mass Index (BMI)` <dbl>, `Total Body Water (TBW)` <dbl>,
## #   `Extracellular Water (ECW)` <dbl>, `Intracellular Water (ICW)` <dbl>,
## #   `Extracellular Fluid/Total Body Water (ECF/TBW)` <dbl>,
## #   `Total Body Fat Ratio (TBFR) (%)` <dbl>, `Lean Mass (LM) (%)` <dbl>,
## #   `Body Protein Content (Protein) (%)` <dbl>, …

Exploratory Data Analysis

Cek Nilai Hilang
colSums(is.na(data_pdm))
##                               Gallstone Status 
##                                              0 
##                                            Age 
##                                              0 
##                                         Gender 
##                                              0 
##                                    Comorbidity 
##                                              0 
##                  Coronary Artery Disease (CAD) 
##                                              0 
##                                 Hypothyroidism 
##                                              0 
##                                 Hyperlipidemia 
##                                              0 
##                         Diabetes Mellitus (DM) 
##                                              0 
##                                         Height 
##                                              0 
##                                         Weight 
##                                              0 
##                          Body Mass Index (BMI) 
##                                              0 
##                         Total Body Water (TBW) 
##                                              0 
##                      Extracellular Water (ECW) 
##                                              0 
##                      Intracellular Water (ICW) 
##                                              0 
## Extracellular Fluid/Total Body Water (ECF/TBW) 
##                                              0 
##                Total Body Fat Ratio (TBFR) (%) 
##                                              0 
##                             Lean Mass (LM) (%) 
##                                              0 
##             Body Protein Content (Protein) (%) 
##                                              0 
##                      Visceral Fat Rating (VFR) 
##                                              0 
##                                 Bone Mass (BM) 
##                                              0 
##                               Muscle Mass (MM) 
##                                              0 
##                                    Obesity (%) 
##                                              0 
##                        Total Fat Content (TFC) 
##                                              0 
##                        Visceral Fat Area (VFA) 
##                                              0 
##                Visceral Muscle Area (VMA) (Kg) 
##                                              0 
##                 Hepatic Fat Accumulation (HFA) 
##                                              0 
##                                        Glucose 
##                                              0 
##                         Total Cholesterol (TC) 
##                                              0 
##                  Low Density Lipoprotein (LDL) 
##                                              0 
##                 High Density Lipoprotein (HDL) 
##                                              0 
##                                   Triglyceride 
##                                              0 
##                 Aspartat Aminotransferaz (AST) 
##                                              0 
##                   Alanin Aminotransferaz (ALT) 
##                                              0 
##                     Alkaline Phosphatase (ALP) 
##                                              0 
##                                     Creatinine 
##                                              0 
##               Glomerular Filtration Rate (GFR) 
##                                              1 
##                       C-Reactive Protein (CRP) 
##                                              0 
##                               Hemoglobin (HGB) 
##                                              0 
##                                      Vitamin D 
##                                              0

Korelasi untuk Variabel Numerik

num_vars <- dplyr::select_if(data_pdm, is.numeric)
corrplot(cor(num_vars), method = "color", tl.cex = 0.7)

table(data_pdm$`Gallstone Status`)
## 
##   0   1 
## 161 158
ggplot(data_pdm, aes(x = `Gallstone Status`)) + 
  geom_bar(fill = "#00BFC4") + 
  theme_minimal() +
  labs(title = "Distribusi Kelas Target")

Preprocessing

Ubah Target Menjadi Faktor
df <- data_pdm
df$y <- factor(df$`Gallstone Status`, levels = c(0,1), labels = c("No","Yes"))

Pisahkan Fitur (Selain Target)

X <- df[, !(names(df) %in% c("Gallstone Status", "y"))]

Ubah Kolom Kategorik Menjadi Numberik

X$Gender <- as.numeric(factor(X$Gender))
X$Comorbidity <- as.numeric(factor(X$Comorbidity))
X$`Hepatic Fat Accumulation (HFA)` <- as.numeric(factor(X$`Hepatic Fat Accumulation (HFA)`))

Gabungkan Kolom Target dengan Fitur

df_final <- cbind(y = df$y, X)

Bagi Data Training (80%) dan Data Testing (20%)

set.seed(123)
index <- createDataPartition(df_final$y, p = 0.8, list = FALSE)
train <- df_final[index, ]
test  <- df_final[-index, ]

Scaling Hanya Pada Fitur Numerik

pp <- preProcess(train[, -1], method = c("center", "scale"))
train_sc <- train
train_sc[, -1] <- predict(pp, train[, -1])
test_sc <- test
test_sc[, -1] <- predict(pp, test[, -1])

Cek Hasil Akhir

str(train_sc)
## 'data.frame':    256 obs. of  39 variables:
##  $ y                                             : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age                                           : num  0.166 -0.567 -0.486 3.915 -0.893 ...
##  $ Gender                                        : num  -0.952 -0.952 -0.952 -0.952 -0.952 ...
##  $ Comorbidity                                   : num  -0.647 -0.647 -0.647 -0.647 -0.647 ...
##  $ Coronary Artery Disease (CAD)                 : num  -0.155 -0.155 -0.155 -0.155 -0.155 ...
##  $ Hypothyroidism                                : num  -0.167 -0.167 -0.167 -0.167 -0.167 ...
##  $ Hyperlipidemia                                : num  -0.155 -0.155 -0.155 -0.155 -0.155 ...
##  $ Diabetes Mellitus (DM)                        : num  -0.417 -0.417 -0.417 -0.417 -0.417 ...
##  $ Height                                        : num  1.7557 0.0771 1.0645 -1.2065 1.7557 ...
##  $ Weight                                        : num  0.768 -0.841 0.563 -2.04 -0.879 ...
##  $ Body Mass Index (BMI)                         : num  -0.352 -0.945 -0.123 -1.633 -1.786 ...
##  $ Total Body Water (TBW)                        : num  1.52 0.068 1.331 -0.866 0.118 ...
##  $ Extracellular Water (ECW)                     : num  1.29605 -0.04168 0.91384 -0.45574 -0.00983 ...
##  $ Intracellular Water (ICW)                     : num  1.4428 0.0966 1.3875 -1.0283 0.1519 ...
##  $ Extracellular Fluid/Total Body Water (ECF/TBW): num  -0.635 -0.331 -0.939 1.19 -0.331 ...
##  $ Total Body Fat Ratio (TBFR) (%)               : num  -1.031 -1.43 -0.937 -2.547 -1.959 ...
##  $ Lean Mass (LM) (%)                            : num  1.049 1.444 0.952 2.563 1.973 ...
##  $ Body Protein Content (Protein) (%)            : num  1.228 0.362 0.322 1.053 1.674 ...
##  $ Visceral Fat Rating (VFR)                     : num  -0.0154 -0.7099 -0.2469 0.6791 -1.4044 ...
##  $ Bone Mass (BM)                                : num  1.734 0.168 1.342 -0.812 0.363 ...
##  $ Muscle Mass (MM)                              : num  1.5828 -0.0522 1.2804 -1.054 0.1746 ...
##  $ Obesity (%)                                   : num  -0.3104 -0.9921 -0.0642 -1.0678 -0.9021 ...
##  $ Total Fat Content (TFC)                       : num  -0.576 -1.314 -0.565 -2.105 -1.637 ...
##  $ Visceral Fat Area (VFA)                       : num  -0.281 -1.053 -0.319 -2.108 -1.411 ...
##  $ Visceral Muscle Area (VMA) (Kg)               : num  2.097 -0.316 1.568 -1.075 0.189 ...
##  $ Hepatic Fat Accumulation (HFA)                : num  -1.105 -0.181 0.744 -1.105 -1.105 ...
##  $ Glucose                                       : num  -0.1701 -0.8422 -0.0275 -0.7404 -0.333 ...
##  $ Total Cholesterol (TC)                        : num  0.927 -0.7168 -0.0336 -1.5707 -0.8662 ...
##  $ Low Density Lipoprotein (LDL)                 : num  1.186 -1.433 0.647 -0.971 -0.457 ...
##  $ High Density Lipoprotein (HDL)                : num  -0.513 0.502 -1.047 -1.047 -0.513 ...
##  $ Triglyceride                                  : num  -0.144 -0.928 1.714 -0.812 -0.861 ...
##  $ Aspartat Aminotransferaz (AST)                : num  -0.0355 -0.0355 0.7912 -0.8622 -0.626 ...
##  $ Alanin Aminotransferaz (ALT)                  : num  -0.199 -0.78 1.661 -0.722 -0.664 ...
##  $ Alkaline Phosphatase (ALP)                    : num  0.5569 -1.601 -0.0945 -0.5424 -0.0538 ...
##  $ Creatinine                                    : num  0.128 1.252 0.128 3.724 -0.153 ...
##  $ Glomerular Filtration Rate (GFR)              : num  NA -0.391 0.657 -3.262 0.986 ...
##  $ C-Reactive Protein (CRP)                      : num  -0.375 -0.375 -0.375 -0.375 -0.375 ...
##  $ Hemoglobin (HGB)                              : num  0.864 0.533 1.305 -1.892 -0.348 ...
##  $ Vitamin D                                     : num  1.121 1.356 1.864 2.371 -0.148 ...
levels(train_sc$y)
## [1] "No"  "Yes"
levels(test_sc$y)
## [1] "No"  "Yes"

Pemodelan

Decision Tree

rpart

tree_model <- rpart(y ~ ., data = train, method = "class")
rpart.plot(tree_model, type = 3, extra = 101, fallen.leaves = TRUE)

Prediksi Data Training

pred_train_class <- predict(tree_model, type = "class")         # ok pakai data training
table(pred_train_class, train$y)
##                 
## pred_train_class  No Yes
##              No  114  28
##              Yes  15  99
conf_tree_train <- caret::confusionMatrix(pred_train_class, train$y, positive = "Yes")
conf_tree_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  114  28
##        Yes  15  99
##                                           
##                Accuracy : 0.832           
##                  95% CI : (0.7805, 0.8757)
##     No Information Rate : 0.5039          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6638          
##                                           
##  Mcnemar's Test P-Value : 0.06725         
##                                           
##             Sensitivity : 0.7795          
##             Specificity : 0.8837          
##          Pos Pred Value : 0.8684          
##          Neg Pred Value : 0.8028          
##              Prevalence : 0.4961          
##          Detection Rate : 0.3867          
##    Detection Prevalence : 0.4453          
##       Balanced Accuracy : 0.8316          
##                                           
##        'Positive' Class : Yes             
## 

Prediksi Data Testing

pred_test_class <- predict(tree_model, newdata = test, type = "class")
table(pred_test_class, test$y)
##                
## pred_test_class No Yes
##             No  27  11
##             Yes  5  20
conf_tree_test <- caret::confusionMatrix(pred_test_class, test$y, positive = "Yes")
conf_tree_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  27  11
##        Yes  5  20
##                                           
##                Accuracy : 0.746           
##                  95% CI : (0.6206, 0.8473)
##     No Information Rate : 0.5079          
##     P-Value [Acc > NIR] : 9.638e-05       
##                                           
##                   Kappa : 0.4904          
##                                           
##  Mcnemar's Test P-Value : 0.2113          
##                                           
##             Sensitivity : 0.6452          
##             Specificity : 0.8438          
##          Pos Pred Value : 0.8000          
##          Neg Pred Value : 0.7105          
##              Prevalence : 0.4921          
##          Detection Rate : 0.3175          
##    Detection Prevalence : 0.3968          
##       Balanced Accuracy : 0.7445          
##                                           
##        'Positive' Class : Yes             
## 

Logistic Regression

log_model <- glm(y ~ ., data = train, family = binomial)

Prediksi Data Training

prob_train <- predict(log_model, newdata = train, type = "response")
pred_train <- ifelse(prob_train > 0.5, "Yes", "No")
pred_train <- factor(pred_train, levels = levels(train$y))
conf_matrix_train <- table(Predicted = pred_train, Actual = train$y)
conf_matrix_train
##          Actual
## Predicted  No Yes
##       No  115  22
##       Yes  13 105
accuracy_train <- round(sum(diag(conf_matrix_train)) / sum(conf_matrix_train) * 100, 2)
accuracy_train
## [1] 86.27
cat(paste0("\nAkurasi data training sebesar ", accuracy_train, "%.\n"))
## 
## Akurasi data training sebesar 86.27%.
conf_log_train <- caret::confusionMatrix(pred_train, train$y, positive = "Yes")
conf_log_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  115  22
##        Yes  13 105
##                                           
##                Accuracy : 0.8627          
##                  95% CI : (0.8143, 0.9025)
##     No Information Rate : 0.502           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7254          
##                                           
##  Mcnemar's Test P-Value : 0.1763          
##                                           
##             Sensitivity : 0.8268          
##             Specificity : 0.8984          
##          Pos Pred Value : 0.8898          
##          Neg Pred Value : 0.8394          
##              Prevalence : 0.4980          
##          Detection Rate : 0.4118          
##    Detection Prevalence : 0.4627          
##       Balanced Accuracy : 0.8626          
##                                           
##        'Positive' Class : Yes             
## 

Prediksi Data Testing

prob_test <- predict(log_model, newdata = test, type = "response")
pred_test <- ifelse(prob_test > 0.5, "Yes", "No")
pred_test <- factor(pred_test, levels = levels(test$y))
conf_matrix_test <- table(Predicted = pred_test, Actual = test$y)
conf_matrix_test
##          Actual
## Predicted No Yes
##       No  30  11
##       Yes  2  20
accuracy_test <- round(sum(diag(conf_matrix_test)) / sum(conf_matrix_test) * 100, 2)
accuracy_test
## [1] 79.37
cat(paste0("\nAkurasi data testing sebesar ", accuracy_test, "%.\n"))
## 
## Akurasi data testing sebesar 79.37%.
conf_log_test <- caret::confusionMatrix(pred_test, test$y, positive = "Yes")
conf_log_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  30  11
##        Yes  2  20
##                                          
##                Accuracy : 0.7937         
##                  95% CI : (0.673, 0.8853)
##     No Information Rate : 0.5079         
##     P-Value [Acc > NIR] : 2.719e-06      
##                                          
##                   Kappa : 0.5853         
##                                          
##  Mcnemar's Test P-Value : 0.0265         
##                                          
##             Sensitivity : 0.6452         
##             Specificity : 0.9375         
##          Pos Pred Value : 0.9091         
##          Neg Pred Value : 0.7317         
##              Prevalence : 0.4921         
##          Detection Rate : 0.3175         
##    Detection Prevalence : 0.3492         
##       Balanced Accuracy : 0.7913         
##                                          
##        'Positive' Class : Yes            
## 

K-Nearest Neighbours (KNN)

Fit Menggunakan Data Yang Sudah Di Scale

train_x <- dplyr::select(train_sc, -y) 
train_x[is.na(train_x)] <- 0
test_x  <- dplyr::select(test_sc,  -y)  
test_x[is.na(test_x)] <- 0
train_y <- train_sc$y                    
test_y  <- test_sc$y  
train_x <- as.data.frame(train_x)
train_y <- as.factor(train_y)                  

set.seed(123)
ctrl <- caret::trainControl(method = "cv", number = 10)
grid <- data.frame(k = seq(3, 31, 2))

knn_cv <- caret::train(
  x = train_x, y = train_y,
  method = "knn",
  trControl = ctrl,
  tuneGrid = grid,
  metric = "Accuracy"
)

best_k <- knn_cv$bestTune$k
print(knn_cv)
## k-Nearest Neighbors 
## 
## 256 samples
##  38 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 230, 230, 230, 231, 231, 230, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    3  0.6561538  0.3093554
##    5  0.6569231  0.3120052
##    7  0.6755385  0.3491110
##    9  0.6758462  0.3492669
##   11  0.6521538  0.3007999
##   13  0.6524615  0.3012033
##   15  0.6526154  0.3019216
##   17  0.6449231  0.2856811
##   19  0.6486154  0.2919221
##   21  0.6290769  0.2536559
##   23  0.6253846  0.2469093
##   25  0.6098462  0.2151749
##   27  0.6098462  0.2158941
##   29  0.6329231  0.2612917
##   31  0.6249231  0.2453434
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
cat(paste0("\nK terbaik hasil CV = ", best_k, "\n"))    
## 
## K terbaik hasil CV = 9

Menggunakan Data Training

pred_train_knn <- predict(knn_cv, newdata = train_x)
conf_knn_train <- caret::confusionMatrix(pred_train_knn, train_y, positive = "Yes")
conf_knn_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  111  51
##        Yes  18  76
##                                           
##                Accuracy : 0.7305          
##                  95% CI : (0.6717, 0.7838)
##     No Information Rate : 0.5039          
##     P-Value [Acc > NIR] : 1.193e-13       
##                                           
##                   Kappa : 0.4598          
##                                           
##  Mcnemar's Test P-Value : 0.000117        
##                                           
##             Sensitivity : 0.5984          
##             Specificity : 0.8605          
##          Pos Pred Value : 0.8085          
##          Neg Pred Value : 0.6852          
##              Prevalence : 0.4961          
##          Detection Rate : 0.2969          
##    Detection Prevalence : 0.3672          
##       Balanced Accuracy : 0.7294          
##                                           
##        'Positive' Class : Yes             
## 

Menggunakan Data Testing

pred_test_knn <- predict(knn_cv, newdata = test_x)
conf_knn_test <- caret::confusionMatrix(pred_test_knn, test_y, positive = "Yes")
conf_knn_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  28  19
##        Yes  4  12
##                                          
##                Accuracy : 0.6349         
##                  95% CI : (0.504, 0.7527)
##     No Information Rate : 0.5079         
##     P-Value [Acc > NIR] : 0.028824       
##                                          
##                   Kappa : 0.2641         
##                                          
##  Mcnemar's Test P-Value : 0.003509       
##                                          
##             Sensitivity : 0.3871         
##             Specificity : 0.8750         
##          Pos Pred Value : 0.7500         
##          Neg Pred Value : 0.5957         
##              Prevalence : 0.4921         
##          Detection Rate : 0.1905         
##    Detection Prevalence : 0.2540         
##       Balanced Accuracy : 0.6310         
##                                          
##        'Positive' Class : Yes            
## 

Confusion Matrix

conf_matrix_knn_test <- table(Predicted = pred_test_knn, Actual = test_y)
conf_matrix_knn_test
##          Actual
## Predicted No Yes
##       No  28  19
##       Yes  4  12
accuracy_knn_test <- round(sum(diag(conf_matrix_knn_test)) / sum(conf_matrix_knn_test) * 100, 2)
cat(paste0("\nAkurasi data testing (KNN, k=", best_k, ") sebesar ", accuracy_knn_test, "%.\n"))
## 
## Akurasi data testing (KNN, k=9) sebesar 63.49%.

Evaluasi Model

  1. { f1 <- 2 * prec * rec / (prec + rec) } else f1 <- NA_real_ } c(Accuracy = as.numeric(acc), Precision = as.numeric(prec), Recall = as.numeric(rec), F1 = as.numeric(f1)) }

m_tree <- get_metrics(conf_tree_test) m_log <- get_metrics(conf_log_test) m_knn <- get_metrics(conf_knn_test)

model_eval <- data.frame( Model = c(“Decision Tree”,“Logistic Regression”,“KNN”), Accuracy = c(m_tree[“Accuracy”], m_log[“Accuracy”], m_knn[“Accuracy”]), Precision = c(m_tree[“Precision”], m_log[“Precision”], m_knn[“Precision”]), Recall = c(m_tree[“Recall”], m_log[“Recall”], m_knn[“Recall”]), F1 = c(m_tree[“F1”], m_log[“F1”], m_knn[“F1”]) ) print(model_eval)

Model Accuracy Precision Recall F1

1 Decision Tree 0.7460317 0.8000000 0.6451613 0.7142857

2 Logistic Regression 0.7936508 0.9090909 0.6451613 0.7547170

3 KNN 0.6349206 0.7500000 0.3870968 0.5106383


<h1 style="color: black; font-size:22px"> Feature Importance </h1>

```r
library(caret)
library(ggplot2)
library(tibble)

Standarisasi Fitur (tanapa kolom)

scaler <- caret::preProcess(train[, -1], method = c("center","scale"))
train_std <- train
train_std[, -1] <- predict(scaler, train[, -1])

Fit Model Logistik dengan Data Terstandarisasi

log_model_std <- glm(y ~ ., data = train_std, family = binomial)

Ambil Koefisien dan hHtung Importance = |β|

summary_log <- summary(log_model_std)
coefs <- as.data.frame(summary_log$coefficients)
colnames(coefs) <- c("Estimate", "Std.Error", "z.value", "p.value")
coefs <- coefs[-1, , drop = FALSE]  # drop intercept

coefs$Importance <- abs(coefs$Estimate)
coefs$OR <- exp(coefs$Estimate)  # odds ratio
coefs_sorted <- coefs[order(coefs$Importance, decreasing = TRUE), ]
coefs_sorted
##                                                     Estimate   Std.Error
## Weight                                            3.30787737   3.7875910
## `Extracellular Water (ECW)`                      -3.15591628   2.5751484
## Hyperlipidemia                                    2.93303260 209.7408772
## `Intracellular Water (ICW)`                       2.62233196   3.1696649
## `Bone Mass (BM)`                                 -2.24252710   0.9687638
## `Obesity (%)`                                    -1.95781607   0.8194581
## `C-Reactive Protein (CRP)`                        1.89049116   0.6787019
## `Lean Mass (LM) (%)`                              1.73570188   2.5717544
## `Total Body Fat Ratio (TBFR) (%)`                 1.65007988   2.1172123
## `Total Fat Content (TFC)`                        -1.02899746   0.9486100
## Gender                                           -1.02266152   0.6209531
## `Muscle Mass (MM)`                               -1.00546770   1.0755186
## `Vitamin D`                                      -0.98098986   0.2392966
## `Total Body Water (TBW)`                          0.97235599   2.2273507
## `Body Protein Content (Protein) (%)`             -0.90442550   0.7854519
## Age                                               0.76297397   0.5031528
## `Visceral Fat Area (VFA)`                         0.75238005   1.0067807
## `Hemoglobin (HGB)`                               -0.73842485   0.3289968
## `Diabetes Mellitus (DM)`                          0.71398942   0.3201217
## `Body Mass Index (BMI)`                           0.71318804   2.1632437
## `Aspartat Aminotransferaz (AST)`                 -0.65468588   0.3718906
## Hypothyroidism                                   -0.62601903   0.5873089
## `Visceral Fat Rating (VFR)`                      -0.55532145   0.9622551
## `Extracellular Fluid/Total Body Water (ECF/TBW)`  0.54246097   1.2924876
## `Alanin Aminotransferaz (ALT)`                    0.49471792   0.3624082
## `High Density Lipoprotein (HDL)`                  0.48991779   0.4628680
## `Hepatic Fat Accumulation (HFA)`                  0.39193430   0.2704924
## `Glomerular Filtration Rate (GFR)`                0.38524846   0.2664003
## Triglyceride                                     -0.29530544   0.3499490
## Comorbidity                                      -0.27798272   0.3559688
## `Low Density Lipoprotein (LDL)`                  -0.25949004   0.5216240
## `Total Cholesterol (TC)`                          0.22864573   0.5639644
## `Coronary Artery Disease (CAD)`                  -0.20606698   0.2458566
## `Visceral Muscle Area (VMA) (Kg)`                 0.18777388   0.3625015
## Glucose                                          -0.11728419   0.2855388
## Creatinine                                        0.09729786   0.3099899
## `Alkaline Phosphatase (ALP)`                      0.05982663   0.2386774
## Height                                            0.04838280   1.3674660
##                                                      z.value      p.value
## Weight                                            0.87334598 3.824745e-01
## `Extracellular Water (ECW)`                      -1.22552795 2.203764e-01
## Hyperlipidemia                                    0.01398408 9.888427e-01
## `Intracellular Water (ICW)`                       0.82732152 4.080548e-01
## `Bone Mass (BM)`                                 -2.31483365 2.062204e-02
## `Obesity (%)`                                    -2.38915944 1.688697e-02
## `C-Reactive Protein (CRP)`                        2.78545135 5.345326e-03
## `Lean Mass (LM) (%)`                              0.67490965 4.997332e-01
## `Total Body Fat Ratio (TBFR) (%)`                 0.77936439 4.357651e-01
## `Total Fat Content (TFC)`                        -1.08474235 2.780358e-01
## Gender                                           -1.64692241 9.957400e-02
## `Muscle Mass (MM)`                               -0.93486775 3.498565e-01
## `Vitamin D`                                      -4.09947195 4.140938e-05
## `Total Body Water (TBW)`                          0.43655272 6.624358e-01
## `Body Protein Content (Protein) (%)`             -1.15147149 2.495383e-01
## Age                                               1.51638609 1.294218e-01
## `Visceral Fat Area (VFA)`                         0.74731272 4.548748e-01
## `Hemoglobin (HGB)`                               -2.24447425 2.480190e-02
## `Diabetes Mellitus (DM)`                          2.23036871 2.572297e-02
## `Body Mass Index (BMI)`                           0.32968455 7.416383e-01
## `Aspartat Aminotransferaz (AST)`                 -1.76042611 7.833558e-02
## Hypothyroidism                                   -1.06591106 2.864639e-01
## `Visceral Fat Rating (VFR)`                      -0.57710417 5.638691e-01
## `Extracellular Fluid/Total Body Water (ECF/TBW)`  0.41970303 6.747024e-01
## `Alanin Aminotransferaz (ALT)`                    1.36508492 1.722264e-01
## `High Density Lipoprotein (HDL)`                  1.05843963 2.898551e-01
## `Hepatic Fat Accumulation (HFA)`                  1.44896623 1.473470e-01
## `Glomerular Filtration Rate (GFR)`                1.44612631 1.481418e-01
## Triglyceride                                     -0.84385278 3.987517e-01
## Comorbidity                                      -0.78091887 4.348502e-01
## `Low Density Lipoprotein (LDL)`                  -0.49746569 6.188607e-01
## `Total Cholesterol (TC)`                          0.40542582 6.851645e-01
## `Coronary Artery Disease (CAD)`                  -0.83815921 4.019413e-01
## `Visceral Muscle Area (VMA) (Kg)`                 0.51799473 6.044619e-01
## Glucose                                          -0.41074687 6.812582e-01
## Creatinine                                        0.31387429 7.536165e-01
## `Alkaline Phosphatase (ALP)`                      0.25065896 8.020778e-01
## Height                                            0.03538136 9.717757e-01
##                                                  Importance          OR
## Weight                                           3.30787737 27.32705867
## `Extracellular Water (ECW)`                      3.15591628  0.04259935
## Hyperlipidemia                                   2.93303260 18.78451008
## `Intracellular Water (ICW)`                      2.62233196 13.76779215
## `Bone Mass (BM)`                                 2.24252710  0.10618981
## `Obesity (%)`                                    1.95781607  0.14116638
## `C-Reactive Protein (CRP)`                       1.89049116  6.62262065
## `Lean Mass (LM) (%)`                             1.73570188  5.67290813
## `Total Body Fat Ratio (TBFR) (%)`                1.65007988  5.20739577
## `Total Fat Content (TFC)`                        1.02899746  0.35736505
## Gender                                           1.02266152  0.35963649
## `Muscle Mass (MM)`                               1.00546770  0.36587348
## `Vitamin D`                                      0.98098986  0.37493978
## `Total Body Water (TBW)`                         0.97235599  2.64416675
## `Body Protein Content (Protein) (%)`             0.90442550  0.40477436
## Age                                              0.76297397  2.14464485
## `Visceral Fat Area (VFA)`                        0.75238005  2.12204459
## `Hemoglobin (HGB)`                               0.73842485  0.47786604
## `Diabetes Mellitus (DM)`                         0.71398942  2.04212191
## `Body Mass Index (BMI)`                          0.71318804  2.04048605
## `Aspartat Aminotransferaz (AST)`                 0.65468588  0.51960525
## Hypothyroidism                                   0.62601903  0.53471626
## `Visceral Fat Rating (VFR)`                      0.55532145  0.57388776
## `Extracellular Fluid/Total Body Water (ECF/TBW)` 0.54246097  1.72023510
## `Alanin Aminotransferaz (ALT)`                   0.49471792  1.64003556
## `High Density Lipoprotein (HDL)`                 0.48991779  1.63218203
## `Hepatic Fat Accumulation (HFA)`                 0.39193430  1.47984049
## `Glomerular Filtration Rate (GFR)`               0.38524846  1.46997950
## Triglyceride                                     0.29530544  0.74430421
## Comorbidity                                      0.27798272  0.75730990
## `Low Density Lipoprotein (LDL)`                  0.25949004  0.77144489
## `Total Cholesterol (TC)`                         0.22864573  1.25689667
## `Coronary Artery Disease (CAD)`                  0.20606698  0.81377857
## `Visceral Muscle Area (VMA) (Kg)`                0.18777388  1.20656066
## Glucose                                          0.11728419  0.88933242
## Creatinine                                       0.09729786  1.10218863
## `Alkaline Phosphatase (ALP)`                     0.05982663  1.06165247
## Height                                           0.04838280  1.04957236

Cetak 10 Fitur Teratas

cat("\nTop 10 fitur paling berpengaruh (berdasarkan |β|):\n")
## 
## Top 10 fitur paling berpengaruh (berdasarkan |β|):
print(head(coefs_sorted[, c("Estimate","Std.Error","z.value","p.value","OR","Importance")], 10))
##                                    Estimate   Std.Error     z.value     p.value
## Weight                             3.307877   3.7875910  0.87334598 0.382474525
## `Extracellular Water (ECW)`       -3.155916   2.5751484 -1.22552795 0.220376365
## Hyperlipidemia                     2.933033 209.7408772  0.01398408 0.988842684
## `Intracellular Water (ICW)`        2.622332   3.1696649  0.82732152 0.408054847
## `Bone Mass (BM)`                  -2.242527   0.9687638 -2.31483365 0.020622038
## `Obesity (%)`                     -1.957816   0.8194581 -2.38915944 0.016886972
## `C-Reactive Protein (CRP)`         1.890491   0.6787019  2.78545135 0.005345326
## `Lean Mass (LM) (%)`               1.735702   2.5717544  0.67490965 0.499733167
## `Total Body Fat Ratio (TBFR) (%)`  1.650080   2.1172123  0.77936439 0.435765094
## `Total Fat Content (TFC)`         -1.028997   0.9486100 -1.08474235 0.278035784
##                                            OR Importance
## Weight                            27.32705867   3.307877
## `Extracellular Water (ECW)`        0.04259935   3.155916
## Hyperlipidemia                    18.78451008   2.933033
## `Intracellular Water (ICW)`       13.76779215   2.622332
## `Bone Mass (BM)`                   0.10618981   2.242527
## `Obesity (%)`                      0.14116638   1.957816
## `C-Reactive Protein (CRP)`         6.62262065   1.890491
## `Lean Mass (LM) (%)`               5.67290813   1.735702
## `Total Body Fat Ratio (TBFR) (%)`  5.20739577   1.650080
## `Total Fat Content (TFC)`          0.35736505   1.028997

Plot dengan ggplot

top_n <- min(10, nrow(coefs_sorted))
plot_df <- head(coefs_sorted, top_n)
plot_df$Feature <- rownames(plot_df)

ggplot(plot_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_col(fill = "#00BFC4") +
  coord_flip() +
  labs(
    title = "Feature Importance (|β|) - Logistic Regression",
    x = NULL, y = "|Koefisien (β)| - standardized"
  ) +
  theme_minimal()

Pemodelan Ulang dengan Variabel Signifikan (Logistic Regression)

Variabel Signifikan Berdasarkan Hasil Uji p-value < 0.05
sig_features <- c("`Obesity (%)`",
                  "`Bone Mass (BM)`",
                  "`C-Reactive Protein (CRP)`",
                  "`Vitamin D`",
                  "`Hemoglobin (HGB)`",
                  "`Diabetes Mellitus (DM)`")

Buat Formula Regresi Logistik dengan Fitur Signifikan

formula_sig <- as.formula(paste("y ~", paste(sig_features, collapse = " + ")))

Fit Model Logistic Baru

log_model_sig <- glm(formula_sig, data = train, family = binomial)

Prediiksi Data Testing

prob_test_sig <- predict(log_model_sig, newdata = test, type = "response")
pred_test_sig <- ifelse(prob_test_sig > 0.5, "Yes", "No")
pred_test_sig <- factor(pred_test_sig, levels = levels(test$y))

Confusion Matrix Model Signifikan

conf_sig <- caret::confusionMatrix(pred_test_sig, test$y, positive = "Yes")
conf_sig
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  29   8
##        Yes  3  23
##                                          
##                Accuracy : 0.8254         
##                  95% CI : (0.709, 0.9095)
##     No Information Rate : 0.5079         
##     P-Value [Acc > NIR] : 1.604e-07      
##                                          
##                   Kappa : 0.6498         
##                                          
##  Mcnemar's Test P-Value : 0.2278         
##                                          
##             Sensitivity : 0.7419         
##             Specificity : 0.9062         
##          Pos Pred Value : 0.8846         
##          Neg Pred Value : 0.7838         
##              Prevalence : 0.4921         
##          Detection Rate : 0.3651         
##    Detection Prevalence : 0.4127         
##       Balanced Accuracy : 0.8241         
##                                          
##        'Positive' Class : Yes            
## 

Bandingkan Performa Model Penuh vs Model Signifikan

compare_acc <- data.frame(
  Model     = c("Full Model (All Variables)", "Reduced Model (Significant Variables)"),
  Accuracy  = c(as.numeric(conf_log_test$overall["Accuracy"]),
                as.numeric(conf_sig$overall["Accuracy"])),
  F1        = c(as.numeric(conf_log_test$byClass["F1"]),
                as.numeric(conf_sig$byClass["F1"])),
  Precision = c(as.numeric(conf_log_test$byClass["Precision"]),
                as.numeric(conf_sig$byClass["Precision"])),
  Recall    = c(as.numeric(conf_log_test$byClass["Recall"]),
                as.numeric(conf_sig$byClass["Recall"]))
)

print(compare_acc)
##                                   Model  Accuracy        F1 Precision    Recall
## 1            Full Model (All Variables) 0.7936508 0.7547170 0.9090909 0.6451613
## 2 Reduced Model (Significant Variables) 0.8253968 0.8070175 0.8846154 0.7419355

Visualisasi Perbandingan Akurasi

ggplot(compare_acc, aes(x = Model, y = Accuracy, fill = Model)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = round(Accuracy, 3)), vjust = -0.3, size = 4) +
  theme_minimal() +
  labs(
    title = "Perbandingan Akurasi: Model Penuh vs Model Signifikan (Logistic Regression)",
    y = "Accuracy", x = NULL
  ) +
  scale_fill_manual(values = c("#00BFC4", "#F8766D"))

Evaluasi AUC & ROC

library(pROC)
library(ggplot2)

Prediksi Probabilitas dari Model Signifikan

prob_test_sig <- predict(log_model_sig, newdata = test, type = "response")

ROC & AUC

roc_sig <- pROC::roc(
  response  = test$y,        # target (factor: No / Yes)
  predictor = prob_test_sig, # probabilitas prediksi
  levels    = c("No", "Yes"),
  direction = "<"
)

auc_sig <- pROC::auc(roc_sig)
cat("AUC Model Signifikan :", round(as.numeric(auc_sig), 3), "\n")
## AUC Model Signifikan : 0.856

Plot ROC

p <- pROC::ggroc(roc_sig, color = "#F8766D", size = 1.2) +
  geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "gray40") +
  labs(
    title = "Kurva ROC - Model Logistik (Variabel Signifikan)",
    subtitle = paste0("AUC = ", round(as.numeric(auc_sig), 3)),
    x = "False Positive Rate (1 - Spesifisitas)",
    y = "True Positive Rate (Sensitivitas)"
  ) +
  theme_minimal()

print(p)