keterangan data ini sudah bersih sudah di prosessing dan data missing sudah di isi dengan mean karena data tidak normal

data_model: dataset utama penelitian yang sudah melalui proses pembersihan data dan imputasi missing value.

PENYAKIT: variabel target (kelas/label) yang menunjukkan jenis penyakit pasien.

trainData: data training (80%) yang digunakan untuk membangun model.

testData: data testing (20%) yang digunakan untuk menguji performa model.

down_train: data training hasil undersampling (mengurangi kelas mayoritas).

up_train: data training hasil oversampling (menambah kelas minoritas).

train_smote: data training hasil SMOTE (membuat data sintetis kelas minoritas).

train_weighted: data training dengan pemberian bobot pada kelas minoritas tanpa mengubah jumlah data.

1. LIBRARY

library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
library(recipes)
## Warning: package 'recipes' was built under R version 4.4.3
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(themis)
## Warning: package 'themis' was built under R version 4.4.3

2. IMPORT DATA

data_model <- read_excel("D:/FINAL TA/data_bersih.xlsx")

data_model$PENYAKIT <- as.factor(data_model$PENYAKIT)

3. CEK DATA

str(data_model)
## tibble [322 × 9] (S3: tbl_df/tbl/data.frame)
##  $ JENIS_KELAMIN    : num [1:322] 1 1 1 1 1 1 0 0 0 1 ...
##  $ KEADAAN_KELUAR   : num [1:322] 1 0 0 0 0 0 0 0 0 0 ...
##  $ PENYAKIT         : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ PENYAKIT_PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ USIA_KATEGORI    : num [1:322] 1 0 1 0 1 1 1 0 0 1 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_LEOKOSIT    : num [1:322] 0 0 0 1 1 0 1 1 0 0 ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 0 1 1 1 0 ...
##  $ KODE_HEMOGLOBIN  : num [1:322] 0 0 1 0 1 0 1 1 0 0 ...
colSums(is.na(data_model))
##     JENIS_KELAMIN    KEADAAN_KELUAR          PENYAKIT PENYAKIT_PENYERTA 
##                 0                 0                 0                 0 
##     USIA_KATEGORI   KODE_LAMA_RAWAT     KODE_LEOKOSIT    KODE_TROMBOSIT 
##                 0                 0                 0                 0 
##   KODE_HEMOGLOBIN 
##                 0
View(data_model)

4. VIF

model_vif <- lm(as.numeric(PENYAKIT) ~ ., data = data_model)

vif(model_vif)
##     JENIS_KELAMIN    KEADAAN_KELUAR PENYAKIT_PENYERTA     USIA_KATEGORI 
##          1.054866          1.044094          1.071182          1.042413 
##   KODE_LAMA_RAWAT     KODE_LEOKOSIT    KODE_TROMBOSIT   KODE_HEMOGLOBIN 
##          1.040701          1.053958          1.089823          1.097102

5. SPLIT DATA

set.seed(123)

trainIndex <- createDataPartition(data_model$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data_model[trainIndex, ]
testData  <- data_model[-trainIndex, ]
trainData %>% count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT     n
##   <fct>    <int>
## 1 0          212
## 2 1           29
## 3 2           17
testData %>% count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT     n
##   <fct>    <int>
## 1 0           53
## 2 1            7
## 3 2            4

MODEL UTAMA (MULTINOMIAL LOGISTIC)

model_multinom <- multinom(PENYAKIT ~ ., data = trainData)
## # weights:  30 (18 variable)
## initial  value 283.441970 
## iter  10 value 141.881038
## iter  20 value 133.187779
## final  value 133.170821 
## converged
pred_multinom <- predict(model_multinom, testData)

confusionMatrix(pred_multinom, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2
##          0 50  7  4
##          1  0  0  0
##          2  3  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7812          
##                  95% CI : (0.6603, 0.8749)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 0.8751          
##                                           
##                   Kappa : -0.0529         
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity            0.9434   0.0000  0.00000
## Specificity            0.0000   1.0000  0.95000
## Pos Pred Value         0.8197      NaN  0.00000
## Neg Pred Value         0.0000   0.8906  0.93443
## Precision              0.8197       NA  0.00000
## Recall                 0.9434   0.0000  0.00000
## F1                     0.8772       NA      NaN
## Prevalence             0.8281   0.1094  0.06250
## Detection Rate         0.7812   0.0000  0.00000
## Detection Prevalence   0.9531   0.0000  0.04688
## Balanced Accuracy      0.4717   0.5000  0.47500

NAIVE BAYES BASELINE

NB_base <- naiveBayes(PENYAKIT ~ ., data = trainData)

pred_base <- predict(NB_base, testData)

confusionMatrix(pred_base, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2
##          0 48  7  2
##          1  0  0  0
##          2  5  0  2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7812          
##                  95% CI : (0.6603, 0.8749)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 0.8751          
##                                           
##                   Kappa : 0.1442          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity            0.9057   0.0000  0.50000
## Specificity            0.1818   1.0000  0.91667
## Pos Pred Value         0.8421      NaN  0.28571
## Neg Pred Value         0.2857   0.8906  0.96491
## Precision              0.8421       NA  0.28571
## Recall                 0.9057   0.0000  0.50000
## F1                     0.8727       NA  0.36364
## Prevalence             0.8281   0.1094  0.06250
## Detection Rate         0.7500   0.0000  0.03125
## Detection Prevalence   0.8906   0.0000  0.10938
## Balanced Accuracy      0.5437   0.5000  0.70833

UNDERSAMPLING

down_train <- downSample(
  x = trainData[, setdiff(names(trainData), "PENYAKIT")],
  y = trainData$PENYAKIT
)

colnames(down_train)[colnames(down_train) == "Class"] <- "PENYAKIT"

NB_under <- naiveBayes(PENYAKIT ~ ., data = down_train)

pred_under <- predict(NB_under, testData)

confusionMatrix(pred_under, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2
##          0 14  1  0
##          1 22  4  2
##          2 17  2  2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3125          
##                  95% CI : (0.2024, 0.4406)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0679          
##                                           
##  Mcnemar's Test P-Value : 6.88e-08        
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity            0.2642   0.5714  0.50000
## Specificity            0.9091   0.5789  0.68333
## Pos Pred Value         0.9333   0.1429  0.09524
## Neg Pred Value         0.2041   0.9167  0.95349
## Precision              0.9333   0.1429  0.09524
## Recall                 0.2642   0.5714  0.50000
## F1                     0.4118   0.2286  0.16000
## Prevalence             0.8281   0.1094  0.06250
## Detection Rate         0.2188   0.0625  0.03125
## Detection Prevalence   0.2344   0.4375  0.32812
## Balanced Accuracy      0.5866   0.5752  0.59167

OVERSAMPLING

up_train <- upSample(
  x = trainData[, setdiff(names(trainData), "PENYAKIT")],
  y = trainData$PENYAKIT
)

colnames(up_train)[colnames(up_train) == "Class"] <- "PENYAKIT"

NB_over <- naiveBayes(PENYAKIT ~ ., data = up_train)

pred_over <- predict(NB_over, testData)

confusionMatrix(pred_over, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2
##          0  6  1  0
##          1 33  5  2
##          2 14  1  2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2031          
##                  95% CI : (0.1128, 0.3223)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0335          
##                                           
##  Mcnemar's Test P-Value : 1.21e-09        
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity           0.11321  0.71429  0.50000
## Specificity           0.90909  0.38596  0.75000
## Pos Pred Value        0.85714  0.12500  0.11765
## Neg Pred Value        0.17544  0.91667  0.95745
## Precision             0.85714  0.12500  0.11765
## Recall                0.11321  0.71429  0.50000
## F1                    0.20000  0.21277  0.19048
## Prevalence            0.82812  0.10938  0.06250
## Detection Rate        0.09375  0.07812  0.03125
## Detection Prevalence  0.10938  0.62500  0.26562
## Balanced Accuracy     0.51115  0.55013  0.62500

SMOTE

rec_smote <- recipe(PENYAKIT ~ ., data = trainData) %>%
  step_smote(PENYAKIT)

train_smote <- prep(rec_smote) %>% juice()

NB_smote <- naiveBayes(PENYAKIT ~ ., data = train_smote, laplace = 1)

pred_smote <- predict(NB_smote, testData)

confusionMatrix(pred_smote, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2
##          0  7  2  0
##          1 27  4  2
##          2 19  1  2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2031          
##                  95% CI : (0.1128, 0.3223)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0109          
##                                           
##  Mcnemar's Test P-Value : 6.917e-09       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity            0.1321   0.5714  0.50000
## Specificity            0.8182   0.4912  0.66667
## Pos Pred Value         0.7778   0.1212  0.09091
## Neg Pred Value         0.1636   0.9032  0.95238
## Precision              0.7778   0.1212  0.09091
## Recall                 0.1321   0.5714  0.50000
## F1                     0.2258   0.2000  0.15385
## Prevalence             0.8281   0.1094  0.06250
## Detection Rate         0.1094   0.0625  0.03125
## Detection Prevalence   0.1406   0.5156  0.34375
## Balanced Accuracy      0.4751   0.5313  0.58333

WEIGHTED MODEL

class_freq <- table(trainData$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

train_weighted <- trainData
train_weighted$weight <- class_weight[train_weighted$PENYAKIT]
set.seed(123)

train_weighted <- train_weighted[
  sample(1:nrow(train_weighted),
         size = nrow(train_weighted),
         replace = TRUE,
         prob = train_weighted$weight),
]

train_weighted$weight <- NULL
NB_weight <- naiveBayes(PENYAKIT ~ ., data = train_weighted, laplace = 1)

pred_weight <- predict(NB_weight, testData)

confusionMatrix(pred_weight, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2
##          0 17  3  1
##          1 19  2  1
##          2 17  2  2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3281          
##                  95% CI : (0.2159, 0.4569)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0026         
##                                           
##  Mcnemar's Test P-Value : 8.694e-06       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity            0.3208  0.28571  0.50000
## Specificity            0.6364  0.64912  0.68333
## Pos Pred Value         0.8095  0.09091  0.09524
## Neg Pred Value         0.1628  0.88095  0.95349
## Precision              0.8095  0.09091  0.09524
## Recall                 0.3208  0.28571  0.50000
## F1                     0.4595  0.13793  0.16000
## Prevalence             0.8281  0.10938  0.06250
## Detection Rate         0.2656  0.03125  0.03125
## Detection Prevalence   0.3281  0.34375  0.32812
## Balanced Accuracy      0.4786  0.46742  0.59167
head(down_train)
##   JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT_PENYERTA USIA_KATEGORI KODE_LAMA_RAWAT
## 1             1              0                 0             0               1
## 2             0              0                 0             1               0
## 3             0              0                 0             0               0
## 4             0              1                 0             0               0
## 5             1              0                 1             1               0
## 6             0              0                 0             0               0
##   KODE_LEOKOSIT KODE_TROMBOSIT KODE_HEMOGLOBIN PENYAKIT
## 1             1              1               0        0
## 2             1              0               1        0
## 3             0              0               1        0
## 4             0              0               0        0
## 5             1              0               0        0
## 6             1              0               1        0
head(up_train)
##   JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT_PENYERTA USIA_KATEGORI KODE_LAMA_RAWAT
## 1             1              0                 0             0               0
## 2             0              0                 0             0               0
## 3             1              0                 0             1               0
## 4             1              0                 1             0               1
## 5             0              0                 0             0               0
## 6             1              0                 1             0               0
##   KODE_LEOKOSIT KODE_TROMBOSIT KODE_HEMOGLOBIN PENYAKIT
## 1             0              1               1        0
## 2             0              0               1        0
## 3             1              0               1        0
## 4             1              0               0        0
## 5             1              1               0        0
## 6             1              1               0        0
head(train_smote)
## # A tibble: 6 × 9
##   JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT_PENYERTA USIA_KATEGORI KODE_LAMA_RAWAT
##           <dbl>          <dbl>             <dbl>         <dbl>           <dbl>
## 1             1              1                 0             1               0
## 2             1              0                 0             0               0
## 3             1              0                 0             1               0
## 4             1              0                 0             0               0
## 5             1              0                 1             1               0
## 6             1              0                 1             1               0
## # ℹ 4 more variables: KODE_LEOKOSIT <dbl>, KODE_TROMBOSIT <dbl>,
## #   KODE_HEMOGLOBIN <dbl>, PENYAKIT <fct>
head(train_weighted)
## # A tibble: 6 × 9
##   JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT PENYAKIT_PENYERTA USIA_KATEGORI
##           <dbl>          <dbl> <fct>                <dbl>         <dbl>
## 1             0              0 1                        0             0
## 2             0              0 0                        0             1
## 3             0              0 2                        0             0
## 4             1              1 1                        0             1
## 5             1              0 1                        0             0
## 6             0              0 0                        0             0
## # ℹ 4 more variables: KODE_LAMA_RAWAT <dbl>, KODE_LEOKOSIT <dbl>,
## #   KODE_TROMBOSIT <dbl>, KODE_HEMOGLOBIN <dbl>