library(readxl)

## Warning: package 'readxl' was built under R version 4.4.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data_tb <- read_excel("D:/FINAL TA/SUMBER REFRENSI/TA.xlsx", sheet = 1)

head(data_tb)

## # A tibble: 6 × 11
##      NO `TANGGAL MASUK` `JENIS KELAMIN` `UMUR TAHUN` `KEADAAN KELUAR` PENYAKIT
##   <dbl> <chr>                     <dbl>        <dbl>            <dbl>    <dbl>
## 1     1 18/02/2024                    1          124                1        1
## 2     2 21/02/2024                    1           23                0        1
## 3     3 19/04/2024                    1           62                0        1
## 4     4 20/04/2024                    1           42                0        1
## 5     5 29/04/2024                    1           71                0        1
## 6     6 45478                         1           63                0        1
## # ℹ 5 more variables: `LAMA RAWAT INAP` <dbl>, `PENYAKIT PENYERTA` <dbl>,
## #   LEOKOSIT <dbl>, TROMBOSIT <chr>, HEMOGLOBIN <dbl>

str(data_tb)

## tibble [322 × 11] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS KELAMIN    : num [1:322] 1 1 1 1 1 1 0 0 0 1 ...
##  $ UMUR TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN KELUAR   : num [1:322] 1 0 0 0 0 0 0 0 0 0 ...
##  $ PENYAKIT         : num [1:322] 1 1 1 1 1 1 1 1 1 1 ...
##  $ LAMA RAWAT INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ LEOKOSIT         : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...

colnames(data_tb) <- c(
  "NO",
  "TANGGAL_MASUK",
  "JENIS_KELAMIN",
  "UMUR_TAHUN",
  "KEADAAN_KELUAR",
  "PENYAKIT",
  "LAMA_RAWAT_INAP",
  "PENYAKIT_PENYERTA",
  "LEOKOSIT",
  "TROMBOSIT",
  "HEMOGLOBIN"
)

data_tb <- data_tb %>%
  mutate(KODE_TROMBOSIT = ifelse(TROMBOSIT >= 150 & TROMBOSIT <= 450, 0, 1))


data_tb <- data_tb %>%
  mutate(KODE_LAMA_RAWAT = ifelse(LAMA_RAWAT_INAP <= 5, 0, 1))

data_tb <- data_tb %>%
  mutate(KODE_PENYERTA = ifelse(PENYAKIT_PENYERTA == 0, 0, 1))

data_tb$KEADAAN_KELUAR <- factor(data_tb$KEADAAN_KELUAR,
                                 levels = c(0,1),
                                 labels = c("HIDUP","MENINGGAL"))

data_tb$PENYAKIT <- factor(data_tb$PENYAKIT,
                           levels = c(0,1,2),
                           labels = c("TB_PARU","TB_PLEURA","TB_LAINNYA"))

data_tb$JENIS_KELAMIN <- factor(data_tb$JENIS_KELAMIN,
                                levels = c(0,1),
                                labels = c("P","L"))

summary(data_tb)

##        NO         TANGGAL_MASUK      JENIS_KELAMIN   UMUR_TAHUN   
##  Min.   :  1.00   Length:322         P:109         Min.   :  1.0  
##  1st Qu.: 81.25   Class :character   L:213         1st Qu.: 41.0  
##  Median :161.50   Mode  :character                 Median : 54.0  
##  Mean   :161.50                                    Mean   : 51.4  
##  3rd Qu.:241.75                                    3rd Qu.: 64.0  
##  Max.   :322.00                                    Max.   :124.0  
##                                                                   
##    KEADAAN_KELUAR       PENYAKIT   LAMA_RAWAT_INAP  PENYAKIT_PENYERTA
##  HIDUP    :291    TB_PARU   :265   Min.   : 1.000   Min.   :0.000    
##  MENINGGAL: 31    TB_PLEURA : 36   1st Qu.: 3.000   1st Qu.:0.000    
##                   TB_LAINNYA: 21   Median : 4.000   Median :0.000    
##                                    Mean   : 4.339   Mean   :0.205    
##                                    3rd Qu.: 5.000   3rd Qu.:0.000    
##                                    Max.   :15.000   Max.   :1.000    
##                                                                      
##     LEOKOSIT       TROMBOSIT           HEMOGLOBIN    KODE_TROMBOSIT  
##  Min.   :0.0000   Length:322         Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   Class :character   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :1.0000   Mode  :character   Median :1.000   Median :0.0000  
##  Mean   :0.5487                      Mean   :0.576   Mean   :0.2941  
##  3rd Qu.:1.0000                      3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :1.0000                      Max.   :1.000   Max.   :1.0000  
##  NA's   :14                          NA's   :39      NA's   :16      
##  KODE_LAMA_RAWAT  KODE_PENYERTA  
##  Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.000  
##  Mean   :0.1429   Mean   :0.205  
##  3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1.000  
##

str(data_tb)

## tibble [322 × 14] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL_MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS_KELAMIN    : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
##  $ UMUR_TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN_KELUAR   : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
##  $ PENYAKIT         : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LAMA_RAWAT_INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT_PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ LEOKOSIT         : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 NA 1 1 1 0 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_PENYERTA    : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...

colSums(is.na(data_tb))

##                NO     TANGGAL_MASUK     JENIS_KELAMIN        UMUR_TAHUN 
##                 0                 0                 0                 0 
##    KEADAAN_KELUAR          PENYAKIT   LAMA_RAWAT_INAP PENYAKIT_PENYERTA 
##                 0                 0                 0                 0 
##          LEOKOSIT         TROMBOSIT        HEMOGLOBIN    KODE_TROMBOSIT 
##                14                16                39                16 
##   KODE_LAMA_RAWAT     KODE_PENYERTA 
##                 0                 0

View(data_tb)

# Frekuensi jenis TBC
tb_freq <- table(data_tb$PENYAKIT)

# Persentase
tb_percent <- prop.table(tb_freq) * 100

# Gabungkan
tb_deskriptif <- data.frame(
  Jenis_TBC = names(tb_freq),
  N = as.vector(tb_freq),
  Persen = round(as.vector(tb_percent),2)
)

tb_deskriptif

##    Jenis_TBC   N Persen
## 1    TB_PARU 265  82.30
## 2  TB_PLEURA  36  11.18
## 3 TB_LAINNYA  21   6.52

pie(tb_freq,
    main = "Diagram Lingkaran Jenis TBC",
    col = c("orange","green","yellow"))

library(dplyr)

# buat kategori usia (hindari simbol aneh)
data_tb <- data_tb %>%
  mutate(
    USIA_KATEGORI = ifelse(UMUR_TAHUN <= 45, "<=45", ">45")
  )

# fungsi deskriptif (lebih aman)
deskriptif <- function(data, var){
  
  # pastikan character bersih
  x <- data[[var]]
  x <- iconv(x, "UTF-8", "ASCII", sub = "")  # cegah error encoding
  
  freq <- table(x)
  persen <- prop.table(freq) * 100
  
  hasil <- data.frame(
    Variabel = var,
    Kategori = names(freq),
    N = as.vector(freq),
    Persen = round(as.vector(persen), 2)
  )
  
  return(hasil)
}

# jalankan
d1 <- deskriptif(data_tb, "JENIS_KELAMIN")
d2 <- deskriptif(data_tb, "USIA_KATEGORI")
d3 <- deskriptif(data_tb, "LEOKOSIT")
d4 <- deskriptif(data_tb, "KODE_TROMBOSIT")
d5 <- deskriptif(data_tb, "HEMOGLOBIN")
d6 <- deskriptif(data_tb, "KODE_PENYERTA")
d7 <- deskriptif(data_tb, "KODE_LAMA_RAWAT")
d8 <- deskriptif(data_tb, "KEADAAN_KELUAR")

tabel_deskriptif <- bind_rows(d1, d2, d3, d4, d5, d6, d7, d8)

tabel_deskriptif

##           Variabel  Kategori   N Persen
## 1    JENIS_KELAMIN         L 213  66.15
## 2    JENIS_KELAMIN         P 109  33.85
## 3    USIA_KATEGORI      <=45 109  33.85
## 4    USIA_KATEGORI       >45 213  66.15
## 5         LEOKOSIT         0 139  45.13
## 6         LEOKOSIT         1 169  54.87
## 7   KODE_TROMBOSIT         0 216  70.59
## 8   KODE_TROMBOSIT         1  90  29.41
## 9       HEMOGLOBIN         0 120  42.40
## 10      HEMOGLOBIN         1 163  57.60
## 11   KODE_PENYERTA         0 256  79.50
## 12   KODE_PENYERTA         1  66  20.50
## 13 KODE_LAMA_RAWAT         0 276  85.71
## 14 KODE_LAMA_RAWAT         1  46  14.29
## 15  KEADAAN_KELUAR     HIDUP 291  90.37
## 16  KEADAAN_KELUAR MENINGGAL  31   9.63

data_no_na <- na.omit(data_tb)
head(data_tb)

## # A tibble: 6 × 15
##      NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN KEADAAN_KELUAR PENYAKIT 
##   <dbl> <chr>         <fct>              <dbl> <fct>          <fct>    
## 1     1 18/02/2024    L                    124 MENINGGAL      TB_PLEURA
## 2     2 21/02/2024    L                     23 HIDUP          TB_PLEURA
## 3     3 19/04/2024    L                     62 HIDUP          TB_PLEURA
## 4     4 20/04/2024    L                     42 HIDUP          TB_PLEURA
## 5     5 29/04/2024    L                     71 HIDUP          TB_PLEURA
## 6     6 45478         L                     63 HIDUP          TB_PLEURA
## # ℹ 9 more variables: LAMA_RAWAT_INAP <dbl>, PENYAKIT_PENYERTA <dbl>,
## #   LEOKOSIT <dbl>, TROMBOSIT <chr>, HEMOGLOBIN <dbl>, KODE_TROMBOSIT <dbl>,
## #   KODE_LAMA_RAWAT <dbl>, KODE_PENYERTA <dbl>, USIA_KATEGORI <chr>

str(data_tb)

## tibble [322 × 15] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL_MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS_KELAMIN    : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
##  $ UMUR_TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN_KELUAR   : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
##  $ PENYAKIT         : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LAMA_RAWAT_INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT_PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ LEOKOSIT         : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 NA 1 1 1 0 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_PENYERTA    : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ USIA_KATEGORI    : chr [1:322] ">45" "<=45" ">45" "<=45" ...

data_tb[data_tb == "-"] <- NA
nrow(data_tb)

## [1] 322

nrow(data_no_na)

## [1] 282

data_no_na <- na.omit(data_tb)

X <- data_no_na[,c(
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))
str(X)

## 'data.frame':    282 obs. of  8 variables:
##  $ JENIS_KELAMIN  : num  2 2 2 2 2 1 1 2 2 2 ...
##  $ USIA_KATEGORI  : num  2 1 2 1 2 2 1 2 2 2 ...
##  $ LEOKOSIT       : num  1 1 1 2 2 2 2 2 1 2 ...
##  $ KODE_TROMBOSIT : num  1 1 2 2 1 2 2 1 2 1 ...
##  $ HEMOGLOBIN     : num  2 1 2 2 2 2 2 1 2 1 ...
##  $ KODE_PENYERTA  : num  1 1 1 1 2 1 1 1 2 1 ...
##  $ KODE_LAMA_RAWAT: num  1 1 1 1 1 1 2 1 1 1 ...
##  $ KEADAAN_KELUAR : num  2 1 1 1 1 1 1 1 1 1 ...

View(data_no_na)

data_model <- data_tb[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

data_model$PENYAKIT <- as.factor(data_model$PENYAKIT)

View(data_model)

Naive Bayes dengan Imbalanced Data FULL

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.3

## Loading required package: lattice

set.seed(1001)

train_index <- createDataPartition(data_model$PENYAKIT, p = 0.80, list = FALSE)

trainData <- data_model[train_index, ]
testData  <- data_model[-train_index, ]

# Jumlah data training
n <- nrow(trainData)
n

## [1] 258

# Jumlah data training
n <- nrow(testData)
n

## [1] 64

trainData %>% count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU      212
## 2 TB_PLEURA     29
## 3 TB_LAINNYA    17

testData %>% count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       53
## 2 TB_PLEURA      7
## 3 TB_LAINNYA     4

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.3

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:ggplot2':
## 
##     element

NBClassifier <- naiveBayes(PENYAKIT ~ ., data = trainData)

NBClassifier

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.82170543 0.11240310 0.06589147 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3301887 0.6698113
##   TB_PLEURA  0.2758621 0.7241379
##   TB_LAINNYA 0.6470588 0.3529412
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3160377 0.6839623
##   TB_PLEURA  0.3103448 0.6896552
##   TB_LAINNYA 0.7647059 0.2352941
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5671642 0.4967056
##   TB_PLEURA  0.6428571 0.4879500
##   TB_LAINNYA 0.1250000 0.3415650
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2985075 0.4587459
##   TB_PLEURA  0.4642857 0.5078745
##   TB_LAINNYA 0.1428571 0.3631365
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5591398 0.4978302
##   TB_PLEURA  0.7307692 0.4523443
##   TB_LAINNYA 0.4545455 0.5222330
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2028302 0.4030588
##   TB_PLEURA  0.1379310 0.3509312
##   TB_LAINNYA 0.4117647 0.5072997
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15566038 0.3633911
##   TB_PLEURA  0.06896552 0.2578807
##   TB_LAINNYA 0.17647059 0.3929526
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90566038 0.09433962
##   TB_PLEURA  0.86206897 0.13793103
##   TB_LAINNYA 0.94117647 0.05882353

# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier, testData)

# data aktual
testData$actual <- testData$PENYAKIT

library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)

## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length

## Warning in confusionMatrix.default(factor(testData$predicted),
## factor(testData$actual)): Levels are not in the same order for reference and
## data. Refactoring data to match.

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         52         7          4
##   TB_PLEURA        0         0          0
##   TB_LAINNYA       1         0          0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8125          
##                  95% CI : (0.6954, 0.8992)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 0.6998          
##                                           
##                   Kappa : -0.0199         
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.9811           0.0000           0.00000
## Specificity                  0.0000           1.0000           0.98333
## Pos Pred Value               0.8254              NaN           0.00000
## Neg Pred Value               0.0000           0.8906           0.93651
## Prevalence                   0.8281           0.1094           0.06250
## Detection Rate               0.8125           0.0000           0.00000
## Detection Prevalence         0.9844           0.0000           0.01562
## Balanced Accuracy            0.4906           0.5000           0.49167

Naive Bayes dengan Balanced data (Undersampling) FULL DATA

set.seed(1001)

down_train <- downSample(
  x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
  y = trainData$PENYAKIT
)

names(down_train)[names(down_train) == "Class"] <- "PENYAKIT"

table(down_train$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##         17         17         17

down_train %>%
  count(PENYAKIT)

##     PENYAKIT  n
## 1    TB_PARU 17
## 2  TB_PLEURA 17
## 3 TB_LAINNYA 17

library(e1071)

NBClassifier3 <- naiveBayes(PENYAKIT ~ ., data = down_train)

NBClassifier3

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.4705882 0.5294118
##   TB_PLEURA  0.2941176 0.7058824
##   TB_LAINNYA 0.6470588 0.3529412
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.2352941 0.7647059
##   TB_PLEURA  0.2352941 0.7647059
##   TB_LAINNYA 0.7647059 0.2352941
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.6470588 0.4925922
##   TB_PLEURA  0.6875000 0.4787136
##   TB_LAINNYA 0.1250000 0.3415650
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.4117647 0.5072997
##   TB_PLEURA  0.5000000 0.5163978
##   TB_LAINNYA 0.1428571 0.3631365
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.6250000 0.5000000
##   TB_PLEURA  0.8125000 0.4031129
##   TB_LAINNYA 0.4545455 0.5222330
## 
##             KODE_PENYERTA
## Y                  [,1]      [,2]
##   TB_PARU    0.05882353 0.2425356
##   TB_PLEURA  0.11764706 0.3321056
##   TB_LAINNYA 0.41176471 0.5072997
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.05882353 0.2425356
##   TB_PLEURA  0.11764706 0.3321056
##   TB_LAINNYA 0.17647059 0.3929526
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.94117647 0.05882353
##   TB_PLEURA  0.82352941 0.17647059
##   TB_LAINNYA 0.94117647 0.05882353

testData$predicted <- predict(NBClassifier3, testData)

testData$actual <- testData$PENYAKIT

library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         32         3          1
##   TB_PLEURA        9         2          2
##   TB_LAINNYA      12         2          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5469          
##                  95% CI : (0.4175, 0.6718)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.0889          
##                                           
##  Mcnemar's Test P-Value : 0.0064          
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.6038          0.28571           0.25000
## Specificity                  0.6364          0.80702           0.76667
## Pos Pred Value               0.8889          0.15385           0.06667
## Neg Pred Value               0.2500          0.90196           0.93878
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.5000          0.03125           0.01562
## Detection Prevalence         0.5625          0.20312           0.23438
## Balanced Accuracy            0.6201          0.54637           0.50833

Naive Bayes dengan Balanced data (Oversampling) FULL DATA

set.seed(1001)

up_train <- upSample(
  x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
  y = trainData$PENYAKIT
)

names(up_train)[names(up_train) == "Class"] <- "PENYAKIT"

table(up_train$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        212        212        212

up_train %>%
  count(PENYAKIT)

##     PENYAKIT   n
## 1    TB_PARU 212
## 2  TB_PLEURA 212
## 3 TB_LAINNYA 212

library(e1071)
NBClassifier1 <- naiveBayes(PENYAKIT ~ ., data = up_train)

NBClassifier1

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3301887 0.6698113
##   TB_PLEURA  0.2830189 0.7169811
##   TB_LAINNYA 0.5896226 0.4103774
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3160377 0.6839623
##   TB_PLEURA  0.2688679 0.7311321
##   TB_LAINNYA 0.7594340 0.2405660
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5671642 0.4967056
##   TB_PLEURA  0.6600985 0.4748465
##   TB_LAINNYA 0.1831683 0.3877655
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2985075 0.4587459
##   TB_PLEURA  0.4384236 0.4974206
##   TB_LAINNYA 0.1525424 0.3605658
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5591398 0.4978302
##   TB_PLEURA  0.7461140 0.4363651
##   TB_LAINNYA 0.3877551 0.4889040
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2028302 0.4030588
##   TB_PLEURA  0.1273585 0.3341632
##   TB_LAINNYA 0.4245283 0.4954411
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15566038 0.3633911
##   TB_PLEURA  0.06132075 0.2404856
##   TB_LAINNYA 0.20283019 0.4030588
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90566038 0.09433962
##   TB_PLEURA  0.83490566 0.16509434
##   TB_LAINNYA 0.91981132 0.08018868

testData$predicted <- predict(NBClassifier1, testData)

testData$actual <- testData$PENYAKIT

library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         19         2          2
##   TB_PLEURA       25         3          1
##   TB_LAINNYA       9         2          1
## 
## Overall Statistics
##                                          
##                Accuracy : 0.3594         
##                  95% CI : (0.2432, 0.489)
##     No Information Rate : 0.8281         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 8e-04          
##                                          
##  Mcnemar's Test P-Value : 2.08e-05       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.3585          0.42857           0.25000
## Specificity                  0.6364          0.54386           0.81667
## Pos Pred Value               0.8261          0.10345           0.08333
## Neg Pred Value               0.1707          0.88571           0.94231
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.2969          0.04688           0.01562
## Detection Prevalence         0.3594          0.45312           0.18750
## Balanced Accuracy            0.4974          0.48622           0.53333

data_model_no_na <- data_no_na[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

data_model_no_na $PENYAKIT <- factor(data_model_no_na $PENYAKIT)

library(caret) 
set.seed(123) 

index <- createDataPartition(data_model_no_na $PENYAKIT, p = 0.8, list = FALSE) 
trainData <- data_model_no_na [index, ] 
testData <- data_model_no_na [-index, ] 

n<-nrow(trainData) 
n

## [1] 227

n<-nrow(testData) 
n

## [1] 55

# sebelum
table(trainData$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        188         27         12

Naive Bayes dengan Balanced data (SMOTE) dengan data tanpa NA

library(themis)

## Warning: package 'themis' was built under R version 4.4.3

## Loading required package: recipes

## Warning: package 'recipes' was built under R version 4.4.3

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(recipes)

rec <- recipe(PENYAKIT ~ ., data = trainData) %>%
  step_smotenc(PENYAKIT)

train_smote <- prep(rec) %>%
  juice()

NB_TB_smote <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_smote,
  laplace = 1
)

library(e1071)

NB_TB_smote <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_smote,
  laplace = 1
)

NB_TB_smote

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3578947 0.6421053
##   TB_PLEURA  0.1631579 0.8368421
##   TB_LAINNYA 0.7263158 0.2736842
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3210526 0.6789474
##   TB_PLEURA  0.1368421 0.8631579
##   TB_LAINNYA 0.5947368 0.4052632
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5478723 0.4990319
##   TB_PLEURA  0.5155320 0.4599931
##   TB_LAINNYA 0.1989285 0.3544782
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2925532 0.4561497
##   TB_PLEURA  0.3486680 0.4514144
##   TB_LAINNYA 0.2760590 0.3896022
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5744681 0.4957436
##   TB_PLEURA  0.8024245 0.3704245
##   TB_LAINNYA 0.5358024 0.4326382
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1755319 0.3814372
##   TB_PLEURA  0.1315456 0.3189518
##   TB_LAINNYA 0.4160202 0.4399221
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15957447 0.3671888
##   TB_PLEURA  0.03961402 0.1645487
##   TB_LAINNYA 0.24698177 0.3770241
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.91052632 0.08947368
##   TB_PLEURA  0.94210526 0.05789474
##   TB_LAINNYA 0.98947368 0.01052632

testData$predicted_smote <- predict(NB_TB_smote, testData)

testData$actual <- testData$PENYAKIT

library(caret)

confusionMatrix(
  factor(testData$predicted_smote),
  factor(testData$actual)
)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         14         2          1
##   TB_PLEURA       28         4          0
##   TB_LAINNYA       5         0          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3455          
##                  95% CI : (0.2224, 0.4858)
##     No Information Rate : 0.8545          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0208          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.2979          0.66667           0.50000
## Specificity                  0.6250          0.42857           0.90566
## Pos Pred Value               0.8235          0.12500           0.16667
## Neg Pred Value               0.1316          0.91304           0.97959
## Prevalence                   0.8545          0.10909           0.03636
## Detection Rate               0.2545          0.07273           0.01818
## Detection Prevalence         0.3091          0.58182           0.10909
## Balanced Accuracy            0.4614          0.54762           0.70283

Naive Bayes Balanced data dengan Weighted training dengan data no NA

library(dplyr)
library(caret)
library(e1071)

data <- data_model_no_na
table(data$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        235         33         14

prop.table(table(data$PENYAKIT))

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.83333333 0.11702128 0.04964539

class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##   0.400000   2.848485   6.714286

data$weight <- class_weight[data$PENYAKIT]

set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]

x_train <- trainData %>% select(-PENYAKIT, -weight)
y_train <- trainData$PENYAKIT

set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )

table(down_train$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##         17         17         17

down_train %>% count(PENYAKIT)

##     PENYAKIT  n
## 1    TB_PARU 17
## 2  TB_PLEURA 17
## 3 TB_LAINNYA 17

train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)

model_nb <- naiveBayes(PENYAKIT ~ ., data = train_weighted)

model_nb

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3612335  0.3171806  0.3215859 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.4024390 0.5975610
##   TB_PLEURA  0.1944444 0.8055556
##   TB_LAINNYA 0.4657534 0.5342466
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3658537 0.6341463
##   TB_PLEURA  0.2361111 0.7638889
##   TB_LAINNYA 0.6438356 0.3561644
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5243902 0.5024781
##   TB_PLEURA  0.5694444 0.4986288
##   TB_LAINNYA 0.3150685 0.4677580
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2804878 0.4520021
##   TB_PLEURA  0.3194444 0.4695334
##   TB_LAINNYA 0.3972603 0.4927171
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5609756 0.4993220
##   TB_PLEURA  0.8194444 0.3873488
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1707317 0.3785899
##   TB_PLEURA  0.1111111 0.3164751
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.19512195 0.3987333
##   TB_PLEURA  0.09722222 0.2983392
##   TB_LAINNYA 0.39726027 0.4927171
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.92682927 0.07317073
##   TB_PLEURA  0.90277778 0.09722222
##   TB_LAINNYA 0.93150685 0.06849315

prediksi <- predict(model_nb, testData)
confusionMatrix(prediksi, testData$PENYAKIT)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         28         2          0
##   TB_PLEURA       15         3          0
##   TB_LAINNYA       4         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.4591, 0.7298)
##     No Information Rate : 0.8545          
##     P-Value [Acc > NIR] : 0.999999        
##                                           
##                   Kappa : 0.1896          
##                                           
##  Mcnemar's Test P-Value : 0.001868        
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.5957          0.50000           1.00000
## Specificity                  0.7500          0.69388           0.90566
## Pos Pred Value               0.9333          0.16667           0.28571
## Neg Pred Value               0.2400          0.91892           1.00000
## Prevalence                   0.8545          0.10909           0.03636
## Detection Rate               0.5091          0.05455           0.03636
## Detection Prevalence         0.5455          0.32727           0.12727
## Balanced Accuracy            0.6729          0.59694           0.95283

Naive Bayes dengan Balanced data dengan n menentukan sendiri

library(dplyr)
library(e1071)
library(caret)

set.seed(1001)

# jumlah data yang diinginkan per kelas
n_sample <- 258

down_train <- trainData %>%
  group_by(PENYAKIT) %>%
  sample_n(size = n_sample, replace = TRUE) %>%
  ungroup()

# cek distribusi
table(down_train$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        258        258        258

down_train %>% count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU      258
## 2 TB_PLEURA    258
## 3 TB_LAINNYA   258

NB_down <- naiveBayes(PENYAKIT ~ ., data = down_train)

NB_down

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3565891 0.6434109
##   TB_PLEURA  0.2209302 0.7790698
##   TB_LAINNYA 0.4767442 0.5232558
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3759690 0.6240310
##   TB_PLEURA  0.2906977 0.7093023
##   TB_LAINNYA 0.6124031 0.3875969
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5155039 0.5007309
##   TB_PLEURA  0.5930233 0.4922253
##   TB_LAINNYA 0.2441860 0.4304386
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2829457 0.4513058
##   TB_PLEURA  0.3914729 0.4890284
##   TB_LAINNYA 0.3139535 0.4649998
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5813953 0.4942892
##   TB_PLEURA  0.7635659 0.4257176
##   TB_LAINNYA 0.5271318 0.5002337
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1782946 0.3835045
##   TB_PLEURA  0.1744186 0.3802066
##   TB_LAINNYA 0.4496124 0.4984215
## 
##             KODE_LAMA_RAWAT
## Y                 [,1]      [,2]
##   TB_PARU    0.2054264 0.4047981
##   TB_PLEURA  0.1162791 0.3211823
##   TB_LAINNYA 0.3255814 0.4695024
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.93023256 0.06976744
##   TB_PLEURA  0.86821705 0.13178295
##   TB_LAINNYA 0.92635659 0.07364341
## 
##             weight
## Y                [,1] [,2]
##   TB_PARU    0.400000    0
##   TB_PLEURA  2.848485    0
##   TB_LAINNYA 6.714286    0

testData$predicted <- predict(NB_down, testData)

## Warning in predict.naiveBayes(NB_down, testData): Type mismatch between
## training and new data for variable 'weight'. Did you use factors with numeric
## labels for training, and numeric values for new data?

testData$actual <- testData$PENYAKIT

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         16         2          0
##   TB_PLEURA       24         3          0
##   TB_LAINNYA       7         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3818          
##                  95% CI : (0.2541, 0.5227)
##     No Information Rate : 0.8545          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0636          
##                                           
##  Mcnemar's Test P-Value : 7.088e-06       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.3404          0.50000           1.00000
## Specificity                  0.7500          0.51020           0.84906
## Pos Pred Value               0.8889          0.11111           0.20000
## Neg Pred Value               0.1622          0.89286           1.00000
## Prevalence                   0.8545          0.10909           0.03636
## Detection Rate               0.2909          0.05455           0.03636
## Detection Prevalence         0.3273          0.49091           0.18182
## Balanced Accuracy            0.5452          0.50510           0.92453

Naive Bayes Balanced data dengan Weighted training dengan data no NA

library(dplyr)
library(caret)
library(e1071)

# Gunakan data tanpa missing value
data <- data_model_no_na

table(data$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        235         33         14

prop.table(table(data$PENYAKIT))

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.83333333 0.11702128 0.04964539

class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##   0.400000   2.848485   6.714286

data <- data %>%
  mutate(weight = class_weight[PENYAKIT])

set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData  <- data[-trainIndex, ]

set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )

train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)

train_weighted %>%
  count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       82
## 2 TB_PLEURA     72
## 3 TB_LAINNYA    73

model_nb_weight <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_weighted,
  laplace = 1
)

model_nb_weight

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3612335  0.3171806  0.3215859 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.4047619 0.5952381
##   TB_PLEURA  0.2027027 0.7972973
##   TB_LAINNYA 0.4666667 0.5333333
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3780488 0.6463415
##   TB_PLEURA  0.2500000 0.7777778
##   TB_LAINNYA 0.6575342 0.3698630
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5243902 0.5024781
##   TB_PLEURA  0.5694444 0.4986288
##   TB_LAINNYA 0.3150685 0.4677580
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2804878 0.4520021
##   TB_PLEURA  0.3194444 0.4695334
##   TB_LAINNYA 0.3972603 0.4927171
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5609756 0.4993220
##   TB_PLEURA  0.8194444 0.3873488
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1707317 0.3785899
##   TB_PLEURA  0.1111111 0.3164751
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.19512195 0.3987333
##   TB_PLEURA  0.09722222 0.2983392
##   TB_LAINNYA 0.39726027 0.4927171
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.91666667 0.08333333
##   TB_PLEURA  0.89189189 0.10810811
##   TB_LAINNYA 0.92000000 0.08000000

prediksi <- predict(model_nb_weight, testData)

confusionMatrix(prediksi, testData$PENYAKIT)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         28         2          0
##   TB_PLEURA       15         3          0
##   TB_LAINNYA       4         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.4591, 0.7298)
##     No Information Rate : 0.8545          
##     P-Value [Acc > NIR] : 0.999999        
##                                           
##                   Kappa : 0.1896          
##                                           
##  Mcnemar's Test P-Value : 0.001868        
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.5957          0.50000           1.00000
## Specificity                  0.7500          0.69388           0.90566
## Pos Pred Value               0.9333          0.16667           0.28571
## Neg Pred Value               0.2400          0.91892           1.00000
## Prevalence                   0.8545          0.10909           0.03636
## Detection Rate               0.5091          0.05455           0.03636
## Detection Prevalence         0.5455          0.32727           0.12727
## Balanced Accuracy            0.6729          0.59694           0.95283

Naive Bayes Balanced data dengan Weighted training dengan DATA FULL

library(dplyr)
library(caret)
library(e1071)

# Gunakan data tanpa missing value
data <- data_model

table(data$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        265         36         21

prop.table(table(data$PENYAKIT))

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.82298137 0.11180124 0.06521739

class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.4050314  2.9814815  5.1111111

data <- data %>%
  mutate(weight = class_weight[PENYAKIT])

set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData  <- data[-trainIndex, ]

set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )

train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)

train_weighted %>%
  count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       91
## 2 TB_PLEURA     78
## 3 TB_LAINNYA    89

model_nb_weight <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_weighted,
  laplace = 1
)

model_nb_weight

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3527132  0.3023256  0.3449612 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.2903226 0.7096774
##   TB_PLEURA  0.2750000 0.7250000
##   TB_LAINNYA 0.4505495 0.5494505
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3516484 0.6703297
##   TB_PLEURA  0.3717949 0.6538462
##   TB_LAINNYA 0.6629213 0.3595506
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5647059 0.4987379
##   TB_PLEURA  0.7500000 0.4358899
##   TB_LAINNYA 0.2289157 0.4226889
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.1882353 0.3932198
##   TB_PLEURA  0.4078947 0.4947088
##   TB_LAINNYA 0.2658228 0.4445932
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5443038 0.5012157
##   TB_PLEURA  0.5616438 0.4996193
##   TB_LAINNYA 0.6363636 0.4847319
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2197802 0.4163919
##   TB_PLEURA  0.1923077 0.3966644
##   TB_LAINNYA 0.3707865 0.4857521
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.05494505 0.2291354
##   TB_PLEURA  0.02564103 0.1590850
##   TB_LAINNYA 0.23595506 0.4269999
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.89247312 0.10752688
##   TB_PLEURA  0.93750000 0.06250000
##   TB_LAINNYA 0.93406593 0.06593407

prediksi <- predict(model_nb_weight, testData)

confusionMatrix(prediksi, testData$PENYAKIT)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         18         4          1
##   TB_PLEURA       20         1          1
##   TB_LAINNYA      15         2          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3281          
##                  95% CI : (0.2159, 0.4569)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0397         
##                                           
##  Mcnemar's Test P-Value : 3.582e-05       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.3396          0.14286           0.50000
## Specificity                  0.5455          0.63158           0.71667
## Pos Pred Value               0.7826          0.04545           0.10526
## Neg Pred Value               0.1463          0.85714           0.95556
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.2812          0.01562           0.03125
## Detection Prevalence         0.3594          0.34375           0.29688
## Balanced Accuracy            0.4425          0.38722           0.60833

coba model full data

samuel aditya pratama

2026-04-14

Naive Bayes dengan Imbalanced Data FULL

Naive Bayes dengan Balanced data (Undersampling) FULL DATA

Naive Bayes dengan Balanced data (Oversampling) FULL DATA

Naive Bayes dengan Balanced data (SMOTE) dengan data tanpa NA

Naive Bayes Balanced data dengan Weighted training dengan data no NA

Naive Bayes dengan Balanced data dengan n menentukan sendiri

Naive Bayes Balanced data dengan Weighted training dengan data no NA

Naive Bayes Balanced data dengan Weighted training dengan DATA FULL