library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data_tb <- read_excel("D:/FINAL TA/SUMBER REFRENSI/TA.xlsx", sheet = 1)

head(data_tb)
## # A tibble: 6 × 11
##      NO `TANGGAL MASUK` `JENIS KELAMIN` `UMUR TAHUN` `KEADAAN KELUAR` PENYAKIT
##   <dbl> <chr>                     <dbl>        <dbl>            <dbl>    <dbl>
## 1     1 18/02/2024                    1          124                1        1
## 2     2 21/02/2024                    1           23                0        1
## 3     3 19/04/2024                    1           62                0        1
## 4     4 20/04/2024                    1           42                0        1
## 5     5 29/04/2024                    1           71                0        1
## 6     6 45478                         1           63                0        1
## # ℹ 5 more variables: `LAMA RAWAT INAP` <dbl>, `PENYAKIT PENYERTA` <chr>,
## #   LEOKOSIT <chr>, TROMBOSIT <chr>, HEMOGLOBIN <chr>
str(data_tb)
## tibble [322 × 11] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS KELAMIN    : num [1:322] 1 1 1 1 1 1 0 0 0 1 ...
##  $ UMUR TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN KELUAR   : num [1:322] 1 0 0 0 0 0 0 0 0 0 ...
##  $ PENYAKIT         : num [1:322] 1 1 1 1 1 1 1 1 1 1 ...
##  $ LAMA RAWAT INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT PENYERTA: chr [1:322] "0" "0" "0" "0" ...
##  $ LEOKOSIT         : chr [1:322] "0" "0" "0" "1" ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : chr [1:322] "1" "0" "1" "1" ...
data_tb <- na.omit(data_tb)

colnames(data_tb) <- c(
  "NO",
  "TANGGAL_MASUK",
  "JENIS_KELAMIN",
  "UMUR_TAHUN",
  "KEADAAN_KELUAR",
  "PENYAKIT",
  "LAMA_RAWAT_INAP",
  "PENYAKIT_PENYERTA",
  "LEOKOSIT",
  "TROMBOSIT",
  "HEMOGLOBIN"
)
data_tb <- data_tb %>%
  mutate(KODE_TROMBOSIT = ifelse(TROMBOSIT >= 150 & TROMBOSIT <= 450, 0, 1))


data_tb <- data_tb %>%
  mutate(KODE_LAMA_RAWAT = ifelse(LAMA_RAWAT_INAP <= 5, 0, 1))

data_tb <- data_tb %>%
  mutate(KODE_PENYERTA = ifelse(PENYAKIT_PENYERTA == 0, 0, 1))

data_tb$KEADAAN_KELUAR <- factor(data_tb$KEADAAN_KELUAR,
                                 levels = c(0,1),
                                 labels = c("HIDUP","MENINGGAL"))

data_tb$PENYAKIT <- factor(data_tb$PENYAKIT,
                           levels = c(0,1,2),
                           labels = c("TB_PARU","TB_PLEURA","TB_LAINNYA"))

data_tb$JENIS_KELAMIN <- factor(data_tb$JENIS_KELAMIN,
                                levels = c(0,1),
                                labels = c("P","L"))
summary(data_tb)
##        NO         TANGGAL_MASUK      JENIS_KELAMIN   UMUR_TAHUN   
##  Min.   :  1.00   Length:322         P:109         Min.   :  1.0  
##  1st Qu.: 81.25   Class :character   L:213         1st Qu.: 41.0  
##  Median :161.50   Mode  :character                 Median : 54.0  
##  Mean   :161.50                                    Mean   : 51.4  
##  3rd Qu.:241.75                                    3rd Qu.: 64.0  
##  Max.   :322.00                                    Max.   :124.0  
##    KEADAAN_KELUAR       PENYAKIT   LAMA_RAWAT_INAP  PENYAKIT_PENYERTA 
##  HIDUP    :291    TB_PARU   :265   Min.   : 1.000   Length:322        
##  MENINGGAL: 31    TB_PLEURA : 36   1st Qu.: 3.000   Class :character  
##                   TB_LAINNYA: 21   Median : 4.000   Mode  :character  
##                                    Mean   : 4.339                     
##                                    3rd Qu.: 5.000                     
##                                    Max.   :15.000                     
##    LEOKOSIT          TROMBOSIT          HEMOGLOBIN        KODE_TROMBOSIT  
##  Length:322         Length:322         Length:322         Min.   :0.0000  
##  Class :character   Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.0000  
##                                                           Mean   :0.3292  
##                                                           3rd Qu.:1.0000  
##                                                           Max.   :1.0000  
##  KODE_LAMA_RAWAT  KODE_PENYERTA  
##  Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.000  
##  Mean   :0.1429   Mean   :0.205  
##  3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1.000
str(data_tb)
## tibble [322 × 14] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL_MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS_KELAMIN    : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
##  $ UMUR_TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN_KELUAR   : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
##  $ PENYAKIT         : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LAMA_RAWAT_INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT_PENYERTA: chr [1:322] "0" "0" "0" "0" ...
##  $ LEOKOSIT         : chr [1:322] "0" "0" "0" "1" ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : chr [1:322] "1" "0" "1" "1" ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 1 1 1 1 0 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_PENYERTA    : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
colSums(is.na(data_tb))
##                NO     TANGGAL_MASUK     JENIS_KELAMIN        UMUR_TAHUN 
##                 0                 0                 0                 0 
##    KEADAAN_KELUAR          PENYAKIT   LAMA_RAWAT_INAP PENYAKIT_PENYERTA 
##                 0                 0                 0                 0 
##          LEOKOSIT         TROMBOSIT        HEMOGLOBIN    KODE_TROMBOSIT 
##                 0                 0                 0                 0 
##   KODE_LAMA_RAWAT     KODE_PENYERTA 
##                 0                 0
View(data_tb)
# Frekuensi jenis TBC
tb_freq <- table(data_tb$PENYAKIT)

# Persentase
tb_percent <- prop.table(tb_freq) * 100

# Gabungkan
tb_deskriptif <- data.frame(
  Jenis_TBC = names(tb_freq),
  N = as.vector(tb_freq),
  Persen = round(as.vector(tb_percent),2)
)

tb_deskriptif
##    Jenis_TBC   N Persen
## 1    TB_PARU 265  82.30
## 2  TB_PLEURA  36  11.18
## 3 TB_LAINNYA  21   6.52
pie(tb_freq,
    main = "Diagram Lingkaran Jenis TBC",
    col = c("orange","green","yellow"))

data_tb <- data_tb %>%
  mutate(
    USIA_KATEGORI = ifelse(UMUR_TAHUN <= 45, "≤45", ">45")
  )
library(dplyr)

deskriptif <- function(data, var){
  
  freq <- table(data[[var]])
  persen <- prop.table(freq) * 100
  
  hasil <- data.frame(
    Variabel = var,
    Kategori = names(freq),
    N = as.vector(freq),
    Persen = round(as.vector(persen), 2)
  )
  
  return(hasil)
}

d1 <- deskriptif(data_tb, "JENIS_KELAMIN")
d2 <- deskriptif(data_tb, "USIA_KATEGORI")
d3 <- deskriptif(data_tb, "LEOKOSIT")
d4 <- deskriptif(data_tb, "KODE_TROMBOSIT")
d5 <- deskriptif(data_tb, "HEMOGLOBIN")
d6 <- deskriptif(data_tb, "KODE_PENYERTA")
d7 <- deskriptif(data_tb, "KODE_LAMA_RAWAT")
d8 <- deskriptif(data_tb, "KEADAAN_KELUAR")


tabel_deskriptif <- bind_rows(d1,d2,d3,d4,d5,d6,d7,d8)

tabel_deskriptif
##           Variabel  Kategori   N Persen
## 1    JENIS_KELAMIN         P 109  33.85
## 2    JENIS_KELAMIN         L 213  66.15
## 3    USIA_KATEGORI       >45 213  66.15
## 4    USIA_KATEGORI       ≤45 109  33.85
## 5         LEOKOSIT         -  14   4.35
## 6         LEOKOSIT         0 139  43.17
## 7         LEOKOSIT         1 169  52.48
## 8   KODE_TROMBOSIT         0 216  67.08
## 9   KODE_TROMBOSIT         1 106  32.92
## 10      HEMOGLOBIN         -  39  12.11
## 11      HEMOGLOBIN         0 120  37.27
## 12      HEMOGLOBIN         1 163  50.62
## 13   KODE_PENYERTA         0 256  79.50
## 14   KODE_PENYERTA         1  66  20.50
## 15 KODE_LAMA_RAWAT         0 276  85.71
## 16 KODE_LAMA_RAWAT         1  46  14.29
## 17  KEADAAN_KELUAR     HIDUP 291  90.37
## 18  KEADAAN_KELUAR MENINGGAL  31   9.63
data_no_na <- na.omit(data_tb)
head(data_tb)
## # A tibble: 6 × 15
##      NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN KEADAAN_KELUAR PENYAKIT 
##   <dbl> <chr>         <fct>              <dbl> <fct>          <fct>    
## 1     1 18/02/2024    L                    124 MENINGGAL      TB_PLEURA
## 2     2 21/02/2024    L                     23 HIDUP          TB_PLEURA
## 3     3 19/04/2024    L                     62 HIDUP          TB_PLEURA
## 4     4 20/04/2024    L                     42 HIDUP          TB_PLEURA
## 5     5 29/04/2024    L                     71 HIDUP          TB_PLEURA
## 6     6 45478         L                     63 HIDUP          TB_PLEURA
## # ℹ 9 more variables: LAMA_RAWAT_INAP <dbl>, PENYAKIT_PENYERTA <chr>,
## #   LEOKOSIT <chr>, TROMBOSIT <chr>, HEMOGLOBIN <chr>, KODE_TROMBOSIT <dbl>,
## #   KODE_LAMA_RAWAT <dbl>, KODE_PENYERTA <dbl>, USIA_KATEGORI <chr>
str(data_tb)
## tibble [322 × 15] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL_MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS_KELAMIN    : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
##  $ UMUR_TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN_KELUAR   : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
##  $ PENYAKIT         : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LAMA_RAWAT_INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT_PENYERTA: chr [1:322] "0" "0" "0" "0" ...
##  $ LEOKOSIT         : chr [1:322] "0" "0" "0" "1" ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : chr [1:322] "1" "0" "1" "1" ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 1 1 1 1 0 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_PENYERTA    : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ USIA_KATEGORI    : chr [1:322] ">45" "≤45" ">45" "≤45" ...
data_tb[data_tb == "-"] <- NA
nrow(data_tb)
## [1] 322
nrow(data_no_na)
## [1] 322
data_no_na <- na.omit(data_tb)

X <- data_no_na[,c(
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))
str(X)
## 'data.frame':    281 obs. of  8 variables:
##  $ JENIS_KELAMIN  : num  2 2 2 2 2 1 1 2 2 2 ...
##  $ USIA_KATEGORI  : num  1 2 1 2 1 1 2 1 1 1 ...
##  $ LEOKOSIT       : num  1 1 1 2 2 2 2 2 1 2 ...
##  $ KODE_TROMBOSIT : num  1 1 2 2 1 2 2 1 2 1 ...
##  $ HEMOGLOBIN     : num  2 1 2 2 2 2 2 1 2 1 ...
##  $ KODE_PENYERTA  : num  1 1 1 1 2 1 1 1 2 1 ...
##  $ KODE_LAMA_RAWAT: num  1 1 1 1 1 1 2 1 1 1 ...
##  $ KEADAAN_KELUAR : num  2 1 1 1 1 1 1 1 1 1 ...
View(data_no_na)
data_model <- data_tb[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

data_model$PENYAKIT <- as.factor(data_model$PENYAKIT)
View(data_model)

Naive Bayes dengan Imbalanced Data FULL

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
set.seed(1001)

train_index <- createDataPartition(data_model$PENYAKIT, p = 0.80, list = FALSE)

trainData <- data_model[train_index, ]
testData  <- data_model[-train_index, ]
# Jumlah data training
n <- nrow(trainData)
n
## [1] 258
# Jumlah data training
n <- nrow(testData)
n
## [1] 64
trainData %>% count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU      212
## 2 TB_PLEURA     29
## 3 TB_LAINNYA    17
testData %>% count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       53
## 2 TB_PLEURA      7
## 3 TB_LAINNYA     4
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
NBClassifier <- naiveBayes(PENYAKIT ~ ., data = trainData)

NBClassifier
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.82170543 0.11240310 0.06589147 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3301887 0.6698113
##   TB_PLEURA  0.2758621 0.7241379
##   TB_LAINNYA 0.6470588 0.3529412
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6839623 0.3160377
##   TB_PLEURA  0.6896552 0.3103448
##   TB_LAINNYA 0.2352941 0.7647059
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4328358 0.5671642
##   TB_PLEURA  0.3571429 0.6428571
##   TB_LAINNYA 0.8750000 0.1250000
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3349057 0.4730745
##   TB_PLEURA  0.4827586 0.5085476
##   TB_LAINNYA 0.2941176 0.4696682
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4408602 0.5591398
##   TB_PLEURA  0.2692308 0.7307692
##   TB_LAINNYA 0.5454545 0.4545455
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2028302 0.4030588
##   TB_PLEURA  0.1379310 0.3509312
##   TB_LAINNYA 0.4117647 0.5072997
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15566038 0.3633911
##   TB_PLEURA  0.06896552 0.2578807
##   TB_LAINNYA 0.17647059 0.3929526
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90566038 0.09433962
##   TB_PLEURA  0.86206897 0.13793103
##   TB_LAINNYA 0.94117647 0.05882353
# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier, testData)

# data aktual
testData$actual <- testData$PENYAKIT
library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)
## Warning in confusionMatrix.default(factor(testData$predicted),
## factor(testData$actual)): Levels are not in the same order for reference and
## data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         53         7          4
##   TB_PLEURA        0         0          0
##   TB_LAINNYA       0         0          0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8281         
##                  95% CI : (0.7132, 0.911)
##     No Information Rate : 0.8281         
##     P-Value [Acc > NIR] : 0.5796         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  1.0000           0.0000            0.0000
## Specificity                  0.0000           1.0000            1.0000
## Pos Pred Value               0.8281              NaN               NaN
## Neg Pred Value                  NaN           0.8906            0.9375
## Prevalence                   0.8281           0.1094            0.0625
## Detection Rate               0.8281           0.0000            0.0000
## Detection Prevalence         1.0000           0.0000            0.0000
## Balanced Accuracy            0.5000           0.5000            0.5000

Naive Bayes dengan Balanced data (Undersampling) FULL DATA

set.seed(1001)

down_train <- downSample(
  x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
  y = trainData$PENYAKIT
)
names(down_train)[names(down_train) == "Class"] <- "PENYAKIT"
table(down_train$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##         17         17         17
down_train %>%
  count(PENYAKIT)
##     PENYAKIT  n
## 1    TB_PARU 17
## 2  TB_PLEURA 17
## 3 TB_LAINNYA 17
library(e1071)

NBClassifier3 <- naiveBayes(PENYAKIT ~ ., data = down_train)

NBClassifier3
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.4705882 0.5294118
##   TB_PLEURA  0.2941176 0.7058824
##   TB_LAINNYA 0.6470588 0.3529412
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.7647059 0.2352941
##   TB_PLEURA  0.7647059 0.2352941
##   TB_LAINNYA 0.2352941 0.7647059
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.3529412 0.6470588
##   TB_PLEURA  0.3125000 0.6875000
##   TB_LAINNYA 0.8750000 0.1250000
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.4117647 0.5072997
##   TB_PLEURA  0.5294118 0.5144958
##   TB_LAINNYA 0.2941176 0.4696682
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.3750000 0.6250000
##   TB_PLEURA  0.1875000 0.8125000
##   TB_LAINNYA 0.5454545 0.4545455
## 
##             KODE_PENYERTA
## Y                  [,1]      [,2]
##   TB_PARU    0.05882353 0.2425356
##   TB_PLEURA  0.11764706 0.3321056
##   TB_LAINNYA 0.41176471 0.5072997
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.05882353 0.2425356
##   TB_PLEURA  0.11764706 0.3321056
##   TB_LAINNYA 0.17647059 0.3929526
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.94117647 0.05882353
##   TB_PLEURA  0.82352941 0.17647059
##   TB_LAINNYA 0.94117647 0.05882353
testData$predicted <- predict(NBClassifier3, testData)

testData$actual <- testData$PENYAKIT
library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         34         3          1
##   TB_PLEURA        7         1          1
##   TB_LAINNYA      12         3          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5781          
##                  95% CI : (0.4482, 0.7006)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 0.999999        
##                                           
##                   Kappa : 0.1143          
##                                           
##  Mcnemar's Test P-Value : 0.007706        
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.6415          0.14286           0.50000
## Specificity                  0.6364          0.85965           0.75000
## Pos Pred Value               0.8947          0.11111           0.11765
## Neg Pred Value               0.2692          0.89091           0.95745
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.5312          0.01562           0.03125
## Detection Prevalence         0.5938          0.14062           0.26562
## Balanced Accuracy            0.6389          0.50125           0.62500

Naive Bayes dengan Balanced data (Oversampling) FULL DATA

set.seed(1001)

up_train <- upSample(
  x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
  y = trainData$PENYAKIT
)
names(up_train)[names(up_train) == "Class"] <- "PENYAKIT"
table(up_train$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        212        212        212
up_train %>%
  count(PENYAKIT)
##     PENYAKIT   n
## 1    TB_PARU 212
## 2  TB_PLEURA 212
## 3 TB_LAINNYA 212
library(e1071)
NBClassifier1 <- naiveBayes(PENYAKIT ~ ., data = up_train)

NBClassifier1
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3301887 0.6698113
##   TB_PLEURA  0.2830189 0.7169811
##   TB_LAINNYA 0.5896226 0.4103774
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6839623 0.3160377
##   TB_PLEURA  0.7311321 0.2688679
##   TB_LAINNYA 0.2405660 0.7594340
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4328358 0.5671642
##   TB_PLEURA  0.3399015 0.6600985
##   TB_LAINNYA 0.8168317 0.1831683
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3349057 0.4730745
##   TB_PLEURA  0.4622642 0.4997540
##   TB_LAINNYA 0.2924528 0.4559658
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4408602 0.5591398
##   TB_PLEURA  0.2538860 0.7461140
##   TB_LAINNYA 0.6122449 0.3877551
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2028302 0.4030588
##   TB_PLEURA  0.1273585 0.3341632
##   TB_LAINNYA 0.4245283 0.4954411
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15566038 0.3633911
##   TB_PLEURA  0.06132075 0.2404856
##   TB_LAINNYA 0.20283019 0.4030588
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90566038 0.09433962
##   TB_PLEURA  0.83490566 0.16509434
##   TB_LAINNYA 0.91981132 0.08018868
testData$predicted <- predict(NBClassifier1, testData)

testData$actual <- testData$PENYAKIT
library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         15         2          1
##   TB_PLEURA       30         3          1
##   TB_LAINNYA       8         2          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3125          
##                  95% CI : (0.2024, 0.4406)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.014           
##                                           
##  Mcnemar's Test P-Value : 1.206e-06       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.2830          0.42857           0.50000
## Specificity                  0.7273          0.45614           0.83333
## Pos Pred Value               0.8333          0.08824           0.16667
## Neg Pred Value               0.1739          0.86667           0.96154
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.2344          0.04688           0.03125
## Detection Prevalence         0.2812          0.53125           0.18750
## Balanced Accuracy            0.5051          0.44236           0.66667
data_model_no_na <- data_no_na[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]
data_model_no_na $PENYAKIT <- factor(data_model_no_na $PENYAKIT)
library(caret) 
set.seed(123) 

index <- createDataPartition(data_model_no_na $PENYAKIT, p = 0.8, list = FALSE) 
trainData <- data_model_no_na [index, ] 
testData <- data_model_no_na [-index, ] 

n<-nrow(trainData) 
n 
## [1] 227
n<-nrow(testData) 
n
## [1] 54
# sebelum
table(trainData$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        188         27         12

Naive Bayes dengan Balanced data (SMOTE) dengan data tanpa NA

library(themis)
## Warning: package 'themis' was built under R version 4.4.3
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.4.3
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(recipes)

rec <- recipe(PENYAKIT ~ ., data = trainData) %>%
  step_smotenc(PENYAKIT)

train_smote <- prep(rec) %>%
  juice()

NB_TB_smote <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_smote,
  laplace = 1
)
library(e1071)

NB_TB_smote <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_smote,
  laplace = 1
)

NB_TB_smote
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3421053 0.6578947
##   TB_PLEURA  0.1631579 0.8368421
##   TB_LAINNYA 0.7263158 0.2736842
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6947368 0.3052632
##   TB_PLEURA  0.8631579 0.1368421
##   TB_LAINNYA 0.4052632 0.5947368
## 
##             LEOKOSIT
## Y                     0          1
##   TB_PARU    0.42631579 0.57368421
##   TB_PLEURA  0.56842105 0.43157895
##   TB_LAINNYA 0.97894737 0.02105263
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3138298 0.4652872
##   TB_PLEURA  0.3486680 0.4514144
##   TB_LAINNYA 0.2760590 0.3896022
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4263158 0.5736842
##   TB_PLEURA  0.1947368 0.8052632
##   TB_LAINNYA 0.3473684 0.6526316
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1808511 0.3859225
##   TB_PLEURA  0.1315456 0.3189518
##   TB_LAINNYA 0.4160202 0.4399221
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.14361702 0.3516374
##   TB_PLEURA  0.03961402 0.1645487
##   TB_LAINNYA 0.24698177 0.3770241
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90526316 0.09473684
##   TB_PLEURA  0.94210526 0.05789474
##   TB_LAINNYA 0.98947368 0.01052632
testData$predicted_smote <- predict(NB_TB_smote, testData)

testData$actual <- testData$PENYAKIT
library(caret)

confusionMatrix(
  factor(testData$predicted_smote),
  factor(testData$actual)
)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         12         2          1
##   TB_PLEURA       23         4          0
##   TB_LAINNYA      11         0          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3148          
##                  95% CI : (0.1952, 0.4555)
##     No Information Rate : 0.8519          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0206          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.2609          0.66667           0.50000
## Specificity                  0.6250          0.52083           0.78846
## Pos Pred Value               0.8000          0.14815           0.08333
## Neg Pred Value               0.1282          0.92593           0.97619
## Prevalence                   0.8519          0.11111           0.03704
## Detection Rate               0.2222          0.07407           0.01852
## Detection Prevalence         0.2778          0.50000           0.22222
## Balanced Accuracy            0.4429          0.59375           0.64423

Naive Bayes Balanced data dengan Weighted training dengan data no NA

library(dplyr)
library(caret)
library(e1071)

data <- data_model_no_na
table(data$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        234         33         14
prop.table(table(data$PENYAKIT))
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.83274021 0.11743772 0.04982206
class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.4002849  2.8383838  6.6904762
data$weight <- class_weight[data$PENYAKIT]
set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
x_train <- trainData %>% select(-PENYAKIT, -weight)
y_train <- trainData$PENYAKIT
set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )
table(down_train$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##         17         17         17
down_train %>% count(PENYAKIT)
##     PENYAKIT  n
## 1    TB_PARU 17
## 2  TB_PLEURA 17
## 3 TB_LAINNYA 17
train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)
model_nb <- naiveBayes(PENYAKIT ~ ., data = train_weighted)

model_nb
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3612335  0.3171806  0.3215859 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3170732 0.6829268
##   TB_PLEURA  0.1944444 0.8055556
##   TB_LAINNYA 0.4657534 0.5342466
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6585366 0.3414634
##   TB_PLEURA  0.7638889 0.2361111
##   TB_LAINNYA 0.3561644 0.6438356
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4024390 0.5975610
##   TB_PLEURA  0.4305556 0.5694444
##   TB_LAINNYA 0.6849315 0.3150685
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3048780 0.4631887
##   TB_PLEURA  0.3194444 0.4695334
##   TB_LAINNYA 0.3972603 0.4927171
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4024390 0.5975610
##   TB_PLEURA  0.1805556 0.8194444
##   TB_LAINNYA 0.5068493 0.4931507
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1341463 0.3429068
##   TB_PLEURA  0.1111111 0.3164751
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.24390244 0.4320773
##   TB_PLEURA  0.09722222 0.2983392
##   TB_LAINNYA 0.39726027 0.4927171
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.92682927 0.07317073
##   TB_PLEURA  0.90277778 0.09722222
##   TB_LAINNYA 0.93150685 0.06849315
prediksi <- predict(model_nb, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         23         2          0
##   TB_PLEURA       15         3          0
##   TB_LAINNYA       8         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5185          
##                  95% CI : (0.3784, 0.6566)
##     No Information Rate : 0.8519          
##     P-Value [Acc > NIR] : 1.0000000       
##                                           
##                   Kappa : 0.1418          
##                                           
##  Mcnemar's Test P-Value : 0.0002812       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.5000          0.50000           1.00000
## Specificity                  0.7500          0.68750           0.82692
## Pos Pred Value               0.9200          0.16667           0.18182
## Neg Pred Value               0.2069          0.91667           1.00000
## Prevalence                   0.8519          0.11111           0.03704
## Detection Rate               0.4259          0.05556           0.03704
## Detection Prevalence         0.4630          0.33333           0.20370
## Balanced Accuracy            0.6250          0.59375           0.91346

Naive Bayes dengan Balanced data dengan n menentukan sendiri

library(dplyr)
library(e1071)
library(caret)

set.seed(1001)

# jumlah data yang diinginkan per kelas
n_sample <- 258

down_train <- trainData %>%
  group_by(PENYAKIT) %>%
  sample_n(size = n_sample, replace = TRUE) %>%
  ungroup()

# cek distribusi
table(down_train$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        258        258        258
down_train %>% count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU      258
## 2 TB_PLEURA    258
## 3 TB_LAINNYA   258
NB_down <- naiveBayes(PENYAKIT ~ ., data = down_train)

NB_down
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3100775 0.6899225
##   TB_PLEURA  0.2209302 0.7790698
##   TB_LAINNYA 0.4767442 0.5232558
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6705426 0.3294574
##   TB_PLEURA  0.7093023 0.2906977
##   TB_LAINNYA 0.3875969 0.6124031
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4457364 0.5542636
##   TB_PLEURA  0.4069767 0.5930233
##   TB_LAINNYA 0.7558140 0.2441860
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3217054 0.4680386
##   TB_PLEURA  0.3914729 0.4890284
##   TB_LAINNYA 0.3139535 0.4649998
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4496124 0.5503876
##   TB_PLEURA  0.2364341 0.7635659
##   TB_LAINNYA 0.4728682 0.5271318
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1976744 0.3990192
##   TB_PLEURA  0.1744186 0.3802066
##   TB_LAINNYA 0.4496124 0.4984215
## 
##             KODE_LAMA_RAWAT
## Y                 [,1]      [,2]
##   TB_PARU    0.1589147 0.3663071
##   TB_PLEURA  0.1162791 0.3211823
##   TB_LAINNYA 0.3255814 0.4695024
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.93023256 0.06976744
##   TB_PLEURA  0.86821705 0.13178295
##   TB_LAINNYA 0.92635659 0.07364341
## 
##             weight
## Y                 [,1] [,2]
##   TB_PARU    0.4002849    0
##   TB_PLEURA  2.8383838    0
##   TB_LAINNYA 6.6904762    0
testData$predicted <- predict(NB_down, testData)
## Warning in predict.naiveBayes(NB_down, testData): Type mismatch between
## training and new data for variable 'weight'. Did you use factors with numeric
## labels for training, and numeric values for new data?
testData$actual <- testData$PENYAKIT

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         19         2          0
##   TB_PLEURA       16         3          0
##   TB_LAINNYA      11         1          2
## 
## Overall Statistics
##                                          
##                Accuracy : 0.4444         
##                  95% CI : (0.3092, 0.586)
##     No Information Rate : 0.8519         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.104          
##                                          
##  Mcnemar's Test P-Value : 4.259e-05      
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.4130          0.50000           1.00000
## Specificity                  0.7500          0.66667           0.76923
## Pos Pred Value               0.9048          0.15789           0.14286
## Neg Pred Value               0.1818          0.91429           1.00000
## Prevalence                   0.8519          0.11111           0.03704
## Detection Rate               0.3519          0.05556           0.03704
## Detection Prevalence         0.3889          0.35185           0.25926
## Balanced Accuracy            0.5815          0.58333           0.88462

Naive Bayes Balanced data dengan Weighted training dengan data no NA

library(dplyr)
library(caret)
library(e1071)

# Gunakan data tanpa missing value
data <- data_model_no_na
table(data$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        234         33         14
prop.table(table(data$PENYAKIT))
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.83274021 0.11743772 0.04982206
class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.4002849  2.8383838  6.6904762
data <- data %>%
  mutate(weight = class_weight[PENYAKIT])
set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData  <- data[-trainIndex, ]
set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )
train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)
train_weighted %>%
  count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       82
## 2 TB_PLEURA     72
## 3 TB_LAINNYA    73
model_nb_weight <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_weighted,
  laplace = 1
)

model_nb_weight
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3612335  0.3171806  0.3215859 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3214286 0.6785714
##   TB_PLEURA  0.2027027 0.7972973
##   TB_LAINNYA 0.4666667 0.5333333
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6707317 0.3536585
##   TB_PLEURA  0.7777778 0.2500000
##   TB_LAINNYA 0.3698630 0.6575342
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4146341 0.6097561
##   TB_PLEURA  0.4444444 0.5833333
##   TB_LAINNYA 0.6986301 0.3287671
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3048780 0.4631887
##   TB_PLEURA  0.3194444 0.4695334
##   TB_LAINNYA 0.3972603 0.4927171
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4146341 0.6097561
##   TB_PLEURA  0.1944444 0.8333333
##   TB_LAINNYA 0.5205479 0.5068493
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1341463 0.3429068
##   TB_PLEURA  0.1111111 0.3164751
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.24390244 0.4320773
##   TB_PLEURA  0.09722222 0.2983392
##   TB_LAINNYA 0.39726027 0.4927171
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.91666667 0.08333333
##   TB_PLEURA  0.89189189 0.10810811
##   TB_LAINNYA 0.92000000 0.08000000
prediksi <- predict(model_nb_weight, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         20         2          0
##   TB_PLEURA       18         3          0
##   TB_LAINNYA       8         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.463           
##                  95% CI : (0.3262, 0.6039)
##     No Information Rate : 0.8519          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1082          
##                                           
##  Mcnemar's Test P-Value : 7.179e-05       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.4348          0.50000           1.00000
## Specificity                  0.7500          0.62500           0.82692
## Pos Pred Value               0.9091          0.14286           0.18182
## Neg Pred Value               0.1875          0.90909           1.00000
## Prevalence                   0.8519          0.11111           0.03704
## Detection Rate               0.3704          0.05556           0.03704
## Detection Prevalence         0.4074          0.38889           0.20370
## Balanced Accuracy            0.5924          0.56250           0.91346

Naive Bayes Balanced data dengan Weighted training dengan DATA FULL

library(dplyr)
library(caret)
library(e1071)

# Gunakan data tanpa missing value
data <- data_model
table(data$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        265         36         21
prop.table(table(data$PENYAKIT))
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.82298137 0.11180124 0.06521739
class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.4050314  2.9814815  5.1111111
data <- data %>%
  mutate(weight = class_weight[PENYAKIT])
set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData  <- data[-trainIndex, ]
set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )
train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)
train_weighted %>%
  count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       91
## 2 TB_PLEURA     78
## 3 TB_LAINNYA    89
model_nb_weight <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_weighted,
  laplace = 1
)

model_nb_weight
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3527132  0.3023256  0.3449612 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.2903226 0.7096774
##   TB_PLEURA  0.2750000 0.7250000
##   TB_LAINNYA 0.4505495 0.5494505
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6703297 0.3516484
##   TB_PLEURA  0.6538462 0.3717949
##   TB_LAINNYA 0.3595506 0.6629213
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4470588 0.5764706
##   TB_PLEURA  0.2631579 0.7631579
##   TB_LAINNYA 0.7831325 0.2409639
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2417582 0.4305206
##   TB_PLEURA  0.4230769 0.4972452
##   TB_LAINNYA 0.3483146 0.4791357
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4683544 0.5569620
##   TB_PLEURA  0.4520548 0.5753425
##   TB_LAINNYA 0.3787879 0.6515152
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2197802 0.4163919
##   TB_PLEURA  0.1923077 0.3966644
##   TB_LAINNYA 0.3707865 0.4857521
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.05494505 0.2291354
##   TB_PLEURA  0.02564103 0.1590850
##   TB_LAINNYA 0.23595506 0.4269999
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.89247312 0.10752688
##   TB_PLEURA  0.93750000 0.06250000
##   TB_LAINNYA 0.93406593 0.06593407
prediksi <- predict(model_nb_weight, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         19         4          0
##   TB_PLEURA       20         1          2
##   TB_LAINNYA      14         2          2
## 
## Overall Statistics
##                                          
##                Accuracy : 0.3438         
##                  95% CI : (0.2295, 0.473)
##     No Information Rate : 0.8281         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.0166        
##                                          
##  Mcnemar's Test P-Value : 1.813e-05      
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.3585          0.14286           0.50000
## Specificity                  0.6364          0.61404           0.73333
## Pos Pred Value               0.8261          0.04348           0.11111
## Neg Pred Value               0.1707          0.85366           0.95652
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.2969          0.01562           0.03125
## Detection Prevalence         0.3594          0.35938           0.28125
## Balanced Accuracy            0.4974          0.37845           0.61667