library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data_tb <- read_excel("D:/FINAL TA/SUMBER REFRENSI/TA.xlsx", sheet = 1)

head(data_tb)
## # A tibble: 6 × 11
##      NO `TANGGAL MASUK` `JENIS KELAMIN` `UMUR TAHUN` `KEADAAN KELUAR` PENYAKIT
##   <dbl> <chr>                     <dbl>        <dbl>            <dbl>    <dbl>
## 1     1 18/02/2024                    1          124                1        1
## 2     2 21/02/2024                    1           23                0        1
## 3     3 19/04/2024                    1           62                0        1
## 4     4 20/04/2024                    1           42                0        1
## 5     5 29/04/2024                    1           71                0        1
## 6     6 45478                         1           63                0        1
## # ℹ 5 more variables: `LAMA RAWAT INAP` <dbl>, `PENYAKIT PENYERTA` <dbl>,
## #   LEOKOSIT <dbl>, TROMBOSIT <chr>, HEMOGLOBIN <dbl>
str(data_tb)
## tibble [322 × 11] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS KELAMIN    : num [1:322] 1 1 1 1 1 1 0 0 0 1 ...
##  $ UMUR TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN KELUAR   : num [1:322] 1 0 0 0 0 0 0 0 0 0 ...
##  $ PENYAKIT         : num [1:322] 1 1 1 1 1 1 1 1 1 1 ...
##  $ LAMA RAWAT INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ LEOKOSIT         : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...
colnames(data_tb) <- c(
  "NO",
  "TANGGAL_MASUK",
  "JENIS_KELAMIN",
  "UMUR_TAHUN",
  "KEADAAN_KELUAR",
  "PENYAKIT",
  "LAMA_RAWAT_INAP",
  "PENYAKIT_PENYERTA",
  "LEOKOSIT",
  "TROMBOSIT",
  "HEMOGLOBIN"
)
data_tb <- data_tb %>%
  mutate(KODE_TROMBOSIT = ifelse(TROMBOSIT >= 150 & TROMBOSIT <= 450, 0, 1))


data_tb <- data_tb %>%
  mutate(KODE_LAMA_RAWAT = ifelse(LAMA_RAWAT_INAP <= 5, 0, 1))

data_tb <- data_tb %>%
  mutate(KODE_PENYERTA = ifelse(PENYAKIT_PENYERTA == 0, 0, 1))

data_tb$KEADAAN_KELUAR <- factor(data_tb$KEADAAN_KELUAR,
                                 levels = c(0,1),
                                 labels = c("HIDUP","MENINGGAL"))

data_tb$PENYAKIT <- factor(data_tb$PENYAKIT,
                           levels = c(0,1,2),
                           labels = c("TB_PARU","TB_PLEURA","TB_LAINNYA"))

data_tb$JENIS_KELAMIN <- factor(data_tb$JENIS_KELAMIN,
                                levels = c(0,1),
                                labels = c("P","L"))
summary(data_tb)
##        NO         TANGGAL_MASUK      JENIS_KELAMIN   UMUR_TAHUN   
##  Min.   :  1.00   Length:322         P:109         Min.   :  1.0  
##  1st Qu.: 81.25   Class :character   L:213         1st Qu.: 41.0  
##  Median :161.50   Mode  :character                 Median : 54.0  
##  Mean   :161.50                                    Mean   : 51.4  
##  3rd Qu.:241.75                                    3rd Qu.: 64.0  
##  Max.   :322.00                                    Max.   :124.0  
##                                                                   
##    KEADAAN_KELUAR       PENYAKIT   LAMA_RAWAT_INAP  PENYAKIT_PENYERTA
##  HIDUP    :291    TB_PARU   :265   Min.   : 1.000   Min.   :0.000    
##  MENINGGAL: 31    TB_PLEURA : 36   1st Qu.: 3.000   1st Qu.:0.000    
##                   TB_LAINNYA: 21   Median : 4.000   Median :0.000    
##                                    Mean   : 4.339   Mean   :0.205    
##                                    3rd Qu.: 5.000   3rd Qu.:0.000    
##                                    Max.   :15.000   Max.   :1.000    
##                                                                      
##     LEOKOSIT       TROMBOSIT           HEMOGLOBIN    KODE_TROMBOSIT  
##  Min.   :0.0000   Length:322         Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   Class :character   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :1.0000   Mode  :character   Median :1.000   Median :0.0000  
##  Mean   :0.5487                      Mean   :0.576   Mean   :0.2941  
##  3rd Qu.:1.0000                      3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :1.0000                      Max.   :1.000   Max.   :1.0000  
##  NA's   :14                          NA's   :39      NA's   :16      
##  KODE_LAMA_RAWAT  KODE_PENYERTA  
##  Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.000  
##  Mean   :0.1429   Mean   :0.205  
##  3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1.000  
## 
str(data_tb)
## tibble [322 × 14] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL_MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS_KELAMIN    : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
##  $ UMUR_TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN_KELUAR   : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
##  $ PENYAKIT         : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LAMA_RAWAT_INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT_PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ LEOKOSIT         : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 NA 1 1 1 0 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_PENYERTA    : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
colSums(is.na(data_tb))
##                NO     TANGGAL_MASUK     JENIS_KELAMIN        UMUR_TAHUN 
##                 0                 0                 0                 0 
##    KEADAAN_KELUAR          PENYAKIT   LAMA_RAWAT_INAP PENYAKIT_PENYERTA 
##                 0                 0                 0                 0 
##          LEOKOSIT         TROMBOSIT        HEMOGLOBIN    KODE_TROMBOSIT 
##                14                16                39                16 
##   KODE_LAMA_RAWAT     KODE_PENYERTA 
##                 0                 0
View(data_tb)
# Frekuensi jenis TBC
tb_freq <- table(data_tb$PENYAKIT)

# Persentase
tb_percent <- prop.table(tb_freq) * 100

# Gabungkan
tb_deskriptif <- data.frame(
  Jenis_TBC = names(tb_freq),
  N = as.vector(tb_freq),
  Persen = round(as.vector(tb_percent),2)
)

tb_deskriptif
##    Jenis_TBC   N Persen
## 1    TB_PARU 265  82.30
## 2  TB_PLEURA  36  11.18
## 3 TB_LAINNYA  21   6.52
pie(tb_freq,
    main = "Diagram Lingkaran Jenis TBC",
    col = c("orange","green","yellow"))

library(dplyr)

# buat kategori usia (hindari simbol aneh)
data_tb <- data_tb %>%
  mutate(
    USIA_KATEGORI = ifelse(UMUR_TAHUN <= 45, "<=45", ">45")
  )

# fungsi deskriptif (lebih aman)
deskriptif <- function(data, var){
  
  # pastikan character bersih
  x <- data[[var]]
  x <- iconv(x, "UTF-8", "ASCII", sub = "")  # cegah error encoding
  
  freq <- table(x)
  persen <- prop.table(freq) * 100
  
  hasil <- data.frame(
    Variabel = var,
    Kategori = names(freq),
    N = as.vector(freq),
    Persen = round(as.vector(persen), 2)
  )
  
  return(hasil)
}

# jalankan
d1 <- deskriptif(data_tb, "JENIS_KELAMIN")
d2 <- deskriptif(data_tb, "USIA_KATEGORI")
d3 <- deskriptif(data_tb, "LEOKOSIT")
d4 <- deskriptif(data_tb, "KODE_TROMBOSIT")
d5 <- deskriptif(data_tb, "HEMOGLOBIN")
d6 <- deskriptif(data_tb, "KODE_PENYERTA")
d7 <- deskriptif(data_tb, "KODE_LAMA_RAWAT")
d8 <- deskriptif(data_tb, "KEADAAN_KELUAR")

tabel_deskriptif <- bind_rows(d1, d2, d3, d4, d5, d6, d7, d8)

tabel_deskriptif
##           Variabel  Kategori   N Persen
## 1    JENIS_KELAMIN         L 213  66.15
## 2    JENIS_KELAMIN         P 109  33.85
## 3    USIA_KATEGORI      <=45 109  33.85
## 4    USIA_KATEGORI       >45 213  66.15
## 5         LEOKOSIT         0 139  45.13
## 6         LEOKOSIT         1 169  54.87
## 7   KODE_TROMBOSIT         0 216  70.59
## 8   KODE_TROMBOSIT         1  90  29.41
## 9       HEMOGLOBIN         0 120  42.40
## 10      HEMOGLOBIN         1 163  57.60
## 11   KODE_PENYERTA         0 256  79.50
## 12   KODE_PENYERTA         1  66  20.50
## 13 KODE_LAMA_RAWAT         0 276  85.71
## 14 KODE_LAMA_RAWAT         1  46  14.29
## 15  KEADAAN_KELUAR     HIDUP 291  90.37
## 16  KEADAAN_KELUAR MENINGGAL  31   9.63
data_no_na <- na.omit(data_tb)
head(data_tb)
## # A tibble: 6 × 15
##      NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN KEADAAN_KELUAR PENYAKIT 
##   <dbl> <chr>         <fct>              <dbl> <fct>          <fct>    
## 1     1 18/02/2024    L                    124 MENINGGAL      TB_PLEURA
## 2     2 21/02/2024    L                     23 HIDUP          TB_PLEURA
## 3     3 19/04/2024    L                     62 HIDUP          TB_PLEURA
## 4     4 20/04/2024    L                     42 HIDUP          TB_PLEURA
## 5     5 29/04/2024    L                     71 HIDUP          TB_PLEURA
## 6     6 45478         L                     63 HIDUP          TB_PLEURA
## # ℹ 9 more variables: LAMA_RAWAT_INAP <dbl>, PENYAKIT_PENYERTA <dbl>,
## #   LEOKOSIT <dbl>, TROMBOSIT <chr>, HEMOGLOBIN <dbl>, KODE_TROMBOSIT <dbl>,
## #   KODE_LAMA_RAWAT <dbl>, KODE_PENYERTA <dbl>, USIA_KATEGORI <chr>
str(data_tb)
## tibble [322 × 15] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL_MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS_KELAMIN    : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
##  $ UMUR_TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN_KELUAR   : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
##  $ PENYAKIT         : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LAMA_RAWAT_INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT_PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ LEOKOSIT         : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 NA 1 1 1 0 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_PENYERTA    : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ USIA_KATEGORI    : chr [1:322] ">45" "<=45" ">45" "<=45" ...
data_tb[data_tb == "-"] <- NA
nrow(data_tb)
## [1] 322
nrow(data_no_na)
## [1] 282
data_no_na <- na.omit(data_tb)

X <- data_no_na[,c(
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))
str(X)
## 'data.frame':    282 obs. of  8 variables:
##  $ JENIS_KELAMIN  : num  2 2 2 2 2 1 1 2 2 2 ...
##  $ USIA_KATEGORI  : num  2 1 2 1 2 2 1 2 2 2 ...
##  $ LEOKOSIT       : num  1 1 1 2 2 2 2 2 1 2 ...
##  $ KODE_TROMBOSIT : num  1 1 2 2 1 2 2 1 2 1 ...
##  $ HEMOGLOBIN     : num  2 1 2 2 2 2 2 1 2 1 ...
##  $ KODE_PENYERTA  : num  1 1 1 1 2 1 1 1 2 1 ...
##  $ KODE_LAMA_RAWAT: num  1 1 1 1 1 1 2 1 1 1 ...
##  $ KEADAAN_KELUAR : num  2 1 1 1 1 1 1 1 1 1 ...
View(data_no_na)
data_model <- data_tb[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

data_model$PENYAKIT <- as.factor(data_model$PENYAKIT)
View(data_model)

Naive Bayes dengan Imbalanced Data FULL

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
set.seed(1001)

train_index <- createDataPartition(data_model$PENYAKIT, p = 0.80, list = FALSE)

trainData <- data_model[train_index, ]
testData  <- data_model[-train_index, ]
# Jumlah data training
n <- nrow(trainData)
n
## [1] 258
# Jumlah data training
n <- nrow(testData)
n
## [1] 64
trainData %>% count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU      212
## 2 TB_PLEURA     29
## 3 TB_LAINNYA    17
testData %>% count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       53
## 2 TB_PLEURA      7
## 3 TB_LAINNYA     4
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
NBClassifier <- naiveBayes(PENYAKIT ~ ., data = trainData)

NBClassifier
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.82170543 0.11240310 0.06589147 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3301887 0.6698113
##   TB_PLEURA  0.2758621 0.7241379
##   TB_LAINNYA 0.6470588 0.3529412
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3160377 0.6839623
##   TB_PLEURA  0.3103448 0.6896552
##   TB_LAINNYA 0.7647059 0.2352941
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5671642 0.4967056
##   TB_PLEURA  0.6428571 0.4879500
##   TB_LAINNYA 0.1250000 0.3415650
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2985075 0.4587459
##   TB_PLEURA  0.4642857 0.5078745
##   TB_LAINNYA 0.1428571 0.3631365
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5591398 0.4978302
##   TB_PLEURA  0.7307692 0.4523443
##   TB_LAINNYA 0.4545455 0.5222330
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2028302 0.4030588
##   TB_PLEURA  0.1379310 0.3509312
##   TB_LAINNYA 0.4117647 0.5072997
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15566038 0.3633911
##   TB_PLEURA  0.06896552 0.2578807
##   TB_LAINNYA 0.17647059 0.3929526
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90566038 0.09433962
##   TB_PLEURA  0.86206897 0.13793103
##   TB_LAINNYA 0.94117647 0.05882353
# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier, testData)

# data aktual
testData$actual <- testData$PENYAKIT
library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(factor(testData$predicted),
## factor(testData$actual)): Levels are not in the same order for reference and
## data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         52         7          4
##   TB_PLEURA        0         0          0
##   TB_LAINNYA       1         0          0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8125          
##                  95% CI : (0.6954, 0.8992)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 0.6998          
##                                           
##                   Kappa : -0.0199         
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.9811           0.0000           0.00000
## Specificity                  0.0000           1.0000           0.98333
## Pos Pred Value               0.8254              NaN           0.00000
## Neg Pred Value               0.0000           0.8906           0.93651
## Prevalence                   0.8281           0.1094           0.06250
## Detection Rate               0.8125           0.0000           0.00000
## Detection Prevalence         0.9844           0.0000           0.01562
## Balanced Accuracy            0.4906           0.5000           0.49167

Naive Bayes dengan Balanced data (Undersampling) FULL DATA

set.seed(1001)

down_train <- downSample(
  x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
  y = trainData$PENYAKIT
)
names(down_train)[names(down_train) == "Class"] <- "PENYAKIT"
table(down_train$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##         17         17         17
down_train %>%
  count(PENYAKIT)
##     PENYAKIT  n
## 1    TB_PARU 17
## 2  TB_PLEURA 17
## 3 TB_LAINNYA 17
library(e1071)

NBClassifier3 <- naiveBayes(PENYAKIT ~ ., data = down_train)

NBClassifier3
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.4705882 0.5294118
##   TB_PLEURA  0.2941176 0.7058824
##   TB_LAINNYA 0.6470588 0.3529412
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.2352941 0.7647059
##   TB_PLEURA  0.2352941 0.7647059
##   TB_LAINNYA 0.7647059 0.2352941
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.6470588 0.4925922
##   TB_PLEURA  0.6875000 0.4787136
##   TB_LAINNYA 0.1250000 0.3415650
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.4117647 0.5072997
##   TB_PLEURA  0.5000000 0.5163978
##   TB_LAINNYA 0.1428571 0.3631365
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.6250000 0.5000000
##   TB_PLEURA  0.8125000 0.4031129
##   TB_LAINNYA 0.4545455 0.5222330
## 
##             KODE_PENYERTA
## Y                  [,1]      [,2]
##   TB_PARU    0.05882353 0.2425356
##   TB_PLEURA  0.11764706 0.3321056
##   TB_LAINNYA 0.41176471 0.5072997
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.05882353 0.2425356
##   TB_PLEURA  0.11764706 0.3321056
##   TB_LAINNYA 0.17647059 0.3929526
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.94117647 0.05882353
##   TB_PLEURA  0.82352941 0.17647059
##   TB_LAINNYA 0.94117647 0.05882353
testData$predicted <- predict(NBClassifier3, testData)

testData$actual <- testData$PENYAKIT
library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         32         3          1
##   TB_PLEURA        9         2          2
##   TB_LAINNYA      12         2          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5469          
##                  95% CI : (0.4175, 0.6718)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.0889          
##                                           
##  Mcnemar's Test P-Value : 0.0064          
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.6038          0.28571           0.25000
## Specificity                  0.6364          0.80702           0.76667
## Pos Pred Value               0.8889          0.15385           0.06667
## Neg Pred Value               0.2500          0.90196           0.93878
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.5000          0.03125           0.01562
## Detection Prevalence         0.5625          0.20312           0.23438
## Balanced Accuracy            0.6201          0.54637           0.50833

Naive Bayes dengan Balanced data (Oversampling) FULL DATA

set.seed(1001)

up_train <- upSample(
  x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
  y = trainData$PENYAKIT
)
names(up_train)[names(up_train) == "Class"] <- "PENYAKIT"
table(up_train$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        212        212        212
up_train %>%
  count(PENYAKIT)
##     PENYAKIT   n
## 1    TB_PARU 212
## 2  TB_PLEURA 212
## 3 TB_LAINNYA 212
library(e1071)
NBClassifier1 <- naiveBayes(PENYAKIT ~ ., data = up_train)

NBClassifier1
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3301887 0.6698113
##   TB_PLEURA  0.2830189 0.7169811
##   TB_LAINNYA 0.5896226 0.4103774
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3160377 0.6839623
##   TB_PLEURA  0.2688679 0.7311321
##   TB_LAINNYA 0.7594340 0.2405660
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5671642 0.4967056
##   TB_PLEURA  0.6600985 0.4748465
##   TB_LAINNYA 0.1831683 0.3877655
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2985075 0.4587459
##   TB_PLEURA  0.4384236 0.4974206
##   TB_LAINNYA 0.1525424 0.3605658
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5591398 0.4978302
##   TB_PLEURA  0.7461140 0.4363651
##   TB_LAINNYA 0.3877551 0.4889040
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2028302 0.4030588
##   TB_PLEURA  0.1273585 0.3341632
##   TB_LAINNYA 0.4245283 0.4954411
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15566038 0.3633911
##   TB_PLEURA  0.06132075 0.2404856
##   TB_LAINNYA 0.20283019 0.4030588
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90566038 0.09433962
##   TB_PLEURA  0.83490566 0.16509434
##   TB_LAINNYA 0.91981132 0.08018868
testData$predicted <- predict(NBClassifier1, testData)

testData$actual <- testData$PENYAKIT
library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         19         2          2
##   TB_PLEURA       25         3          1
##   TB_LAINNYA       9         2          1
## 
## Overall Statistics
##                                          
##                Accuracy : 0.3594         
##                  95% CI : (0.2432, 0.489)
##     No Information Rate : 0.8281         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 8e-04          
##                                          
##  Mcnemar's Test P-Value : 2.08e-05       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.3585          0.42857           0.25000
## Specificity                  0.6364          0.54386           0.81667
## Pos Pred Value               0.8261          0.10345           0.08333
## Neg Pred Value               0.1707          0.88571           0.94231
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.2969          0.04688           0.01562
## Detection Prevalence         0.3594          0.45312           0.18750
## Balanced Accuracy            0.4974          0.48622           0.53333
data_model_no_na <- data_no_na[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]
data_model_no_na $PENYAKIT <- factor(data_model_no_na $PENYAKIT)
library(caret) 
set.seed(123) 

index <- createDataPartition(data_model_no_na $PENYAKIT, p = 0.8, list = FALSE) 
trainData <- data_model_no_na [index, ] 
testData <- data_model_no_na [-index, ] 

n<-nrow(trainData) 
n 
## [1] 227
n<-nrow(testData) 
n
## [1] 55
# sebelum
table(trainData$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        188         27         12

Naive Bayes dengan Balanced data (SMOTE) dengan data tanpa NA

library(themis)
## Warning: package 'themis' was built under R version 4.4.3
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.4.3
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(recipes)

rec <- recipe(PENYAKIT ~ ., data = trainData) %>%
  step_smotenc(PENYAKIT)

train_smote <- prep(rec) %>%
  juice()

NB_TB_smote <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_smote,
  laplace = 1
)
library(e1071)

NB_TB_smote <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_smote,
  laplace = 1
)

NB_TB_smote
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3578947 0.6421053
##   TB_PLEURA  0.1631579 0.8368421
##   TB_LAINNYA 0.7263158 0.2736842
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3210526 0.6789474
##   TB_PLEURA  0.1368421 0.8631579
##   TB_LAINNYA 0.5947368 0.4052632
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5478723 0.4990319
##   TB_PLEURA  0.5155320 0.4599931
##   TB_LAINNYA 0.1989285 0.3544782
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2925532 0.4561497
##   TB_PLEURA  0.3486680 0.4514144
##   TB_LAINNYA 0.2760590 0.3896022
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5744681 0.4957436
##   TB_PLEURA  0.8024245 0.3704245
##   TB_LAINNYA 0.5358024 0.4326382
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1755319 0.3814372
##   TB_PLEURA  0.1315456 0.3189518
##   TB_LAINNYA 0.4160202 0.4399221
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15957447 0.3671888
##   TB_PLEURA  0.03961402 0.1645487
##   TB_LAINNYA 0.24698177 0.3770241
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.91052632 0.08947368
##   TB_PLEURA  0.94210526 0.05789474
##   TB_LAINNYA 0.98947368 0.01052632
testData$predicted_smote <- predict(NB_TB_smote, testData)

testData$actual <- testData$PENYAKIT
library(caret)

confusionMatrix(
  factor(testData$predicted_smote),
  factor(testData$actual)
)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         14         2          1
##   TB_PLEURA       28         4          0
##   TB_LAINNYA       5         0          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3455          
##                  95% CI : (0.2224, 0.4858)
##     No Information Rate : 0.8545          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0208          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.2979          0.66667           0.50000
## Specificity                  0.6250          0.42857           0.90566
## Pos Pred Value               0.8235          0.12500           0.16667
## Neg Pred Value               0.1316          0.91304           0.97959
## Prevalence                   0.8545          0.10909           0.03636
## Detection Rate               0.2545          0.07273           0.01818
## Detection Prevalence         0.3091          0.58182           0.10909
## Balanced Accuracy            0.4614          0.54762           0.70283

Naive Bayes Balanced data dengan Weighted training dengan data no NA

library(dplyr)
library(caret)
library(e1071)

data <- data_model_no_na
table(data$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        235         33         14
prop.table(table(data$PENYAKIT))
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.83333333 0.11702128 0.04964539
class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##   0.400000   2.848485   6.714286
data$weight <- class_weight[data$PENYAKIT]
set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
x_train <- trainData %>% select(-PENYAKIT, -weight)
y_train <- trainData$PENYAKIT
set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )
table(down_train$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##         17         17         17
down_train %>% count(PENYAKIT)
##     PENYAKIT  n
## 1    TB_PARU 17
## 2  TB_PLEURA 17
## 3 TB_LAINNYA 17
train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)
model_nb <- naiveBayes(PENYAKIT ~ ., data = train_weighted)

model_nb
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3612335  0.3171806  0.3215859 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.4024390 0.5975610
##   TB_PLEURA  0.1944444 0.8055556
##   TB_LAINNYA 0.4657534 0.5342466
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3658537 0.6341463
##   TB_PLEURA  0.2361111 0.7638889
##   TB_LAINNYA 0.6438356 0.3561644
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5243902 0.5024781
##   TB_PLEURA  0.5694444 0.4986288
##   TB_LAINNYA 0.3150685 0.4677580
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2804878 0.4520021
##   TB_PLEURA  0.3194444 0.4695334
##   TB_LAINNYA 0.3972603 0.4927171
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5609756 0.4993220
##   TB_PLEURA  0.8194444 0.3873488
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1707317 0.3785899
##   TB_PLEURA  0.1111111 0.3164751
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.19512195 0.3987333
##   TB_PLEURA  0.09722222 0.2983392
##   TB_LAINNYA 0.39726027 0.4927171
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.92682927 0.07317073
##   TB_PLEURA  0.90277778 0.09722222
##   TB_LAINNYA 0.93150685 0.06849315
prediksi <- predict(model_nb, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         28         2          0
##   TB_PLEURA       15         3          0
##   TB_LAINNYA       4         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.4591, 0.7298)
##     No Information Rate : 0.8545          
##     P-Value [Acc > NIR] : 0.999999        
##                                           
##                   Kappa : 0.1896          
##                                           
##  Mcnemar's Test P-Value : 0.001868        
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.5957          0.50000           1.00000
## Specificity                  0.7500          0.69388           0.90566
## Pos Pred Value               0.9333          0.16667           0.28571
## Neg Pred Value               0.2400          0.91892           1.00000
## Prevalence                   0.8545          0.10909           0.03636
## Detection Rate               0.5091          0.05455           0.03636
## Detection Prevalence         0.5455          0.32727           0.12727
## Balanced Accuracy            0.6729          0.59694           0.95283

Naive Bayes dengan Balanced data dengan n menentukan sendiri

library(dplyr)
library(e1071)
library(caret)

set.seed(1001)

# jumlah data yang diinginkan per kelas
n_sample <- 258

down_train <- trainData %>%
  group_by(PENYAKIT) %>%
  sample_n(size = n_sample, replace = TRUE) %>%
  ungroup()

# cek distribusi
table(down_train$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        258        258        258
down_train %>% count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU      258
## 2 TB_PLEURA    258
## 3 TB_LAINNYA   258
NB_down <- naiveBayes(PENYAKIT ~ ., data = down_train)

NB_down
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3565891 0.6434109
##   TB_PLEURA  0.2209302 0.7790698
##   TB_LAINNYA 0.4767442 0.5232558
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3759690 0.6240310
##   TB_PLEURA  0.2906977 0.7093023
##   TB_LAINNYA 0.6124031 0.3875969
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5155039 0.5007309
##   TB_PLEURA  0.5930233 0.4922253
##   TB_LAINNYA 0.2441860 0.4304386
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2829457 0.4513058
##   TB_PLEURA  0.3914729 0.4890284
##   TB_LAINNYA 0.3139535 0.4649998
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5813953 0.4942892
##   TB_PLEURA  0.7635659 0.4257176
##   TB_LAINNYA 0.5271318 0.5002337
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1782946 0.3835045
##   TB_PLEURA  0.1744186 0.3802066
##   TB_LAINNYA 0.4496124 0.4984215
## 
##             KODE_LAMA_RAWAT
## Y                 [,1]      [,2]
##   TB_PARU    0.2054264 0.4047981
##   TB_PLEURA  0.1162791 0.3211823
##   TB_LAINNYA 0.3255814 0.4695024
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.93023256 0.06976744
##   TB_PLEURA  0.86821705 0.13178295
##   TB_LAINNYA 0.92635659 0.07364341
## 
##             weight
## Y                [,1] [,2]
##   TB_PARU    0.400000    0
##   TB_PLEURA  2.848485    0
##   TB_LAINNYA 6.714286    0
testData$predicted <- predict(NB_down, testData)
## Warning in predict.naiveBayes(NB_down, testData): Type mismatch between
## training and new data for variable 'weight'. Did you use factors with numeric
## labels for training, and numeric values for new data?
testData$actual <- testData$PENYAKIT

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         16         2          0
##   TB_PLEURA       24         3          0
##   TB_LAINNYA       7         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3818          
##                  95% CI : (0.2541, 0.5227)
##     No Information Rate : 0.8545          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0636          
##                                           
##  Mcnemar's Test P-Value : 7.088e-06       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.3404          0.50000           1.00000
## Specificity                  0.7500          0.51020           0.84906
## Pos Pred Value               0.8889          0.11111           0.20000
## Neg Pred Value               0.1622          0.89286           1.00000
## Prevalence                   0.8545          0.10909           0.03636
## Detection Rate               0.2909          0.05455           0.03636
## Detection Prevalence         0.3273          0.49091           0.18182
## Balanced Accuracy            0.5452          0.50510           0.92453

Naive Bayes Balanced data dengan Weighted training dengan data no NA

library(dplyr)
library(caret)
library(e1071)

# Gunakan data tanpa missing value
data <- data_model_no_na
table(data$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        235         33         14
prop.table(table(data$PENYAKIT))
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.83333333 0.11702128 0.04964539
class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##   0.400000   2.848485   6.714286
data <- data %>%
  mutate(weight = class_weight[PENYAKIT])
set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData  <- data[-trainIndex, ]
set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )
train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)
train_weighted %>%
  count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       82
## 2 TB_PLEURA     72
## 3 TB_LAINNYA    73
model_nb_weight <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_weighted,
  laplace = 1
)

model_nb_weight
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3612335  0.3171806  0.3215859 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.4047619 0.5952381
##   TB_PLEURA  0.2027027 0.7972973
##   TB_LAINNYA 0.4666667 0.5333333
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3780488 0.6463415
##   TB_PLEURA  0.2500000 0.7777778
##   TB_LAINNYA 0.6575342 0.3698630
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5243902 0.5024781
##   TB_PLEURA  0.5694444 0.4986288
##   TB_LAINNYA 0.3150685 0.4677580
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2804878 0.4520021
##   TB_PLEURA  0.3194444 0.4695334
##   TB_LAINNYA 0.3972603 0.4927171
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5609756 0.4993220
##   TB_PLEURA  0.8194444 0.3873488
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1707317 0.3785899
##   TB_PLEURA  0.1111111 0.3164751
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.19512195 0.3987333
##   TB_PLEURA  0.09722222 0.2983392
##   TB_LAINNYA 0.39726027 0.4927171
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.91666667 0.08333333
##   TB_PLEURA  0.89189189 0.10810811
##   TB_LAINNYA 0.92000000 0.08000000
prediksi <- predict(model_nb_weight, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         28         2          0
##   TB_PLEURA       15         3          0
##   TB_LAINNYA       4         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.4591, 0.7298)
##     No Information Rate : 0.8545          
##     P-Value [Acc > NIR] : 0.999999        
##                                           
##                   Kappa : 0.1896          
##                                           
##  Mcnemar's Test P-Value : 0.001868        
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.5957          0.50000           1.00000
## Specificity                  0.7500          0.69388           0.90566
## Pos Pred Value               0.9333          0.16667           0.28571
## Neg Pred Value               0.2400          0.91892           1.00000
## Prevalence                   0.8545          0.10909           0.03636
## Detection Rate               0.5091          0.05455           0.03636
## Detection Prevalence         0.5455          0.32727           0.12727
## Balanced Accuracy            0.6729          0.59694           0.95283

Naive Bayes Balanced data dengan Weighted training dengan DATA FULL

library(dplyr)
library(caret)
library(e1071)

# Gunakan data tanpa missing value
data <- data_model
table(data$PENYAKIT)
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        265         36         21
prop.table(table(data$PENYAKIT))
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.82298137 0.11180124 0.06521739
class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight
## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.4050314  2.9814815  5.1111111
data <- data %>%
  mutate(weight = class_weight[PENYAKIT])
set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData  <- data[-trainIndex, ]
set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )
train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)
train_weighted %>%
  count(PENYAKIT)
## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       91
## 2 TB_PLEURA     78
## 3 TB_LAINNYA    89
model_nb_weight <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_weighted,
  laplace = 1
)

model_nb_weight
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3527132  0.3023256  0.3449612 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.2903226 0.7096774
##   TB_PLEURA  0.2750000 0.7250000
##   TB_LAINNYA 0.4505495 0.5494505
## 
##             USIA_KATEGORI
## Y                 <=45       >45
##   TB_PARU    0.3516484 0.6703297
##   TB_PLEURA  0.3717949 0.6538462
##   TB_LAINNYA 0.6629213 0.3595506
## 
##             LEOKOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.5647059 0.4987379
##   TB_PLEURA  0.7500000 0.4358899
##   TB_LAINNYA 0.2289157 0.4226889
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.1882353 0.3932198
##   TB_PLEURA  0.4078947 0.4947088
##   TB_LAINNYA 0.2658228 0.4445932
## 
##             HEMOGLOBIN
## Y                 [,1]      [,2]
##   TB_PARU    0.5443038 0.5012157
##   TB_PLEURA  0.5616438 0.4996193
##   TB_LAINNYA 0.6363636 0.4847319
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2197802 0.4163919
##   TB_PLEURA  0.1923077 0.3966644
##   TB_LAINNYA 0.3707865 0.4857521
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.05494505 0.2291354
##   TB_PLEURA  0.02564103 0.1590850
##   TB_LAINNYA 0.23595506 0.4269999
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.89247312 0.10752688
##   TB_PLEURA  0.93750000 0.06250000
##   TB_LAINNYA 0.93406593 0.06593407
prediksi <- predict(model_nb_weight, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         18         4          1
##   TB_PLEURA       20         1          1
##   TB_LAINNYA      15         2          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3281          
##                  95% CI : (0.2159, 0.4569)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0397         
##                                           
##  Mcnemar's Test P-Value : 3.582e-05       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.3396          0.14286           0.50000
## Specificity                  0.5455          0.63158           0.71667
## Pos Pred Value               0.7826          0.04545           0.10526
## Neg Pred Value               0.1463          0.85714           0.95556
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.2812          0.01562           0.03125
## Detection Prevalence         0.3594          0.34375           0.29688
## Balanced Accuracy            0.4425          0.38722           0.60833