library(readxl)

## Warning: package 'readxl' was built under R version 4.4.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data_tb <- read_excel("D:/FINAL TA/SUMBER REFRENSI/TA.xlsx", sheet = 1)

head(data_tb)

## # A tibble: 6 × 11
##      NO `TANGGAL MASUK` `JENIS KELAMIN` `UMUR TAHUN` `KEADAAN KELUAR` PENYAKIT
##   <dbl> <chr>                     <dbl>        <dbl>            <dbl>    <dbl>
## 1     1 18/02/2024                    1          124                1        1
## 2     2 21/02/2024                    1           23                0        1
## 3     3 19/04/2024                    1           62                0        1
## 4     4 20/04/2024                    1           42                0        1
## 5     5 29/04/2024                    1           71                0        1
## 6     6 45478                         1           63                0        1
## # ℹ 5 more variables: `LAMA RAWAT INAP` <dbl>, `PENYAKIT PENYERTA` <chr>,
## #   LEOKOSIT <chr>, TROMBOSIT <chr>, HEMOGLOBIN <chr>

str(data_tb)

## tibble [322 × 11] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS KELAMIN    : num [1:322] 1 1 1 1 1 1 0 0 0 1 ...
##  $ UMUR TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN KELUAR   : num [1:322] 1 0 0 0 0 0 0 0 0 0 ...
##  $ PENYAKIT         : num [1:322] 1 1 1 1 1 1 1 1 1 1 ...
##  $ LAMA RAWAT INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT PENYERTA: chr [1:322] "0" "0" "0" "0" ...
##  $ LEOKOSIT         : chr [1:322] "0" "0" "0" "1" ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : chr [1:322] "1" "0" "1" "1" ...

data_tb <- na.omit(data_tb)

colnames(data_tb) <- c(
  "NO",
  "TANGGAL_MASUK",
  "JENIS_KELAMIN",
  "UMUR_TAHUN",
  "KEADAAN_KELUAR",
  "PENYAKIT",
  "LAMA_RAWAT_INAP",
  "PENYAKIT_PENYERTA",
  "LEOKOSIT",
  "TROMBOSIT",
  "HEMOGLOBIN"
)

data_tb <- data_tb %>%
  mutate(KODE_TROMBOSIT = ifelse(TROMBOSIT >= 150 & TROMBOSIT <= 450, 0, 1))


data_tb <- data_tb %>%
  mutate(KODE_LAMA_RAWAT = ifelse(LAMA_RAWAT_INAP <= 5, 0, 1))

data_tb <- data_tb %>%
  mutate(KODE_PENYERTA = ifelse(PENYAKIT_PENYERTA == 0, 0, 1))

data_tb$KEADAAN_KELUAR <- factor(data_tb$KEADAAN_KELUAR,
                                 levels = c(0,1),
                                 labels = c("HIDUP","MENINGGAL"))

data_tb$PENYAKIT <- factor(data_tb$PENYAKIT,
                           levels = c(0,1,2),
                           labels = c("TB_PARU","TB_PLEURA","TB_LAINNYA"))

data_tb$JENIS_KELAMIN <- factor(data_tb$JENIS_KELAMIN,
                                levels = c(0,1),
                                labels = c("P","L"))

summary(data_tb)

##        NO         TANGGAL_MASUK      JENIS_KELAMIN   UMUR_TAHUN   
##  Min.   :  1.00   Length:322         P:109         Min.   :  1.0  
##  1st Qu.: 81.25   Class :character   L:213         1st Qu.: 41.0  
##  Median :161.50   Mode  :character                 Median : 54.0  
##  Mean   :161.50                                    Mean   : 51.4  
##  3rd Qu.:241.75                                    3rd Qu.: 64.0  
##  Max.   :322.00                                    Max.   :124.0  
##    KEADAAN_KELUAR       PENYAKIT   LAMA_RAWAT_INAP  PENYAKIT_PENYERTA 
##  HIDUP    :291    TB_PARU   :265   Min.   : 1.000   Length:322        
##  MENINGGAL: 31    TB_PLEURA : 36   1st Qu.: 3.000   Class :character  
##                   TB_LAINNYA: 21   Median : 4.000   Mode  :character  
##                                    Mean   : 4.339                     
##                                    3rd Qu.: 5.000                     
##                                    Max.   :15.000                     
##    LEOKOSIT          TROMBOSIT          HEMOGLOBIN        KODE_TROMBOSIT  
##  Length:322         Length:322         Length:322         Min.   :0.0000  
##  Class :character   Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.0000  
##                                                           Mean   :0.3292  
##                                                           3rd Qu.:1.0000  
##                                                           Max.   :1.0000  
##  KODE_LAMA_RAWAT  KODE_PENYERTA  
##  Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.000  
##  Mean   :0.1429   Mean   :0.205  
##  3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1.000

str(data_tb)

## tibble [322 × 14] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL_MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS_KELAMIN    : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
##  $ UMUR_TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN_KELUAR   : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
##  $ PENYAKIT         : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LAMA_RAWAT_INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT_PENYERTA: chr [1:322] "0" "0" "0" "0" ...
##  $ LEOKOSIT         : chr [1:322] "0" "0" "0" "1" ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : chr [1:322] "1" "0" "1" "1" ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 1 1 1 1 0 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_PENYERTA    : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...

colSums(is.na(data_tb))

##                NO     TANGGAL_MASUK     JENIS_KELAMIN        UMUR_TAHUN 
##                 0                 0                 0                 0 
##    KEADAAN_KELUAR          PENYAKIT   LAMA_RAWAT_INAP PENYAKIT_PENYERTA 
##                 0                 0                 0                 0 
##          LEOKOSIT         TROMBOSIT        HEMOGLOBIN    KODE_TROMBOSIT 
##                 0                 0                 0                 0 
##   KODE_LAMA_RAWAT     KODE_PENYERTA 
##                 0                 0

View(data_tb)

# Frekuensi jenis TBC
tb_freq <- table(data_tb$PENYAKIT)

# Persentase
tb_percent <- prop.table(tb_freq) * 100

# Gabungkan
tb_deskriptif <- data.frame(
  Jenis_TBC = names(tb_freq),
  N = as.vector(tb_freq),
  Persen = round(as.vector(tb_percent),2)
)

tb_deskriptif

##    Jenis_TBC   N Persen
## 1    TB_PARU 265  82.30
## 2  TB_PLEURA  36  11.18
## 3 TB_LAINNYA  21   6.52

pie(tb_freq,
    main = "Diagram Lingkaran Jenis TBC",
    col = c("orange","green","yellow"))

data_tb <- data_tb %>%
  mutate(
    USIA_KATEGORI = ifelse(UMUR_TAHUN <= 45, "≤45", ">45")
  )

library(dplyr)

deskriptif <- function(data, var){
  
  freq <- table(data[[var]])
  persen <- prop.table(freq) * 100
  
  hasil <- data.frame(
    Variabel = var,
    Kategori = names(freq),
    N = as.vector(freq),
    Persen = round(as.vector(persen), 2)
  )
  
  return(hasil)
}

d1 <- deskriptif(data_tb, "JENIS_KELAMIN")
d2 <- deskriptif(data_tb, "USIA_KATEGORI")
d3 <- deskriptif(data_tb, "LEOKOSIT")
d4 <- deskriptif(data_tb, "KODE_TROMBOSIT")
d5 <- deskriptif(data_tb, "HEMOGLOBIN")
d6 <- deskriptif(data_tb, "KODE_PENYERTA")
d7 <- deskriptif(data_tb, "KODE_LAMA_RAWAT")
d8 <- deskriptif(data_tb, "KEADAAN_KELUAR")


tabel_deskriptif <- bind_rows(d1,d2,d3,d4,d5,d6,d7,d8)

tabel_deskriptif

##           Variabel  Kategori   N Persen
## 1    JENIS_KELAMIN         P 109  33.85
## 2    JENIS_KELAMIN         L 213  66.15
## 3    USIA_KATEGORI       >45 213  66.15
## 4    USIA_KATEGORI       ≤45 109  33.85
## 5         LEOKOSIT         -  14   4.35
## 6         LEOKOSIT         0 139  43.17
## 7         LEOKOSIT         1 169  52.48
## 8   KODE_TROMBOSIT         0 216  67.08
## 9   KODE_TROMBOSIT         1 106  32.92
## 10      HEMOGLOBIN         -  39  12.11
## 11      HEMOGLOBIN         0 120  37.27
## 12      HEMOGLOBIN         1 163  50.62
## 13   KODE_PENYERTA         0 256  79.50
## 14   KODE_PENYERTA         1  66  20.50
## 15 KODE_LAMA_RAWAT         0 276  85.71
## 16 KODE_LAMA_RAWAT         1  46  14.29
## 17  KEADAAN_KELUAR     HIDUP 291  90.37
## 18  KEADAAN_KELUAR MENINGGAL  31   9.63

data_no_na <- na.omit(data_tb)
head(data_tb)

## # A tibble: 6 × 15
##      NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN KEADAAN_KELUAR PENYAKIT 
##   <dbl> <chr>         <fct>              <dbl> <fct>          <fct>    
## 1     1 18/02/2024    L                    124 MENINGGAL      TB_PLEURA
## 2     2 21/02/2024    L                     23 HIDUP          TB_PLEURA
## 3     3 19/04/2024    L                     62 HIDUP          TB_PLEURA
## 4     4 20/04/2024    L                     42 HIDUP          TB_PLEURA
## 5     5 29/04/2024    L                     71 HIDUP          TB_PLEURA
## 6     6 45478         L                     63 HIDUP          TB_PLEURA
## # ℹ 9 more variables: LAMA_RAWAT_INAP <dbl>, PENYAKIT_PENYERTA <chr>,
## #   LEOKOSIT <chr>, TROMBOSIT <chr>, HEMOGLOBIN <chr>, KODE_TROMBOSIT <dbl>,
## #   KODE_LAMA_RAWAT <dbl>, KODE_PENYERTA <dbl>, USIA_KATEGORI <chr>

str(data_tb)

## tibble [322 × 15] (S3: tbl_df/tbl/data.frame)
##  $ NO               : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
##  $ TANGGAL_MASUK    : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
##  $ JENIS_KELAMIN    : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
##  $ UMUR_TAHUN       : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
##  $ KEADAAN_KELUAR   : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
##  $ PENYAKIT         : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LAMA_RAWAT_INAP  : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
##  $ PENYAKIT_PENYERTA: chr [1:322] "0" "0" "0" "0" ...
##  $ LEOKOSIT         : chr [1:322] "0" "0" "0" "1" ...
##  $ TROMBOSIT        : chr [1:322] "435" "314" "478" "538" ...
##  $ HEMOGLOBIN       : chr [1:322] "1" "0" "1" "1" ...
##  $ KODE_TROMBOSIT   : num [1:322] 0 0 1 1 0 1 1 1 1 0 ...
##  $ KODE_LAMA_RAWAT  : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
##  $ KODE_PENYERTA    : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
##  $ USIA_KATEGORI    : chr [1:322] ">45" "≤45" ">45" "≤45" ...

data_tb[data_tb == "-"] <- NA
nrow(data_tb)

## [1] 322

nrow(data_no_na)

## [1] 322

data_no_na <- na.omit(data_tb)

X <- data_no_na[,c(
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))
str(X)

## 'data.frame':    281 obs. of  8 variables:
##  $ JENIS_KELAMIN  : num  2 2 2 2 2 1 1 2 2 2 ...
##  $ USIA_KATEGORI  : num  1 2 1 2 1 1 2 1 1 1 ...
##  $ LEOKOSIT       : num  1 1 1 2 2 2 2 2 1 2 ...
##  $ KODE_TROMBOSIT : num  1 1 2 2 1 2 2 1 2 1 ...
##  $ HEMOGLOBIN     : num  2 1 2 2 2 2 2 1 2 1 ...
##  $ KODE_PENYERTA  : num  1 1 1 1 2 1 1 1 2 1 ...
##  $ KODE_LAMA_RAWAT: num  1 1 1 1 1 1 2 1 1 1 ...
##  $ KEADAAN_KELUAR : num  2 1 1 1 1 1 1 1 1 1 ...

View(data_no_na)

data_model <- data_tb[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

data_model$PENYAKIT <- as.factor(data_model$PENYAKIT)

View(data_model)

Naive Bayes dengan Imbalanced Data FULL

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.3

## Loading required package: lattice

set.seed(1001)

train_index <- createDataPartition(data_model$PENYAKIT, p = 0.80, list = FALSE)

trainData <- data_model[train_index, ]
testData  <- data_model[-train_index, ]

# Jumlah data training
n <- nrow(trainData)
n

## [1] 258

# Jumlah data training
n <- nrow(testData)
n

## [1] 64

trainData %>% count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU      212
## 2 TB_PLEURA     29
## 3 TB_LAINNYA    17

testData %>% count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       53
## 2 TB_PLEURA      7
## 3 TB_LAINNYA     4

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.3

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:ggplot2':
## 
##     element

NBClassifier <- naiveBayes(PENYAKIT ~ ., data = trainData)

NBClassifier

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.82170543 0.11240310 0.06589147 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3301887 0.6698113
##   TB_PLEURA  0.2758621 0.7241379
##   TB_LAINNYA 0.6470588 0.3529412
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6839623 0.3160377
##   TB_PLEURA  0.6896552 0.3103448
##   TB_LAINNYA 0.2352941 0.7647059
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4328358 0.5671642
##   TB_PLEURA  0.3571429 0.6428571
##   TB_LAINNYA 0.8750000 0.1250000
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3349057 0.4730745
##   TB_PLEURA  0.4827586 0.5085476
##   TB_LAINNYA 0.2941176 0.4696682
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4408602 0.5591398
##   TB_PLEURA  0.2692308 0.7307692
##   TB_LAINNYA 0.5454545 0.4545455
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2028302 0.4030588
##   TB_PLEURA  0.1379310 0.3509312
##   TB_LAINNYA 0.4117647 0.5072997
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15566038 0.3633911
##   TB_PLEURA  0.06896552 0.2578807
##   TB_LAINNYA 0.17647059 0.3929526
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90566038 0.09433962
##   TB_PLEURA  0.86206897 0.13793103
##   TB_LAINNYA 0.94117647 0.05882353

# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier, testData)

# data aktual
testData$actual <- testData$PENYAKIT

library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)

## Warning in confusionMatrix.default(factor(testData$predicted),
## factor(testData$actual)): Levels are not in the same order for reference and
## data. Refactoring data to match.

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         53         7          4
##   TB_PLEURA        0         0          0
##   TB_LAINNYA       0         0          0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8281         
##                  95% CI : (0.7132, 0.911)
##     No Information Rate : 0.8281         
##     P-Value [Acc > NIR] : 0.5796         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  1.0000           0.0000            0.0000
## Specificity                  0.0000           1.0000            1.0000
## Pos Pred Value               0.8281              NaN               NaN
## Neg Pred Value                  NaN           0.8906            0.9375
## Prevalence                   0.8281           0.1094            0.0625
## Detection Rate               0.8281           0.0000            0.0000
## Detection Prevalence         1.0000           0.0000            0.0000
## Balanced Accuracy            0.5000           0.5000            0.5000

Naive Bayes dengan Balanced data (Undersampling) FULL DATA

set.seed(1001)

down_train <- downSample(
  x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
  y = trainData$PENYAKIT
)

names(down_train)[names(down_train) == "Class"] <- "PENYAKIT"

table(down_train$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##         17         17         17

down_train %>%
  count(PENYAKIT)

##     PENYAKIT  n
## 1    TB_PARU 17
## 2  TB_PLEURA 17
## 3 TB_LAINNYA 17

library(e1071)

NBClassifier3 <- naiveBayes(PENYAKIT ~ ., data = down_train)

NBClassifier3

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.4705882 0.5294118
##   TB_PLEURA  0.2941176 0.7058824
##   TB_LAINNYA 0.6470588 0.3529412
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.7647059 0.2352941
##   TB_PLEURA  0.7647059 0.2352941
##   TB_LAINNYA 0.2352941 0.7647059
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.3529412 0.6470588
##   TB_PLEURA  0.3125000 0.6875000
##   TB_LAINNYA 0.8750000 0.1250000
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.4117647 0.5072997
##   TB_PLEURA  0.5294118 0.5144958
##   TB_LAINNYA 0.2941176 0.4696682
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.3750000 0.6250000
##   TB_PLEURA  0.1875000 0.8125000
##   TB_LAINNYA 0.5454545 0.4545455
## 
##             KODE_PENYERTA
## Y                  [,1]      [,2]
##   TB_PARU    0.05882353 0.2425356
##   TB_PLEURA  0.11764706 0.3321056
##   TB_LAINNYA 0.41176471 0.5072997
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.05882353 0.2425356
##   TB_PLEURA  0.11764706 0.3321056
##   TB_LAINNYA 0.17647059 0.3929526
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.94117647 0.05882353
##   TB_PLEURA  0.82352941 0.17647059
##   TB_LAINNYA 0.94117647 0.05882353

testData$predicted <- predict(NBClassifier3, testData)

testData$actual <- testData$PENYAKIT

library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         34         3          1
##   TB_PLEURA        7         1          1
##   TB_LAINNYA      12         3          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5781          
##                  95% CI : (0.4482, 0.7006)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 0.999999        
##                                           
##                   Kappa : 0.1143          
##                                           
##  Mcnemar's Test P-Value : 0.007706        
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.6415          0.14286           0.50000
## Specificity                  0.6364          0.85965           0.75000
## Pos Pred Value               0.8947          0.11111           0.11765
## Neg Pred Value               0.2692          0.89091           0.95745
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.5312          0.01562           0.03125
## Detection Prevalence         0.5938          0.14062           0.26562
## Balanced Accuracy            0.6389          0.50125           0.62500

Naive Bayes dengan Balanced data (Oversampling) FULL DATA

set.seed(1001)

up_train <- upSample(
  x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
  y = trainData$PENYAKIT
)

names(up_train)[names(up_train) == "Class"] <- "PENYAKIT"

table(up_train$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        212        212        212

up_train %>%
  count(PENYAKIT)

##     PENYAKIT   n
## 1    TB_PARU 212
## 2  TB_PLEURA 212
## 3 TB_LAINNYA 212

library(e1071)
NBClassifier1 <- naiveBayes(PENYAKIT ~ ., data = up_train)

NBClassifier1

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3301887 0.6698113
##   TB_PLEURA  0.2830189 0.7169811
##   TB_LAINNYA 0.5896226 0.4103774
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6839623 0.3160377
##   TB_PLEURA  0.7311321 0.2688679
##   TB_LAINNYA 0.2405660 0.7594340
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4328358 0.5671642
##   TB_PLEURA  0.3399015 0.6600985
##   TB_LAINNYA 0.8168317 0.1831683
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3349057 0.4730745
##   TB_PLEURA  0.4622642 0.4997540
##   TB_LAINNYA 0.2924528 0.4559658
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4408602 0.5591398
##   TB_PLEURA  0.2538860 0.7461140
##   TB_LAINNYA 0.6122449 0.3877551
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2028302 0.4030588
##   TB_PLEURA  0.1273585 0.3341632
##   TB_LAINNYA 0.4245283 0.4954411
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.15566038 0.3633911
##   TB_PLEURA  0.06132075 0.2404856
##   TB_LAINNYA 0.20283019 0.4030588
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90566038 0.09433962
##   TB_PLEURA  0.83490566 0.16509434
##   TB_LAINNYA 0.91981132 0.08018868

testData$predicted <- predict(NBClassifier1, testData)

testData$actual <- testData$PENYAKIT

library(caret)

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         15         2          1
##   TB_PLEURA       30         3          1
##   TB_LAINNYA       8         2          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3125          
##                  95% CI : (0.2024, 0.4406)
##     No Information Rate : 0.8281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.014           
##                                           
##  Mcnemar's Test P-Value : 1.206e-06       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.2830          0.42857           0.50000
## Specificity                  0.7273          0.45614           0.83333
## Pos Pred Value               0.8333          0.08824           0.16667
## Neg Pred Value               0.1739          0.86667           0.96154
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.2344          0.04688           0.03125
## Detection Prevalence         0.2812          0.53125           0.18750
## Balanced Accuracy            0.5051          0.44236           0.66667

data_model_no_na <- data_no_na[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]

data_model_no_na $PENYAKIT <- factor(data_model_no_na $PENYAKIT)

library(caret) 
set.seed(123) 

index <- createDataPartition(data_model_no_na $PENYAKIT, p = 0.8, list = FALSE) 
trainData <- data_model_no_na [index, ] 
testData <- data_model_no_na [-index, ] 

n<-nrow(trainData) 
n

## [1] 227

n<-nrow(testData) 
n

## [1] 54

# sebelum
table(trainData$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        188         27         12

Naive Bayes dengan Balanced data (SMOTE) dengan data tanpa NA

library(themis)

## Warning: package 'themis' was built under R version 4.4.3

## Loading required package: recipes

## Warning: package 'recipes' was built under R version 4.4.3

## 
## Attaching package: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(recipes)

rec <- recipe(PENYAKIT ~ ., data = trainData) %>%
  step_smotenc(PENYAKIT)

train_smote <- prep(rec) %>%
  juice()

NB_TB_smote <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_smote,
  laplace = 1
)

library(e1071)

NB_TB_smote <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_smote,
  laplace = 1
)

NB_TB_smote

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3421053 0.6578947
##   TB_PLEURA  0.1631579 0.8368421
##   TB_LAINNYA 0.7263158 0.2736842
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6947368 0.3052632
##   TB_PLEURA  0.8631579 0.1368421
##   TB_LAINNYA 0.4052632 0.5947368
## 
##             LEOKOSIT
## Y                     0          1
##   TB_PARU    0.42631579 0.57368421
##   TB_PLEURA  0.56842105 0.43157895
##   TB_LAINNYA 0.97894737 0.02105263
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3138298 0.4652872
##   TB_PLEURA  0.3486680 0.4514144
##   TB_LAINNYA 0.2760590 0.3896022
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4263158 0.5736842
##   TB_PLEURA  0.1947368 0.8052632
##   TB_LAINNYA 0.3473684 0.6526316
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1808511 0.3859225
##   TB_PLEURA  0.1315456 0.3189518
##   TB_LAINNYA 0.4160202 0.4399221
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.14361702 0.3516374
##   TB_PLEURA  0.03961402 0.1645487
##   TB_LAINNYA 0.24698177 0.3770241
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.90526316 0.09473684
##   TB_PLEURA  0.94210526 0.05789474
##   TB_LAINNYA 0.98947368 0.01052632

testData$predicted_smote <- predict(NB_TB_smote, testData)

testData$actual <- testData$PENYAKIT

library(caret)

confusionMatrix(
  factor(testData$predicted_smote),
  factor(testData$actual)
)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         12         2          1
##   TB_PLEURA       23         4          0
##   TB_LAINNYA      11         0          1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3148          
##                  95% CI : (0.1952, 0.4555)
##     No Information Rate : 0.8519          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0206          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.2609          0.66667           0.50000
## Specificity                  0.6250          0.52083           0.78846
## Pos Pred Value               0.8000          0.14815           0.08333
## Neg Pred Value               0.1282          0.92593           0.97619
## Prevalence                   0.8519          0.11111           0.03704
## Detection Rate               0.2222          0.07407           0.01852
## Detection Prevalence         0.2778          0.50000           0.22222
## Balanced Accuracy            0.4429          0.59375           0.64423

Naive Bayes Balanced data dengan Weighted training dengan data no NA

library(dplyr)
library(caret)
library(e1071)

data <- data_model_no_na
table(data$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        234         33         14

prop.table(table(data$PENYAKIT))

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.83274021 0.11743772 0.04982206

class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.4002849  2.8383838  6.6904762

data$weight <- class_weight[data$PENYAKIT]

set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]

x_train <- trainData %>% select(-PENYAKIT, -weight)
y_train <- trainData$PENYAKIT

set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )

table(down_train$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##         17         17         17

down_train %>% count(PENYAKIT)

##     PENYAKIT  n
## 1    TB_PARU 17
## 2  TB_PLEURA 17
## 3 TB_LAINNYA 17

train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)

model_nb <- naiveBayes(PENYAKIT ~ ., data = train_weighted)

model_nb

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3612335  0.3171806  0.3215859 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3170732 0.6829268
##   TB_PLEURA  0.1944444 0.8055556
##   TB_LAINNYA 0.4657534 0.5342466
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6585366 0.3414634
##   TB_PLEURA  0.7638889 0.2361111
##   TB_LAINNYA 0.3561644 0.6438356
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4024390 0.5975610
##   TB_PLEURA  0.4305556 0.5694444
##   TB_LAINNYA 0.6849315 0.3150685
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3048780 0.4631887
##   TB_PLEURA  0.3194444 0.4695334
##   TB_LAINNYA 0.3972603 0.4927171
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4024390 0.5975610
##   TB_PLEURA  0.1805556 0.8194444
##   TB_LAINNYA 0.5068493 0.4931507
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1341463 0.3429068
##   TB_PLEURA  0.1111111 0.3164751
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.24390244 0.4320773
##   TB_PLEURA  0.09722222 0.2983392
##   TB_LAINNYA 0.39726027 0.4927171
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.92682927 0.07317073
##   TB_PLEURA  0.90277778 0.09722222
##   TB_LAINNYA 0.93150685 0.06849315

prediksi <- predict(model_nb, testData)
confusionMatrix(prediksi, testData$PENYAKIT)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         23         2          0
##   TB_PLEURA       15         3          0
##   TB_LAINNYA       8         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5185          
##                  95% CI : (0.3784, 0.6566)
##     No Information Rate : 0.8519          
##     P-Value [Acc > NIR] : 1.0000000       
##                                           
##                   Kappa : 0.1418          
##                                           
##  Mcnemar's Test P-Value : 0.0002812       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.5000          0.50000           1.00000
## Specificity                  0.7500          0.68750           0.82692
## Pos Pred Value               0.9200          0.16667           0.18182
## Neg Pred Value               0.2069          0.91667           1.00000
## Prevalence                   0.8519          0.11111           0.03704
## Detection Rate               0.4259          0.05556           0.03704
## Detection Prevalence         0.4630          0.33333           0.20370
## Balanced Accuracy            0.6250          0.59375           0.91346

Naive Bayes dengan Balanced data dengan n menentukan sendiri

library(dplyr)
library(e1071)
library(caret)

set.seed(1001)

# jumlah data yang diinginkan per kelas
n_sample <- 258

down_train <- trainData %>%
  group_by(PENYAKIT) %>%
  sample_n(size = n_sample, replace = TRUE) %>%
  ungroup()

# cek distribusi
table(down_train$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        258        258        258

down_train %>% count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU      258
## 2 TB_PLEURA    258
## 3 TB_LAINNYA   258

NB_down <- naiveBayes(PENYAKIT ~ ., data = down_train)

NB_down

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3100775 0.6899225
##   TB_PLEURA  0.2209302 0.7790698
##   TB_LAINNYA 0.4767442 0.5232558
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6705426 0.3294574
##   TB_PLEURA  0.7093023 0.2906977
##   TB_LAINNYA 0.3875969 0.6124031
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4457364 0.5542636
##   TB_PLEURA  0.4069767 0.5930233
##   TB_LAINNYA 0.7558140 0.2441860
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3217054 0.4680386
##   TB_PLEURA  0.3914729 0.4890284
##   TB_LAINNYA 0.3139535 0.4649998
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4496124 0.5503876
##   TB_PLEURA  0.2364341 0.7635659
##   TB_LAINNYA 0.4728682 0.5271318
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1976744 0.3990192
##   TB_PLEURA  0.1744186 0.3802066
##   TB_LAINNYA 0.4496124 0.4984215
## 
##             KODE_LAMA_RAWAT
## Y                 [,1]      [,2]
##   TB_PARU    0.1589147 0.3663071
##   TB_PLEURA  0.1162791 0.3211823
##   TB_LAINNYA 0.3255814 0.4695024
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.93023256 0.06976744
##   TB_PLEURA  0.86821705 0.13178295
##   TB_LAINNYA 0.92635659 0.07364341
## 
##             weight
## Y                 [,1] [,2]
##   TB_PARU    0.4002849    0
##   TB_PLEURA  2.8383838    0
##   TB_LAINNYA 6.6904762    0

testData$predicted <- predict(NB_down, testData)

## Warning in predict.naiveBayes(NB_down, testData): Type mismatch between
## training and new data for variable 'weight'. Did you use factors with numeric
## labels for training, and numeric values for new data?

testData$actual <- testData$PENYAKIT

confusionMatrix(
  factor(testData$predicted),
  factor(testData$actual)
)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         19         2          0
##   TB_PLEURA       16         3          0
##   TB_LAINNYA      11         1          2
## 
## Overall Statistics
##                                          
##                Accuracy : 0.4444         
##                  95% CI : (0.3092, 0.586)
##     No Information Rate : 0.8519         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.104          
##                                          
##  Mcnemar's Test P-Value : 4.259e-05      
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.4130          0.50000           1.00000
## Specificity                  0.7500          0.66667           0.76923
## Pos Pred Value               0.9048          0.15789           0.14286
## Neg Pred Value               0.1818          0.91429           1.00000
## Prevalence                   0.8519          0.11111           0.03704
## Detection Rate               0.3519          0.05556           0.03704
## Detection Prevalence         0.3889          0.35185           0.25926
## Balanced Accuracy            0.5815          0.58333           0.88462

Naive Bayes Balanced data dengan Weighted training dengan data no NA

library(dplyr)
library(caret)
library(e1071)

# Gunakan data tanpa missing value
data <- data_model_no_na

table(data$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        234         33         14

prop.table(table(data$PENYAKIT))

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.83274021 0.11743772 0.04982206

class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.4002849  2.8383838  6.6904762

data <- data %>%
  mutate(weight = class_weight[PENYAKIT])

set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData  <- data[-trainIndex, ]

set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )

train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)

train_weighted %>%
  count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       82
## 2 TB_PLEURA     72
## 3 TB_LAINNYA    73

model_nb_weight <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_weighted,
  laplace = 1
)

model_nb_weight

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3612335  0.3171806  0.3215859 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.3214286 0.6785714
##   TB_PLEURA  0.2027027 0.7972973
##   TB_LAINNYA 0.4666667 0.5333333
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6707317 0.3536585
##   TB_PLEURA  0.7777778 0.2500000
##   TB_LAINNYA 0.3698630 0.6575342
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4146341 0.6097561
##   TB_PLEURA  0.4444444 0.5833333
##   TB_LAINNYA 0.6986301 0.3287671
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.3048780 0.4631887
##   TB_PLEURA  0.3194444 0.4695334
##   TB_LAINNYA 0.3972603 0.4927171
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4146341 0.6097561
##   TB_PLEURA  0.1944444 0.8333333
##   TB_LAINNYA 0.5205479 0.5068493
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.1341463 0.3429068
##   TB_PLEURA  0.1111111 0.3164751
##   TB_LAINNYA 0.4931507 0.5034130
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.24390244 0.4320773
##   TB_PLEURA  0.09722222 0.2983392
##   TB_LAINNYA 0.39726027 0.4927171
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.91666667 0.08333333
##   TB_PLEURA  0.89189189 0.10810811
##   TB_LAINNYA 0.92000000 0.08000000

prediksi <- predict(model_nb_weight, testData)

confusionMatrix(prediksi, testData$PENYAKIT)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         20         2          0
##   TB_PLEURA       18         3          0
##   TB_LAINNYA       8         1          2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.463           
##                  95% CI : (0.3262, 0.6039)
##     No Information Rate : 0.8519          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1082          
##                                           
##  Mcnemar's Test P-Value : 7.179e-05       
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.4348          0.50000           1.00000
## Specificity                  0.7500          0.62500           0.82692
## Pos Pred Value               0.9091          0.14286           0.18182
## Neg Pred Value               0.1875          0.90909           1.00000
## Prevalence                   0.8519          0.11111           0.03704
## Detection Rate               0.3704          0.05556           0.03704
## Detection Prevalence         0.4074          0.38889           0.20370
## Balanced Accuracy            0.5924          0.56250           0.91346

Naive Bayes Balanced data dengan Weighted training dengan DATA FULL

library(dplyr)
library(caret)
library(e1071)

# Gunakan data tanpa missing value
data <- data_model

table(data$PENYAKIT)

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##        265         36         21

prop.table(table(data$PENYAKIT))

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
## 0.82298137 0.11180124 0.06521739

class_freq <- table(data$PENYAKIT)

class_weight <- sum(class_freq) / (length(class_freq) * class_freq)

class_weight

## 
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.4050314  2.9814815  5.1111111

data <- data %>%
  mutate(weight = class_weight[PENYAKIT])

set.seed(123)

trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)

trainData <- data[trainIndex, ]
testData  <- data[-trainIndex, ]

set.seed(123)

train_weighted <- trainData %>%
  slice_sample(
    n = nrow(trainData),
    replace = TRUE,
    weight_by = weight
  )

train_weighted <- train_weighted %>%
  select(-weight)

testData <- testData %>%
  select(-weight)

train_weighted %>%
  count(PENYAKIT)

## # A tibble: 3 × 2
##   PENYAKIT       n
##   <fct>      <int>
## 1 TB_PARU       91
## 2 TB_PLEURA     78
## 3 TB_LAINNYA    89

model_nb_weight <- naiveBayes(
  PENYAKIT ~ ., 
  data = train_weighted,
  laplace = 1
)

model_nb_weight

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    TB_PARU  TB_PLEURA TB_LAINNYA 
##  0.3527132  0.3023256  0.3449612 
## 
## Conditional probabilities:
##             JENIS_KELAMIN
## Y                    P         L
##   TB_PARU    0.2903226 0.7096774
##   TB_PLEURA  0.2750000 0.7250000
##   TB_LAINNYA 0.4505495 0.5494505
## 
##             USIA_KATEGORI
## Y                  >45       ≤45
##   TB_PARU    0.6703297 0.3516484
##   TB_PLEURA  0.6538462 0.3717949
##   TB_LAINNYA 0.3595506 0.6629213
## 
##             LEOKOSIT
## Y                    0         1
##   TB_PARU    0.4470588 0.5764706
##   TB_PLEURA  0.2631579 0.7631579
##   TB_LAINNYA 0.7831325 0.2409639
## 
##             KODE_TROMBOSIT
## Y                 [,1]      [,2]
##   TB_PARU    0.2417582 0.4305206
##   TB_PLEURA  0.4230769 0.4972452
##   TB_LAINNYA 0.3483146 0.4791357
## 
##             HEMOGLOBIN
## Y                    0         1
##   TB_PARU    0.4683544 0.5569620
##   TB_PLEURA  0.4520548 0.5753425
##   TB_LAINNYA 0.3787879 0.6515152
## 
##             KODE_PENYERTA
## Y                 [,1]      [,2]
##   TB_PARU    0.2197802 0.4163919
##   TB_PLEURA  0.1923077 0.3966644
##   TB_LAINNYA 0.3707865 0.4857521
## 
##             KODE_LAMA_RAWAT
## Y                  [,1]      [,2]
##   TB_PARU    0.05494505 0.2291354
##   TB_PLEURA  0.02564103 0.1590850
##   TB_LAINNYA 0.23595506 0.4269999
## 
##             KEADAAN_KELUAR
## Y                 HIDUP  MENINGGAL
##   TB_PARU    0.89247312 0.10752688
##   TB_PLEURA  0.93750000 0.06250000
##   TB_LAINNYA 0.93406593 0.06593407

prediksi <- predict(model_nb_weight, testData)

confusionMatrix(prediksi, testData$PENYAKIT)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   TB_PARU TB_PLEURA TB_LAINNYA
##   TB_PARU         19         4          0
##   TB_PLEURA       20         1          2
##   TB_LAINNYA      14         2          2
## 
## Overall Statistics
##                                          
##                Accuracy : 0.3438         
##                  95% CI : (0.2295, 0.473)
##     No Information Rate : 0.8281         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.0166        
##                                          
##  Mcnemar's Test P-Value : 1.813e-05      
## 
## Statistics by Class:
## 
##                      Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity                  0.3585          0.14286           0.50000
## Specificity                  0.6364          0.61404           0.73333
## Pos Pred Value               0.8261          0.04348           0.11111
## Neg Pred Value               0.1707          0.85366           0.95652
## Prevalence                   0.8281          0.10938           0.06250
## Detection Rate               0.2969          0.01562           0.03125
## Detection Prevalence         0.3594          0.35938           0.28125
## Balanced Accuracy            0.4974          0.37845           0.61667

coba model full data

samuel aditya pratama

2026-04-13

Naive Bayes dengan Imbalanced Data FULL

Naive Bayes dengan Balanced data (Undersampling) FULL DATA

Naive Bayes dengan Balanced data (Oversampling) FULL DATA

Naive Bayes dengan Balanced data (SMOTE) dengan data tanpa NA

Naive Bayes Balanced data dengan Weighted training dengan data no NA

Naive Bayes dengan Balanced data dengan n menentukan sendiri

Naive Bayes Balanced data dengan Weighted training dengan data no NA

Naive Bayes Balanced data dengan Weighted training dengan DATA FULL