library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data_tb <- read_excel("D:/FINAL TA/SUMBER REFRENSI/TA.xlsx", sheet = 1)
head(data_tb)
## # A tibble: 6 × 11
## NO `TANGGAL MASUK` `JENIS KELAMIN` `UMUR TAHUN` `KEADAAN KELUAR` PENYAKIT
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 18/02/2024 1 124 1 1
## 2 2 21/02/2024 1 23 0 1
## 3 3 19/04/2024 1 62 0 1
## 4 4 20/04/2024 1 42 0 1
## 5 5 29/04/2024 1 71 0 1
## 6 6 45478 1 63 0 1
## # ℹ 5 more variables: `LAMA RAWAT INAP` <dbl>, `PENYAKIT PENYERTA` <dbl>,
## # LEOKOSIT <dbl>, TROMBOSIT <chr>, HEMOGLOBIN <dbl>
str(data_tb)
## tibble [322 × 11] (S3: tbl_df/tbl/data.frame)
## $ NO : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
## $ TANGGAL MASUK : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
## $ JENIS KELAMIN : num [1:322] 1 1 1 1 1 1 0 0 0 1 ...
## $ UMUR TAHUN : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
## $ KEADAAN KELUAR : num [1:322] 1 0 0 0 0 0 0 0 0 0 ...
## $ PENYAKIT : num [1:322] 1 1 1 1 1 1 1 1 1 1 ...
## $ LAMA RAWAT INAP : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
## $ PENYAKIT PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
## $ LEOKOSIT : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
## $ TROMBOSIT : chr [1:322] "435" "314" "478" "538" ...
## $ HEMOGLOBIN : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...
colnames(data_tb) <- c(
"NO",
"TANGGAL_MASUK",
"JENIS_KELAMIN",
"UMUR_TAHUN",
"KEADAAN_KELUAR",
"PENYAKIT",
"LAMA_RAWAT_INAP",
"PENYAKIT_PENYERTA",
"LEOKOSIT",
"TROMBOSIT",
"HEMOGLOBIN"
)
data_tb <- data_tb %>%
mutate(KODE_TROMBOSIT = ifelse(TROMBOSIT >= 150 & TROMBOSIT <= 450, 0, 1))
data_tb <- data_tb %>%
mutate(KODE_LAMA_RAWAT = ifelse(LAMA_RAWAT_INAP <= 5, 0, 1))
data_tb <- data_tb %>%
mutate(KODE_PENYERTA = ifelse(PENYAKIT_PENYERTA == 0, 0, 1))
data_tb$KEADAAN_KELUAR <- factor(data_tb$KEADAAN_KELUAR,
levels = c(0,1),
labels = c("HIDUP","MENINGGAL"))
data_tb$PENYAKIT <- factor(data_tb$PENYAKIT,
levels = c(0,1,2),
labels = c("TB_PARU","TB_PLEURA","TB_LAINNYA"))
data_tb$JENIS_KELAMIN <- factor(data_tb$JENIS_KELAMIN,
levels = c(0,1),
labels = c("P","L"))
summary(data_tb)
## NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN
## Min. : 1.00 Length:322 P:109 Min. : 1.0
## 1st Qu.: 81.25 Class :character L:213 1st Qu.: 41.0
## Median :161.50 Mode :character Median : 54.0
## Mean :161.50 Mean : 51.4
## 3rd Qu.:241.75 3rd Qu.: 64.0
## Max. :322.00 Max. :124.0
##
## KEADAAN_KELUAR PENYAKIT LAMA_RAWAT_INAP PENYAKIT_PENYERTA
## HIDUP :291 TB_PARU :265 Min. : 1.000 Min. :0.000
## MENINGGAL: 31 TB_PLEURA : 36 1st Qu.: 3.000 1st Qu.:0.000
## TB_LAINNYA: 21 Median : 4.000 Median :0.000
## Mean : 4.339 Mean :0.205
## 3rd Qu.: 5.000 3rd Qu.:0.000
## Max. :15.000 Max. :1.000
##
## LEOKOSIT TROMBOSIT HEMOGLOBIN KODE_TROMBOSIT
## Min. :0.0000 Length:322 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 Class :character 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.0000 Mode :character Median :1.000 Median :0.0000
## Mean :0.5487 Mean :0.576 Mean :0.2941
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000
## NA's :14 NA's :39 NA's :16
## KODE_LAMA_RAWAT KODE_PENYERTA
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.000
## Mean :0.1429 Mean :0.205
## 3rd Qu.:0.0000 3rd Qu.:0.000
## Max. :1.0000 Max. :1.000
##
str(data_tb)
## tibble [322 × 14] (S3: tbl_df/tbl/data.frame)
## $ NO : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
## $ TANGGAL_MASUK : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
## $ JENIS_KELAMIN : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
## $ UMUR_TAHUN : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
## $ KEADAAN_KELUAR : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
## $ PENYAKIT : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ LAMA_RAWAT_INAP : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
## $ PENYAKIT_PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
## $ LEOKOSIT : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
## $ TROMBOSIT : chr [1:322] "435" "314" "478" "538" ...
## $ HEMOGLOBIN : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...
## $ KODE_TROMBOSIT : num [1:322] 0 0 1 1 0 NA 1 1 1 0 ...
## $ KODE_LAMA_RAWAT : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
## $ KODE_PENYERTA : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
colSums(is.na(data_tb))
## NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN
## 0 0 0 0
## KEADAAN_KELUAR PENYAKIT LAMA_RAWAT_INAP PENYAKIT_PENYERTA
## 0 0 0 0
## LEOKOSIT TROMBOSIT HEMOGLOBIN KODE_TROMBOSIT
## 14 16 39 16
## KODE_LAMA_RAWAT KODE_PENYERTA
## 0 0
View(data_tb)
# Frekuensi jenis TBC
tb_freq <- table(data_tb$PENYAKIT)
# Persentase
tb_percent <- prop.table(tb_freq) * 100
# Gabungkan
tb_deskriptif <- data.frame(
Jenis_TBC = names(tb_freq),
N = as.vector(tb_freq),
Persen = round(as.vector(tb_percent),2)
)
tb_deskriptif
## Jenis_TBC N Persen
## 1 TB_PARU 265 82.30
## 2 TB_PLEURA 36 11.18
## 3 TB_LAINNYA 21 6.52
pie(tb_freq,
main = "Diagram Lingkaran Jenis TBC",
col = c("orange","green","yellow"))

library(dplyr)
# buat kategori usia (hindari simbol aneh)
data_tb <- data_tb %>%
mutate(
USIA_KATEGORI = ifelse(UMUR_TAHUN <= 45, "<=45", ">45")
)
# fungsi deskriptif (lebih aman)
deskriptif <- function(data, var){
# pastikan character bersih
x <- data[[var]]
x <- iconv(x, "UTF-8", "ASCII", sub = "") # cegah error encoding
freq <- table(x)
persen <- prop.table(freq) * 100
hasil <- data.frame(
Variabel = var,
Kategori = names(freq),
N = as.vector(freq),
Persen = round(as.vector(persen), 2)
)
return(hasil)
}
# jalankan
d1 <- deskriptif(data_tb, "JENIS_KELAMIN")
d2 <- deskriptif(data_tb, "USIA_KATEGORI")
d3 <- deskriptif(data_tb, "LEOKOSIT")
d4 <- deskriptif(data_tb, "KODE_TROMBOSIT")
d5 <- deskriptif(data_tb, "HEMOGLOBIN")
d6 <- deskriptif(data_tb, "KODE_PENYERTA")
d7 <- deskriptif(data_tb, "KODE_LAMA_RAWAT")
d8 <- deskriptif(data_tb, "KEADAAN_KELUAR")
tabel_deskriptif <- bind_rows(d1, d2, d3, d4, d5, d6, d7, d8)
tabel_deskriptif
## Variabel Kategori N Persen
## 1 JENIS_KELAMIN L 213 66.15
## 2 JENIS_KELAMIN P 109 33.85
## 3 USIA_KATEGORI <=45 109 33.85
## 4 USIA_KATEGORI >45 213 66.15
## 5 LEOKOSIT 0 139 45.13
## 6 LEOKOSIT 1 169 54.87
## 7 KODE_TROMBOSIT 0 216 70.59
## 8 KODE_TROMBOSIT 1 90 29.41
## 9 HEMOGLOBIN 0 120 42.40
## 10 HEMOGLOBIN 1 163 57.60
## 11 KODE_PENYERTA 0 256 79.50
## 12 KODE_PENYERTA 1 66 20.50
## 13 KODE_LAMA_RAWAT 0 276 85.71
## 14 KODE_LAMA_RAWAT 1 46 14.29
## 15 KEADAAN_KELUAR HIDUP 291 90.37
## 16 KEADAAN_KELUAR MENINGGAL 31 9.63
data_no_na <- na.omit(data_tb)
head(data_tb)
## # A tibble: 6 × 15
## NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN KEADAAN_KELUAR PENYAKIT
## <dbl> <chr> <fct> <dbl> <fct> <fct>
## 1 1 18/02/2024 L 124 MENINGGAL TB_PLEURA
## 2 2 21/02/2024 L 23 HIDUP TB_PLEURA
## 3 3 19/04/2024 L 62 HIDUP TB_PLEURA
## 4 4 20/04/2024 L 42 HIDUP TB_PLEURA
## 5 5 29/04/2024 L 71 HIDUP TB_PLEURA
## 6 6 45478 L 63 HIDUP TB_PLEURA
## # ℹ 9 more variables: LAMA_RAWAT_INAP <dbl>, PENYAKIT_PENYERTA <dbl>,
## # LEOKOSIT <dbl>, TROMBOSIT <chr>, HEMOGLOBIN <dbl>, KODE_TROMBOSIT <dbl>,
## # KODE_LAMA_RAWAT <dbl>, KODE_PENYERTA <dbl>, USIA_KATEGORI <chr>
str(data_tb)
## tibble [322 × 15] (S3: tbl_df/tbl/data.frame)
## $ NO : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
## $ TANGGAL_MASUK : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
## $ JENIS_KELAMIN : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
## $ UMUR_TAHUN : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
## $ KEADAAN_KELUAR : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
## $ PENYAKIT : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ LAMA_RAWAT_INAP : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
## $ PENYAKIT_PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
## $ LEOKOSIT : num [1:322] 0 0 0 1 1 NA 1 1 0 1 ...
## $ TROMBOSIT : chr [1:322] "435" "314" "478" "538" ...
## $ HEMOGLOBIN : num [1:322] 1 0 1 1 1 NA 1 1 NA 0 ...
## $ KODE_TROMBOSIT : num [1:322] 0 0 1 1 0 NA 1 1 1 0 ...
## $ KODE_LAMA_RAWAT : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
## $ KODE_PENYERTA : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
## $ USIA_KATEGORI : chr [1:322] ">45" "<=45" ">45" "<=45" ...
data_tb[data_tb == "-"] <- NA
nrow(data_tb)
## [1] 322
nrow(data_no_na)
## [1] 282
data_no_na <- na.omit(data_tb)
X <- data_no_na[,c(
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]
X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))
str(X)
## 'data.frame': 282 obs. of 8 variables:
## $ JENIS_KELAMIN : num 2 2 2 2 2 1 1 2 2 2 ...
## $ USIA_KATEGORI : num 2 1 2 1 2 2 1 2 2 2 ...
## $ LEOKOSIT : num 1 1 1 2 2 2 2 2 1 2 ...
## $ KODE_TROMBOSIT : num 1 1 2 2 1 2 2 1 2 1 ...
## $ HEMOGLOBIN : num 2 1 2 2 2 2 2 1 2 1 ...
## $ KODE_PENYERTA : num 1 1 1 1 2 1 1 1 2 1 ...
## $ KODE_LAMA_RAWAT: num 1 1 1 1 1 1 2 1 1 1 ...
## $ KEADAAN_KELUAR : num 2 1 1 1 1 1 1 1 1 1 ...
View(data_no_na)
data_model <- data_tb[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]
data_model$PENYAKIT <- as.factor(data_model$PENYAKIT)
View(data_model)
Naive Bayes dengan Imbalanced Data FULL
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
set.seed(1001)
train_index <- createDataPartition(data_model$PENYAKIT, p = 0.80, list = FALSE)
trainData <- data_model[train_index, ]
testData <- data_model[-train_index, ]
# Jumlah data training
n <- nrow(trainData)
n
## [1] 258
# Jumlah data training
n <- nrow(testData)
n
## [1] 64
trainData %>% count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 TB_PARU 212
## 2 TB_PLEURA 29
## 3 TB_LAINNYA 17
testData %>% count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 TB_PARU 53
## 2 TB_PLEURA 7
## 3 TB_LAINNYA 4
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
NBClassifier <- naiveBayes(PENYAKIT ~ ., data = trainData)
NBClassifier
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.82170543 0.11240310 0.06589147
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.3301887 0.6698113
## TB_PLEURA 0.2758621 0.7241379
## TB_LAINNYA 0.6470588 0.3529412
##
## USIA_KATEGORI
## Y <=45 >45
## TB_PARU 0.3160377 0.6839623
## TB_PLEURA 0.3103448 0.6896552
## TB_LAINNYA 0.7647059 0.2352941
##
## LEOKOSIT
## Y [,1] [,2]
## TB_PARU 0.5671642 0.4967056
## TB_PLEURA 0.6428571 0.4879500
## TB_LAINNYA 0.1250000 0.3415650
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.2985075 0.4587459
## TB_PLEURA 0.4642857 0.5078745
## TB_LAINNYA 0.1428571 0.3631365
##
## HEMOGLOBIN
## Y [,1] [,2]
## TB_PARU 0.5591398 0.4978302
## TB_PLEURA 0.7307692 0.4523443
## TB_LAINNYA 0.4545455 0.5222330
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.2028302 0.4030588
## TB_PLEURA 0.1379310 0.3509312
## TB_LAINNYA 0.4117647 0.5072997
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.15566038 0.3633911
## TB_PLEURA 0.06896552 0.2578807
## TB_LAINNYA 0.17647059 0.3929526
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.90566038 0.09433962
## TB_PLEURA 0.86206897 0.13793103
## TB_LAINNYA 0.94117647 0.05882353
# Predict using Naive Bayes
testData$predicted <- predict(NBClassifier, testData)
# data aktual
testData$actual <- testData$PENYAKIT
library(caret)
confusionMatrix(
factor(testData$predicted),
factor(testData$actual)
)
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(factor(testData$predicted),
## factor(testData$actual)): Levels are not in the same order for reference and
## data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 52 7 4
## TB_PLEURA 0 0 0
## TB_LAINNYA 1 0 0
##
## Overall Statistics
##
## Accuracy : 0.8125
## 95% CI : (0.6954, 0.8992)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 0.6998
##
## Kappa : -0.0199
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.9811 0.0000 0.00000
## Specificity 0.0000 1.0000 0.98333
## Pos Pred Value 0.8254 NaN 0.00000
## Neg Pred Value 0.0000 0.8906 0.93651
## Prevalence 0.8281 0.1094 0.06250
## Detection Rate 0.8125 0.0000 0.00000
## Detection Prevalence 0.9844 0.0000 0.01562
## Balanced Accuracy 0.4906 0.5000 0.49167
Naive Bayes dengan Balanced data (Undersampling) FULL DATA
set.seed(1001)
down_train <- downSample(
x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
y = trainData$PENYAKIT
)
names(down_train)[names(down_train) == "Class"] <- "PENYAKIT"
table(down_train$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 17 17 17
down_train %>%
count(PENYAKIT)
## PENYAKIT n
## 1 TB_PARU 17
## 2 TB_PLEURA 17
## 3 TB_LAINNYA 17
library(e1071)
NBClassifier3 <- naiveBayes(PENYAKIT ~ ., data = down_train)
NBClassifier3
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3333333 0.3333333 0.3333333
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.4705882 0.5294118
## TB_PLEURA 0.2941176 0.7058824
## TB_LAINNYA 0.6470588 0.3529412
##
## USIA_KATEGORI
## Y <=45 >45
## TB_PARU 0.2352941 0.7647059
## TB_PLEURA 0.2352941 0.7647059
## TB_LAINNYA 0.7647059 0.2352941
##
## LEOKOSIT
## Y [,1] [,2]
## TB_PARU 0.6470588 0.4925922
## TB_PLEURA 0.6875000 0.4787136
## TB_LAINNYA 0.1250000 0.3415650
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.4117647 0.5072997
## TB_PLEURA 0.5000000 0.5163978
## TB_LAINNYA 0.1428571 0.3631365
##
## HEMOGLOBIN
## Y [,1] [,2]
## TB_PARU 0.6250000 0.5000000
## TB_PLEURA 0.8125000 0.4031129
## TB_LAINNYA 0.4545455 0.5222330
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.05882353 0.2425356
## TB_PLEURA 0.11764706 0.3321056
## TB_LAINNYA 0.41176471 0.5072997
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.05882353 0.2425356
## TB_PLEURA 0.11764706 0.3321056
## TB_LAINNYA 0.17647059 0.3929526
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.94117647 0.05882353
## TB_PLEURA 0.82352941 0.17647059
## TB_LAINNYA 0.94117647 0.05882353
testData$predicted <- predict(NBClassifier3, testData)
testData$actual <- testData$PENYAKIT
library(caret)
confusionMatrix(
factor(testData$predicted),
factor(testData$actual)
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 32 3 1
## TB_PLEURA 9 2 2
## TB_LAINNYA 12 2 1
##
## Overall Statistics
##
## Accuracy : 0.5469
## 95% CI : (0.4175, 0.6718)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.0889
##
## Mcnemar's Test P-Value : 0.0064
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.6038 0.28571 0.25000
## Specificity 0.6364 0.80702 0.76667
## Pos Pred Value 0.8889 0.15385 0.06667
## Neg Pred Value 0.2500 0.90196 0.93878
## Prevalence 0.8281 0.10938 0.06250
## Detection Rate 0.5000 0.03125 0.01562
## Detection Prevalence 0.5625 0.20312 0.23438
## Balanced Accuracy 0.6201 0.54637 0.50833
Naive Bayes dengan Balanced data (Oversampling) FULL DATA
set.seed(1001)
up_train <- upSample(
x = trainData[, !colnames(trainData) %in% "PENYAKIT"],
y = trainData$PENYAKIT
)
names(up_train)[names(up_train) == "Class"] <- "PENYAKIT"
table(up_train$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 212 212 212
up_train %>%
count(PENYAKIT)
## PENYAKIT n
## 1 TB_PARU 212
## 2 TB_PLEURA 212
## 3 TB_LAINNYA 212
library(e1071)
NBClassifier1 <- naiveBayes(PENYAKIT ~ ., data = up_train)
NBClassifier1
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3333333 0.3333333 0.3333333
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.3301887 0.6698113
## TB_PLEURA 0.2830189 0.7169811
## TB_LAINNYA 0.5896226 0.4103774
##
## USIA_KATEGORI
## Y <=45 >45
## TB_PARU 0.3160377 0.6839623
## TB_PLEURA 0.2688679 0.7311321
## TB_LAINNYA 0.7594340 0.2405660
##
## LEOKOSIT
## Y [,1] [,2]
## TB_PARU 0.5671642 0.4967056
## TB_PLEURA 0.6600985 0.4748465
## TB_LAINNYA 0.1831683 0.3877655
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.2985075 0.4587459
## TB_PLEURA 0.4384236 0.4974206
## TB_LAINNYA 0.1525424 0.3605658
##
## HEMOGLOBIN
## Y [,1] [,2]
## TB_PARU 0.5591398 0.4978302
## TB_PLEURA 0.7461140 0.4363651
## TB_LAINNYA 0.3877551 0.4889040
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.2028302 0.4030588
## TB_PLEURA 0.1273585 0.3341632
## TB_LAINNYA 0.4245283 0.4954411
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.15566038 0.3633911
## TB_PLEURA 0.06132075 0.2404856
## TB_LAINNYA 0.20283019 0.4030588
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.90566038 0.09433962
## TB_PLEURA 0.83490566 0.16509434
## TB_LAINNYA 0.91981132 0.08018868
testData$predicted <- predict(NBClassifier1, testData)
testData$actual <- testData$PENYAKIT
library(caret)
confusionMatrix(
factor(testData$predicted),
factor(testData$actual)
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 19 2 2
## TB_PLEURA 25 3 1
## TB_LAINNYA 9 2 1
##
## Overall Statistics
##
## Accuracy : 0.3594
## 95% CI : (0.2432, 0.489)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 1
##
## Kappa : 8e-04
##
## Mcnemar's Test P-Value : 2.08e-05
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.3585 0.42857 0.25000
## Specificity 0.6364 0.54386 0.81667
## Pos Pred Value 0.8261 0.10345 0.08333
## Neg Pred Value 0.1707 0.88571 0.94231
## Prevalence 0.8281 0.10938 0.06250
## Detection Rate 0.2969 0.04688 0.01562
## Detection Prevalence 0.3594 0.45312 0.18750
## Balanced Accuracy 0.4974 0.48622 0.53333
data_model_no_na <- data_no_na[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]
data_model_no_na $PENYAKIT <- factor(data_model_no_na $PENYAKIT)
library(caret)
set.seed(123)
index <- createDataPartition(data_model_no_na $PENYAKIT, p = 0.8, list = FALSE)
trainData <- data_model_no_na [index, ]
testData <- data_model_no_na [-index, ]
n<-nrow(trainData)
n
## [1] 227
n<-nrow(testData)
n
## [1] 55
# sebelum
table(trainData$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 188 27 12
Naive Bayes dengan Balanced data (SMOTE) dengan data tanpa NA
library(themis)
## Warning: package 'themis' was built under R version 4.4.3
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.4.3
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(recipes)
rec <- recipe(PENYAKIT ~ ., data = trainData) %>%
step_smotenc(PENYAKIT)
train_smote <- prep(rec) %>%
juice()
NB_TB_smote <- naiveBayes(
PENYAKIT ~ .,
data = train_smote,
laplace = 1
)
library(e1071)
NB_TB_smote <- naiveBayes(
PENYAKIT ~ .,
data = train_smote,
laplace = 1
)
NB_TB_smote
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3333333 0.3333333 0.3333333
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.3578947 0.6421053
## TB_PLEURA 0.1631579 0.8368421
## TB_LAINNYA 0.7263158 0.2736842
##
## USIA_KATEGORI
## Y <=45 >45
## TB_PARU 0.3210526 0.6789474
## TB_PLEURA 0.1368421 0.8631579
## TB_LAINNYA 0.5947368 0.4052632
##
## LEOKOSIT
## Y [,1] [,2]
## TB_PARU 0.5478723 0.4990319
## TB_PLEURA 0.5155320 0.4599931
## TB_LAINNYA 0.1989285 0.3544782
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.2925532 0.4561497
## TB_PLEURA 0.3486680 0.4514144
## TB_LAINNYA 0.2760590 0.3896022
##
## HEMOGLOBIN
## Y [,1] [,2]
## TB_PARU 0.5744681 0.4957436
## TB_PLEURA 0.8024245 0.3704245
## TB_LAINNYA 0.5358024 0.4326382
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.1755319 0.3814372
## TB_PLEURA 0.1315456 0.3189518
## TB_LAINNYA 0.4160202 0.4399221
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.15957447 0.3671888
## TB_PLEURA 0.03961402 0.1645487
## TB_LAINNYA 0.24698177 0.3770241
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.91052632 0.08947368
## TB_PLEURA 0.94210526 0.05789474
## TB_LAINNYA 0.98947368 0.01052632
testData$predicted_smote <- predict(NB_TB_smote, testData)
testData$actual <- testData$PENYAKIT
library(caret)
confusionMatrix(
factor(testData$predicted_smote),
factor(testData$actual)
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 14 2 1
## TB_PLEURA 28 4 0
## TB_LAINNYA 5 0 1
##
## Overall Statistics
##
## Accuracy : 0.3455
## 95% CI : (0.2224, 0.4858)
## No Information Rate : 0.8545
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0208
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.2979 0.66667 0.50000
## Specificity 0.6250 0.42857 0.90566
## Pos Pred Value 0.8235 0.12500 0.16667
## Neg Pred Value 0.1316 0.91304 0.97959
## Prevalence 0.8545 0.10909 0.03636
## Detection Rate 0.2545 0.07273 0.01818
## Detection Prevalence 0.3091 0.58182 0.10909
## Balanced Accuracy 0.4614 0.54762 0.70283
Naive Bayes Balanced data dengan Weighted training dengan data no
NA
library(dplyr)
library(caret)
library(e1071)
data <- data_model_no_na
table(data$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 235 33 14
prop.table(table(data$PENYAKIT))
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.83333333 0.11702128 0.04964539
class_freq <- table(data$PENYAKIT)
class_weight <- sum(class_freq) / (length(class_freq) * class_freq)
class_weight
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.400000 2.848485 6.714286
data$weight <- class_weight[data$PENYAKIT]
set.seed(123)
trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
x_train <- trainData %>% select(-PENYAKIT, -weight)
y_train <- trainData$PENYAKIT
set.seed(123)
train_weighted <- trainData %>%
slice_sample(
n = nrow(trainData),
replace = TRUE,
weight_by = weight
)
table(down_train$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 17 17 17
down_train %>% count(PENYAKIT)
## PENYAKIT n
## 1 TB_PARU 17
## 2 TB_PLEURA 17
## 3 TB_LAINNYA 17
train_weighted <- train_weighted %>%
select(-weight)
testData <- testData %>%
select(-weight)
model_nb <- naiveBayes(PENYAKIT ~ ., data = train_weighted)
model_nb
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3612335 0.3171806 0.3215859
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.4024390 0.5975610
## TB_PLEURA 0.1944444 0.8055556
## TB_LAINNYA 0.4657534 0.5342466
##
## USIA_KATEGORI
## Y <=45 >45
## TB_PARU 0.3658537 0.6341463
## TB_PLEURA 0.2361111 0.7638889
## TB_LAINNYA 0.6438356 0.3561644
##
## LEOKOSIT
## Y [,1] [,2]
## TB_PARU 0.5243902 0.5024781
## TB_PLEURA 0.5694444 0.4986288
## TB_LAINNYA 0.3150685 0.4677580
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.2804878 0.4520021
## TB_PLEURA 0.3194444 0.4695334
## TB_LAINNYA 0.3972603 0.4927171
##
## HEMOGLOBIN
## Y [,1] [,2]
## TB_PARU 0.5609756 0.4993220
## TB_PLEURA 0.8194444 0.3873488
## TB_LAINNYA 0.4931507 0.5034130
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.1707317 0.3785899
## TB_PLEURA 0.1111111 0.3164751
## TB_LAINNYA 0.4931507 0.5034130
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.19512195 0.3987333
## TB_PLEURA 0.09722222 0.2983392
## TB_LAINNYA 0.39726027 0.4927171
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.92682927 0.07317073
## TB_PLEURA 0.90277778 0.09722222
## TB_LAINNYA 0.93150685 0.06849315
prediksi <- predict(model_nb, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 28 2 0
## TB_PLEURA 15 3 0
## TB_LAINNYA 4 1 2
##
## Overall Statistics
##
## Accuracy : 0.6
## 95% CI : (0.4591, 0.7298)
## No Information Rate : 0.8545
## P-Value [Acc > NIR] : 0.999999
##
## Kappa : 0.1896
##
## Mcnemar's Test P-Value : 0.001868
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.5957 0.50000 1.00000
## Specificity 0.7500 0.69388 0.90566
## Pos Pred Value 0.9333 0.16667 0.28571
## Neg Pred Value 0.2400 0.91892 1.00000
## Prevalence 0.8545 0.10909 0.03636
## Detection Rate 0.5091 0.05455 0.03636
## Detection Prevalence 0.5455 0.32727 0.12727
## Balanced Accuracy 0.6729 0.59694 0.95283
Naive Bayes dengan Balanced data dengan n menentukan sendiri
library(dplyr)
library(e1071)
library(caret)
set.seed(1001)
# jumlah data yang diinginkan per kelas
n_sample <- 258
down_train <- trainData %>%
group_by(PENYAKIT) %>%
sample_n(size = n_sample, replace = TRUE) %>%
ungroup()
# cek distribusi
table(down_train$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 258 258 258
down_train %>% count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 TB_PARU 258
## 2 TB_PLEURA 258
## 3 TB_LAINNYA 258
NB_down <- naiveBayes(PENYAKIT ~ ., data = down_train)
NB_down
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3333333 0.3333333 0.3333333
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.3565891 0.6434109
## TB_PLEURA 0.2209302 0.7790698
## TB_LAINNYA 0.4767442 0.5232558
##
## USIA_KATEGORI
## Y <=45 >45
## TB_PARU 0.3759690 0.6240310
## TB_PLEURA 0.2906977 0.7093023
## TB_LAINNYA 0.6124031 0.3875969
##
## LEOKOSIT
## Y [,1] [,2]
## TB_PARU 0.5155039 0.5007309
## TB_PLEURA 0.5930233 0.4922253
## TB_LAINNYA 0.2441860 0.4304386
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.2829457 0.4513058
## TB_PLEURA 0.3914729 0.4890284
## TB_LAINNYA 0.3139535 0.4649998
##
## HEMOGLOBIN
## Y [,1] [,2]
## TB_PARU 0.5813953 0.4942892
## TB_PLEURA 0.7635659 0.4257176
## TB_LAINNYA 0.5271318 0.5002337
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.1782946 0.3835045
## TB_PLEURA 0.1744186 0.3802066
## TB_LAINNYA 0.4496124 0.4984215
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.2054264 0.4047981
## TB_PLEURA 0.1162791 0.3211823
## TB_LAINNYA 0.3255814 0.4695024
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.93023256 0.06976744
## TB_PLEURA 0.86821705 0.13178295
## TB_LAINNYA 0.92635659 0.07364341
##
## weight
## Y [,1] [,2]
## TB_PARU 0.400000 0
## TB_PLEURA 2.848485 0
## TB_LAINNYA 6.714286 0
testData$predicted <- predict(NB_down, testData)
## Warning in predict.naiveBayes(NB_down, testData): Type mismatch between
## training and new data for variable 'weight'. Did you use factors with numeric
## labels for training, and numeric values for new data?
testData$actual <- testData$PENYAKIT
confusionMatrix(
factor(testData$predicted),
factor(testData$actual)
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 16 2 0
## TB_PLEURA 24 3 0
## TB_LAINNYA 7 1 2
##
## Overall Statistics
##
## Accuracy : 0.3818
## 95% CI : (0.2541, 0.5227)
## No Information Rate : 0.8545
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0636
##
## Mcnemar's Test P-Value : 7.088e-06
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.3404 0.50000 1.00000
## Specificity 0.7500 0.51020 0.84906
## Pos Pred Value 0.8889 0.11111 0.20000
## Neg Pred Value 0.1622 0.89286 1.00000
## Prevalence 0.8545 0.10909 0.03636
## Detection Rate 0.2909 0.05455 0.03636
## Detection Prevalence 0.3273 0.49091 0.18182
## Balanced Accuracy 0.5452 0.50510 0.92453
Naive Bayes Balanced data dengan Weighted training dengan data no
NA
library(dplyr)
library(caret)
library(e1071)
# Gunakan data tanpa missing value
data <- data_model_no_na
table(data$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 235 33 14
prop.table(table(data$PENYAKIT))
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.83333333 0.11702128 0.04964539
class_freq <- table(data$PENYAKIT)
class_weight <- sum(class_freq) / (length(class_freq) * class_freq)
class_weight
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.400000 2.848485 6.714286
data <- data %>%
mutate(weight = class_weight[PENYAKIT])
set.seed(123)
trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
set.seed(123)
train_weighted <- trainData %>%
slice_sample(
n = nrow(trainData),
replace = TRUE,
weight_by = weight
)
train_weighted <- train_weighted %>%
select(-weight)
testData <- testData %>%
select(-weight)
train_weighted %>%
count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 TB_PARU 82
## 2 TB_PLEURA 72
## 3 TB_LAINNYA 73
model_nb_weight <- naiveBayes(
PENYAKIT ~ .,
data = train_weighted,
laplace = 1
)
model_nb_weight
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3612335 0.3171806 0.3215859
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.4047619 0.5952381
## TB_PLEURA 0.2027027 0.7972973
## TB_LAINNYA 0.4666667 0.5333333
##
## USIA_KATEGORI
## Y <=45 >45
## TB_PARU 0.3780488 0.6463415
## TB_PLEURA 0.2500000 0.7777778
## TB_LAINNYA 0.6575342 0.3698630
##
## LEOKOSIT
## Y [,1] [,2]
## TB_PARU 0.5243902 0.5024781
## TB_PLEURA 0.5694444 0.4986288
## TB_LAINNYA 0.3150685 0.4677580
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.2804878 0.4520021
## TB_PLEURA 0.3194444 0.4695334
## TB_LAINNYA 0.3972603 0.4927171
##
## HEMOGLOBIN
## Y [,1] [,2]
## TB_PARU 0.5609756 0.4993220
## TB_PLEURA 0.8194444 0.3873488
## TB_LAINNYA 0.4931507 0.5034130
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.1707317 0.3785899
## TB_PLEURA 0.1111111 0.3164751
## TB_LAINNYA 0.4931507 0.5034130
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.19512195 0.3987333
## TB_PLEURA 0.09722222 0.2983392
## TB_LAINNYA 0.39726027 0.4927171
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.91666667 0.08333333
## TB_PLEURA 0.89189189 0.10810811
## TB_LAINNYA 0.92000000 0.08000000
prediksi <- predict(model_nb_weight, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 28 2 0
## TB_PLEURA 15 3 0
## TB_LAINNYA 4 1 2
##
## Overall Statistics
##
## Accuracy : 0.6
## 95% CI : (0.4591, 0.7298)
## No Information Rate : 0.8545
## P-Value [Acc > NIR] : 0.999999
##
## Kappa : 0.1896
##
## Mcnemar's Test P-Value : 0.001868
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.5957 0.50000 1.00000
## Specificity 0.7500 0.69388 0.90566
## Pos Pred Value 0.9333 0.16667 0.28571
## Neg Pred Value 0.2400 0.91892 1.00000
## Prevalence 0.8545 0.10909 0.03636
## Detection Rate 0.5091 0.05455 0.03636
## Detection Prevalence 0.5455 0.32727 0.12727
## Balanced Accuracy 0.6729 0.59694 0.95283
Naive Bayes Balanced data dengan Weighted training dengan DATA
FULL
library(dplyr)
library(caret)
library(e1071)
# Gunakan data tanpa missing value
data <- data_model
table(data$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 265 36 21
prop.table(table(data$PENYAKIT))
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.82298137 0.11180124 0.06521739
class_freq <- table(data$PENYAKIT)
class_weight <- sum(class_freq) / (length(class_freq) * class_freq)
class_weight
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.4050314 2.9814815 5.1111111
data <- data %>%
mutate(weight = class_weight[PENYAKIT])
set.seed(123)
trainIndex <- createDataPartition(data$PENYAKIT, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
set.seed(123)
train_weighted <- trainData %>%
slice_sample(
n = nrow(trainData),
replace = TRUE,
weight_by = weight
)
train_weighted <- train_weighted %>%
select(-weight)
testData <- testData %>%
select(-weight)
train_weighted %>%
count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 TB_PARU 91
## 2 TB_PLEURA 78
## 3 TB_LAINNYA 89
model_nb_weight <- naiveBayes(
PENYAKIT ~ .,
data = train_weighted,
laplace = 1
)
model_nb_weight
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3527132 0.3023256 0.3449612
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.2903226 0.7096774
## TB_PLEURA 0.2750000 0.7250000
## TB_LAINNYA 0.4505495 0.5494505
##
## USIA_KATEGORI
## Y <=45 >45
## TB_PARU 0.3516484 0.6703297
## TB_PLEURA 0.3717949 0.6538462
## TB_LAINNYA 0.6629213 0.3595506
##
## LEOKOSIT
## Y [,1] [,2]
## TB_PARU 0.5647059 0.4987379
## TB_PLEURA 0.7500000 0.4358899
## TB_LAINNYA 0.2289157 0.4226889
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.1882353 0.3932198
## TB_PLEURA 0.4078947 0.4947088
## TB_LAINNYA 0.2658228 0.4445932
##
## HEMOGLOBIN
## Y [,1] [,2]
## TB_PARU 0.5443038 0.5012157
## TB_PLEURA 0.5616438 0.4996193
## TB_LAINNYA 0.6363636 0.4847319
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.2197802 0.4163919
## TB_PLEURA 0.1923077 0.3966644
## TB_LAINNYA 0.3707865 0.4857521
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.05494505 0.2291354
## TB_PLEURA 0.02564103 0.1590850
## TB_LAINNYA 0.23595506 0.4269999
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.89247312 0.10752688
## TB_PLEURA 0.93750000 0.06250000
## TB_LAINNYA 0.93406593 0.06593407
prediksi <- predict(model_nb_weight, testData)
confusionMatrix(prediksi, testData$PENYAKIT)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 18 4 1
## TB_PLEURA 20 1 1
## TB_LAINNYA 15 2 2
##
## Overall Statistics
##
## Accuracy : 0.3281
## 95% CI : (0.2159, 0.4569)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0397
##
## Mcnemar's Test P-Value : 3.582e-05
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.3396 0.14286 0.50000
## Specificity 0.5455 0.63158 0.71667
## Pos Pred Value 0.7826 0.04545 0.10526
## Neg Pred Value 0.1463 0.85714 0.95556
## Prevalence 0.8281 0.10938 0.06250
## Detection Rate 0.2812 0.01562 0.03125
## Detection Prevalence 0.3594 0.34375 0.29688
## Balanced Accuracy 0.4425 0.38722 0.60833