library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data_tb <- read_excel("D:/FINAL TA/SUMBER REFRENSI/TA.xlsx", sheet = 1)
head(data_tb)
## # A tibble: 6 × 11
## NO `TANGGAL MASUK` `JENIS KELAMIN` `UMUR TAHUN` `KEADAAN KELUAR` PENYAKIT
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 18/02/2024 1 124 1 1
## 2 2 21/02/2024 1 23 0 1
## 3 3 19/04/2024 1 62 0 1
## 4 4 20/04/2024 1 42 0 1
## 5 5 29/04/2024 1 71 0 1
## 6 6 45478 1 63 0 1
## # ℹ 5 more variables: `LAMA RAWAT INAP` <dbl>, `PENYAKIT PENYERTA` <chr>,
## # LEOKOSIT <chr>, TROMBOSIT <chr>, HEMOGLOBIN <chr>
str(data_tb)
## tibble [322 × 11] (S3: tbl_df/tbl/data.frame)
## $ NO : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
## $ TANGGAL MASUK : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
## $ JENIS KELAMIN : num [1:322] 1 1 1 1 1 1 0 0 0 1 ...
## $ UMUR TAHUN : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
## $ KEADAAN KELUAR : num [1:322] 1 0 0 0 0 0 0 0 0 0 ...
## $ PENYAKIT : num [1:322] 1 1 1 1 1 1 1 1 1 1 ...
## $ LAMA RAWAT INAP : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
## $ PENYAKIT PENYERTA: chr [1:322] "0" "0" "0" "0" ...
## $ LEOKOSIT : chr [1:322] "0" "0" "0" "1" ...
## $ TROMBOSIT : chr [1:322] "435" "314" "478" "538" ...
## $ HEMOGLOBIN : chr [1:322] "1" "0" "1" "1" ...
data_tb <- na.omit(data_tb)
colnames(data_tb) <- c(
"NO",
"TANGGAL_MASUK",
"JENIS_KELAMIN",
"UMUR_TAHUN",
"KEADAAN_KELUAR",
"PENYAKIT",
"LAMA_RAWAT_INAP",
"PENYAKIT_PENYERTA",
"LEOKOSIT",
"TROMBOSIT",
"HEMOGLOBIN"
)
data_tb <- data_tb %>%
mutate(KODE_TROMBOSIT = ifelse(TROMBOSIT >= 150 & TROMBOSIT <= 450, 0, 1))
data_tb <- data_tb %>%
mutate(KODE_LAMA_RAWAT = ifelse(LAMA_RAWAT_INAP <= 5, 0, 1))
data_tb <- data_tb %>%
mutate(KODE_PENYERTA = ifelse(PENYAKIT_PENYERTA == 0, 0, 1))
data_tb$KEADAAN_KELUAR <- factor(data_tb$KEADAAN_KELUAR,
levels = c(0,1),
labels = c("HIDUP","MENINGGAL"))
data_tb$PENYAKIT <- factor(data_tb$PENYAKIT,
levels = c(0,1,2),
labels = c("TB_PARU","TB_PLEURA","TB_LAINNYA"))
data_tb$JENIS_KELAMIN <- factor(data_tb$JENIS_KELAMIN,
levels = c(0,1),
labels = c("P","L"))
summary(data_tb)
## NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN
## Min. : 1.00 Length:322 P:109 Min. : 1.0
## 1st Qu.: 81.25 Class :character L:213 1st Qu.: 41.0
## Median :161.50 Mode :character Median : 54.0
## Mean :161.50 Mean : 51.4
## 3rd Qu.:241.75 3rd Qu.: 64.0
## Max. :322.00 Max. :124.0
## KEADAAN_KELUAR PENYAKIT LAMA_RAWAT_INAP PENYAKIT_PENYERTA
## HIDUP :291 TB_PARU :265 Min. : 1.000 Length:322
## MENINGGAL: 31 TB_PLEURA : 36 1st Qu.: 3.000 Class :character
## TB_LAINNYA: 21 Median : 4.000 Mode :character
## Mean : 4.339
## 3rd Qu.: 5.000
## Max. :15.000
## LEOKOSIT TROMBOSIT HEMOGLOBIN KODE_TROMBOSIT
## Length:322 Length:322 Length:322 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Mode :character Median :0.0000
## Mean :0.3292
## 3rd Qu.:1.0000
## Max. :1.0000
## KODE_LAMA_RAWAT KODE_PENYERTA
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.000
## Mean :0.1429 Mean :0.205
## 3rd Qu.:0.0000 3rd Qu.:0.000
## Max. :1.0000 Max. :1.000
str(data_tb)
## tibble [322 × 14] (S3: tbl_df/tbl/data.frame)
## $ NO : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
## $ TANGGAL_MASUK : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
## $ JENIS_KELAMIN : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
## $ UMUR_TAHUN : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
## $ KEADAAN_KELUAR : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
## $ PENYAKIT : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ LAMA_RAWAT_INAP : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
## $ PENYAKIT_PENYERTA: chr [1:322] "0" "0" "0" "0" ...
## $ LEOKOSIT : chr [1:322] "0" "0" "0" "1" ...
## $ TROMBOSIT : chr [1:322] "435" "314" "478" "538" ...
## $ HEMOGLOBIN : chr [1:322] "1" "0" "1" "1" ...
## $ KODE_TROMBOSIT : num [1:322] 0 0 1 1 0 1 1 1 1 0 ...
## $ KODE_LAMA_RAWAT : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
## $ KODE_PENYERTA : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
colSums(is.na(data_tb))
## NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN
## 0 0 0 0
## KEADAAN_KELUAR PENYAKIT LAMA_RAWAT_INAP PENYAKIT_PENYERTA
## 0 0 0 0
## LEOKOSIT TROMBOSIT HEMOGLOBIN KODE_TROMBOSIT
## 0 0 0 0
## KODE_LAMA_RAWAT KODE_PENYERTA
## 0 0
View(data_tb)
tb_freq <- table(data_tb$PENYAKIT)
tb_percent <- prop.table(tb_freq) * 100
tb_deskriptif <- data.frame(
Jenis_TBC = names(tb_freq),
N = as.vector(tb_freq),
Persen = round(as.vector(tb_percent),2)
)
tb_deskriptif
## Jenis_TBC N Persen
## 1 TB_PARU 265 82.30
## 2 TB_PLEURA 36 11.18
## 3 TB_LAINNYA 21 6.52
#Visualisasi distribusi data dalam bentuk diagram lingkaran.
pie(tb_freq,
main = "Diagram Lingkaran Jenis TBC",
col = c("orange","green","yellow"))
data_tb <- data_tb %>%
mutate(
USIA_KATEGORI = ifelse(UMUR_TAHUN <= 45, "≤45", ">45")
)
library(dplyr)
deskriptif <- function(data, var){
freq <- table(data[[var]])
persen <- prop.table(freq) * 100
hasil <- data.frame(
Variabel = var,
Kategori = names(freq),
N = as.vector(freq),
Persen = round(as.vector(persen), 2)
)
return(hasil)
}
d1 <- deskriptif(data_tb, "JENIS_KELAMIN")
d2 <- deskriptif(data_tb, "USIA_KATEGORI")
d3 <- deskriptif(data_tb, "LEOKOSIT")
d4 <- deskriptif(data_tb, "KODE_TROMBOSIT")
d5 <- deskriptif(data_tb, "HEMOGLOBIN")
d6 <- deskriptif(data_tb, "KODE_PENYERTA")
d7 <- deskriptif(data_tb, "KODE_LAMA_RAWAT")
d8 <- deskriptif(data_tb, "KEADAAN_KELUAR")
tabel_deskriptif <- bind_rows(d1,d2,d3,d4,d5,d6,d7,d8)
tabel_deskriptif
## Variabel Kategori N Persen
## 1 JENIS_KELAMIN P 109 33.85
## 2 JENIS_KELAMIN L 213 66.15
## 3 USIA_KATEGORI >45 213 66.15
## 4 USIA_KATEGORI ≤45 109 33.85
## 5 LEOKOSIT - 14 4.35
## 6 LEOKOSIT 0 139 43.17
## 7 LEOKOSIT 1 169 52.48
## 8 KODE_TROMBOSIT 0 216 67.08
## 9 KODE_TROMBOSIT 1 106 32.92
## 10 HEMOGLOBIN - 39 12.11
## 11 HEMOGLOBIN 0 120 37.27
## 12 HEMOGLOBIN 1 163 50.62
## 13 KODE_PENYERTA 0 256 79.50
## 14 KODE_PENYERTA 1 66 20.50
## 15 KODE_LAMA_RAWAT 0 276 85.71
## 16 KODE_LAMA_RAWAT 1 46 14.29
## 17 KEADAAN_KELUAR HIDUP 291 90.37
## 18 KEADAAN_KELUAR MENINGGAL 31 9.63
data_no_na <- na.omit(data_tb)
head(data_tb)
## # A tibble: 6 × 15
## NO TANGGAL_MASUK JENIS_KELAMIN UMUR_TAHUN KEADAAN_KELUAR PENYAKIT
## <dbl> <chr> <fct> <dbl> <fct> <fct>
## 1 1 18/02/2024 L 124 MENINGGAL TB_PLEURA
## 2 2 21/02/2024 L 23 HIDUP TB_PLEURA
## 3 3 19/04/2024 L 62 HIDUP TB_PLEURA
## 4 4 20/04/2024 L 42 HIDUP TB_PLEURA
## 5 5 29/04/2024 L 71 HIDUP TB_PLEURA
## 6 6 45478 L 63 HIDUP TB_PLEURA
## # ℹ 9 more variables: LAMA_RAWAT_INAP <dbl>, PENYAKIT_PENYERTA <chr>,
## # LEOKOSIT <chr>, TROMBOSIT <chr>, HEMOGLOBIN <chr>, KODE_TROMBOSIT <dbl>,
## # KODE_LAMA_RAWAT <dbl>, KODE_PENYERTA <dbl>, USIA_KATEGORI <chr>
str(data_tb)
## tibble [322 × 15] (S3: tbl_df/tbl/data.frame)
## $ NO : num [1:322] 1 2 3 4 5 6 7 8 9 10 ...
## $ TANGGAL_MASUK : chr [1:322] "18/02/2024" "21/02/2024" "19/04/2024" "20/04/2024" ...
## $ JENIS_KELAMIN : Factor w/ 2 levels "P","L": 2 2 2 2 2 2 1 1 1 2 ...
## $ UMUR_TAHUN : num [1:322] 124 23 62 42 71 63 51 42 27 66 ...
## $ KEADAAN_KELUAR : Factor w/ 2 levels "HIDUP","MENINGGAL": 2 1 1 1 1 1 1 1 1 1 ...
## $ PENYAKIT : Factor w/ 3 levels "TB_PARU","TB_PLEURA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ LAMA_RAWAT_INAP : num [1:322] 3 1 3 3 3 3 4 7 3 5 ...
## $ PENYAKIT_PENYERTA: chr [1:322] "0" "0" "0" "0" ...
## $ LEOKOSIT : chr [1:322] "0" "0" "0" "1" ...
## $ TROMBOSIT : chr [1:322] "435" "314" "478" "538" ...
## $ HEMOGLOBIN : chr [1:322] "1" "0" "1" "1" ...
## $ KODE_TROMBOSIT : num [1:322] 0 0 1 1 0 1 1 1 1 0 ...
## $ KODE_LAMA_RAWAT : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
## $ KODE_PENYERTA : num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
## $ USIA_KATEGORI : chr [1:322] ">45" "≤45" ">45" "≤45" ...
data_tb[data_tb == "-"] <- NA
nrow(data_tb)
## [1] 322
nrow(data_no_na)
## [1] 322
data_no_na <- na.omit(data_tb)
X <- data_no_na[,c(
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]
X <- data.frame(lapply(X, function(x) as.numeric(as.factor(x))))
str(X)
## 'data.frame': 281 obs. of 8 variables:
## $ JENIS_KELAMIN : num 2 2 2 2 2 1 1 2 2 2 ...
## $ USIA_KATEGORI : num 1 2 1 2 1 1 2 1 1 1 ...
## $ LEOKOSIT : num 1 1 1 2 2 2 2 2 1 2 ...
## $ KODE_TROMBOSIT : num 1 1 2 2 1 2 2 1 2 1 ...
## $ HEMOGLOBIN : num 2 1 2 2 2 2 2 1 2 1 ...
## $ KODE_PENYERTA : num 1 1 1 1 2 1 1 1 2 1 ...
## $ KODE_LAMA_RAWAT: num 1 1 1 1 1 1 2 1 1 1 ...
## $ KEADAAN_KELUAR : num 2 1 1 1 1 1 1 1 1 1 ...
View(data_no_na)
data_model_no_na <- data_no_na[, c(
"PENYAKIT",
"JENIS_KELAMIN",
"USIA_KATEGORI",
"LEOKOSIT",
"KODE_TROMBOSIT",
"HEMOGLOBIN",
"KODE_PENYERTA",
"KODE_LAMA_RAWAT",
"KEADAAN_KELUAR"
)]
data_model_no_na $PENYAKIT <- factor(data_model_no_na $PENYAKIT)
View(data_model_no_na)
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
set.seed(1001)
train_index <- createDataPartition(data_model_no_na$PENYAKIT, p = 0.80, list = FALSE)
trainDataNO <- data_model_no_na[train_index, ]
testDataNO <- data_model_no_na[-train_index, ]
n <- nrow(trainDataNO)
n
## [1] 227
n <- nrow(testDataNO)
n
## [1] 54
trainDataNO %>% count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 TB_PARU 188
## 2 TB_PLEURA 27
## 3 TB_LAINNYA 12
testDataNO %>% count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 TB_PARU 46
## 2 TB_PLEURA 6
## 3 TB_LAINNYA 2
set.seed(1001)
down_trainNO <- downSample(
x = trainDataNO[, !colnames(trainDataNO) %in% "PENYAKIT"],
y = trainDataNO$PENYAKIT
)
names(down_trainNO)[names(down_trainNO) == "Class"] <- "PENYAKIT"
table(down_trainNO$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 12 12 12
down_trainNO %>%
count(PENYAKIT)
## PENYAKIT n
## 1 TB_PARU 12
## 2 TB_PLEURA 12
## 3 TB_LAINNYA 12
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
NBClassifier0 <- naiveBayes(PENYAKIT ~ ., data = down_trainNO)
NBClassifier0
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3333333 0.3333333 0.3333333
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.25000000 0.75000000
## TB_PLEURA 0.08333333 0.91666667
## TB_LAINNYA 0.50000000 0.50000000
##
## USIA_KATEGORI
## Y >45 ≤45
## TB_PARU 0.7500000 0.2500000
## TB_PLEURA 0.7500000 0.2500000
## TB_LAINNYA 0.3333333 0.6666667
##
## LEOKOSIT
## Y 0 1
## TB_PARU 0.5000000 0.5000000
## TB_PLEURA 0.4166667 0.5833333
## TB_LAINNYA 0.6666667 0.3333333
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.08333333 0.2886751
## TB_PLEURA 0.25000000 0.4522670
## TB_LAINNYA 0.41666667 0.5149287
##
## HEMOGLOBIN
## Y 0 1
## TB_PARU 0.75 0.25
## TB_PLEURA 0.50 0.50
## TB_LAINNYA 0.50 0.50
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.3333333 0.4923660
## TB_PLEURA 0.1666667 0.3892495
## TB_LAINNYA 0.5000000 0.5222330
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.0000000 0.000000
## TB_PLEURA 0.0000000 0.000000
## TB_LAINNYA 0.3333333 0.492366
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.83333333 0.16666667
## TB_PLEURA 0.75000000 0.25000000
## TB_LAINNYA 0.91666667 0.08333333
testDataNO$predicted <- predict(NBClassifier0, testDataNO)
testDataNO$actual <- testDataNO$PENYAKIT
library(caret)
confusionMatrix(
factor(testDataNO$predicted),
factor(testDataNO$actual)
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 20 1 0
## TB_PLEURA 22 5 1
## TB_LAINNYA 4 0 1
##
## Overall Statistics
##
## Accuracy : 0.4815
## 95% CI : (0.3434, 0.6216)
## No Information Rate : 0.8519
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1467
##
## Mcnemar's Test P-Value : 2.298e-05
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.4348 0.83333 0.50000
## Specificity 0.8750 0.52083 0.92308
## Pos Pred Value 0.9524 0.17857 0.20000
## Neg Pred Value 0.2121 0.96154 0.97959
## Prevalence 0.8519 0.11111 0.03704
## Detection Rate 0.3704 0.09259 0.01852
## Detection Prevalence 0.3889 0.51852 0.09259
## Balanced Accuracy 0.6549 0.67708 0.71154
set.seed(1001)
up_trainNO <- upSample(
x = trainDataNO[, !colnames(trainDataNO) %in% "PENYAKIT"],
y = trainDataNO$PENYAKIT
)
names(up_trainNO)[names(up_trainNO) == "Class"] <- "PENYAKIT"
table(up_trainNO$PENYAKIT)
##
## TB_PARU TB_PLEURA TB_LAINNYA
## 188 188 188
up_trainNO %>%
count(PENYAKIT)
## PENYAKIT n
## 1 TB_PARU 188
## 2 TB_PLEURA 188
## 3 TB_LAINNYA 188
library(e1071)
NBClassifier1 <- naiveBayes(PENYAKIT ~ ., data = up_trainNO)
NBClassifier1
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## TB_PARU TB_PLEURA TB_LAINNYA
## 0.3333333 0.3333333 0.3333333
##
## Conditional probabilities:
## JENIS_KELAMIN
## Y P L
## TB_PARU 0.3351064 0.6648936
## TB_PLEURA 0.2287234 0.7712766
## TB_LAINNYA 0.4734043 0.5265957
##
## USIA_KATEGORI
## Y >45 ≤45
## TB_PARU 0.6861702 0.3138298
## TB_PLEURA 0.6914894 0.3085106
## TB_LAINNYA 0.2925532 0.7074468
##
## LEOKOSIT
## Y 0 1
## TB_PARU 0.4202128 0.5797872
## TB_PLEURA 0.4468085 0.5531915
## TB_LAINNYA 0.6117021 0.3882979
##
## KODE_TROMBOSIT
## Y [,1] [,2]
## TB_PARU 0.2872340 0.4536799
## TB_PLEURA 0.3670213 0.4832794
## TB_LAINNYA 0.4468085 0.4984902
##
## HEMOGLOBIN
## Y 0 1
## TB_PARU 0.4468085 0.5531915
## TB_PLEURA 0.2393617 0.7606383
## TB_LAINNYA 0.5053191 0.4946809
##
## KODE_PENYERTA
## Y [,1] [,2]
## TB_PARU 0.1808511 0.3859225
## TB_PLEURA 0.1436170 0.3516374
## TB_LAINNYA 0.4840426 0.5010797
##
## KODE_LAMA_RAWAT
## Y [,1] [,2]
## TB_PARU 0.16489362 0.3720754
## TB_PLEURA 0.07446809 0.2632321
## TB_LAINNYA 0.35638298 0.4802091
##
## KEADAAN_KELUAR
## Y HIDUP MENINGGAL
## TB_PARU 0.89893617 0.10106383
## TB_PLEURA 0.88829787 0.11170213
## TB_LAINNYA 0.93617021 0.06382979
testDataNO$predicted <- predict(NBClassifier1, testDataNO)
testDataNO$actual <- testDataNO$PENYAKIT
library(caret)
confusionMatrix(
factor(testDataNO$predicted),
factor(testDataNO$actual)
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction TB_PARU TB_PLEURA TB_LAINNYA
## TB_PARU 10 1 0
## TB_PLEURA 26 4 1
## TB_LAINNYA 10 1 1
##
## Overall Statistics
##
## Accuracy : 0.2778
## 95% CI : (0.1646, 0.4164)
## No Information Rate : 0.8519
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0427
##
## Mcnemar's Test P-Value : 2.997e-07
##
## Statistics by Class:
##
## Class: TB_PARU Class: TB_PLEURA Class: TB_LAINNYA
## Sensitivity 0.2174 0.66667 0.50000
## Specificity 0.8750 0.43750 0.78846
## Pos Pred Value 0.9091 0.12903 0.08333
## Neg Pred Value 0.1628 0.91304 0.97619
## Prevalence 0.8519 0.11111 0.03704
## Detection Rate 0.1852 0.07407 0.01852
## Detection Prevalence 0.2037 0.57407 0.22222
## Balanced Accuracy 0.5462 0.55208 0.64423