keterangan data ini sudah bersih sudah di prosessing dan data
missing sudah di isi dengan mean karena data tidak normal
data_model: dataset utama penelitian yang sudah melalui proses
pembersihan data dan imputasi missing value.
trainData: data training (80%) yang digunakan untuk membangun
model.
testData: data testing (20%) yang digunakan untuk menguji performa
model.
down_train: data training hasil undersampling (mengurangi kelas
mayoritas).
up_train: data training hasil oversampling (menambah kelas
minoritas).
train_smote: data training hasil SMOTE (membuat data sintetis kelas
minoritas).
train_weighted: data training dengan pemberian bobot pada kelas
minoritas tanpa mengubah jumlah data.
1. LIBRARY
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(recipes)
## Warning: package 'recipes' was built under R version 4.4.3
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(themis)
## Warning: package 'themis' was built under R version 4.4.3
2. IMPORT DATA
data_model <- read_excel("D:/FINAL TA/data_bersih.xlsx")
data_model$PENYAKIT <- as.factor(data_model$PENYAKIT)
3. CEK DATA
str(data_model)
## tibble [322 × 9] (S3: tbl_df/tbl/data.frame)
## $ JENIS_KELAMIN : num [1:322] 1 1 1 1 1 1 0 0 0 1 ...
## $ KEADAAN_KELUAR : num [1:322] 1 0 0 0 0 0 0 0 0 0 ...
## $ PENYAKIT : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ PENYAKIT_PENYERTA: num [1:322] 0 0 0 0 1 1 0 0 0 0 ...
## $ USIA_KATEGORI : num [1:322] 1 0 1 0 1 1 1 0 0 1 ...
## $ KODE_LAMA_RAWAT : num [1:322] 0 0 0 0 0 0 0 1 0 0 ...
## $ KODE_LEOKOSIT : num [1:322] 0 0 0 1 1 0 1 1 0 0 ...
## $ KODE_TROMBOSIT : num [1:322] 0 0 1 1 0 0 1 1 1 0 ...
## $ KODE_HEMOGLOBIN : num [1:322] 0 0 1 0 1 0 1 1 0 0 ...
colSums(is.na(data_model))
## JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT PENYAKIT_PENYERTA
## 0 0 0 0
## USIA_KATEGORI KODE_LAMA_RAWAT KODE_LEOKOSIT KODE_TROMBOSIT
## 0 0 0 0
## KODE_HEMOGLOBIN
## 0
View(data_model)
4. VIF
model_vif <- lm(as.numeric(PENYAKIT) ~ ., data = data_model)
vif(model_vif)
## JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT_PENYERTA USIA_KATEGORI
## 1.054866 1.044094 1.071182 1.042413
## KODE_LAMA_RAWAT KODE_LEOKOSIT KODE_TROMBOSIT KODE_HEMOGLOBIN
## 1.040701 1.053958 1.089823 1.097102
5. SPLIT DATA
set.seed(123)
trainIndex <- createDataPartition(data_model$PENYAKIT, p = 0.8, list = FALSE)
trainData <- data_model[trainIndex, ]
testData <- data_model[-trainIndex, ]
trainData %>% count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 0 212
## 2 1 29
## 3 2 17
testData %>% count(PENYAKIT)
## # A tibble: 3 × 2
## PENYAKIT n
## <fct> <int>
## 1 0 53
## 2 1 7
## 3 2 4
MODEL UTAMA (MULTINOMIAL LOGISTIC)
model_multinom <- multinom(PENYAKIT ~ ., data = trainData)
## # weights: 30 (18 variable)
## initial value 283.441970
## iter 10 value 141.881038
## iter 20 value 133.187779
## final value 133.170821
## converged
pred_multinom <- predict(model_multinom, testData)
confusionMatrix(pred_multinom, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 50 7 4
## 1 0 0 0
## 2 3 0 0
##
## Overall Statistics
##
## Accuracy : 0.7812
## 95% CI : (0.6603, 0.8749)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 0.8751
##
## Kappa : -0.0529
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.9434 0.0000 0.00000
## Specificity 0.0000 1.0000 0.95000
## Pos Pred Value 0.8197 NaN 0.00000
## Neg Pred Value 0.0000 0.8906 0.93443
## Precision 0.8197 NA 0.00000
## Recall 0.9434 0.0000 0.00000
## F1 0.8772 NA NaN
## Prevalence 0.8281 0.1094 0.06250
## Detection Rate 0.7812 0.0000 0.00000
## Detection Prevalence 0.9531 0.0000 0.04688
## Balanced Accuracy 0.4717 0.5000 0.47500
NAIVE BAYES BASELINE
NB_base <- naiveBayes(PENYAKIT ~ ., data = trainData)
pred_base <- predict(NB_base, testData)
confusionMatrix(pred_base, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 48 7 2
## 1 0 0 0
## 2 5 0 2
##
## Overall Statistics
##
## Accuracy : 0.7812
## 95% CI : (0.6603, 0.8749)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 0.8751
##
## Kappa : 0.1442
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.9057 0.0000 0.50000
## Specificity 0.1818 1.0000 0.91667
## Pos Pred Value 0.8421 NaN 0.28571
## Neg Pred Value 0.2857 0.8906 0.96491
## Precision 0.8421 NA 0.28571
## Recall 0.9057 0.0000 0.50000
## F1 0.8727 NA 0.36364
## Prevalence 0.8281 0.1094 0.06250
## Detection Rate 0.7500 0.0000 0.03125
## Detection Prevalence 0.8906 0.0000 0.10938
## Balanced Accuracy 0.5437 0.5000 0.70833
UNDERSAMPLING
down_train <- downSample(
x = trainData[, setdiff(names(trainData), "PENYAKIT")],
y = trainData$PENYAKIT
)
colnames(down_train)[colnames(down_train) == "Class"] <- "PENYAKIT"
NB_under <- naiveBayes(PENYAKIT ~ ., data = down_train)
pred_under <- predict(NB_under, testData)
confusionMatrix(pred_under, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 14 1 0
## 1 22 4 2
## 2 17 2 2
##
## Overall Statistics
##
## Accuracy : 0.3125
## 95% CI : (0.2024, 0.4406)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0679
##
## Mcnemar's Test P-Value : 6.88e-08
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.2642 0.5714 0.50000
## Specificity 0.9091 0.5789 0.68333
## Pos Pred Value 0.9333 0.1429 0.09524
## Neg Pred Value 0.2041 0.9167 0.95349
## Precision 0.9333 0.1429 0.09524
## Recall 0.2642 0.5714 0.50000
## F1 0.4118 0.2286 0.16000
## Prevalence 0.8281 0.1094 0.06250
## Detection Rate 0.2188 0.0625 0.03125
## Detection Prevalence 0.2344 0.4375 0.32812
## Balanced Accuracy 0.5866 0.5752 0.59167
OVERSAMPLING
up_train <- upSample(
x = trainData[, setdiff(names(trainData), "PENYAKIT")],
y = trainData$PENYAKIT
)
colnames(up_train)[colnames(up_train) == "Class"] <- "PENYAKIT"
NB_over <- naiveBayes(PENYAKIT ~ ., data = up_train)
pred_over <- predict(NB_over, testData)
confusionMatrix(pred_over, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 6 1 0
## 1 33 5 2
## 2 14 1 2
##
## Overall Statistics
##
## Accuracy : 0.2031
## 95% CI : (0.1128, 0.3223)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0335
##
## Mcnemar's Test P-Value : 1.21e-09
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.11321 0.71429 0.50000
## Specificity 0.90909 0.38596 0.75000
## Pos Pred Value 0.85714 0.12500 0.11765
## Neg Pred Value 0.17544 0.91667 0.95745
## Precision 0.85714 0.12500 0.11765
## Recall 0.11321 0.71429 0.50000
## F1 0.20000 0.21277 0.19048
## Prevalence 0.82812 0.10938 0.06250
## Detection Rate 0.09375 0.07812 0.03125
## Detection Prevalence 0.10938 0.62500 0.26562
## Balanced Accuracy 0.51115 0.55013 0.62500
SMOTE
rec_smote <- recipe(PENYAKIT ~ ., data = trainData) %>%
step_smote(PENYAKIT)
train_smote <- prep(rec_smote) %>% juice()
NB_smote <- naiveBayes(PENYAKIT ~ ., data = train_smote, laplace = 1)
pred_smote <- predict(NB_smote, testData)
confusionMatrix(pred_smote, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 7 2 0
## 1 27 4 2
## 2 19 1 2
##
## Overall Statistics
##
## Accuracy : 0.2031
## 95% CI : (0.1128, 0.3223)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0109
##
## Mcnemar's Test P-Value : 6.917e-09
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.1321 0.5714 0.50000
## Specificity 0.8182 0.4912 0.66667
## Pos Pred Value 0.7778 0.1212 0.09091
## Neg Pred Value 0.1636 0.9032 0.95238
## Precision 0.7778 0.1212 0.09091
## Recall 0.1321 0.5714 0.50000
## F1 0.2258 0.2000 0.15385
## Prevalence 0.8281 0.1094 0.06250
## Detection Rate 0.1094 0.0625 0.03125
## Detection Prevalence 0.1406 0.5156 0.34375
## Balanced Accuracy 0.4751 0.5313 0.58333
WEIGHTED MODEL
class_freq <- table(trainData$PENYAKIT)
class_weight <- sum(class_freq) / (length(class_freq) * class_freq)
train_weighted <- trainData
train_weighted$weight <- class_weight[train_weighted$PENYAKIT]
set.seed(123)
train_weighted <- train_weighted[
sample(1:nrow(train_weighted),
size = nrow(train_weighted),
replace = TRUE,
prob = train_weighted$weight),
]
train_weighted$weight <- NULL
NB_weight <- naiveBayes(PENYAKIT ~ ., data = train_weighted, laplace = 1)
pred_weight <- predict(NB_weight, testData)
confusionMatrix(pred_weight, testData$PENYAKIT, mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2
## 0 17 3 1
## 1 19 2 1
## 2 17 2 2
##
## Overall Statistics
##
## Accuracy : 0.3281
## 95% CI : (0.2159, 0.4569)
## No Information Rate : 0.8281
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0026
##
## Mcnemar's Test P-Value : 8.694e-06
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2
## Sensitivity 0.3208 0.28571 0.50000
## Specificity 0.6364 0.64912 0.68333
## Pos Pred Value 0.8095 0.09091 0.09524
## Neg Pred Value 0.1628 0.88095 0.95349
## Precision 0.8095 0.09091 0.09524
## Recall 0.3208 0.28571 0.50000
## F1 0.4595 0.13793 0.16000
## Prevalence 0.8281 0.10938 0.06250
## Detection Rate 0.2656 0.03125 0.03125
## Detection Prevalence 0.3281 0.34375 0.32812
## Balanced Accuracy 0.4786 0.46742 0.59167
head(down_train)
## JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT_PENYERTA USIA_KATEGORI KODE_LAMA_RAWAT
## 1 1 0 0 0 1
## 2 0 0 0 1 0
## 3 0 0 0 0 0
## 4 0 1 0 0 0
## 5 1 0 1 1 0
## 6 0 0 0 0 0
## KODE_LEOKOSIT KODE_TROMBOSIT KODE_HEMOGLOBIN PENYAKIT
## 1 1 1 0 0
## 2 1 0 1 0
## 3 0 0 1 0
## 4 0 0 0 0
## 5 1 0 0 0
## 6 1 0 1 0
head(up_train)
## JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT_PENYERTA USIA_KATEGORI KODE_LAMA_RAWAT
## 1 1 0 0 0 0
## 2 0 0 0 0 0
## 3 1 0 0 1 0
## 4 1 0 1 0 1
## 5 0 0 0 0 0
## 6 1 0 1 0 0
## KODE_LEOKOSIT KODE_TROMBOSIT KODE_HEMOGLOBIN PENYAKIT
## 1 0 1 1 0
## 2 0 0 1 0
## 3 1 0 1 0
## 4 1 0 0 0
## 5 1 1 0 0
## 6 1 1 0 0
head(train_smote)
## # A tibble: 6 × 9
## JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT_PENYERTA USIA_KATEGORI KODE_LAMA_RAWAT
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0 1 0
## 2 1 0 0 0 0
## 3 1 0 0 1 0
## 4 1 0 0 0 0
## 5 1 0 1 1 0
## 6 1 0 1 1 0
## # ℹ 4 more variables: KODE_LEOKOSIT <dbl>, KODE_TROMBOSIT <dbl>,
## # KODE_HEMOGLOBIN <dbl>, PENYAKIT <fct>
head(train_weighted)
## # A tibble: 6 × 9
## JENIS_KELAMIN KEADAAN_KELUAR PENYAKIT PENYAKIT_PENYERTA USIA_KATEGORI
## <dbl> <dbl> <fct> <dbl> <dbl>
## 1 0 0 1 0 0
## 2 0 0 0 0 1
## 3 0 0 2 0 0
## 4 1 1 1 0 1
## 5 1 0 1 0 0
## 6 0 0 0 0 0
## # ℹ 4 more variables: KODE_LAMA_RAWAT <dbl>, KODE_LEOKOSIT <dbl>,
## # KODE_TROMBOSIT <dbl>, KODE_HEMOGLOBIN <dbl>