Penelitian ini bertujuan membandingkan metode klasifikasi: - CART (Classification and Regression Tree) - Naïve Bayes
Evaluasi menggunakan: - Accuracy - Precision - Recall - F1-score
library(rpart)
library(rpart.plot)
library(caret)
library(e1071)
library(corrplot)
library(ggplot2)
library(reshape2)
library(readxl)
data <- read.csv("C:/Users/RevaA/Downloads/data_malaria.csv")
# Perbaiki nama variabel
names(data)[names(data) == "severe_maleria"] <- "severe_malaria"
names(data)[names(data) == "headace"] <- "headache"
names(data)[names(data) == "prostraction"] <- "prostration"
# Cek data
head(data)
## age sex fever cold rigor fatigue headache bitter_tongue vomitting diarrhea
## 1 3 1 1 1 0 1 1 1 0 1
## 2 3 0 1 1 1 1 1 1 0 1
## 3 3 0 1 1 1 1 1 0 0 1
## 4 4 1 1 1 0 1 0 0 0 0
## 5 4 0 1 1 1 0 1 0 0 0
## 6 4 1 0 0 0 1 1 0 0 1
## Convulsion Anemia jundice cocacola_urine hypoglycemia prostration
## 1 1 0 1 1 1 0
## 2 0 0 0 1 1 0
## 3 1 0 0 1 1 0
## 4 0 0 1 0 1 0
## 5 1 0 1 1 1 0
## 6 0 1 0 0 0 0
## hyperpyrexia severe_malaria
## 1 0 0
## 2 0 0
## 3 0 1
## 4 1 0
## 5 0 0
## 6 0 1
str(data)
## 'data.frame': 337 obs. of 18 variables:
## $ age : int 3 3 3 4 4 4 4 5 5 8 ...
## $ sex : int 1 0 0 1 0 1 0 1 0 0 ...
## $ fever : int 1 1 1 1 1 0 1 1 1 1 ...
## $ cold : int 1 1 1 1 1 0 1 0 0 1 ...
## $ rigor : int 0 1 1 0 1 0 1 1 1 1 ...
## $ fatigue : int 1 1 1 1 0 1 1 1 1 0 ...
## $ headache : int 1 1 1 0 1 1 0 0 1 1 ...
## $ bitter_tongue : int 1 1 0 0 0 0 0 0 1 1 ...
## $ vomitting : int 0 0 0 0 0 0 0 1 0 0 ...
## $ diarrhea : int 1 1 1 0 0 1 0 1 0 1 ...
## $ Convulsion : int 1 0 1 0 1 0 0 0 1 0 ...
## $ Anemia : int 0 0 0 0 0 1 0 1 0 0 ...
## $ jundice : int 1 0 0 1 1 0 0 1 0 1 ...
## $ cocacola_urine: int 1 1 1 0 1 0 0 0 0 0 ...
## $ hypoglycemia : int 1 1 1 1 1 0 0 0 1 1 ...
## $ prostration : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hyperpyrexia : int 0 0 0 1 0 0 0 0 0 0 ...
## $ severe_malaria: int 0 0 1 0 0 1 0 0 0 0 ...
summary(data)
## age sex fever cold
## Min. : 3.00 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:19.00 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :29.00 Median :1.0000 Median :1.0000 Median :1.0000
## Mean :30.35 Mean :0.5341 Mean :0.7507 Mean :0.5668
## 3rd Qu.:38.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :77.00 Max. :1.0000 Max. :1.0000 Max. :1.0000
## rigor fatigue headache bitter_tongue
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :0.0000
## Mean :0.3412 Mean :0.4837 Mean :0.7003 Mean :0.4036
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## vomitting diarrhea Convulsion Anemia
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.07418 Mean :0.3383 Mean :0.3442 Mean :0.3501
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## jundice cocacola_urine hypoglycemia prostration
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.6588 Mean :0.5401 Mean :0.8576 Mean :0.2196
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## hyperpyrexia severe_malaria
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.1395 Mean :0.3442
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
---
## 4. Preprocessing
``` r
# Target jadi faktor
data$severe_malaria <- as.factor(data$severe_malaria)
# Normalisasi age
data$age <- (data$age - min(data$age)) /
(max(data$age) - min(data$age))
data$complication <- with(data,
ifelse(Convulsion==0 & hypoglycemia==0 & hyperpyrexia==0, "0",
ifelse(Convulsion==0 & hypoglycemia==0 & hyperpyrexia==1, "C1",
ifelse(Convulsion==0 & hypoglycemia==1 & hyperpyrexia==0, "C2",
ifelse(Convulsion==1 & hypoglycemia==0 & hyperpyrexia==0, "C3",
ifelse(Convulsion==0 & hypoglycemia==1 & hyperpyrexia==1, "C4",
ifelse(Convulsion==1 & hypoglycemia==0 & hyperpyrexia==1, "C5",
ifelse(Convulsion==1 & hypoglycemia==1 & hyperpyrexia==0, "C6",
"C7"))))))))
data$complication <- as.factor(data$complication)
data_num <- data[, sapply(data, is.numeric)]
corr_matrix <- cor(data_num)
corrplot(corr_matrix,
method = "color",
type = "full",
col = colorRampPalette(c("blue","white","red"))(200))
set.seed(123)
train_index <- createDataPartition(data$severe_malaria, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
model_cart <- rpart(severe_malaria ~ .,
data = train_data,
method = "class")
rpart.plot(model_cart)
pred_cart <- predict(model_cart, test_data, type = "class")
cm <- table(Predicted = pred_cart, Actual = test_data$severe_malaria)
TP <- cm[2,2]; TN <- cm[1,1]
FP <- cm[2,1]; FN <- cm[1,2]
precision_cart <- TP/(TP+FP)
recall_cart <- TP/(TP+FN)
f1_cart <- 2*precision_cart*recall_cart/(precision_cart+recall_cart)
acc_cart <- (TP+TN)/sum(cm)
cm
## Actual
## Predicted 0 1
## 0 57 28
## 1 9 6
acc_cart
## [1] 0.63
model_nb <- naiveBayes(severe_malaria ~ ., data = train_data)
pred_nb <- predict(model_nb, test_data)
cm_nb <- table(Predicted = pred_nb, Actual = test_data$severe_malaria)
TP_nb <- cm_nb[2,2]; TN_nb <- cm_nb[1,1]
FP_nb <- cm_nb[2,1]; FN_nb <- cm_nb[1,2]
precision_nb <- TP_nb/(TP_nb+FP_nb)
recall_nb <- TP_nb/(TP_nb+FN_nb)
f1_nb <- 2*precision_nb*recall_nb/(precision_nb+recall_nb)
acc_nb <- (TP_nb+TN_nb)/sum(cm_nb)
cm_nb
## Actual
## Predicted 0 1
## 0 49 21
## 1 17 13
acc_nb
## [1] 0.62
set.seed(123)
results <- data.frame(Attempt=1:3,
Accuracy_CART=NA,
Accuracy_NB=NA)
for(i in 1:3){
train_index <- createDataPartition(data$severe_malaria, p=0.7, list=FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
model_cart <- rpart(severe_malaria ~ ., data=train_data, method="class")
pred_cart <- predict(model_cart, test_data, type="class")
model_nb <- naiveBayes(severe_malaria ~ ., data=train_data)
pred_nb <- predict(model_nb, test_data)
results$Accuracy_CART[i] <- mean(pred_cart == test_data$severe_malaria)
results$Accuracy_NB[i] <- mean(pred_nb == test_data$severe_malaria)
}
results
## Attempt Accuracy_CART Accuracy_NB
## 1 1 0.63 0.62
## 2 2 0.63 0.60
## 3 3 0.64 0.59
results_long <- melt(results, id.vars = "Attempt")
ggplot(results_long, aes(x=factor(Attempt), y=value, fill=variable)) +
geom_bar(stat="identity", position="dodge") +
labs(title="Perbandingan Akurasi CART vs Naive Bayes",
x="Percobaan",
y="Akurasi") +
theme_minimal()
hasil <- data.frame(
Model=c("CART","Naive Bayes"),
Accuracy=c(acc_cart, acc_nb),
Precision=c(precision_cart, precision_nb),
Recall=c(recall_cart, recall_nb),
F1=c(f1_cart, f1_nb)
)
hasil
## Model Accuracy Precision Recall F1
## 1 CART 0.63 0.4000000 0.1764706 0.244898
## 2 Naive Bayes 0.62 0.4333333 0.3823529 0.406250
Berdasarkan hasil analisis:
Sehingga, model Naive Bayes lebih stabil, sedangkan CART lebih sederhana namun kurang sensitif terhadap kelas minoritas.