1. Pendahuluan

Penelitian ini bertujuan membandingkan metode klasifikasi: - CART (Classification and Regression Tree) - Naïve Bayes

Evaluasi menggunakan: - Accuracy - Precision - Recall - F1-score


2. Setup

library(rpart)
library(rpart.plot)
library(caret)
library(e1071)
library(corrplot)
library(ggplot2)
library(reshape2)
library(readxl)

3. Input Data

data <- read.csv("C:/Users/RevaA/Downloads/data_malaria.csv")

# Perbaiki nama variabel
names(data)[names(data) == "severe_maleria"] <- "severe_malaria"
names(data)[names(data) == "headace"] <- "headache"
names(data)[names(data) == "prostraction"] <- "prostration"

# Cek data
head(data)
##   age sex fever cold rigor fatigue headache bitter_tongue vomitting diarrhea
## 1   3   1     1    1     0       1        1             1         0        1
## 2   3   0     1    1     1       1        1             1         0        1
## 3   3   0     1    1     1       1        1             0         0        1
## 4   4   1     1    1     0       1        0             0         0        0
## 5   4   0     1    1     1       0        1             0         0        0
## 6   4   1     0    0     0       1        1             0         0        1
##   Convulsion Anemia jundice cocacola_urine hypoglycemia prostration
## 1          1      0       1              1            1           0
## 2          0      0       0              1            1           0
## 3          1      0       0              1            1           0
## 4          0      0       1              0            1           0
## 5          1      0       1              1            1           0
## 6          0      1       0              0            0           0
##   hyperpyrexia severe_malaria
## 1            0              0
## 2            0              0
## 3            0              1
## 4            1              0
## 5            0              0
## 6            0              1
str(data)
## 'data.frame':    337 obs. of  18 variables:
##  $ age           : int  3 3 3 4 4 4 4 5 5 8 ...
##  $ sex           : int  1 0 0 1 0 1 0 1 0 0 ...
##  $ fever         : int  1 1 1 1 1 0 1 1 1 1 ...
##  $ cold          : int  1 1 1 1 1 0 1 0 0 1 ...
##  $ rigor         : int  0 1 1 0 1 0 1 1 1 1 ...
##  $ fatigue       : int  1 1 1 1 0 1 1 1 1 0 ...
##  $ headache      : int  1 1 1 0 1 1 0 0 1 1 ...
##  $ bitter_tongue : int  1 1 0 0 0 0 0 0 1 1 ...
##  $ vomitting     : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ diarrhea      : int  1 1 1 0 0 1 0 1 0 1 ...
##  $ Convulsion    : int  1 0 1 0 1 0 0 0 1 0 ...
##  $ Anemia        : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ jundice       : int  1 0 0 1 1 0 0 1 0 1 ...
##  $ cocacola_urine: int  1 1 1 0 1 0 0 0 0 0 ...
##  $ hypoglycemia  : int  1 1 1 1 1 0 0 0 1 1 ...
##  $ prostration   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hyperpyrexia  : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ severe_malaria: int  0 0 1 0 0 1 0 0 0 0 ...
summary(data)
##       age             sex             fever             cold       
##  Min.   : 3.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:19.00   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :29.00   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   :30.35   Mean   :0.5341   Mean   :0.7507   Mean   :0.5668  
##  3rd Qu.:38.00   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :77.00   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      rigor           fatigue          headache      bitter_tongue   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.3412   Mean   :0.4837   Mean   :0.7003   Mean   :0.4036  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    vomitting          diarrhea        Convulsion         Anemia      
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.07418   Mean   :0.3383   Mean   :0.3442   Mean   :0.3501  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     jundice       cocacola_urine    hypoglycemia     prostration    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.6588   Mean   :0.5401   Mean   :0.8576   Mean   :0.2196  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##   hyperpyrexia    severe_malaria  
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000  
##  Mean   :0.1395   Mean   :0.3442  
##  3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000

---

## 4. Preprocessing


``` r
# Target jadi faktor
data$severe_malaria <- as.factor(data$severe_malaria)

# Normalisasi age
data$age <- (data$age - min(data$age)) / 
  (max(data$age) - min(data$age))

5. Variabel Complication

data$complication <- with(data,
  ifelse(Convulsion==0 & hypoglycemia==0 & hyperpyrexia==0, "0",
  ifelse(Convulsion==0 & hypoglycemia==0 & hyperpyrexia==1, "C1",
  ifelse(Convulsion==0 & hypoglycemia==1 & hyperpyrexia==0, "C2",
  ifelse(Convulsion==1 & hypoglycemia==0 & hyperpyrexia==0, "C3",
  ifelse(Convulsion==0 & hypoglycemia==1 & hyperpyrexia==1, "C4",
  ifelse(Convulsion==1 & hypoglycemia==0 & hyperpyrexia==1, "C5",
  ifelse(Convulsion==1 & hypoglycemia==1 & hyperpyrexia==0, "C6",
         "C7"))))))))

data$complication <- as.factor(data$complication)

6. Analisis Korelasi

data_num <- data[, sapply(data, is.numeric)]
corr_matrix <- cor(data_num)

corrplot(corr_matrix,
         method = "color",
         type = "full",
         col = colorRampPalette(c("blue","white","red"))(200))


7. Split Data

set.seed(123)

train_index <- createDataPartition(data$severe_malaria, p = 0.7, list = FALSE)

train_data <- data[train_index, ]
test_data  <- data[-train_index, ]

8. Model CART

model_cart <- rpart(severe_malaria ~ ., 
                    data = train_data,
                    method = "class")

rpart.plot(model_cart)


9. Evaluasi CART

pred_cart <- predict(model_cart, test_data, type = "class")

cm <- table(Predicted = pred_cart, Actual = test_data$severe_malaria)

TP <- cm[2,2]; TN <- cm[1,1]
FP <- cm[2,1]; FN <- cm[1,2]

precision_cart <- TP/(TP+FP)
recall_cart <- TP/(TP+FN)
f1_cart <- 2*precision_cart*recall_cart/(precision_cart+recall_cart)
acc_cart <- (TP+TN)/sum(cm)

cm
##          Actual
## Predicted  0  1
##         0 57 28
##         1  9  6
acc_cart
## [1] 0.63

10. Model Naive Bayes

model_nb <- naiveBayes(severe_malaria ~ ., data = train_data)

pred_nb <- predict(model_nb, test_data)

cm_nb <- table(Predicted = pred_nb, Actual = test_data$severe_malaria)

TP_nb <- cm_nb[2,2]; TN_nb <- cm_nb[1,1]
FP_nb <- cm_nb[2,1]; FN_nb <- cm_nb[1,2]

precision_nb <- TP_nb/(TP_nb+FP_nb)
recall_nb <- TP_nb/(TP_nb+FN_nb)
f1_nb <- 2*precision_nb*recall_nb/(precision_nb+recall_nb)
acc_nb <- (TP_nb+TN_nb)/sum(cm_nb)

cm_nb
##          Actual
## Predicted  0  1
##         0 49 21
##         1 17 13
acc_nb
## [1] 0.62

11. Perbandingan Model (3 Percobaan)

set.seed(123)

results <- data.frame(Attempt=1:3,
                      Accuracy_CART=NA,
                      Accuracy_NB=NA)

for(i in 1:3){
  
  train_index <- createDataPartition(data$severe_malaria, p=0.7, list=FALSE)
  train_data <- data[train_index, ]
  test_data  <- data[-train_index, ]
  
  model_cart <- rpart(severe_malaria ~ ., data=train_data, method="class")
  pred_cart <- predict(model_cart, test_data, type="class")
  
  model_nb <- naiveBayes(severe_malaria ~ ., data=train_data)
  pred_nb <- predict(model_nb, test_data)
  
  results$Accuracy_CART[i] <- mean(pred_cart == test_data$severe_malaria)
  results$Accuracy_NB[i]   <- mean(pred_nb == test_data$severe_malaria)
}

results
##   Attempt Accuracy_CART Accuracy_NB
## 1       1          0.63        0.62
## 2       2          0.63        0.60
## 3       3          0.64        0.59

12. Visualisasi Perbandingan

results_long <- melt(results, id.vars = "Attempt")

ggplot(results_long, aes(x=factor(Attempt), y=value, fill=variable)) +
  geom_bar(stat="identity", position="dodge") +
  labs(title="Perbandingan Akurasi CART vs Naive Bayes",
       x="Percobaan",
       y="Akurasi") +
  theme_minimal()


13. Hasil Akhir

hasil <- data.frame(
  Model=c("CART","Naive Bayes"),
  Accuracy=c(acc_cart, acc_nb),
  Precision=c(precision_cart, precision_nb),
  Recall=c(recall_cart, recall_nb),
  F1=c(f1_cart, f1_nb)
)

hasil
##         Model Accuracy Precision    Recall       F1
## 1        CART     0.63 0.4000000 0.1764706 0.244898
## 2 Naive Bayes     0.62 0.4333333 0.3823529 0.406250

14. Kesimpulan

Berdasarkan hasil analisis:

Sehingga, model Naive Bayes lebih stabil, sedangkan CART lebih sederhana namun kurang sensitif terhadap kelas minoritas.