Import Dataset yang akan digunakan

setwd("D:/STIS/4SE/4. Data Mining/Tugas/Bismillah Dulu")
library(readxl)
## Warning: package 'readxl' was built under R version 4.0.5
Data <- read_excel("dataya.xlsx")
#View(Data)

Meload semua library yang dibutuhkan

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(data.table)
## Warning: package 'data.table' was built under R version 4.0.5
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.0.5
library(ggplot2)
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.5
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.0.5
dataset<-as.data.frame(Data)
str(dataset)
## 'data.frame':    17848 obs. of  7 variables:
##  $ V102  : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ V106  : num  1 1 1 0 3 1 1 1 3 3 ...
##  $ V190  : num  1 1 3 1 5 2 2 1 2 2 ...
##  $ V212  : num  21 21 17 22 26 18 18 18 25 25 ...
##  $ M14   : num  6 NA 9 9 7 9 NA 9 4 NA ...
##  $ M19   : num  2100 2000 3000 2500 3400 5000 4500 3000 4700 3600 ...
##  $ M14nol: num  6 0 9 9 7 9 0 9 4 0 ...
#View(dataset)

##PREPROCESSING Pada tahap awal preprocessing, dilakukan data reduction dengan menyeleksi variabel yang akan digunakan dalam model. Dataset file excel yang kami gunakan sudah melalui tahap data reduction sehingga tahap ini tidak dilakukan lagi. Selanjutnya nama-nama variabel akan diubah untuk mempermudah memahami data.

dataset <- rename(dataset,Wilayah=V102)
dataset <- rename(dataset,Pend=V106)
dataset <- rename(dataset,SK=V190)
dataset <- rename(dataset,Umur=V212)
dataset <- rename(dataset,Ante=M14)
dataset <- rename(dataset,Berat=M19)
str(dataset)
## 'data.frame':    17848 obs. of  7 variables:
##  $ Wilayah: num  2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : num  1 1 1 0 3 1 1 1 3 3 ...
##  $ SK     : num  1 1 3 1 5 2 2 1 2 2 ...
##  $ Umur   : num  21 21 17 22 26 18 18 18 25 25 ...
##  $ Ante   : num  6 NA 9 9 7 9 NA 9 4 NA ...
##  $ Berat  : num  2100 2000 3000 2500 3400 5000 4500 3000 4700 3600 ...
##  $ M14nol : num  6 0 9 9 7 9 0 9 4 0 ...
for(i in 1:3){
  dataset[,i] <- as.factor(dataset[,i])}
str(dataset)
## 'data.frame':    17848 obs. of  7 variables:
##  $ Wilayah: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : Factor w/ 4 levels "0","1","2","3": 2 2 2 1 4 2 2 2 4 4 ...
##  $ SK     : Factor w/ 5 levels "1","2","3","4",..: 1 1 3 1 5 2 2 1 2 2 ...
##  $ Umur   : num  21 21 17 22 26 18 18 18 25 25 ...
##  $ Ante   : num  6 NA 9 9 7 9 NA 9 4 NA ...
##  $ Berat  : num  2100 2000 3000 2500 3400 5000 4500 3000 4700 3600 ...
##  $ M14nol : num  6 0 9 9 7 9 0 9 4 0 ...
#View(dataset)

Selanjutnya, dilakukan tahapan data discretization yaitu mengklasifikasikan berat bayi lahir ke dalam tiga kategori yaitu BBLR, BBLN, dan BBLL

dataset <- mutate(dataset,BBL=Berat)

for(i in 1:nrow(dataset)){
  if (dataset$Berat[i] <= 2500)
    {dataset$BBL[i] <- "BBLR"}
  else if (dataset$Berat[i] > 2500 && dataset$Berat[i] <= 3999)
  {dataset$BBL[i] <- "BBLN"}
  else
  {dataset$BBL[i] <- "BBLL"}
}

dataset$BBL <- as.factor(dataset$BBL)
str(dataset)
## 'data.frame':    17848 obs. of  8 variables:
##  $ Wilayah: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : Factor w/ 4 levels "0","1","2","3": 2 2 2 1 4 2 2 2 4 4 ...
##  $ SK     : Factor w/ 5 levels "1","2","3","4",..: 1 1 3 1 5 2 2 1 2 2 ...
##  $ Umur   : num  21 21 17 22 26 18 18 18 25 25 ...
##  $ Ante   : num  6 NA 9 9 7 9 NA 9 4 NA ...
##  $ Berat  : num  2100 2000 3000 2500 3400 5000 4500 3000 4700 3600 ...
##  $ M14nol : num  6 0 9 9 7 9 0 9 4 0 ...
##  $ BBL    : Factor w/ 3 levels "BBLL","BBLN",..: 3 3 2 3 2 1 1 2 1 2 ...

Selanjutnya, semua nilai missing value akan dibuang dan akan dilihat apakah terdapat nilai outlier pada variabel numerik yang digunakan

dataset <- na.omit(dataset)

boxplot(dataset$Umur)

boxplot(dataset$Umur, plot=FALSE)$out #Terlihat bahwa outlier Umur ibu saat melahirkan masih tergolong pada kategori WUS dan dapat dikatakan sebagai usia yang normal. Sehingga, outlier pada variabel ini akan tetap digunakan
##   [1] 36 37 38 43 37 37 37 36 35 37 38 37 44 36 35 35 42 36 37 36 46 35 35 36 35
##  [26] 35 36 41 36 36 39 36 35 39 37 37 38 35 39 39 35 38 41 42 39 35 35 42 37 35
##  [51] 37 39 39 37 35 35 36 44 36 38 42 35 35 37 40 37 42 43 39 41 42 35 41 35 37
##  [76] 38 38 42 38 39 42 40 38 40 36 39 39 38 38 35 37 38 39 41 37 35 37 35 37 38
## [101] 35 39 36 38 40 35 38 36 35 37 36 39 41 37 40 36 41 45 40 35 36 36 40 35 36
## [126] 38 41 35 37 36 35 35 42 41 36 37 37 36 35 36 40 38 39 43 35 38 37 37 37 37
## [151] 35 35 44 38 38 37 36 37 37 35 41 36 39 35 36 38 38 35 41 37 35 35 38 37 39
## [176] 35 37 38 38 38 35 35 37 43 42 39 35 41 40 37
boxplot(dataset$Ante)

boxplot(dataset$Ante, plot=FALSE)$out #Outlier pada variabel ini akan dibuang
##   [1] 18 21 98 98 98 36 24 26 98 98 18 28 98 98 98 18 98 98 98 18 18 21 98 98 32
##  [26] 98 18 32 98 98 98 18 17 17 18 26 17 17 18 17 17 98 17 21 20 19 18 18 17 18
##  [51] 18 26 99 25 20 98 21 98 23 21 18 32 20 18 17 98 18 18 25 32 98 20 24 18 18
##  [76] 18 19 18 17 22 98 19 17 98 33 98 18 27 98 98 99 18 18 17 40 20 18 18 18 18
## [101] 18 19 19 19 98 17 20 19 17 18 18 98 18 17 20 27 18 17 98 20 20 98 18 18 98
## [126] 21 20 21 17 18 20 17 18 17 98 98 17 99 18 24 24 20 19 20 20 22 18 21 18 20
## [151] 21 18 18 18 20 20 21 28 98 23 98 20 18 19 19 17 29 17 18 17 19 25 17 18 21
## [176] 98 18 24 18 17 32 20 19 20 17 19 21 22 20 18 17 18 18 24 17 21 18 18 21 17
## [201] 17 22 27 18 18 98 28 20 17 17 22 24 24 18 20 17 20 18 98 98 18 98 18 18 18
## [226] 18 98 18 23 98 17 21 98 98 99 98 18 98 98 98 18 27 22 24 32 17 18 17 98 17
## [251] 27 98 20 17 28 98 98 17 18 18 18 17 18 18 18 40 20 20 22 27 98 19 20 18 18
## [276] 17 18 26 28 98 98 98 24 98 98 20 98 22 98 98 27 17 25 17 99 23 18 24 40 22
## [301] 98 98 98 98 18 20 98 23 18 99 98 98 98 98 98 98 98 22 98 98 98 98 98
outliers <- boxplot(dataset$Ante, plot=FALSE)$out
dataset <- dataset[-which(dataset$Ante %in% outliers),]
#View(dataset)

Berdasarkan tujuan penelitian kelompok kami, kami akan menggabungkan BBLR (Berat Bayi Lahir Rendah) dan BBLL (Berat Bayi Lahir Lebih) menjadi satu kategori yaitu BBLTN (Berat Bayi Lahir Tidak Normal)

dataset1 <- dataset
dataset1$BBL <- gsub("BBLL", "BBLTN", dataset1$BBL)
dataset1$BBL <- gsub("BBLR", "BBLTN", dataset1$BBL)
dataset1$BBL <- as.factor(dataset1$BBL)

Untuk melihat apakah dataset yang digunakan merupakan imbalance dataset maka akan dilihat grafik proporsi berat bayi lahir

propBBL<-table(dataset1$BBL)
propBBL
## 
##  BBLN BBLTN 
## 11282  3752
perc<-round(prop.table(propBBL), digits=2)
barplot(perc,
        main="Berat Bayi Lahir",
        xlab="Label",
        ylab="proporsi",
        col="brown",
        density = 10,
        angle = 45,
        names.arg = c("BBLN","BBLTN"))

Melihat jumlah data pada masing-masing kelas pada setiap variabel

with(dataset1,
    {
      print(table(Wilayah));
      print(table(Pend));
      print(table(SK));
      print(table(BBL));
    }
)
## Wilayah
##    1    2 
## 7384 7650 
## Pend
##    0    1    2    3 
##  197 3791 8466 2580 
## SK
##    1    2    3    4    5 
## 4014 2987 2826 2685 2522 
## BBL
##  BBLN BBLTN 
## 11282  3752

Berdasarkan grafik proporsi berat bayi lahir, terlihat bahwa terjadi permasalahan imbalance dataset, sehingga akan dilakukan upsampling untuk mengatasi permasalahan tersebut

dataset3 <- dataset1
str(dataset3)
## 'data.frame':    15034 obs. of  8 variables:
##  $ Wilayah: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : Factor w/ 4 levels "0","1","2","3": 2 2 1 4 2 2 4 4 3 2 ...
##  $ SK     : Factor w/ 5 levels "1","2","3","4",..: 1 3 1 5 2 1 2 2 1 1 ...
##  $ Umur   : num  21 17 22 26 18 18 25 36 20 17 ...
##  $ Ante   : num  6 9 9 7 9 9 4 9 7 4 ...
##  $ Berat  : num  2100 3000 2500 3400 5000 3000 4700 4100 3500 2800 ...
##  $ M14nol : num  6 9 9 7 9 9 4 9 7 4 ...
##  $ BBL    : Factor w/ 2 levels "BBLN","BBLTN": 2 1 2 1 2 1 2 2 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2491] 2 7 10 12 26 40 43 60 91 93 ...
##   ..- attr(*, "names")= chr [1:2491] "2" "7" "10" "12" ...
newdf <- upSample(dataset3[,-8],dataset3$BBL,yname="BBL") #upsampling
str(newdf)
## 'data.frame':    22564 obs. of  8 variables:
##  $ Wilayah: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : Factor w/ 4 levels "0","1","2","3": 2 4 2 3 2 2 3 3 3 3 ...
##  $ SK     : Factor w/ 5 levels "1","2","3","4",..: 3 5 1 1 1 2 4 2 4 2 ...
##  $ Umur   : num  17 26 18 20 17 19 18 27 21 30 ...
##  $ Ante   : num  9 7 9 7 4 7 4 9 3 8 ...
##  $ Berat  : num  3000 3400 3000 3500 2800 2800 3000 3500 3000 2800 ...
##  $ M14nol : num  9 7 9 7 4 7 4 9 3 8 ...
##  $ BBL    : Factor w/ 2 levels "BBLN","BBLTN": 1 1 1 1 1 1 1 1 1 1 ...
with(newdf,
    {
      print(table(Wilayah));
      print(table(Pend));
      print(table(SK));
      print(table(BBL));
    }
)
## Wilayah
##     1     2 
## 10240 12324 
## Pend
##     0     1     2     3 
##   428  6370 12246  3520 
## SK
##    1    2    3    4    5 
## 7284 4328 3958 3666 3328 
## BBL
##  BBLN BBLTN 
## 11282 11282
propBBL<-table(newdf$BBL)
perc<-round(prop.table(propBBL), digits=2)
barplot(perc,
        main="Berat Bayi Lahir",
        xlab="Label",
        ylab="proporsi",
        col="brown",
        density = 10,
        angle = 45,
        names.arg = c("BBLN","BBLTN"))

##DATA MINING Membuat Model

attach(newdf)
model <- BBL ~ Pend + SK + Ante + Umur + Wilayah

Pada setiap metode yang digunakan, pembuatan data training dan data testing memakai Splitting Data Train dan Data Test dengan 70:30 dan Cross Validation

set.seed(100)
sampling <- sample(1:nrow(newdf), 0.7*nrow(newdf))
train_set <- newdf[sampling,]
test_set <- newdf[-sampling,]

myControl <- trainControl(
  method = "cv",
  number = 10,
  verboseIter = TRUE
)

##Decision Tree dengan Splitting 70:30

dtree <- rpart(model, data = train_set, method = "class")
rpart.plot(dtree,extra = 106)

pred_dtree <- predict(dtree, newdata = test_set, type = "class")
confusionMatrix(pred_dtree,test_set$BBL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLTN
##      BBLN  2359  1482
##      BBLTN 1021  1908
##                                           
##                Accuracy : 0.6303          
##                  95% CI : (0.6187, 0.6418)
##     No Information Rate : 0.5007          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2607          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6979          
##             Specificity : 0.5628          
##          Pos Pred Value : 0.6142          
##          Neg Pred Value : 0.6514          
##              Prevalence : 0.4993          
##          Detection Rate : 0.3484          
##    Detection Prevalence : 0.5674          
##       Balanced Accuracy : 0.6304          
##                                           
##        'Positive' Class : BBLN            
## 

Decision Tree Cross Validation

dtree_cv <- train(model, data=train_set,
                  method='rpart',
                  trControl=myControl)
## + Fold01: cp=0.003833 
## - Fold01: cp=0.003833 
## + Fold02: cp=0.003833 
## - Fold02: cp=0.003833 
## + Fold03: cp=0.003833 
## - Fold03: cp=0.003833 
## + Fold04: cp=0.003833 
## - Fold04: cp=0.003833 
## + Fold05: cp=0.003833 
## - Fold05: cp=0.003833 
## + Fold06: cp=0.003833 
## - Fold06: cp=0.003833 
## + Fold07: cp=0.003833 
## - Fold07: cp=0.003833 
## + Fold08: cp=0.003833 
## - Fold08: cp=0.003833 
## + Fold09: cp=0.003833 
## - Fold09: cp=0.003833 
## + Fold10: cp=0.003833 
## - Fold10: cp=0.003833 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00383 on full training set
confusionMatrix(predict(dtree_cv,newdata = test_set)%>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLTN
##      BBLN  2471  1815
##      BBLTN  909  1575
##                                           
##                Accuracy : 0.5976          
##                  95% CI : (0.5858, 0.6093)
##     No Information Rate : 0.5007          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.1956          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7311          
##             Specificity : 0.4646          
##          Pos Pred Value : 0.5765          
##          Neg Pred Value : 0.6341          
##              Prevalence : 0.4993          
##          Detection Rate : 0.3650          
##    Detection Prevalence : 0.6331          
##       Balanced Accuracy : 0.5978          
##                                           
##        'Positive' Class : BBLN            
## 

Naive Bayes dengan Splitting 70:30

nb <- naiveBayes(model, data = train_set)
print(nb)
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      BBLN     BBLTN 
## 0.5003166 0.4996834 
## 
## Conditional probabilities:
##        Pend
## Y                 0           1           2           3
##   BBLN  0.006833713 0.220956720 0.585674513 0.186535054
##   BBLTN 0.030283832 0.345919919 0.499746579 0.124049671
## 
##        SK
## Y               1         2         3         4         5
##   BBLN  0.2158947 0.1979246 0.2003290 0.1974184 0.1884333
##   BBLTN 0.4235935 0.1841105 0.1533198 0.1325393 0.1064369
## 
##        Ante
## Y           [,1]     [,2]
##   BBLN  8.021893 3.041734
##   BBLTN 6.422960 3.459346
## 
##        Umur
## Y           [,1]     [,2]
##   BBLN  22.42356 4.323949
##   BBLTN 21.46211 4.415467
## 
##        Wilayah
## Y               1         2
##   BBLN  0.5234118 0.4765882
##   BBLTN 0.3857070 0.6142930
pred_nb <- predict(nb, newdata = test_set)
confusionMatrix(pred_nb, test_set$BBL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLTN
##      BBLN  2257  1366
##      BBLTN 1123  2024
##                                           
##                Accuracy : 0.6323          
##                  95% CI : (0.6207, 0.6438)
##     No Information Rate : 0.5007          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2648          
##                                           
##  Mcnemar's Test P-Value : 1.23e-06        
##                                           
##             Sensitivity : 0.6678          
##             Specificity : 0.5971          
##          Pos Pred Value : 0.6230          
##          Neg Pred Value : 0.6432          
##              Prevalence : 0.4993          
##          Detection Rate : 0.3334          
##    Detection Prevalence : 0.5352          
##       Balanced Accuracy : 0.6324          
##                                           
##        'Positive' Class : BBLN            
## 

Naive Bayes dengan Cross Validation

nb_cv <- train(model, data=train_set,
               method='naive_bayes',
               trControl=myControl)
## + Fold01: usekernel= TRUE, laplace=0, adjust=1 
## - Fold01: usekernel= TRUE, laplace=0, adjust=1 
## + Fold01: usekernel=FALSE, laplace=0, adjust=1 
## - Fold01: usekernel=FALSE, laplace=0, adjust=1 
## + Fold02: usekernel= TRUE, laplace=0, adjust=1 
## - Fold02: usekernel= TRUE, laplace=0, adjust=1 
## + Fold02: usekernel=FALSE, laplace=0, adjust=1 
## - Fold02: usekernel=FALSE, laplace=0, adjust=1 
## + Fold03: usekernel= TRUE, laplace=0, adjust=1 
## - Fold03: usekernel= TRUE, laplace=0, adjust=1 
## + Fold03: usekernel=FALSE, laplace=0, adjust=1 
## - Fold03: usekernel=FALSE, laplace=0, adjust=1 
## + Fold04: usekernel= TRUE, laplace=0, adjust=1 
## - Fold04: usekernel= TRUE, laplace=0, adjust=1 
## + Fold04: usekernel=FALSE, laplace=0, adjust=1 
## - Fold04: usekernel=FALSE, laplace=0, adjust=1 
## + Fold05: usekernel= TRUE, laplace=0, adjust=1 
## - Fold05: usekernel= TRUE, laplace=0, adjust=1 
## + Fold05: usekernel=FALSE, laplace=0, adjust=1 
## - Fold05: usekernel=FALSE, laplace=0, adjust=1 
## + Fold06: usekernel= TRUE, laplace=0, adjust=1 
## - Fold06: usekernel= TRUE, laplace=0, adjust=1 
## + Fold06: usekernel=FALSE, laplace=0, adjust=1 
## - Fold06: usekernel=FALSE, laplace=0, adjust=1 
## + Fold07: usekernel= TRUE, laplace=0, adjust=1 
## - Fold07: usekernel= TRUE, laplace=0, adjust=1 
## + Fold07: usekernel=FALSE, laplace=0, adjust=1 
## - Fold07: usekernel=FALSE, laplace=0, adjust=1 
## + Fold08: usekernel= TRUE, laplace=0, adjust=1 
## - Fold08: usekernel= TRUE, laplace=0, adjust=1 
## + Fold08: usekernel=FALSE, laplace=0, adjust=1 
## - Fold08: usekernel=FALSE, laplace=0, adjust=1 
## + Fold09: usekernel= TRUE, laplace=0, adjust=1 
## - Fold09: usekernel= TRUE, laplace=0, adjust=1 
## + Fold09: usekernel=FALSE, laplace=0, adjust=1 
## - Fold09: usekernel=FALSE, laplace=0, adjust=1 
## + Fold10: usekernel= TRUE, laplace=0, adjust=1 
## - Fold10: usekernel= TRUE, laplace=0, adjust=1 
## + Fold10: usekernel=FALSE, laplace=0, adjust=1 
## - Fold10: usekernel=FALSE, laplace=0, adjust=1 
## Aggregating results
## Selecting tuning parameters
## Fitting laplace = 0, usekernel = FALSE, adjust = 1 on full training set
print(nb_cv)
## Naive Bayes 
## 
## 15794 samples
##     5 predictor
##     2 classes: 'BBLN', 'BBLTN' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 14215, 14215, 14215, 14215, 14213, 14214, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.6082052  0.2164282
##    TRUE      0.5957967  0.1917538
## 
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = FALSE
##  and adjust = 1.
confusionMatrix(predict(nb_cv, newdata=test_set) %>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLTN
##      BBLN  2024  1235
##      BBLTN 1356  2155
##                                           
##                Accuracy : 0.6173          
##                  95% CI : (0.6056, 0.6289)
##     No Information Rate : 0.5007          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.2345          
##                                           
##  Mcnemar's Test P-Value : 0.0184          
##                                           
##             Sensitivity : 0.5988          
##             Specificity : 0.6357          
##          Pos Pred Value : 0.6210          
##          Neg Pred Value : 0.6138          
##              Prevalence : 0.4993          
##          Detection Rate : 0.2990          
##    Detection Prevalence : 0.4814          
##       Balanced Accuracy : 0.6173          
##                                           
##        'Positive' Class : BBLN            
## 

Lalu akan dibuat kurva ROC untuk membandingkan kedua metode tersebut

par(pty="s") 
# ROC untuk dtree
dtreeROC <- roc(ifelse(test_set$BBL == "BBLN", 0, 1),
                ifelse(pred_dtree == "BBLN", 0, 1), plot=TRUE, print.auc=TRUE, col="green", lwd =4, legacy.axes=TRUE, main="ROC Curves")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#ROC untuk naive bayes
nbayesROC <- roc(ifelse(test_set$BBL == "BBLN", 0, 1), ifelse(pred_nb == "BBLN", 0, 1), plot=TRUE, print.auc=TRUE, col="blue", lwd = 4, print.auc.y=0.4, legacy.axes=TRUE, add = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend=c("Decision Tree","Naive Bayes"),col=c("green", "blue"),lwd=4)