Import Dataset Kita yagesya

setwd("D:/STIS/4SE/4. Data Mining/Tugas/Bismillah Dulu")
library(readxl)
## Warning: package 'readxl' was built under R version 4.0.5
Data <- read_excel("dataya.xlsx")
#View(Data)

Load Library kita yagesya

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(data.table)
## Warning: package 'data.table' was built under R version 4.0.5
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.0.5
library(ggplot2)
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.5
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.0.5
dataset<-as.data.frame(Data)
str(dataset)
## 'data.frame':    17848 obs. of  7 variables:
##  $ V102  : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ V106  : num  1 1 1 0 3 1 1 1 3 3 ...
##  $ V190  : num  1 1 3 1 5 2 2 1 2 2 ...
##  $ V212  : num  21 21 17 22 26 18 18 18 25 25 ...
##  $ M14   : num  6 NA 9 9 7 9 NA 9 4 NA ...
##  $ M19   : num  2100 2000 3000 2500 3400 5000 4500 3000 4700 3600 ...
##  $ M14nol: num  6 0 9 9 7 9 0 9 4 0 ...
#View(dataset)
dataset <- rename(dataset,Wilayah=V102)
dataset <- rename(dataset,Pend=V106)
dataset <- rename(dataset,SK=V190)
dataset <- rename(dataset,Umur=V212)
dataset <- rename(dataset,Ante=M14)
dataset <- rename(dataset,Berat=M19)

str(dataset)
## 'data.frame':    17848 obs. of  7 variables:
##  $ Wilayah: num  2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : num  1 1 1 0 3 1 1 1 3 3 ...
##  $ SK     : num  1 1 3 1 5 2 2 1 2 2 ...
##  $ Umur   : num  21 21 17 22 26 18 18 18 25 25 ...
##  $ Ante   : num  6 NA 9 9 7 9 NA 9 4 NA ...
##  $ Berat  : num  2100 2000 3000 2500 3400 5000 4500 3000 4700 3600 ...
##  $ M14nol : num  6 0 9 9 7 9 0 9 4 0 ...
for(i in 1:3){
  dataset[,i] <- as.factor(dataset[,i])}
str(dataset)
## 'data.frame':    17848 obs. of  7 variables:
##  $ Wilayah: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : Factor w/ 4 levels "0","1","2","3": 2 2 2 1 4 2 2 2 4 4 ...
##  $ SK     : Factor w/ 5 levels "1","2","3","4",..: 1 1 3 1 5 2 2 1 2 2 ...
##  $ Umur   : num  21 21 17 22 26 18 18 18 25 25 ...
##  $ Ante   : num  6 NA 9 9 7 9 NA 9 4 NA ...
##  $ Berat  : num  2100 2000 3000 2500 3400 5000 4500 3000 4700 3600 ...
##  $ M14nol : num  6 0 9 9 7 9 0 9 4 0 ...
#View(dataset)
dataset <- mutate(dataset,BBL=Berat)

for(i in 1:nrow(dataset)){
  if (dataset$Berat[i] <= 2500)
    {dataset$BBL[i] <- "BBLR"}
  else if (dataset$Berat[i] > 2500 && dataset$Berat[i] <= 3999)
  {dataset$BBL[i] <- "BBLN"}
  else
  {dataset$BBL[i] <- "BBLL"}
}

dataset$BBL <- as.factor(dataset$BBL)
str(dataset)
## 'data.frame':    17848 obs. of  8 variables:
##  $ Wilayah: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : Factor w/ 4 levels "0","1","2","3": 2 2 2 1 4 2 2 2 4 4 ...
##  $ SK     : Factor w/ 5 levels "1","2","3","4",..: 1 1 3 1 5 2 2 1 2 2 ...
##  $ Umur   : num  21 21 17 22 26 18 18 18 25 25 ...
##  $ Ante   : num  6 NA 9 9 7 9 NA 9 4 NA ...
##  $ Berat  : num  2100 2000 3000 2500 3400 5000 4500 3000 4700 3600 ...
##  $ M14nol : num  6 0 9 9 7 9 0 9 4 0 ...
##  $ BBL    : Factor w/ 3 levels "BBLL","BBLN",..: 3 3 2 3 2 1 1 2 1 2 ...

buang missing value & cek outlier

dataset <- na.omit(dataset)

boxplot(dataset$Umur)

boxplot(dataset$Ante)

boxplot(dataset$Umur, plot=FALSE)$out
##   [1] 36 37 38 43 37 37 37 36 35 37 38 37 44 36 35 35 42 36 37 36 46 35 35 36 35
##  [26] 35 36 41 36 36 39 36 35 39 37 37 38 35 39 39 35 38 41 42 39 35 35 42 37 35
##  [51] 37 39 39 37 35 35 36 44 36 38 42 35 35 37 40 37 42 43 39 41 42 35 41 35 37
##  [76] 38 38 42 38 39 42 40 38 40 36 39 39 38 38 35 37 38 39 41 37 35 37 35 37 38
## [101] 35 39 36 38 40 35 38 36 35 37 36 39 41 37 40 36 41 45 40 35 36 36 40 35 36
## [126] 38 41 35 37 36 35 35 42 41 36 37 37 36 35 36 40 38 39 43 35 38 37 37 37 37
## [151] 35 35 44 38 38 37 36 37 37 35 41 36 39 35 36 38 38 35 41 37 35 35 38 37 39
## [176] 35 37 38 38 38 35 35 37 43 42 39 35 41 40 37
boxplot(dataset$Ante, plot=FALSE)$out
##   [1] 18 21 98 98 98 36 24 26 98 98 18 28 98 98 98 18 98 98 98 18 18 21 98 98 32
##  [26] 98 18 32 98 98 98 18 17 17 18 26 17 17 18 17 17 98 17 21 20 19 18 18 17 18
##  [51] 18 26 99 25 20 98 21 98 23 21 18 32 20 18 17 98 18 18 25 32 98 20 24 18 18
##  [76] 18 19 18 17 22 98 19 17 98 33 98 18 27 98 98 99 18 18 17 40 20 18 18 18 18
## [101] 18 19 19 19 98 17 20 19 17 18 18 98 18 17 20 27 18 17 98 20 20 98 18 18 98
## [126] 21 20 21 17 18 20 17 18 17 98 98 17 99 18 24 24 20 19 20 20 22 18 21 18 20
## [151] 21 18 18 18 20 20 21 28 98 23 98 20 18 19 19 17 29 17 18 17 19 25 17 18 21
## [176] 98 18 24 18 17 32 20 19 20 17 19 21 22 20 18 17 18 18 24 17 21 18 18 21 17
## [201] 17 22 27 18 18 98 28 20 17 17 22 24 24 18 20 17 20 18 98 98 18 98 18 18 18
## [226] 18 98 18 23 98 17 21 98 98 99 98 18 98 98 98 18 27 22 24 32 17 18 17 98 17
## [251] 27 98 20 17 28 98 98 17 18 18 18 17 18 18 18 40 20 20 22 27 98 19 20 18 18
## [276] 17 18 26 28 98 98 98 24 98 98 20 98 22 98 98 27 17 25 17 99 23 18 24 40 22
## [301] 98 98 98 98 18 20 98 23 18 99 98 98 98 98 98 98 98 22 98 98 98 98 98
outliers <- boxplot(dataset$Ante, plot=FALSE)$out


dataset <- dataset[-which(dataset$Ante %in% outliers),]
#View(dataset)

============================ DATASET1 COBA

dataset1 <- dataset
dataset1$BBL <- gsub("BBLL", "BBLTN", dataset1$BBL)
dataset1$BBL <- gsub("BBLR", "BBLTN", dataset1$BBL)
dataset1$BBL <- as.factor(dataset1$BBL)


propBBL<-table(dataset1$BBL)
propBBL
## 
##  BBLN BBLTN 
## 11282  3752
perc<-round(prop.table(propBBL), digits=2)
barplot(perc,
        main="Berat Bayi Lahir",
        xlab="Label",
        ylab="proporsi",
        col="brown",
        density = 10,
        angle = 45,
        names.arg = c("BBLN","BBLTN"))

with(dataset1,
    {
      print(table(Wilayah));
      print(table(Pend));
      print(table(SK));
      print(table(BBL));
    }
)
## Wilayah
##    1    2 
## 7384 7650 
## Pend
##    0    1    2    3 
##  197 3791 8466 2580 
## SK
##    1    2    3    4    5 
## 4014 2987 2826 2685 2522 
## BBL
##  BBLN BBLTN 
## 11282  3752
 # dataset1 <- na.omit(dataset1)
  
  #BBLN <- which(dataset1$BBL == "BBLN")
  #BBLTN <- which(dataset1$BBL =="BBLTN")
  #length(BBLN)
  #length(BBLTN)
  #BBLN.downsample <- sample(BBLN,length(BBLTN))
  #dataset1.down <- dataset1[c(BBLN.downsample,BBLTN),]
  #View(dataset1.down)
#str(dataset1.down)
library(caret)
dataset3 <- dataset1
str(dataset3)
## 'data.frame':    15034 obs. of  8 variables:
##  $ Wilayah: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : Factor w/ 4 levels "0","1","2","3": 2 2 1 4 2 2 4 4 3 2 ...
##  $ SK     : Factor w/ 5 levels "1","2","3","4",..: 1 3 1 5 2 1 2 2 1 1 ...
##  $ Umur   : num  21 17 22 26 18 18 25 36 20 17 ...
##  $ Ante   : num  6 9 9 7 9 9 4 9 7 4 ...
##  $ Berat  : num  2100 3000 2500 3400 5000 3000 4700 4100 3500 2800 ...
##  $ M14nol : num  6 9 9 7 9 9 4 9 7 4 ...
##  $ BBL    : Factor w/ 2 levels "BBLN","BBLTN": 2 1 2 1 2 1 2 2 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2491] 2 7 10 12 26 40 43 60 91 93 ...
##   ..- attr(*, "names")= chr [1:2491] "2" "7" "10" "12" ...
propBBL<-table(dataset3$BBL)
propBBL
## 
##  BBLN BBLTN 
## 11282  3752
perc<-round(prop.table(propBBL), digits=2)
barplot(perc,
        main="Berat Bayi Lahir",
        xlab="Label",
        ylab="proporsi",
        col="brown",
        density = 10,
        angle = 45,
        names.arg = c("BBLN","BBLTN"))

newdf <- upSample(dataset3[,-8],dataset3$BBL,yname="BBL")
str(newdf)
## 'data.frame':    22564 obs. of  8 variables:
##  $ Wilayah: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Pend   : Factor w/ 4 levels "0","1","2","3": 2 4 2 3 2 2 3 3 3 3 ...
##  $ SK     : Factor w/ 5 levels "1","2","3","4",..: 3 5 1 1 1 2 4 2 4 2 ...
##  $ Umur   : num  17 26 18 20 17 19 18 27 21 30 ...
##  $ Ante   : num  9 7 9 7 4 7 4 9 3 8 ...
##  $ Berat  : num  3000 3400 3000 3500 2800 2800 3000 3500 3000 2800 ...
##  $ M14nol : num  9 7 9 7 4 7 4 9 3 8 ...
##  $ BBL    : Factor w/ 2 levels "BBLN","BBLTN": 1 1 1 1 1 1 1 1 1 1 ...
with(newdf,
    {
      print(table(Wilayah));
      print(table(Pend));
      print(table(SK));
      print(table(BBL));
    }
)
## Wilayah
##     1     2 
## 10200 12364 
## Pend
##     0     1     2     3 
##   450  6365 12208  3541 
## SK
##    1    2    3    4    5 
## 7212 4442 3961 3671 3278 
## BBL
##  BBLN BBLTN 
## 11282 11282
propBBL<-table(newdf$BBL)
propBBL
## 
##  BBLN BBLTN 
## 11282 11282
perc<-round(prop.table(propBBL), digits=2)
barplot(perc,
        main="Berat Bayi Lahir",
        xlab="Label",
        ylab="proporsi",
        col="brown",
        density = 10,
        angle = 45,
        names.arg = c("BBLN","BBLTN"))

===============================

===============================================##########################=====================

#propBBL<-table(dataset$BBL)
#propBBL
#perc<-round(prop.table(propBBL), digits=2)
#barplot(perc,
      #  main="Berat Bayi Lahir",
      #  xlab="Label",
      #  ylab="proporsi",
      #  col="brown",
      #  density = 10,
      #  angle = 45,
      #  names.arg = c("BBLL","BBLN","BBLR"))

=======================================##############################=============================

Membuat Model

attach(newdf)
model <- BBL ~ Pend + SK + Ante + Umur + Wilayah

Membuat Splitting Data Train dan Data Test dengan 70:30 dan Cross Validation

set.seed(100)
sampling <- sample(1:nrow(newdf), 0.7*nrow(newdf))
train_set <- newdf[sampling,]
test_set <- newdf[-sampling,]

myControl <- trainControl(
  method = "cv",
  number = 10,
  verboseIter = TRUE
)

Decision Tree Splitting 70:30

dtree <- rpart(model, data = train_set, method = "class")
rpart.plot(dtree,extra = 106)

pred_dtree <- predict(dtree, newdata = test_set, type = "class")
confusionMatrix(pred_dtree,test_set$BBL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLTN
##      BBLN  2594  1782
##      BBLTN  786  1608
##                                          
##                Accuracy : 0.6207         
##                  95% CI : (0.609, 0.6323)
##     No Information Rate : 0.5007         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.2417         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.7675         
##             Specificity : 0.4743         
##          Pos Pred Value : 0.5928         
##          Neg Pred Value : 0.6717         
##              Prevalence : 0.4993         
##          Detection Rate : 0.3832         
##    Detection Prevalence : 0.6464         
##       Balanced Accuracy : 0.6209         
##                                          
##        'Positive' Class : BBLN           
## 

Decision Tree Cross Validation

dtree_cv <- train(model, data=train_set,
                  method='rpart',
                  trControl=myControl)
## + Fold01: cp=0.005406 
## - Fold01: cp=0.005406 
## + Fold02: cp=0.005406 
## - Fold02: cp=0.005406 
## + Fold03: cp=0.005406 
## - Fold03: cp=0.005406 
## + Fold04: cp=0.005406 
## - Fold04: cp=0.005406 
## + Fold05: cp=0.005406 
## - Fold05: cp=0.005406 
## + Fold06: cp=0.005406 
## - Fold06: cp=0.005406 
## + Fold07: cp=0.005406 
## - Fold07: cp=0.005406 
## + Fold08: cp=0.005406 
## - Fold08: cp=0.005406 
## + Fold09: cp=0.005406 
## - Fold09: cp=0.005406 
## + Fold10: cp=0.005406 
## - Fold10: cp=0.005406 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00541 on full training set
confusionMatrix(predict(dtree_cv,newdata = test_set)%>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLTN
##      BBLN  2315  1682
##      BBLTN 1065  1708
##                                          
##                Accuracy : 0.5942         
##                  95% CI : (0.5824, 0.606)
##     No Information Rate : 0.5007         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.1887         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.6849         
##             Specificity : 0.5038         
##          Pos Pred Value : 0.5792         
##          Neg Pred Value : 0.6159         
##              Prevalence : 0.4993         
##          Detection Rate : 0.3419         
##    Detection Prevalence : 0.5904         
##       Balanced Accuracy : 0.5944         
##                                          
##        'Positive' Class : BBLN           
## 
#Random Forest dengan Splitting 70:30
#rf <- randomForest(model, data = train_set)
#print(rf)
#pred_rf <- predict(rf, newdata = test_set)
#confusionMatrix(pred_rf %>% as.factor(), test_set$BBL %>% as.factor())
#Random Forest dengan Cross Validation
#rf_cv <- train(model, data=train_set,
 #              method='rf',
 #              trControl=myControl)
#confusionMatrix(predict(rf_cv, newdata=test_set) %>% as.factor(),test_set$BBL %>% as.factor())

Naive Bayes dengan Splitting 70:30

nb <- naiveBayes(model, data = train_set)
print(nb)
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      BBLN     BBLTN 
## 0.5003166 0.4996834 
## 
## Conditional probabilities:
##        Pend
## Y                 0           1           2           3
##   BBLN  0.006833713 0.220956720 0.585674513 0.186535054
##   BBLTN 0.033831728 0.348454131 0.495818550 0.121895590
## 
##        SK
## Y                1          2          3          4          5
##   BBLN  0.21589471 0.19792458 0.20032903 0.19741838 0.18843331
##   BBLTN 0.42372022 0.19386721 0.15534719 0.12886467 0.09820071
## 
##        Ante
## Y           [,1]     [,2]
##   BBLN  8.021893 3.041734
##   BBLTN 6.375697 3.437017
## 
##        Umur
## Y           [,1]     [,2]
##   BBLN  22.42356 4.323949
##   BBLTN 21.48961 4.427969
## 
##        Wilayah
## Y               1         2
##   BBLN  0.5234118 0.4765882
##   BBLTN 0.3767106 0.6232894
pred_nb <- predict(nb, newdata = test_set)
confusionMatrix(pred_nb, test_set$BBL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLTN
##      BBLN  2232  1406
##      BBLTN 1148  1984
##                                           
##                Accuracy : 0.6227          
##                  95% CI : (0.6111, 0.6343)
##     No Information Rate : 0.5007          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2456          
##                                           
##  Mcnemar's Test P-Value : 3.669e-07       
##                                           
##             Sensitivity : 0.6604          
##             Specificity : 0.5853          
##          Pos Pred Value : 0.6135          
##          Neg Pred Value : 0.6335          
##              Prevalence : 0.4993          
##          Detection Rate : 0.3297          
##    Detection Prevalence : 0.5374          
##       Balanced Accuracy : 0.6228          
##                                           
##        'Positive' Class : BBLN            
## 

Naive Bayes dengan Cross Validation

nb_cv <- train(model, data=train_set,
               method='naive_bayes',
               trControl=myControl)
## + Fold01: usekernel= TRUE, laplace=0, adjust=1 
## - Fold01: usekernel= TRUE, laplace=0, adjust=1 
## + Fold01: usekernel=FALSE, laplace=0, adjust=1 
## - Fold01: usekernel=FALSE, laplace=0, adjust=1 
## + Fold02: usekernel= TRUE, laplace=0, adjust=1 
## - Fold02: usekernel= TRUE, laplace=0, adjust=1 
## + Fold02: usekernel=FALSE, laplace=0, adjust=1 
## - Fold02: usekernel=FALSE, laplace=0, adjust=1 
## + Fold03: usekernel= TRUE, laplace=0, adjust=1 
## - Fold03: usekernel= TRUE, laplace=0, adjust=1 
## + Fold03: usekernel=FALSE, laplace=0, adjust=1 
## - Fold03: usekernel=FALSE, laplace=0, adjust=1 
## + Fold04: usekernel= TRUE, laplace=0, adjust=1 
## - Fold04: usekernel= TRUE, laplace=0, adjust=1 
## + Fold04: usekernel=FALSE, laplace=0, adjust=1 
## - Fold04: usekernel=FALSE, laplace=0, adjust=1 
## + Fold05: usekernel= TRUE, laplace=0, adjust=1 
## - Fold05: usekernel= TRUE, laplace=0, adjust=1 
## + Fold05: usekernel=FALSE, laplace=0, adjust=1 
## - Fold05: usekernel=FALSE, laplace=0, adjust=1 
## + Fold06: usekernel= TRUE, laplace=0, adjust=1 
## - Fold06: usekernel= TRUE, laplace=0, adjust=1 
## + Fold06: usekernel=FALSE, laplace=0, adjust=1 
## - Fold06: usekernel=FALSE, laplace=0, adjust=1 
## + Fold07: usekernel= TRUE, laplace=0, adjust=1 
## - Fold07: usekernel= TRUE, laplace=0, adjust=1 
## + Fold07: usekernel=FALSE, laplace=0, adjust=1 
## - Fold07: usekernel=FALSE, laplace=0, adjust=1 
## + Fold08: usekernel= TRUE, laplace=0, adjust=1 
## - Fold08: usekernel= TRUE, laplace=0, adjust=1 
## + Fold08: usekernel=FALSE, laplace=0, adjust=1 
## - Fold08: usekernel=FALSE, laplace=0, adjust=1 
## + Fold09: usekernel= TRUE, laplace=0, adjust=1 
## - Fold09: usekernel= TRUE, laplace=0, adjust=1 
## + Fold09: usekernel=FALSE, laplace=0, adjust=1 
## - Fold09: usekernel=FALSE, laplace=0, adjust=1 
## + Fold10: usekernel= TRUE, laplace=0, adjust=1 
## - Fold10: usekernel= TRUE, laplace=0, adjust=1 
## + Fold10: usekernel=FALSE, laplace=0, adjust=1 
## - Fold10: usekernel=FALSE, laplace=0, adjust=1 
## Aggregating results
## Selecting tuning parameters
## Fitting laplace = 0, usekernel = FALSE, adjust = 1 on full training set
print(nb_cv)
## Naive Bayes 
## 
## 15794 samples
##     5 predictor
##     2 classes: 'BBLN', 'BBLTN' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 14215, 14215, 14215, 14215, 14213, 14214, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.6080146  0.2160635
##    TRUE      0.5967474  0.1936645
## 
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = FALSE
##  and adjust = 1.
confusionMatrix(predict(nb_cv, newdata=test_set) %>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLTN
##      BBLN  1988  1237
##      BBLTN 1392  2153
##                                           
##                Accuracy : 0.6117          
##                  95% CI : (0.5999, 0.6233)
##     No Information Rate : 0.5007          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2233          
##                                           
##  Mcnemar's Test P-Value : 0.002669        
##                                           
##             Sensitivity : 0.5882          
##             Specificity : 0.6351          
##          Pos Pred Value : 0.6164          
##          Neg Pred Value : 0.6073          
##              Prevalence : 0.4993          
##          Detection Rate : 0.2936          
##    Detection Prevalence : 0.4764          
##       Balanced Accuracy : 0.6116          
##                                           
##        'Positive' Class : BBLN            
## 

Membuat Kurva ROC

par(pty="s") 
# ROC untuk dtree
dtreeROC <- roc(ifelse(test_set$BBL == "BBLN", 0, 1),
                ifelse(pred_dtree == "BBLN", 0, 1), plot=TRUE, print.auc=TRUE, col="green", lwd =4, legacy.axes=TRUE, main="ROC Curves")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#ROC untuk naive bayes
nbayesROC <- roc(ifelse(test_set$BBL == "BBLN", 0, 1), ifelse(pred_nb == "BBLN", 0, 1), plot=TRUE, print.auc=TRUE, col="blue", lwd = 4, print.auc.y=0.4, legacy.axes=TRUE, add = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottomright", legend=c("Decision Tree","Naive Bayes"),col=c("green", "blue"),lwd=4)