Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Import Dataset Kita yagesya

setwd("D:/STIS/4SE/4. Data Mining/Tugas/FIX BENERAN YAK GUA GEBUG LUH")
library(readxl)
## Warning: package 'readxl' was built under R version 4.0.5
Data <- read_excel("datafix.xlsx")
View(Data)

Load Library kita yagesya

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(data.table)
## Warning: package 'data.table' was built under R version 4.0.5
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.0.5
library(ggplot2)
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.5
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.0.5
dataset<-as.data.frame(Data)
str(dataset)
## 'data.frame':    2181 obs. of  7 variables:
##  $ BBL        : chr  "BBLR" "BBLR" "BBLR" "BBLR" ...
##  $ PEND       : chr  "Secondary" "Primary" "Primary" "Secondary" ...
##  $ KLASIFIKASI: chr  "Rural" "Rural" "Rural" "Rural" ...
##  $ SK         : chr  "Poorer" "Poorer" "Poorer" "Poorer" ...
##  $ ANTE       : num  9 3 3 9 0 5 1 6 9 8 ...
##  $ UMUR       : num  18 16 23 15 22 21 17 20 27 25 ...
##  $ M19        : chr  "BBLR" "BBLR" "BBLR" "BBLR" ...
#View(dataset)
str(dataset)
## 'data.frame':    2181 obs. of  7 variables:
##  $ BBL        : chr  "BBLR" "BBLR" "BBLR" "BBLR" ...
##  $ PEND       : chr  "Secondary" "Primary" "Primary" "Secondary" ...
##  $ KLASIFIKASI: chr  "Rural" "Rural" "Rural" "Rural" ...
##  $ SK         : chr  "Poorer" "Poorer" "Poorer" "Poorer" ...
##  $ ANTE       : num  9 3 3 9 0 5 1 6 9 8 ...
##  $ UMUR       : num  18 16 23 15 22 21 17 20 27 25 ...
##  $ M19        : chr  "BBLR" "BBLR" "BBLR" "BBLR" ...
for(i in 1:4){
  dataset[,i] <- as.factor(dataset[,i])}
dataset$M19 <- as.factor(dataset$M19)
str(dataset)
## 'data.frame':    2181 obs. of  7 variables:
##  $ BBL        : Factor w/ 2 levels "BBLN","BBLR": 2 2 2 2 2 2 2 2 2 2 ...
##  $ PEND       : Factor w/ 3 levels "Higher","Primary",..: 3 2 2 3 1 3 2 3 3 3 ...
##  $ KLASIFIKASI: Factor w/ 2 levels "Rural","Urban": 1 1 1 1 1 1 1 2 1 2 ...
##  $ SK         : Factor w/ 3 levels "Middle","Poorer",..: 2 2 2 2 2 2 2 2 3 2 ...
##  $ ANTE       : num  9 3 3 9 0 5 1 6 9 8 ...
##  $ UMUR       : num  18 16 23 15 22 21 17 20 27 25 ...
##  $ M19        : Factor w/ 3 levels "BBLL","BBLN",..: 3 3 3 3 3 3 3 3 3 3 ...
View(dataset)
propBBL<-table(dataset$BBL)
propBBL
## 
## BBLN BBLR 
## 1454  727
perc<-round(prop.table(propBBL), digits=2)
barplot(perc,
        main="Berat Bayi Lahir",
        xlab="Label",
        ylab="proporsi",
        col="brown",
        density = 10,
        angle = 45,
        names.arg = c("BBLN","BBLR"))

Membuat Model

attach(dataset)
model <- BBL ~ PEND + KLASIFIKASI + SK + ANTE + UMUR 

Membuat Splitting Data Train dan Data Test dengan 70:30 dan Cross Validation

set.seed(100)
sampling <- sample(1:nrow(dataset), 0.7*nrow(dataset))
train_set <- dataset[sampling,]
test_set <- dataset[-sampling,]

myControl <- trainControl(
  method = "cv",
  number = 10,
  verboseIter = TRUE
)

Decision Tree Splitting 70:30

dtree <- rpart(model, data = train_set, method = "class")
rpart.plot(dtree,extra = 106)

pred_dtree <- predict(dtree, newdata = test_set, type = "class")
confusionMatrix(pred_dtree,test_set$BBL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLR
##       BBLN  399  153
##       BBLR   38   65
##                                          
##                Accuracy : 0.7084         
##                  95% CI : (0.6719, 0.743)
##     No Information Rate : 0.6672         
##     P-Value [Acc > NIR] : 0.01332        
##                                          
##                   Kappa : 0.2434         
##                                          
##  Mcnemar's Test P-Value : < 2e-16        
##                                          
##             Sensitivity : 0.9130         
##             Specificity : 0.2982         
##          Pos Pred Value : 0.7228         
##          Neg Pred Value : 0.6311         
##              Prevalence : 0.6672         
##          Detection Rate : 0.6092         
##    Detection Prevalence : 0.8427         
##       Balanced Accuracy : 0.6056         
##                                          
##        'Positive' Class : BBLN           
## 

Decision Tree Cross Validation

dtree_cv <- train(model, data=train_set,
                  method='rpart',
                  trControl=myControl)
## + Fold01: cp=0.003274 
## - Fold01: cp=0.003274 
## + Fold02: cp=0.003274 
## - Fold02: cp=0.003274 
## + Fold03: cp=0.003274 
## - Fold03: cp=0.003274 
## + Fold04: cp=0.003274 
## - Fold04: cp=0.003274 
## + Fold05: cp=0.003274 
## - Fold05: cp=0.003274 
## + Fold06: cp=0.003274 
## - Fold06: cp=0.003274 
## + Fold07: cp=0.003274 
## - Fold07: cp=0.003274 
## + Fold08: cp=0.003274 
## - Fold08: cp=0.003274 
## + Fold09: cp=0.003274 
## - Fold09: cp=0.003274 
## + Fold10: cp=0.003274 
## - Fold10: cp=0.003274 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00786 on full training set
confusionMatrix(predict(dtree_cv,newdata = test_set)%>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLR
##       BBLN  399  153
##       BBLR   38   65
##                                          
##                Accuracy : 0.7084         
##                  95% CI : (0.6719, 0.743)
##     No Information Rate : 0.6672         
##     P-Value [Acc > NIR] : 0.01332        
##                                          
##                   Kappa : 0.2434         
##                                          
##  Mcnemar's Test P-Value : < 2e-16        
##                                          
##             Sensitivity : 0.9130         
##             Specificity : 0.2982         
##          Pos Pred Value : 0.7228         
##          Neg Pred Value : 0.6311         
##              Prevalence : 0.6672         
##          Detection Rate : 0.6092         
##    Detection Prevalence : 0.8427         
##       Balanced Accuracy : 0.6056         
##                                          
##        'Positive' Class : BBLN           
## 
#Random Forest dengan Splitting 70:30
rf <- randomForest(model, data = train_set)
print(rf)
## 
## Call:
##  randomForest(formula = model, data = train_set) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 33.22%
## Confusion matrix:
##      BBLN BBLR class.error
## BBLN  879  138   0.1356932
## BBLR  369  140   0.7249509
pred_rf <- predict(rf, newdata = test_set)
confusionMatrix(pred_rf %>% as.factor(), test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLR
##       BBLN  375  142
##       BBLR   62   76
##                                           
##                Accuracy : 0.6885          
##                  95% CI : (0.6515, 0.7239)
##     No Information Rate : 0.6672          
##     P-Value [Acc > NIR] : 0.1312          
##                                           
##                   Kappa : 0.2277          
##                                           
##  Mcnemar's Test P-Value : 3.182e-08       
##                                           
##             Sensitivity : 0.8581          
##             Specificity : 0.3486          
##          Pos Pred Value : 0.7253          
##          Neg Pred Value : 0.5507          
##              Prevalence : 0.6672          
##          Detection Rate : 0.5725          
##    Detection Prevalence : 0.7893          
##       Balanced Accuracy : 0.6034          
##                                           
##        'Positive' Class : BBLN            
## 
#Random Forest dengan Cross Validation
rf_cv <- train(model, data=train_set,
               method='rf',
               trControl=myControl)
## + Fold01: mtry=2 
## - Fold01: mtry=2 
## + Fold01: mtry=4 
## - Fold01: mtry=4 
## + Fold01: mtry=7 
## - Fold01: mtry=7 
## + Fold02: mtry=2 
## - Fold02: mtry=2 
## + Fold02: mtry=4 
## - Fold02: mtry=4 
## + Fold02: mtry=7 
## - Fold02: mtry=7 
## + Fold03: mtry=2 
## - Fold03: mtry=2 
## + Fold03: mtry=4 
## - Fold03: mtry=4 
## + Fold03: mtry=7 
## - Fold03: mtry=7 
## + Fold04: mtry=2 
## - Fold04: mtry=2 
## + Fold04: mtry=4 
## - Fold04: mtry=4 
## + Fold04: mtry=7 
## - Fold04: mtry=7 
## + Fold05: mtry=2 
## - Fold05: mtry=2 
## + Fold05: mtry=4 
## - Fold05: mtry=4 
## + Fold05: mtry=7 
## - Fold05: mtry=7 
## + Fold06: mtry=2 
## - Fold06: mtry=2 
## + Fold06: mtry=4 
## - Fold06: mtry=4 
## + Fold06: mtry=7 
## - Fold06: mtry=7 
## + Fold07: mtry=2 
## - Fold07: mtry=2 
## + Fold07: mtry=4 
## - Fold07: mtry=4 
## + Fold07: mtry=7 
## - Fold07: mtry=7 
## + Fold08: mtry=2 
## - Fold08: mtry=2 
## + Fold08: mtry=4 
## - Fold08: mtry=4 
## + Fold08: mtry=7 
## - Fold08: mtry=7 
## + Fold09: mtry=2 
## - Fold09: mtry=2 
## + Fold09: mtry=4 
## - Fold09: mtry=4 
## + Fold09: mtry=7 
## - Fold09: mtry=7 
## + Fold10: mtry=2 
## - Fold10: mtry=2 
## + Fold10: mtry=4 
## - Fold10: mtry=4 
## + Fold10: mtry=7 
## - Fold10: mtry=7 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
confusionMatrix(predict(rf_cv, newdata=test_set) %>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLR
##       BBLN  398  153
##       BBLR   39   65
##                                           
##                Accuracy : 0.7069          
##                  95% CI : (0.6704, 0.7415)
##     No Information Rate : 0.6672          
##     P-Value [Acc > NIR] : 0.01651         
##                                           
##                   Kappa : 0.2404          
##                                           
##  Mcnemar's Test P-Value : 3.49e-16        
##                                           
##             Sensitivity : 0.9108          
##             Specificity : 0.2982          
##          Pos Pred Value : 0.7223          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.6672          
##          Detection Rate : 0.6076          
##    Detection Prevalence : 0.8412          
##       Balanced Accuracy : 0.6045          
##                                           
##        'Positive' Class : BBLN            
## 

Naive Bayes dengan Splitting 70:30

nb <- naiveBayes(model, data = train_set)
print(nb)
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      BBLN      BBLR 
## 0.6664482 0.3335518 
## 
## Conditional probabilities:
##       PEND
## Y          Higher    Primary  Secondary
##   BBLN 0.19370698 0.17404130 0.63225172
##   BBLR 0.07662083 0.32809430 0.59528487
## 
##       KLASIFIKASI
## Y          Rural     Urban
##   BBLN 0.5368732 0.4631268
##   BBLR 0.5717092 0.4282908
## 
##       SK
## Y         Middle    Poorer    Richer
##   BBLN 0.2104228 0.4650934 0.3244838
##   BBLR 0.1984283 0.5992141 0.2023576
## 
##       ANTE
## Y          [,1]     [,2]
##   BBLN 5.590954 3.221153
##   BBLR 5.137525 3.361539
## 
##       UMUR
## Y          [,1]     [,2]
##   BBLN 22.11701 4.287305
##   BBLR 21.50884 4.488658
pred_nb <- predict(nb, newdata = test_set)
confusionMatrix(pred_nb, test_set$BBL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLR
##       BBLN  393  151
##       BBLR   44   67
##                                           
##                Accuracy : 0.7023          
##                  95% CI : (0.6657, 0.7371)
##     No Information Rate : 0.6672          
##     P-Value [Acc > NIR] : 0.03019         
##                                           
##                   Kappa : 0.2356          
##                                           
##  Mcnemar's Test P-Value : 3.179e-14       
##                                           
##             Sensitivity : 0.8993          
##             Specificity : 0.3073          
##          Pos Pred Value : 0.7224          
##          Neg Pred Value : 0.6036          
##              Prevalence : 0.6672          
##          Detection Rate : 0.6000          
##    Detection Prevalence : 0.8305          
##       Balanced Accuracy : 0.6033          
##                                           
##        'Positive' Class : BBLN            
## 

Naive Bayes dengan Cross Validation

nb_cv <- train(model, data=train_set,
               method='naive_bayes',
               trControl=myControl)
## + Fold01: usekernel= TRUE, laplace=0, adjust=1 
## - Fold01: usekernel= TRUE, laplace=0, adjust=1 
## + Fold01: usekernel=FALSE, laplace=0, adjust=1 
## - Fold01: usekernel=FALSE, laplace=0, adjust=1 
## + Fold02: usekernel= TRUE, laplace=0, adjust=1 
## - Fold02: usekernel= TRUE, laplace=0, adjust=1 
## + Fold02: usekernel=FALSE, laplace=0, adjust=1 
## - Fold02: usekernel=FALSE, laplace=0, adjust=1 
## + Fold03: usekernel= TRUE, laplace=0, adjust=1 
## - Fold03: usekernel= TRUE, laplace=0, adjust=1 
## + Fold03: usekernel=FALSE, laplace=0, adjust=1 
## - Fold03: usekernel=FALSE, laplace=0, adjust=1 
## + Fold04: usekernel= TRUE, laplace=0, adjust=1 
## - Fold04: usekernel= TRUE, laplace=0, adjust=1 
## + Fold04: usekernel=FALSE, laplace=0, adjust=1 
## - Fold04: usekernel=FALSE, laplace=0, adjust=1 
## + Fold05: usekernel= TRUE, laplace=0, adjust=1 
## - Fold05: usekernel= TRUE, laplace=0, adjust=1 
## + Fold05: usekernel=FALSE, laplace=0, adjust=1 
## - Fold05: usekernel=FALSE, laplace=0, adjust=1 
## + Fold06: usekernel= TRUE, laplace=0, adjust=1 
## - Fold06: usekernel= TRUE, laplace=0, adjust=1 
## + Fold06: usekernel=FALSE, laplace=0, adjust=1 
## - Fold06: usekernel=FALSE, laplace=0, adjust=1 
## + Fold07: usekernel= TRUE, laplace=0, adjust=1 
## - Fold07: usekernel= TRUE, laplace=0, adjust=1 
## + Fold07: usekernel=FALSE, laplace=0, adjust=1 
## - Fold07: usekernel=FALSE, laplace=0, adjust=1 
## + Fold08: usekernel= TRUE, laplace=0, adjust=1 
## - Fold08: usekernel= TRUE, laplace=0, adjust=1 
## + Fold08: usekernel=FALSE, laplace=0, adjust=1 
## - Fold08: usekernel=FALSE, laplace=0, adjust=1 
## + Fold09: usekernel= TRUE, laplace=0, adjust=1 
## - Fold09: usekernel= TRUE, laplace=0, adjust=1 
## + Fold09: usekernel=FALSE, laplace=0, adjust=1 
## - Fold09: usekernel=FALSE, laplace=0, adjust=1 
## + Fold10: usekernel= TRUE, laplace=0, adjust=1 
## - Fold10: usekernel= TRUE, laplace=0, adjust=1 
## + Fold10: usekernel=FALSE, laplace=0, adjust=1 
## - Fold10: usekernel=FALSE, laplace=0, adjust=1 
## Aggregating results
## Selecting tuning parameters
## Fitting laplace = 0, usekernel = TRUE, adjust = 1 on full training set
print(nb_cv)
## Naive Bayes 
## 
## 1526 samples
##    5 predictor
##    2 classes: 'BBLN', 'BBLR' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1373, 1373, 1373, 1375, 1374, 1373, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa     
##   FALSE      0.6710889  0.17381701
##    TRUE      0.6723441  0.04392668
## 
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = TRUE
##  and adjust = 1.
confusionMatrix(predict(nb_cv, newdata=test_set) %>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction BBLN BBLR
##       BBLN  426  204
##       BBLR   11   14
##                                           
##                Accuracy : 0.6718          
##                  95% CI : (0.6343, 0.7076)
##     No Information Rate : 0.6672          
##     P-Value [Acc > NIR] : 0.4196          
##                                           
##                   Kappa : 0.0502          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.97483         
##             Specificity : 0.06422         
##          Pos Pred Value : 0.67619         
##          Neg Pred Value : 0.56000         
##              Prevalence : 0.66718         
##          Detection Rate : 0.65038         
##    Detection Prevalence : 0.96183         
##       Balanced Accuracy : 0.51952         
##                                           
##        'Positive' Class : BBLN            
##