Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
Import Dataset Kita yagesya
setwd("D:/STIS/4SE/4. Data Mining/Tugas/FIX BENERAN YAK GUA GEBUG LUH")
library(readxl)
## Warning: package 'readxl' was built under R version 4.0.5
Data <- read_excel("datafix.xlsx")
View(Data)
Load Library kita yagesya
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(data.table)
## Warning: package 'data.table' was built under R version 4.0.5
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.0.5
library(ggplot2)
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.5
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.0.5
dataset<-as.data.frame(Data)
str(dataset)
## 'data.frame': 2181 obs. of 7 variables:
## $ BBL : chr "BBLR" "BBLR" "BBLR" "BBLR" ...
## $ PEND : chr "Secondary" "Primary" "Primary" "Secondary" ...
## $ KLASIFIKASI: chr "Rural" "Rural" "Rural" "Rural" ...
## $ SK : chr "Poorer" "Poorer" "Poorer" "Poorer" ...
## $ ANTE : num 9 3 3 9 0 5 1 6 9 8 ...
## $ UMUR : num 18 16 23 15 22 21 17 20 27 25 ...
## $ M19 : chr "BBLR" "BBLR" "BBLR" "BBLR" ...
#View(dataset)
str(dataset)
## 'data.frame': 2181 obs. of 7 variables:
## $ BBL : chr "BBLR" "BBLR" "BBLR" "BBLR" ...
## $ PEND : chr "Secondary" "Primary" "Primary" "Secondary" ...
## $ KLASIFIKASI: chr "Rural" "Rural" "Rural" "Rural" ...
## $ SK : chr "Poorer" "Poorer" "Poorer" "Poorer" ...
## $ ANTE : num 9 3 3 9 0 5 1 6 9 8 ...
## $ UMUR : num 18 16 23 15 22 21 17 20 27 25 ...
## $ M19 : chr "BBLR" "BBLR" "BBLR" "BBLR" ...
for(i in 1:4){
dataset[,i] <- as.factor(dataset[,i])}
dataset$M19 <- as.factor(dataset$M19)
str(dataset)
## 'data.frame': 2181 obs. of 7 variables:
## $ BBL : Factor w/ 2 levels "BBLN","BBLR": 2 2 2 2 2 2 2 2 2 2 ...
## $ PEND : Factor w/ 3 levels "Higher","Primary",..: 3 2 2 3 1 3 2 3 3 3 ...
## $ KLASIFIKASI: Factor w/ 2 levels "Rural","Urban": 1 1 1 1 1 1 1 2 1 2 ...
## $ SK : Factor w/ 3 levels "Middle","Poorer",..: 2 2 2 2 2 2 2 2 3 2 ...
## $ ANTE : num 9 3 3 9 0 5 1 6 9 8 ...
## $ UMUR : num 18 16 23 15 22 21 17 20 27 25 ...
## $ M19 : Factor w/ 3 levels "BBLL","BBLN",..: 3 3 3 3 3 3 3 3 3 3 ...
View(dataset)
propBBL<-table(dataset$BBL)
propBBL
##
## BBLN BBLR
## 1454 727
perc<-round(prop.table(propBBL), digits=2)
barplot(perc,
main="Berat Bayi Lahir",
xlab="Label",
ylab="proporsi",
col="brown",
density = 10,
angle = 45,
names.arg = c("BBLN","BBLR"))
Membuat Model
attach(dataset)
model <- BBL ~ PEND + KLASIFIKASI + SK + ANTE + UMUR
Membuat Splitting Data Train dan Data Test dengan 70:30 dan Cross Validation
set.seed(100)
sampling <- sample(1:nrow(dataset), 0.7*nrow(dataset))
train_set <- dataset[sampling,]
test_set <- dataset[-sampling,]
myControl <- trainControl(
method = "cv",
number = 10,
verboseIter = TRUE
)
Decision Tree Splitting 70:30
dtree <- rpart(model, data = train_set, method = "class")
rpart.plot(dtree,extra = 106)
pred_dtree <- predict(dtree, newdata = test_set, type = "class")
confusionMatrix(pred_dtree,test_set$BBL)
## Confusion Matrix and Statistics
##
## Reference
## Prediction BBLN BBLR
## BBLN 399 153
## BBLR 38 65
##
## Accuracy : 0.7084
## 95% CI : (0.6719, 0.743)
## No Information Rate : 0.6672
## P-Value [Acc > NIR] : 0.01332
##
## Kappa : 0.2434
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9130
## Specificity : 0.2982
## Pos Pred Value : 0.7228
## Neg Pred Value : 0.6311
## Prevalence : 0.6672
## Detection Rate : 0.6092
## Detection Prevalence : 0.8427
## Balanced Accuracy : 0.6056
##
## 'Positive' Class : BBLN
##
Decision Tree Cross Validation
dtree_cv <- train(model, data=train_set,
method='rpart',
trControl=myControl)
## + Fold01: cp=0.003274
## - Fold01: cp=0.003274
## + Fold02: cp=0.003274
## - Fold02: cp=0.003274
## + Fold03: cp=0.003274
## - Fold03: cp=0.003274
## + Fold04: cp=0.003274
## - Fold04: cp=0.003274
## + Fold05: cp=0.003274
## - Fold05: cp=0.003274
## + Fold06: cp=0.003274
## - Fold06: cp=0.003274
## + Fold07: cp=0.003274
## - Fold07: cp=0.003274
## + Fold08: cp=0.003274
## - Fold08: cp=0.003274
## + Fold09: cp=0.003274
## - Fold09: cp=0.003274
## + Fold10: cp=0.003274
## - Fold10: cp=0.003274
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00786 on full training set
confusionMatrix(predict(dtree_cv,newdata = test_set)%>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
##
## Reference
## Prediction BBLN BBLR
## BBLN 399 153
## BBLR 38 65
##
## Accuracy : 0.7084
## 95% CI : (0.6719, 0.743)
## No Information Rate : 0.6672
## P-Value [Acc > NIR] : 0.01332
##
## Kappa : 0.2434
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9130
## Specificity : 0.2982
## Pos Pred Value : 0.7228
## Neg Pred Value : 0.6311
## Prevalence : 0.6672
## Detection Rate : 0.6092
## Detection Prevalence : 0.8427
## Balanced Accuracy : 0.6056
##
## 'Positive' Class : BBLN
##
#Random Forest dengan Splitting 70:30
rf <- randomForest(model, data = train_set)
print(rf)
##
## Call:
## randomForest(formula = model, data = train_set)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 33.22%
## Confusion matrix:
## BBLN BBLR class.error
## BBLN 879 138 0.1356932
## BBLR 369 140 0.7249509
pred_rf <- predict(rf, newdata = test_set)
confusionMatrix(pred_rf %>% as.factor(), test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
##
## Reference
## Prediction BBLN BBLR
## BBLN 375 142
## BBLR 62 76
##
## Accuracy : 0.6885
## 95% CI : (0.6515, 0.7239)
## No Information Rate : 0.6672
## P-Value [Acc > NIR] : 0.1312
##
## Kappa : 0.2277
##
## Mcnemar's Test P-Value : 3.182e-08
##
## Sensitivity : 0.8581
## Specificity : 0.3486
## Pos Pred Value : 0.7253
## Neg Pred Value : 0.5507
## Prevalence : 0.6672
## Detection Rate : 0.5725
## Detection Prevalence : 0.7893
## Balanced Accuracy : 0.6034
##
## 'Positive' Class : BBLN
##
#Random Forest dengan Cross Validation
rf_cv <- train(model, data=train_set,
method='rf',
trControl=myControl)
## + Fold01: mtry=2
## - Fold01: mtry=2
## + Fold01: mtry=4
## - Fold01: mtry=4
## + Fold01: mtry=7
## - Fold01: mtry=7
## + Fold02: mtry=2
## - Fold02: mtry=2
## + Fold02: mtry=4
## - Fold02: mtry=4
## + Fold02: mtry=7
## - Fold02: mtry=7
## + Fold03: mtry=2
## - Fold03: mtry=2
## + Fold03: mtry=4
## - Fold03: mtry=4
## + Fold03: mtry=7
## - Fold03: mtry=7
## + Fold04: mtry=2
## - Fold04: mtry=2
## + Fold04: mtry=4
## - Fold04: mtry=4
## + Fold04: mtry=7
## - Fold04: mtry=7
## + Fold05: mtry=2
## - Fold05: mtry=2
## + Fold05: mtry=4
## - Fold05: mtry=4
## + Fold05: mtry=7
## - Fold05: mtry=7
## + Fold06: mtry=2
## - Fold06: mtry=2
## + Fold06: mtry=4
## - Fold06: mtry=4
## + Fold06: mtry=7
## - Fold06: mtry=7
## + Fold07: mtry=2
## - Fold07: mtry=2
## + Fold07: mtry=4
## - Fold07: mtry=4
## + Fold07: mtry=7
## - Fold07: mtry=7
## + Fold08: mtry=2
## - Fold08: mtry=2
## + Fold08: mtry=4
## - Fold08: mtry=4
## + Fold08: mtry=7
## - Fold08: mtry=7
## + Fold09: mtry=2
## - Fold09: mtry=2
## + Fold09: mtry=4
## - Fold09: mtry=4
## + Fold09: mtry=7
## - Fold09: mtry=7
## + Fold10: mtry=2
## - Fold10: mtry=2
## + Fold10: mtry=4
## - Fold10: mtry=4
## + Fold10: mtry=7
## - Fold10: mtry=7
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
confusionMatrix(predict(rf_cv, newdata=test_set) %>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
##
## Reference
## Prediction BBLN BBLR
## BBLN 398 153
## BBLR 39 65
##
## Accuracy : 0.7069
## 95% CI : (0.6704, 0.7415)
## No Information Rate : 0.6672
## P-Value [Acc > NIR] : 0.01651
##
## Kappa : 0.2404
##
## Mcnemar's Test P-Value : 3.49e-16
##
## Sensitivity : 0.9108
## Specificity : 0.2982
## Pos Pred Value : 0.7223
## Neg Pred Value : 0.6250
## Prevalence : 0.6672
## Detection Rate : 0.6076
## Detection Prevalence : 0.8412
## Balanced Accuracy : 0.6045
##
## 'Positive' Class : BBLN
##
Naive Bayes dengan Splitting 70:30
nb <- naiveBayes(model, data = train_set)
print(nb)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## BBLN BBLR
## 0.6664482 0.3335518
##
## Conditional probabilities:
## PEND
## Y Higher Primary Secondary
## BBLN 0.19370698 0.17404130 0.63225172
## BBLR 0.07662083 0.32809430 0.59528487
##
## KLASIFIKASI
## Y Rural Urban
## BBLN 0.5368732 0.4631268
## BBLR 0.5717092 0.4282908
##
## SK
## Y Middle Poorer Richer
## BBLN 0.2104228 0.4650934 0.3244838
## BBLR 0.1984283 0.5992141 0.2023576
##
## ANTE
## Y [,1] [,2]
## BBLN 5.590954 3.221153
## BBLR 5.137525 3.361539
##
## UMUR
## Y [,1] [,2]
## BBLN 22.11701 4.287305
## BBLR 21.50884 4.488658
pred_nb <- predict(nb, newdata = test_set)
confusionMatrix(pred_nb, test_set$BBL)
## Confusion Matrix and Statistics
##
## Reference
## Prediction BBLN BBLR
## BBLN 393 151
## BBLR 44 67
##
## Accuracy : 0.7023
## 95% CI : (0.6657, 0.7371)
## No Information Rate : 0.6672
## P-Value [Acc > NIR] : 0.03019
##
## Kappa : 0.2356
##
## Mcnemar's Test P-Value : 3.179e-14
##
## Sensitivity : 0.8993
## Specificity : 0.3073
## Pos Pred Value : 0.7224
## Neg Pred Value : 0.6036
## Prevalence : 0.6672
## Detection Rate : 0.6000
## Detection Prevalence : 0.8305
## Balanced Accuracy : 0.6033
##
## 'Positive' Class : BBLN
##
Naive Bayes dengan Cross Validation
nb_cv <- train(model, data=train_set,
method='naive_bayes',
trControl=myControl)
## + Fold01: usekernel= TRUE, laplace=0, adjust=1
## - Fold01: usekernel= TRUE, laplace=0, adjust=1
## + Fold01: usekernel=FALSE, laplace=0, adjust=1
## - Fold01: usekernel=FALSE, laplace=0, adjust=1
## + Fold02: usekernel= TRUE, laplace=0, adjust=1
## - Fold02: usekernel= TRUE, laplace=0, adjust=1
## + Fold02: usekernel=FALSE, laplace=0, adjust=1
## - Fold02: usekernel=FALSE, laplace=0, adjust=1
## + Fold03: usekernel= TRUE, laplace=0, adjust=1
## - Fold03: usekernel= TRUE, laplace=0, adjust=1
## + Fold03: usekernel=FALSE, laplace=0, adjust=1
## - Fold03: usekernel=FALSE, laplace=0, adjust=1
## + Fold04: usekernel= TRUE, laplace=0, adjust=1
## - Fold04: usekernel= TRUE, laplace=0, adjust=1
## + Fold04: usekernel=FALSE, laplace=0, adjust=1
## - Fold04: usekernel=FALSE, laplace=0, adjust=1
## + Fold05: usekernel= TRUE, laplace=0, adjust=1
## - Fold05: usekernel= TRUE, laplace=0, adjust=1
## + Fold05: usekernel=FALSE, laplace=0, adjust=1
## - Fold05: usekernel=FALSE, laplace=0, adjust=1
## + Fold06: usekernel= TRUE, laplace=0, adjust=1
## - Fold06: usekernel= TRUE, laplace=0, adjust=1
## + Fold06: usekernel=FALSE, laplace=0, adjust=1
## - Fold06: usekernel=FALSE, laplace=0, adjust=1
## + Fold07: usekernel= TRUE, laplace=0, adjust=1
## - Fold07: usekernel= TRUE, laplace=0, adjust=1
## + Fold07: usekernel=FALSE, laplace=0, adjust=1
## - Fold07: usekernel=FALSE, laplace=0, adjust=1
## + Fold08: usekernel= TRUE, laplace=0, adjust=1
## - Fold08: usekernel= TRUE, laplace=0, adjust=1
## + Fold08: usekernel=FALSE, laplace=0, adjust=1
## - Fold08: usekernel=FALSE, laplace=0, adjust=1
## + Fold09: usekernel= TRUE, laplace=0, adjust=1
## - Fold09: usekernel= TRUE, laplace=0, adjust=1
## + Fold09: usekernel=FALSE, laplace=0, adjust=1
## - Fold09: usekernel=FALSE, laplace=0, adjust=1
## + Fold10: usekernel= TRUE, laplace=0, adjust=1
## - Fold10: usekernel= TRUE, laplace=0, adjust=1
## + Fold10: usekernel=FALSE, laplace=0, adjust=1
## - Fold10: usekernel=FALSE, laplace=0, adjust=1
## Aggregating results
## Selecting tuning parameters
## Fitting laplace = 0, usekernel = TRUE, adjust = 1 on full training set
print(nb_cv)
## Naive Bayes
##
## 1526 samples
## 5 predictor
## 2 classes: 'BBLN', 'BBLR'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1373, 1373, 1373, 1375, 1374, 1373, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.6710889 0.17381701
## TRUE 0.6723441 0.04392668
##
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = TRUE
## and adjust = 1.
confusionMatrix(predict(nb_cv, newdata=test_set) %>% as.factor(),test_set$BBL %>% as.factor())
## Confusion Matrix and Statistics
##
## Reference
## Prediction BBLN BBLR
## BBLN 426 204
## BBLR 11 14
##
## Accuracy : 0.6718
## 95% CI : (0.6343, 0.7076)
## No Information Rate : 0.6672
## P-Value [Acc > NIR] : 0.4196
##
## Kappa : 0.0502
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.97483
## Specificity : 0.06422
## Pos Pred Value : 0.67619
## Neg Pred Value : 0.56000
## Prevalence : 0.66718
## Detection Rate : 0.65038
## Detection Prevalence : 0.96183
## Balanced Accuracy : 0.51952
##
## 'Positive' Class : BBLN
##