Import Package
library(readxl)
library(lattice)
library(smotefamily)
library(caret)
## Loading required package: ggplot2
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(klaR)
## Loading required package: MASS
library(naivebayes)
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
library(ggplot2)
Import Data
s <- 1999
data_xls <- readxl::read_excel("/Users/User/Documents/RFiles/data_STA1581.xlsx")
# data_csv <- read.csv("data_STA1581.csv", sep = ";")
rbind(head(data_xls, 3), tail(data_xls, 3))
## # A tibble: 6 × 6
## AP XRay Stage Grade Age Y
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 128 0 1 1 54 1
## 2 46 0 1 1 55 0
## 3 165 0 0 1 53 1
## 4 179 1 0 1 69 1
## 5 56 1 1 0 69 0
## 6 136 1 1 0 67 1
sum(is.na(data_xls)); sum(is.null(data_xls))
## [1] 0
## [1] 0
set.seed(s)
r <- runif(nrow(data_xls))
data <- cbind(data_xls, r)
data <- data[order(data$r), c('AP', 'XRay', 'Stage', 'Grade', 'Age', 'Y')]
data <- data[1:200,]
head(data, 3)
## AP XRay Stage Grade Age Y
## 144 154 0 0 1 61 0
## 189 53 0 0 1 55 0
## 145 92 0 0 1 68 0
tail(data, 3)
## AP XRay Stage Grade Age Y
## 174 82 1 0 0 53 0
## 116 159 1 1 0 54 1
## 129 186 0 1 0 69 1
Split Data
set.seed(s)
sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.8,0.2))
train <- data[sample, ]; test <- data[!sample, ]
barchart(as.factor(train$Y), col='maroon')

Smote Data Train
smote_train <- SMOTE(train[,-6], train$Y)
newtrain <- smote_train$data
barchart(newtrain$class, col = 'navy')

newtrain[,-6] <- round(newtrain[,-6])
nrow(newtrain)
## [1] 198
Logistik
log_train <- newtrain
log_test <- test
model1_log <- glm(as.factor(class)~., data = log_train, family = 'binomial')
summary(model1_log)
##
## Call:
## glm(formula = as.factor(class) ~ ., family = "binomial", data = log_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.447254 2.178821 -1.123 0.26135
## AP 0.055138 0.008195 6.728 1.71e-11 ***
## XRay 3.020336 0.723736 4.173 3.00e-05 ***
## Stage 2.295046 0.711425 3.226 0.00126 **
## Grade 1.243346 0.429910 2.892 0.00383 **
## Age -0.104864 0.038797 -2.703 0.00687 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 274.47 on 197 degrees of freedom
## Residual deviance: 144.01 on 192 degrees of freedom
## AIC: 156.01
##
## Number of Fisher Scoring iterations: 6
exp(model1_log$coefficients)
## (Intercept) AP XRay Stage Grade Age
## 0.08653086 1.05668610 20.49816780 9.92489557 3.46719684 0.90044662
Train Konfusi
log_pred_train <- round(predict(model1_log, newdata = log_train, type = 'response'))
confusionMatrix(as.factor(log_train$class), as.factor(log_pred_train), positive = "1", mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 79 19
## 1 20 80
##
## Accuracy : 0.803
## 95% CI : (0.7407, 0.856)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6061
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8081
## Specificity : 0.7980
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.8061
## Precision : 0.8000
## Recall : 0.8081
## F1 : 0.8040
## Prevalence : 0.5000
## Detection Rate : 0.4040
## Detection Prevalence : 0.5051
## Balanced Accuracy : 0.8030
##
## 'Positive' Class : 1
##
Konfusi Test
log_pred_test <- round(predict(model1_log, newdata = log_test, type = 'response'))
confusionMatrix(as.factor(log_test$Y), as.factor(log_pred_test), positive = "1", mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16 1
## 1 5 29
##
## Accuracy : 0.8824
## 95% CI : (0.7613, 0.9556)
## No Information Rate : 0.5882
## P-Value [Acc > NIR] : 4.56e-06
##
## Kappa : 0.75
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9667
## Specificity : 0.7619
## Pos Pred Value : 0.8529
## Neg Pred Value : 0.9412
## Precision : 0.8529
## Recall : 0.9667
## F1 : 0.9062
## Prevalence : 0.5882
## Detection Rate : 0.5686
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.8643
##
## 'Positive' Class : 1
##
ROC
log_roc <- roc(log_test$Y, log_pred_test, plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

log_roc$auc
## Area under the curve: 0.8971
Naive Bayes
nb_train <- newtrain
nb_test <- test
features <- setdiff(names(nb_train), "class")
nbx <- nb_train[, features]
nby <- nb_train$class
train_control <- trainControl(
method = "cv",
number = 10
)
# train model
nb.m1 <- train(
x = nbx,
y = as.factor(nby),
method = "nb",
trControl = train_control
)
nb_train$XRay <- as.factor(nb_train$XRay)
nb_train$Grade <- as.factor(nb_train$Grade)
nb_train$Stage <- as.factor(nb_train$Stage)
modelnv<-naive_bayes(class~.,data=nb_train)
modelnv
##
## ================================= Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = class ~ ., data = nb_train)
##
## --------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## --------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.4949495 0.5050505
##
## --------------------------------------------------------------------------------
##
## Tables:
##
## --------------------------------------------------------------------------------
## :: AP (Gaussian)
## --------------------------------------------------------------------------------
##
## AP 0 1
## mean 76.22449 130.02000
## sd 28.65922 40.86143
##
## --------------------------------------------------------------------------------
## :: XRay (Bernoulli)
## --------------------------------------------------------------------------------
##
## XRay 0 1
## 0 0.6326531 0.3600000
## 1 0.3673469 0.6400000
##
## --------------------------------------------------------------------------------
## :: Stage (Bernoulli)
## --------------------------------------------------------------------------------
##
## Stage 0 1
## 0 0.5612245 0.5100000
## 1 0.4387755 0.4900000
##
## --------------------------------------------------------------------------------
## :: Grade (Bernoulli)
## --------------------------------------------------------------------------------
##
## Grade 0 1
## 0 0.5816327 0.4400000
## 1 0.4183673 0.5600000
##
## --------------------------------------------------------------------------------
## :: Age (Gaussian)
## --------------------------------------------------------------------------------
##
## Age 0 1
## mean 59.020408 58.340000
## sd 5.916915 6.131917
##
## --------------------------------------------------------------------------------
plot(modelnv)





nb_pred_train <- predict(modelnv, nb_train[,-6])
confusionMatrix(as.factor(nb_train$class), as.factor(nb_pred_train), positive = "1", mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 85 13
## 1 31 69
##
## Accuracy : 0.7778
## 95% CI : (0.7134, 0.8336)
## No Information Rate : 0.5859
## P-Value [Acc > NIR] : 9.882e-09
##
## Kappa : 0.5563
##
## Mcnemar's Test P-Value : 0.01038
##
## Sensitivity : 0.8415
## Specificity : 0.7328
## Pos Pred Value : 0.6900
## Neg Pred Value : 0.8673
## Precision : 0.6900
## Recall : 0.8415
## F1 : 0.7582
## Prevalence : 0.4141
## Detection Rate : 0.3485
## Detection Prevalence : 0.5051
## Balanced Accuracy : 0.7871
##
## 'Positive' Class : 1
##
nb_test$XRay <- as.factor(nb_test$XRay)
nb_test$Grade <- as.factor(nb_test$Grade)
nb_test$Stage <- as.factor(nb_test$Stage)
nb_pred_test <- predict(modelnv, nb_test[,-6])
confusionMatrix(as.factor(nb_test$Y), as.factor(nb_pred_test), positive = "1", mode = "everything")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 13 4
## 1 8 26
##
## Accuracy : 0.7647
## 95% CI : (0.6251, 0.8721)
## No Information Rate : 0.5882
## P-Value [Acc > NIR] : 0.006486
##
## Kappa : 0.5
##
## Mcnemar's Test P-Value : 0.386476
##
## Sensitivity : 0.8667
## Specificity : 0.6190
## Pos Pred Value : 0.7647
## Neg Pred Value : 0.7647
## Precision : 0.7647
## Recall : 0.8667
## F1 : 0.8125
## Prevalence : 0.5882
## Detection Rate : 0.5098
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.7429
##
## 'Positive' Class : 1
##
nb_roc <- roc(nb_test$Y, as.numeric(nb_pred_test), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

nb_roc$auc
## Area under the curve: 0.7647
Naive Bayes 2
nb_model <- train(as.factor(class) ~ ., data = nb_train, method = "naive_bayes", usepoisson = TRUE)
nb_model
## Naive Bayes
##
## 198 samples
## 5 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 198, 198, 198, 198, 198, 198, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.7828671 0.5647901
## TRUE 0.7704682 0.5425591
##
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = FALSE
## and adjust = 1.
nb_object <- nb_model$finalModel
class(nb_object)
## [1] "naive_bayes"
nb_grid <- expand.grid(usekernel = c(TRUE, FALSE),
laplace = c(0, 0.25, 0.5,0.75, 1),
adjust = c(0.25, 0.5, 0.75, 1, 1.25, 1.5))
# Fit the Naive Bayes model
set.seed(s)
nb_model2 <- train(as.factor(class) ~ ., data = nb_train,
method = "naive_bayes",
usepoisson = TRUE,
tuneGrid = nb_grid)
# Selected tuning parameters
nb_model2$finalModel$tuneValue
## laplace usekernel adjust
## 33 0 TRUE 0.75
plot(nb_model2)

plot(nb_model2$finalModel)





nb_pred2_train <- predict(nb_model2, newdata = nb_train)
confusionMatrix(as.factor(nb_train$class), as.factor(nb_pred2_train), mode = "everything", positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 83 15
## 1 20 80
##
## Accuracy : 0.8232
## 95% CI : (0.7628, 0.8737)
## No Information Rate : 0.5202
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6466
##
## Mcnemar's Test P-Value : 0.499
##
## Sensitivity : 0.8421
## Specificity : 0.8058
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.8469
## Precision : 0.8000
## Recall : 0.8421
## F1 : 0.8205
## Prevalence : 0.4798
## Detection Rate : 0.4040
## Detection Prevalence : 0.5051
## Balanced Accuracy : 0.8240
##
## 'Positive' Class : 1
##
nb_pred2_test <- predict(nb_model2, newdata = nb_test)
confusionMatrix(as.factor(nb_test$Y), as.factor(nb_pred2_test), mode = "everything", positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16 1
## 1 7 27
##
## Accuracy : 0.8431
## 95% CI : (0.7141, 0.9298)
## No Information Rate : 0.549
## P-Value [Acc > NIR] : 8.783e-06
##
## Kappa : 0.6757
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.9643
## Specificity : 0.6957
## Pos Pred Value : 0.7941
## Neg Pred Value : 0.9412
## Precision : 0.7941
## Recall : 0.9643
## F1 : 0.8710
## Prevalence : 0.5490
## Detection Rate : 0.5294
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.8300
##
## 'Positive' Class : 1
##
nb_roc2 <- roc(nb_test$Y, as.numeric(nb_pred2_test), plot = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

nb_roc2$auc
## Area under the curve: 0.8676
ROC
get_auc <- function(x){
return(x$auc)
}
ggroc(list(LogReg = log_roc, NaiveBayes = nb_roc),
aes = c('colour')) + labs(colour = 'Method')
