Classification Prostate Cancer Data using Naive Bayes & Logistic Regression

Import Package

library(readxl)
library(lattice)
library(smotefamily)
library(caret)

## Loading required package: ggplot2

library(e1071)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(klaR)

## Loading required package: MASS

library(naivebayes)

## naivebayes 1.0.0 loaded

## For more information please visit:

## https://majkamichal.github.io/naivebayes/

library(ggplot2)

Import Data

s <- 1999
data_xls <- readxl::read_excel("/Users/User/Documents/RFiles/data_STA1581.xlsx")
# data_csv <- read.csv("data_STA1581.csv", sep = ";")

rbind(head(data_xls, 3), tail(data_xls, 3))

## # A tibble: 6 × 6
##      AP  XRay Stage Grade   Age     Y
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   128     0     1     1    54     1
## 2    46     0     1     1    55     0
## 3   165     0     0     1    53     1
## 4   179     1     0     1    69     1
## 5    56     1     1     0    69     0
## 6   136     1     1     0    67     1

sum(is.na(data_xls)); sum(is.null(data_xls))

## [1] 0

## [1] 0

set.seed(s)
r <- runif(nrow(data_xls))
data <- cbind(data_xls, r)
data <- data[order(data$r), c('AP', 'XRay', 'Stage', 'Grade', 'Age', 'Y')]
data <- data[1:200,]

head(data, 3)

##      AP XRay Stage Grade Age Y
## 144 154    0     0     1  61 0
## 189  53    0     0     1  55 0
## 145  92    0     0     1  68 0

tail(data, 3)

##      AP XRay Stage Grade Age Y
## 174  82    1     0     0  53 0
## 116 159    1     1     0  54 1
## 129 186    0     1     0  69 1

Split Data

set.seed(s)
sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.8,0.2))
train  <- data[sample, ]; test   <- data[!sample, ]

barchart(as.factor(train$Y), col='maroon')

Smote Data Train

smote_train <- SMOTE(train[,-6], train$Y)
newtrain <- smote_train$data
barchart(newtrain$class, col = 'navy')

newtrain[,-6] <- round(newtrain[,-6])
nrow(newtrain)

## [1] 198

Logistik

log_train <- newtrain
log_test <- test

model1_log <- glm(as.factor(class)~., data = log_train, family = 'binomial')
summary(model1_log)

## 
## Call:
## glm(formula = as.factor(class) ~ ., family = "binomial", data = log_train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.447254   2.178821  -1.123  0.26135    
## AP           0.055138   0.008195   6.728 1.71e-11 ***
## XRay         3.020336   0.723736   4.173 3.00e-05 ***
## Stage        2.295046   0.711425   3.226  0.00126 ** 
## Grade        1.243346   0.429910   2.892  0.00383 ** 
## Age         -0.104864   0.038797  -2.703  0.00687 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 274.47  on 197  degrees of freedom
## Residual deviance: 144.01  on 192  degrees of freedom
## AIC: 156.01
## 
## Number of Fisher Scoring iterations: 6

exp(model1_log$coefficients)

## (Intercept)          AP        XRay       Stage       Grade         Age 
##  0.08653086  1.05668610 20.49816780  9.92489557  3.46719684  0.90044662

Train Konfusi

log_pred_train <- round(predict(model1_log, newdata = log_train, type = 'response'))
confusionMatrix(as.factor(log_train$class), as.factor(log_pred_train), positive = "1", mode = "everything")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 79 19
##          1 20 80
##                                          
##                Accuracy : 0.803          
##                  95% CI : (0.7407, 0.856)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.6061         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.8081         
##             Specificity : 0.7980         
##          Pos Pred Value : 0.8000         
##          Neg Pred Value : 0.8061         
##               Precision : 0.8000         
##                  Recall : 0.8081         
##                      F1 : 0.8040         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4040         
##    Detection Prevalence : 0.5051         
##       Balanced Accuracy : 0.8030         
##                                          
##        'Positive' Class : 1              
##

Konfusi Test

log_pred_test <- round(predict(model1_log, newdata = log_test, type = 'response'))
confusionMatrix(as.factor(log_test$Y), as.factor(log_pred_test), positive = "1", mode = "everything")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 16  1
##          1  5 29
##                                           
##                Accuracy : 0.8824          
##                  95% CI : (0.7613, 0.9556)
##     No Information Rate : 0.5882          
##     P-Value [Acc > NIR] : 4.56e-06        
##                                           
##                   Kappa : 0.75            
##                                           
##  Mcnemar's Test P-Value : 0.2207          
##                                           
##             Sensitivity : 0.9667          
##             Specificity : 0.7619          
##          Pos Pred Value : 0.8529          
##          Neg Pred Value : 0.9412          
##               Precision : 0.8529          
##                  Recall : 0.9667          
##                      F1 : 0.9062          
##              Prevalence : 0.5882          
##          Detection Rate : 0.5686          
##    Detection Prevalence : 0.6667          
##       Balanced Accuracy : 0.8643          
##                                           
##        'Positive' Class : 1               
##

ROC

log_roc <- roc(log_test$Y, log_pred_test, plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

log_roc$auc

## Area under the curve: 0.8971

Naive Bayes

nb_train <- newtrain
nb_test <- test

features <- setdiff(names(nb_train), "class")
nbx <- nb_train[, features]
nby <- nb_train$class

train_control <- trainControl(
  method = "cv", 
  number = 10
  )

# train model
nb.m1 <- train(
  x = nbx,
  y = as.factor(nby),
  method = "nb",
  trControl = train_control
  )

nb_train$XRay <- as.factor(nb_train$XRay)
nb_train$Grade <- as.factor(nb_train$Grade)
nb_train$Stage <- as.factor(nb_train$Stage)

modelnv<-naive_bayes(class~.,data=nb_train)
modelnv

## 
## ================================= Naive Bayes ==================================
## 
## Call:
## naive_bayes.formula(formula = class ~ ., data = nb_train)
## 
## -------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## -------------------------------------------------------------------------------- 
##  
## A priori probabilities: 
## 
##         0         1 
## 0.4949495 0.5050505 
## 
## -------------------------------------------------------------------------------- 
##  
## Tables: 
## 
## -------------------------------------------------------------------------------- 
## :: AP (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## AP             0         1
##   mean  76.22449 130.02000
##   sd    28.65922  40.86143
## 
## -------------------------------------------------------------------------------- 
## :: XRay (Bernoulli) 
## -------------------------------------------------------------------------------- 
##     
## XRay         0         1
##    0 0.6326531 0.3600000
##    1 0.3673469 0.6400000
## 
## -------------------------------------------------------------------------------- 
## :: Stage (Bernoulli) 
## -------------------------------------------------------------------------------- 
##      
## Stage         0         1
##     0 0.5612245 0.5100000
##     1 0.4387755 0.4900000
## 
## -------------------------------------------------------------------------------- 
## :: Grade (Bernoulli) 
## -------------------------------------------------------------------------------- 
##      
## Grade         0         1
##     0 0.5816327 0.4400000
##     1 0.4183673 0.5600000
## 
## -------------------------------------------------------------------------------- 
## :: Age (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## Age            0         1
##   mean 59.020408 58.340000
##   sd    5.916915  6.131917
## 
## --------------------------------------------------------------------------------

plot(modelnv)

nb_pred_train <- predict(modelnv, nb_train[,-6])
confusionMatrix(as.factor(nb_train$class), as.factor(nb_pred_train), positive = "1", mode = "everything")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 85 13
##          1 31 69
##                                           
##                Accuracy : 0.7778          
##                  95% CI : (0.7134, 0.8336)
##     No Information Rate : 0.5859          
##     P-Value [Acc > NIR] : 9.882e-09       
##                                           
##                   Kappa : 0.5563          
##                                           
##  Mcnemar's Test P-Value : 0.01038         
##                                           
##             Sensitivity : 0.8415          
##             Specificity : 0.7328          
##          Pos Pred Value : 0.6900          
##          Neg Pred Value : 0.8673          
##               Precision : 0.6900          
##                  Recall : 0.8415          
##                      F1 : 0.7582          
##              Prevalence : 0.4141          
##          Detection Rate : 0.3485          
##    Detection Prevalence : 0.5051          
##       Balanced Accuracy : 0.7871          
##                                           
##        'Positive' Class : 1               
##

nb_test$XRay <- as.factor(nb_test$XRay)
nb_test$Grade <- as.factor(nb_test$Grade)
nb_test$Stage <- as.factor(nb_test$Stage)
nb_pred_test <- predict(modelnv, nb_test[,-6])
confusionMatrix(as.factor(nb_test$Y), as.factor(nb_pred_test), positive = "1", mode = "everything")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 13  4
##          1  8 26
##                                           
##                Accuracy : 0.7647          
##                  95% CI : (0.6251, 0.8721)
##     No Information Rate : 0.5882          
##     P-Value [Acc > NIR] : 0.006486        
##                                           
##                   Kappa : 0.5             
##                                           
##  Mcnemar's Test P-Value : 0.386476        
##                                           
##             Sensitivity : 0.8667          
##             Specificity : 0.6190          
##          Pos Pred Value : 0.7647          
##          Neg Pred Value : 0.7647          
##               Precision : 0.7647          
##                  Recall : 0.8667          
##                      F1 : 0.8125          
##              Prevalence : 0.5882          
##          Detection Rate : 0.5098          
##    Detection Prevalence : 0.6667          
##       Balanced Accuracy : 0.7429          
##                                           
##        'Positive' Class : 1               
##

nb_roc <- roc(nb_test$Y, as.numeric(nb_pred_test), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

nb_roc$auc

## Area under the curve: 0.7647

Naive Bayes 2

nb_model <- train(as.factor(class) ~ ., data = nb_train, method = "naive_bayes", usepoisson = TRUE)
nb_model

## Naive Bayes 
## 
## 198 samples
##   5 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 198, 198, 198, 198, 198, 198, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.7828671  0.5647901
##    TRUE      0.7704682  0.5425591
## 
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = FALSE
##  and adjust = 1.

nb_object <- nb_model$finalModel
class(nb_object)

## [1] "naive_bayes"

nb_grid <-   expand.grid(usekernel = c(TRUE, FALSE),
                         laplace = c(0, 0.25, 0.5,0.75, 1), 
                         adjust = c(0.25, 0.5, 0.75, 1, 1.25, 1.5))
# Fit the Naive Bayes model 
set.seed(s)
nb_model2 <- train(as.factor(class) ~ ., data = nb_train, 
                               method = "naive_bayes",
                               usepoisson = TRUE,
                               tuneGrid = nb_grid)
# Selected tuning parameters
nb_model2$finalModel$tuneValue

##    laplace usekernel adjust
## 33       0      TRUE   0.75

plot(nb_model2)

plot(nb_model2$finalModel)

nb_pred2_train <- predict(nb_model2, newdata = nb_train)
confusionMatrix(as.factor(nb_train$class), as.factor(nb_pred2_train), mode = "everything", positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 83 15
##          1 20 80
##                                           
##                Accuracy : 0.8232          
##                  95% CI : (0.7628, 0.8737)
##     No Information Rate : 0.5202          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6466          
##                                           
##  Mcnemar's Test P-Value : 0.499           
##                                           
##             Sensitivity : 0.8421          
##             Specificity : 0.8058          
##          Pos Pred Value : 0.8000          
##          Neg Pred Value : 0.8469          
##               Precision : 0.8000          
##                  Recall : 0.8421          
##                      F1 : 0.8205          
##              Prevalence : 0.4798          
##          Detection Rate : 0.4040          
##    Detection Prevalence : 0.5051          
##       Balanced Accuracy : 0.8240          
##                                           
##        'Positive' Class : 1               
##

nb_pred2_test <- predict(nb_model2, newdata = nb_test)
confusionMatrix(as.factor(nb_test$Y), as.factor(nb_pred2_test), mode = "everything", positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 16  1
##          1  7 27
##                                           
##                Accuracy : 0.8431          
##                  95% CI : (0.7141, 0.9298)
##     No Information Rate : 0.549           
##     P-Value [Acc > NIR] : 8.783e-06       
##                                           
##                   Kappa : 0.6757          
##                                           
##  Mcnemar's Test P-Value : 0.0771          
##                                           
##             Sensitivity : 0.9643          
##             Specificity : 0.6957          
##          Pos Pred Value : 0.7941          
##          Neg Pred Value : 0.9412          
##               Precision : 0.7941          
##                  Recall : 0.9643          
##                      F1 : 0.8710          
##              Prevalence : 0.5490          
##          Detection Rate : 0.5294          
##    Detection Prevalence : 0.6667          
##       Balanced Accuracy : 0.8300          
##                                           
##        'Positive' Class : 1               
##

nb_roc2 <- roc(nb_test$Y, as.numeric(nb_pred2_test), plot = T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

nb_roc2$auc

## Area under the curve: 0.8676

ROC

get_auc <- function(x){
  return(x$auc)
}
ggroc(list(LogReg = log_roc, NaiveBayes = nb_roc), 
      aes = c('colour')) + labs(colour = 'Method')

Classification Prostate Cancer Data using Naive Bayes & Logistic Regression

2022-12-15

Import Package

Import Data

Split Data

Smote Data Train

Logistik

Train Konfusi

Konfusi Test

ROC

Naive Bayes

Naive Bayes 2

ROC