042324

# Re sampling methods to estimate model accuracy

# Load packages
library(caret)

## Loading required package: lattice

library(klaR)

## Warning: package 'klaR' was built under R version 4.3.3

## Loading required package: MASS

## Warning: package 'MASS' was built under R version 4.3.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# Load the dataset
data("iris")
# define 80%/20% train/test split of the dataset
trainIndex <- createDataPartition(iris$Species, p = 0.80, list = FALSE)
dataTrain <- iris[trainIndex,]
dataTest <- iris[-trainIndex,]
# train a naive Bayes model
fit <- NaiveBayes(Species~., data = dataTrain)
# make predictions
predictions <- predict(fit, dataTest[,1:4])
# summarize results
confusionMatrix(predictions$class, dataTest$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500

## Bootstrap
library(caret)
# load dataset
data("iris")
# define training control
trainControl <- trainControl(method = "boot", number = 100)
# evaluate the model
fit <- train(Species~., data = iris, trControl = trainControl, method = "nb")
# display results
print(fit)

## Naive Bayes 
## 
## 150 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Bootstrapped (100 reps) 
## Summary of sample sizes: 150, 150, 150, 150, 150, 150, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.9522972  0.9277221
##    TRUE      0.9552496  0.9321597
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
##  = 1.

## k-fold Cross Validation
library(caret)
data("iris")
# define training control
trainControl <- trainControl(method = "cv", number = 10)
# evaluate
fit <- train(Species~., data = iris, trControl = trainControl, method = "nb")
# display
print(fit)

## Naive Bayes 
## 
## 150 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa
##   FALSE      0.9533333  0.93 
##    TRUE      0.9533333  0.93 
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
##  = 1.

## Repeated k-fold Cross Validation
library(caret)
data("iris")
# define training control
trainControl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
# evaluate
fit <- train(Species~., data = iris, trControl = trainControl, method = "nb")
# display
print(fit)

## Naive Bayes 
## 
## 150 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.9555556  0.9333333
##    TRUE      0.9600000  0.9400000
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
##  = 1.

## Leave One Out Cross Validation (LOOCV)
library(caret)
data("iris")
# define training control
trainControl <- trainControl(method = "LOOCV")
# evaluate
fit <- train(Species~., data = iris, trControl = trainControl, method = "nb")
# display
print(fit)

## Naive Bayes 
## 
## 150 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 149, 149, 149, 149, 149, 149, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa
##   FALSE      0.9533333  0.93 
##    TRUE      0.9600000  0.94 
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
##  = 1.

042324

Thu Vu

2024-04-23