# Demo Dataset: SPAM
library(ElemStatLearn)
## Warning: package 'ElemStatLearn' was built under R version 3.6.1
library(caret)
## Warning: package 'caret' was built under R version 3.6.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.1
data(spam)
dim(spam)
## [1] 4601   58
names(spam)
##  [1] "A.1"  "A.2"  "A.3"  "A.4"  "A.5"  "A.6"  "A.7"  "A.8"  "A.9"  "A.10"
## [11] "A.11" "A.12" "A.13" "A.14" "A.15" "A.16" "A.17" "A.18" "A.19" "A.20"
## [21] "A.21" "A.22" "A.23" "A.24" "A.25" "A.26" "A.27" "A.28" "A.29" "A.30"
## [31] "A.31" "A.32" "A.33" "A.34" "A.35" "A.36" "A.37" "A.38" "A.39" "A.40"
## [41] "A.41" "A.42" "A.43" "A.44" "A.45" "A.46" "A.47" "A.48" "A.49" "A.50"
## [51] "A.51" "A.52" "A.53" "A.54" "A.55" "A.56" "A.57" "spam"
str(spam)
## 'data.frame':    4601 obs. of  58 variables:
##  $ A.1 : num  0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
##  $ A.2 : num  0.64 0.28 0 0 0 0 0 0 0 0.12 ...
##  $ A.3 : num  0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
##  $ A.4 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.5 : num  0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
##  $ A.6 : num  0 0.28 0.19 0 0 0 0 0 0 0.32 ...
##  $ A.7 : num  0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
##  $ A.8 : num  0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
##  $ A.9 : num  0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
##  $ A.10: num  0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
##  $ A.11: num  0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
##  $ A.12: num  0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
##  $ A.13: num  0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
##  $ A.14: num  0 0.21 0 0 0 0 0 0 0 0 ...
##  $ A.15: num  0 0.14 1.75 0 0 0 0 0 0 0.12 ...
##  $ A.16: num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
##  $ A.17: num  0 0.07 0.06 0 0 0 0 0 0 0 ...
##  $ A.18: num  1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
##  $ A.19: num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
##  $ A.20: num  0 0 0.32 0 0 0 0 0 3.53 0.06 ...
##  $ A.21: num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
##  $ A.22: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.23: num  0 0.43 1.16 0 0 0 0 0 0 0.19 ...
##  $ A.24: num  0 0.43 0.06 0 0 0 0 0 0.15 0 ...
##  $ A.25: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.26: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.27: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.28: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.29: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.30: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.31: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.32: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.33: num  0 0 0 0 0 0 0 0 0.15 0 ...
##  $ A.34: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.35: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.36: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.37: num  0 0.07 0 0 0 0 0 0 0 0 ...
##  $ A.38: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.39: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.40: num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ A.41: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.42: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.43: num  0 0 0.12 0 0 0 0 0 0.3 0 ...
##  $ A.44: num  0 0 0 0 0 0 0 0 0 0.06 ...
##  $ A.45: num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ A.46: num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ A.47: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.48: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.49: num  0 0 0.01 0 0 0 0 0 0 0.04 ...
##  $ A.50: num  0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
##  $ A.51: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ A.52: num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
##  $ A.53: num  0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
##  $ A.54: num  0 0.048 0.01 0 0 0 0 0 0.022 0 ...
##  $ A.55: num  3.76 5.11 9.82 3.54 3.54 ...
##  $ A.56: int  61 101 485 40 40 15 4 11 445 43 ...
##  $ A.57: int  278 1028 2259 191 191 54 112 49 1257 749 ...
##  $ spam: Factor w/ 2 levels "email","spam": 2 2 2 2 2 2 2 2 2 2 ...
# Creat Training and Validation Datasets
train_index <- createDataPartition(spam$spam, p = 0.8, list = FALSE)

spam_train <- spam[train_index, ]
spam_test <- spam[-train_index, ]

table(spam_train$spam)
## 
## email  spam 
##  2231  1451
table(spam_test$spam)
## 
## email  spam 
##   557   362
# Train the Basic NB Model
library(klaR)
## Warning: package 'klaR' was built under R version 3.6.1
## Loading required package: MASS
model_nb1 <- suppressWarnings(train(spam ~ ., data = spam_train, method = "nb"))
model_nb1
## Naive Bayes 
## 
## 3682 samples
##   57 predictor
##    2 classes: 'email', 'spam' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3682, 3682, 3682, 3682, 3682, 3682, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.7234254  0.4756690
##    TRUE      0.5866633  0.2666212
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE
##  and adjust = 1.
# Predict with NBC
predict_nb1 <- suppressWarnings(predict(model_nb1, newdata = spam_test, type = "raw"))
confusionMatrix(predict_nb1, spam_test$spam)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction email spam
##      email   309   17
##      spam    248  345
##                                           
##                Accuracy : 0.7116          
##                  95% CI : (0.6812, 0.7408)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : 1.511e-11       
##                                           
##                   Kappa : 0.4568          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5548          
##             Specificity : 0.9530          
##          Pos Pred Value : 0.9479          
##          Neg Pred Value : 0.5818          
##              Prevalence : 0.6061          
##          Detection Rate : 0.3362          
##    Detection Prevalence : 0.3547          
##       Balanced Accuracy : 0.7539          
##                                           
##        'Positive' Class : email           
## 
# Fine Tune the Models
model_nb2 <- suppressWarnings(train(spam ~ ., data = spam,
                                    method = "nb",
                                    trcontrol = trainControl(method = "cv", number = 3),
                                    tuneGrid = data.frame(fL = 1, usekernel = FALSE, adjust = 1)))

model_nb2
## Naive Bayes 
## 
## 4601 samples
##   57 predictor
##    2 classes: 'email', 'spam' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 4601, 4601, 4601, 4601, 4601, 4601, ... 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.7129969  0.457764
## 
## Tuning parameter 'fL' was held constant at a value of 1
## Tuning
##  parameter 'usekernel' was held constant at a value of FALSE
## 
## Tuning parameter 'adjust' was held constant at a value of 1
# Predict with Tuned Model
predict_nb2 <- suppressWarnings(predict(model_nb2, newdata = spam_test, type = "raw"))
confusionMatrix(predict_nb2, spam_test$spam)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction email spam
##      email   307   17
##      spam    250  345
##                                           
##                Accuracy : 0.7095          
##                  95% CI : (0.6789, 0.7387)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : 3.903e-11       
##                                           
##                   Kappa : 0.4532          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5512          
##             Specificity : 0.9530          
##          Pos Pred Value : 0.9475          
##          Neg Pred Value : 0.5798          
##              Prevalence : 0.6061          
##          Detection Rate : 0.3341          
##    Detection Prevalence : 0.3526          
##       Balanced Accuracy : 0.7521          
##                                           
##        'Positive' Class : email           
##