# Demo Dataset: SPAM
library(ElemStatLearn)
## Warning: package 'ElemStatLearn' was built under R version 3.6.1
library(caret)
## Warning: package 'caret' was built under R version 3.6.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.1
data(spam)
dim(spam)
## [1] 4601 58
names(spam)
## [1] "A.1" "A.2" "A.3" "A.4" "A.5" "A.6" "A.7" "A.8" "A.9" "A.10"
## [11] "A.11" "A.12" "A.13" "A.14" "A.15" "A.16" "A.17" "A.18" "A.19" "A.20"
## [21] "A.21" "A.22" "A.23" "A.24" "A.25" "A.26" "A.27" "A.28" "A.29" "A.30"
## [31] "A.31" "A.32" "A.33" "A.34" "A.35" "A.36" "A.37" "A.38" "A.39" "A.40"
## [41] "A.41" "A.42" "A.43" "A.44" "A.45" "A.46" "A.47" "A.48" "A.49" "A.50"
## [51] "A.51" "A.52" "A.53" "A.54" "A.55" "A.56" "A.57" "spam"
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ A.1 : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ A.2 : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ A.3 : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ A.4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.5 : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ A.6 : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ A.7 : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ A.8 : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ A.9 : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ A.10: num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ A.11: num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ A.12: num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ A.13: num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ A.14: num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ A.15: num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ A.16: num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ A.17: num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ A.18: num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ A.19: num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ A.20: num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ A.21: num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ A.22: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.23: num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ A.24: num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ A.25: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.26: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.27: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.28: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.29: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.30: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.31: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.32: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.33: num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ A.34: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.35: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.36: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.37: num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ A.38: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.39: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.40: num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ A.41: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.42: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.43: num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ A.44: num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ A.45: num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ A.46: num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ A.47: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.48: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.49: num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ A.50: num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ A.51: num 0 0 0 0 0 0 0 0 0 0 ...
## $ A.52: num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ A.53: num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ A.54: num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ A.55: num 3.76 5.11 9.82 3.54 3.54 ...
## $ A.56: int 61 101 485 40 40 15 4 11 445 43 ...
## $ A.57: int 278 1028 2259 191 191 54 112 49 1257 749 ...
## $ spam: Factor w/ 2 levels "email","spam": 2 2 2 2 2 2 2 2 2 2 ...
# Creat Training and Validation Datasets
train_index <- createDataPartition(spam$spam, p = 0.8, list = FALSE)
spam_train <- spam[train_index, ]
spam_test <- spam[-train_index, ]
table(spam_train$spam)
##
## email spam
## 2231 1451
table(spam_test$spam)
##
## email spam
## 557 362
# Train the Basic NB Model
library(klaR)
## Warning: package 'klaR' was built under R version 3.6.1
## Loading required package: MASS
model_nb1 <- suppressWarnings(train(spam ~ ., data = spam_train, method = "nb"))
model_nb1
## Naive Bayes
##
## 3682 samples
## 57 predictor
## 2 classes: 'email', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3682, 3682, 3682, 3682, 3682, 3682, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.7234254 0.4756690
## TRUE 0.5866633 0.2666212
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE
## and adjust = 1.
# Predict with NBC
predict_nb1 <- suppressWarnings(predict(model_nb1, newdata = spam_test, type = "raw"))
confusionMatrix(predict_nb1, spam_test$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction email spam
## email 309 17
## spam 248 345
##
## Accuracy : 0.7116
## 95% CI : (0.6812, 0.7408)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : 1.511e-11
##
## Kappa : 0.4568
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5548
## Specificity : 0.9530
## Pos Pred Value : 0.9479
## Neg Pred Value : 0.5818
## Prevalence : 0.6061
## Detection Rate : 0.3362
## Detection Prevalence : 0.3547
## Balanced Accuracy : 0.7539
##
## 'Positive' Class : email
##
# Fine Tune the Models
model_nb2 <- suppressWarnings(train(spam ~ ., data = spam,
method = "nb",
trcontrol = trainControl(method = "cv", number = 3),
tuneGrid = data.frame(fL = 1, usekernel = FALSE, adjust = 1)))
model_nb2
## Naive Bayes
##
## 4601 samples
## 57 predictor
## 2 classes: 'email', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 4601, 4601, 4601, 4601, 4601, 4601, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7129969 0.457764
##
## Tuning parameter 'fL' was held constant at a value of 1
## Tuning
## parameter 'usekernel' was held constant at a value of FALSE
##
## Tuning parameter 'adjust' was held constant at a value of 1
# Predict with Tuned Model
predict_nb2 <- suppressWarnings(predict(model_nb2, newdata = spam_test, type = "raw"))
confusionMatrix(predict_nb2, spam_test$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction email spam
## email 307 17
## spam 250 345
##
## Accuracy : 0.7095
## 95% CI : (0.6789, 0.7387)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : 3.903e-11
##
## Kappa : 0.4532
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5512
## Specificity : 0.9530
## Pos Pred Value : 0.9475
## Neg Pred Value : 0.5798
## Prevalence : 0.6061
## Detection Rate : 0.3341
## Detection Prevalence : 0.3526
## Balanced Accuracy : 0.7521
##
## 'Positive' Class : email
##