library(kernlab)
set.seed(12345)
data(spam)
First, we look at the spam dataset:
head(spam)
## make address all num3d our over remove internet order mail receive
## 1 0.00 0.64 0.64 0 0.32 0.00 0.00 0.00 0.00 0.00 0.00
## 2 0.21 0.28 0.50 0 0.14 0.28 0.21 0.07 0.00 0.94 0.21
## 3 0.06 0.00 0.71 0 1.23 0.19 0.19 0.12 0.64 0.25 0.38
## 4 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 0.31
## 5 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 0.31
## 6 0.00 0.00 0.00 0 1.85 0.00 0.00 1.85 0.00 0.00 0.00
## will people report addresses free business email you credit your font
## 1 0.64 0.00 0.00 0.00 0.32 0.00 1.29 1.93 0.00 0.96 0
## 2 0.79 0.65 0.21 0.14 0.14 0.07 0.28 3.47 0.00 1.59 0
## 3 0.45 0.12 0.00 1.75 0.06 0.06 1.03 1.36 0.32 0.51 0
## 4 0.31 0.31 0.00 0.00 0.31 0.00 0.00 3.18 0.00 0.31 0
## 5 0.31 0.31 0.00 0.00 0.31 0.00 0.00 3.18 0.00 0.31 0
## 6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0
## num000 money hp hpl george num650 lab labs telnet num857 data num415
## 1 0.00 0.00 0 0 0 0 0 0 0 0 0 0
## 2 0.43 0.43 0 0 0 0 0 0 0 0 0 0
## 3 1.16 0.06 0 0 0 0 0 0 0 0 0 0
## 4 0.00 0.00 0 0 0 0 0 0 0 0 0 0
## 5 0.00 0.00 0 0 0 0 0 0 0 0 0 0
## 6 0.00 0.00 0 0 0 0 0 0 0 0 0 0
## num85 technology num1999 parts pm direct cs meeting original project
## 1 0 0 0.00 0 0 0.00 0 0 0.00 0
## 2 0 0 0.07 0 0 0.00 0 0 0.00 0
## 3 0 0 0.00 0 0 0.06 0 0 0.12 0
## 4 0 0 0.00 0 0 0.00 0 0 0.00 0
## 5 0 0 0.00 0 0 0.00 0 0 0.00 0
## 6 0 0 0.00 0 0 0.00 0 0 0.00 0
## re edu table conference charSemicolon charRoundbracket
## 1 0.00 0.00 0 0 0.00 0.000
## 2 0.00 0.00 0 0 0.00 0.132
## 3 0.06 0.06 0 0 0.01 0.143
## 4 0.00 0.00 0 0 0.00 0.137
## 5 0.00 0.00 0 0 0.00 0.135
## 6 0.00 0.00 0 0 0.00 0.223
## charSquarebracket charExclamation charDollar charHash capitalAve
## 1 0 0.778 0.000 0.000 3.756
## 2 0 0.372 0.180 0.048 5.114
## 3 0 0.276 0.184 0.010 9.821
## 4 0 0.137 0.000 0.000 3.537
## 5 0 0.135 0.000 0.000 3.537
## 6 0 0.000 0.000 0.000 3.000
## capitalLong capitalTotal type
## 1 61 278 spam
## 2 101 1028 spam
## 3 485 2259 spam
## 4 40 191 spam
## 5 40 191 spam
## 6 15 54 spam
The spamPredictor function uses only the ‘your’ feature from the spam dataset to predict spam v. non-spam. spamPredictor finds the optimal threshold to maximize accuracy using only this feature
spamPredictor <- function(spam){
vals = seq(0,11.1,by=0.1)
optimal_pos = 0
optimal_neg = 0
optimal_acc = 0
pos = 0
neg = 0
acc_loc = 0
ns_vals = double()
ns_sensitivity = double()
ns_specificity = double()
accuracy = double()
s_vals = double()
steps = double()
for(i in vals){
prediction <- ifelse(spam$your > i, "spam", "nonspam")
x <- table(prediction, spam$type)/length(spam$type)
ns <- x[1,1] / (x[1,1] + x[1,2])
ns_sens <- x[1,1] / (x[1,1] + x[2,1])
ns_spec <- x[2,2] / (x[1,2] + x[2,2])
ns_vals <- c(ns_vals, ns)
ns_sensitivity <-c(ns_sensitivity, ns_sens)
ns_specificity <-c(ns_specificity, ns_spec)
steps = c(steps, i)
if(ns > pos){
pos = ns
optimal_pos = i
}
s <- x[2,2] / (x[2,1] + x[2,2])
if(s > neg){
neg = s
optimal_neg = i
}
acc = (x[1,1] + x[2,2]) / sum(x)
if(acc > optimal_acc){
optimal_acc = acc
optimal_loc = i
}
s_vals <- c(s_vals, s)
accuracy <- c(accuracy,acc)
}
#old.par <- par(mfrow=c(2, 1))
#dev.new(width = 5, height = 4)
plot(steps, ns_vals, xlab = "Number of 'your' occurrences",type='l',
col=2, ylim = c(0,1), main = "Positive and negative predictive power", ylab="" )
lines(steps, s_vals, col=3)
lines(steps, accuracy, col=4)
legend(0.4,1,c("Positive predictive value", "Negative predictive value", "Accuracy"), col=c(2,3,4), lty=c(1,1), cex =0.7)
plot(1 - ns_specificity, ns_sensitivity, col="green", type='l', ylim = c(0,1),
xlab = "1 - specificity", ylab = "Sensitivity", main = "ROC")
points(steps, steps)
#lines(steps, ns_specificity, col="orange")
#par(old.par)
z = list("optimal_pos_loc" = optimal_pos, "optimal_pos_val" = pos, "optimal_neg_loc" = optimal_neg, "optimal_neg_val" = neg, "optimal_acc_loc" = optimal_loc, "optimal_acc_val" =
optimal_acc)
return(z)
}
results = spamPredictor(spam)
results
## $optimal_pos_loc
## [1] 0
##
## $optimal_pos_val
## [1] 0.8406795
##
## $optimal_neg_loc
## [1] 10.8
##
## $optimal_neg_val
## [1] 1
##
## $optimal_acc_loc
## [1] 0.6
##
## $optimal_acc_val
## [1] 0.7570093
The above analysis shows that for single variable analysis, using only the number of occurrences of ‘your’ in the email, we can achieve a spam v. non-spam accuracy of 75.7% by using a threshold value of 0.6.
Next, we use the caret package to explore several other more robust models in which all features of the spam dataset are used.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
#60% of our values are used for constructing the training set
inTrain <- createDataPartition(y=spam$type, p=0.6, list=FALSE)
training <-spam[inTrain,]
testing <- spam[-inTrain,]
suppressWarnings(modelFit_glm <- train(type ~., data=training, method="glm"))
modelFit_glm
## Generalized Linear Model
##
## 2761 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2761, 2761, 2761, 2761, 2761, 2761, ...
##
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.9178637 0.8268386 0.02124111 0.0488519
##
##
predictions <- predict(modelFit_glm, newdata=testing)
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 1058 81
## spam 57 644
##
## Accuracy : 0.925
## 95% CI : (0.912, 0.9366)
## No Information Rate : 0.606
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.842
## Mcnemar's Test P-Value : 0.05024
##
## Sensitivity : 0.9489
## Specificity : 0.8883
## Pos Pred Value : 0.9289
## Neg Pred Value : 0.9187
## Prevalence : 0.6060
## Detection Rate : 0.5750
## Detection Prevalence : 0.6190
## Balanced Accuracy : 0.9186
##
## 'Positive' Class : nonspam
##
Using a generalized linear model with all features of spam dataset, we achieve an accuracy of 91.79% on the training set and 92.5% on the test set
## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: parallel
## Loaded gbm 2.1
## Loading required package: plyr
## Stochastic Gradient Boosting
##
## 2761 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2761, 2761, 2761, 2761, 2761, 2761, ...
##
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa Accuracy SD
## 1 50 0.9165111 0.8213298 0.008362393
## 1 100 0.9323249 0.8564350 0.007510232
## 1 150 0.9364828 0.8655810 0.007193508
## 2 50 0.9313950 0.8545639 0.007997771
## 2 100 0.9392781 0.8716994 0.007063781
## 2 150 0.9425737 0.8787397 0.007805466
## 3 50 0.9357728 0.8640799 0.007935755
## 3 100 0.9430190 0.8796626 0.007880130
## 3 150 0.9452118 0.8843612 0.007048526
## Kappa SD
## 0.01833140
## 0.01588054
## 0.01491265
## 0.01676476
## 0.01463570
## 0.01618822
## 0.01654318
## 0.01650152
## 0.01483397
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3 and shrinkage = 0.1.
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 1070 70
## spam 45 655
##
## Accuracy : 0.9375
## 95% CI : (0.9255, 0.9481)
## No Information Rate : 0.606
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8683
## Mcnemar's Test P-Value : 0.02522
##
## Sensitivity : 0.9596
## Specificity : 0.9034
## Pos Pred Value : 0.9386
## Neg Pred Value : 0.9357
## Prevalence : 0.6060
## Detection Rate : 0.5815
## Detection Prevalence : 0.6196
## Balanced Accuracy : 0.9315
##
## 'Positive' Class : nonspam
##
Using stochastic gradient boosting, using all features of the spam dataset, we achieve an accuracy of 94.52% on the training set and 93.75% accuracy on the test set.
suppressWarnings(modelFit_svm <- train(type ~., data=training, method="svmLinear"))
modelFit_svm
## Support Vector Machines with Linear Kernel
##
## 2761 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2761, 2761, 2761, 2761, 2761, 2761, ...
##
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.9254654 0.8433606 0.005406397 0.01113216
##
## Tuning parameter 'C' was held constant at a value of 1
##
predictions <- predict(modelFit_svm, newdata=testing)
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 1055 75
## spam 60 650
##
## Accuracy : 0.9266
## 95% CI : (0.9138, 0.9381)
## No Information Rate : 0.606
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8458
## Mcnemar's Test P-Value : 0.2282
##
## Sensitivity : 0.9462
## Specificity : 0.8966
## Pos Pred Value : 0.9336
## Neg Pred Value : 0.9155
## Prevalence : 0.6060
## Detection Rate : 0.5734
## Detection Prevalence : 0.6141
## Balanced Accuracy : 0.9214
##
## 'Positive' Class : nonspam
##
A support vector machine with linear kernel provides 92.55% accuracy on training set and 92.66% accuracy on test set.
We achieve accuracy of over 91.9% with a random forest. (results not shown)
modelFit_rpart <- train(type ~., data=training, method="rpart")
## Loading required package: rpart
modelFit_rpart
## CART
##
## 2761 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2761, 2761, 2761, 2761, 2761, 2761, ...
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.06985294 0.8215504 0.6221559 0.02249869 0.04696270
## 0.07444853 0.8168455 0.6142914 0.02407778 0.04864253
## 0.48437500 0.7023295 0.3051601 0.09515024 0.27662878
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.06985294.
predictions <- predict(modelFit_rpart, newdata=testing)
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 1041 267
## spam 74 458
##
## Accuracy : 0.8147
## 95% CI : (0.7962, 0.8322)
## No Information Rate : 0.606
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.593
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9336
## Specificity : 0.6317
## Pos Pred Value : 0.7959
## Neg Pred Value : 0.8609
## Prevalence : 0.6060
## Detection Rate : 0.5658
## Detection Prevalence : 0.7109
## Balanced Accuracy : 0.7827
##
## 'Positive' Class : nonspam
##
We achieve an accuracy of 82.16% on the training set using classification and regression trees (CART), and 81.47% accurac on the test set.
capture.output(modelFit_nnet <- train(type ~., data=training, method="nnet"), file='NUL')
## Loading required package: nnet
modelFit_nnet
## Neural Network
##
## 2761 samples
## 57 predictors
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2761, 2761, 2761, 2761, 2761, 2761, ...
##
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa Accuracy SD Kappa SD
## 1 0e+00 0.8906584 0.7655596 0.066231176 0.17241292
## 1 1e-04 0.8393926 0.6243925 0.132960235 0.35363145
## 1 1e-01 0.9272355 0.8482397 0.008496003 0.01746492
## 3 0e+00 0.9225580 0.8385038 0.024232270 0.04805599
## 3 1e-04 0.9243352 0.8418648 0.011609575 0.02446403
## 3 1e-01 0.9313034 0.8562914 0.010084010 0.02100872
## 5 0e+00 0.9312791 0.8562088 0.008149483 0.01713756
## 5 1e-04 0.9295296 0.8525310 0.008558600 0.01772226
## 5 1e-01 0.9337628 0.8613387 0.008709115 0.01833183
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.1.
predictions <- predict(modelFit_nnet, newdata=testing)
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 1061 55
## spam 54 670
##
## Accuracy : 0.9408
## 95% CI : (0.929, 0.9511)
## No Information Rate : 0.606
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8759
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9516
## Specificity : 0.9241
## Pos Pred Value : 0.9507
## Neg Pred Value : 0.9254
## Prevalence : 0.6060
## Detection Rate : 0.5766
## Detection Prevalence : 0.6065
## Balanced Accuracy : 0.9379
##
## 'Positive' Class : nonspam
##
Using a neural network with the default of 100 iterations we get 93.37% accuracy on the training set, and 94.08% on the test set.