library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.3
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
spambase <- read.csv("spambase.data.txt", header = FALSE)
head(spambase)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
## 1 0.00 0.64 0.64 0 0.32 0.00 0.00 0.00 0.00 0.00 0.00 0.64 0.00 0.00 0.00
## 2 0.21 0.28 0.50 0 0.14 0.28 0.21 0.07 0.00 0.94 0.21 0.79 0.65 0.21 0.14
## 3 0.06 0.00 0.71 0 1.23 0.19 0.19 0.12 0.64 0.25 0.38 0.45 0.12 0.00 1.75
## 4 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 0.31 0.31 0.31 0.00 0.00
## 5 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 0.31 0.31 0.31 0.00 0.00
## 6 0.00 0.00 0.00 0 1.85 0.00 0.00 1.85 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31
## 1 0.32 0.00 1.29 1.93 0.00 0.96 0 0.00 0.00 0 0 0 0 0 0 0
## 2 0.14 0.07 0.28 3.47 0.00 1.59 0 0.43 0.43 0 0 0 0 0 0 0
## 3 0.06 0.06 1.03 1.36 0.32 0.51 0 1.16 0.06 0 0 0 0 0 0 0
## 4 0.31 0.00 0.00 3.18 0.00 0.31 0 0.00 0.00 0 0 0 0 0 0 0
## 5 0.31 0.00 0.00 3.18 0.00 0.31 0 0.00 0.00 0 0 0 0 0 0 0
## 6 0.00 0.00 0.00 0.00 0.00 0.00 0 0.00 0.00 0 0 0 0 0 0 0
## V32 V33 V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48
## 1 0 0 0 0 0 0.00 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0
## 2 0 0 0 0 0 0.07 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0
## 3 0 0 0 0 0 0.00 0 0 0.06 0 0 0.12 0 0.06 0.06 0 0
## 4 0 0 0 0 0 0.00 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0
## 5 0 0 0 0 0 0.00 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0
## 6 0 0 0 0 0 0.00 0 0 0.00 0 0 0.00 0 0.00 0.00 0 0
## V49 V50 V51 V52 V53 V54 V55 V56 V57 V58
## 1 0.00 0.000 0 0.778 0.000 0.000 3.756 61 278 1
## 2 0.00 0.132 0 0.372 0.180 0.048 5.114 101 1028 1
## 3 0.01 0.143 0 0.276 0.184 0.010 9.821 485 2259 1
## 4 0.00 0.137 0 0.137 0.000 0.000 3.537 40 191 1
## 5 0.00 0.135 0 0.135 0.000 0.000 3.537 40 191 1
## 6 0.00 0.223 0 0.000 0.000 0.000 3.000 15 54 1
Assign column names
colnames(spambase) <- c("word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d",
"word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet",
"word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will",
"word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free",
"word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
"word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money",
"word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab",
"word_freq_labs", "word_freq_telnet", "word_freq_857", "word_freq_data",
"word_freq_415", "word_freq_85", "word_freq_technology", "word_freq_1999",
"word_freq_parts", "word_freq_pm", "word_freq_direct", "word_freq_cs", "word_freq_meeting",
"word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu",
"word_freq_table", "word_freq_conference", "char_freq_ch;", "char_freq_ch(",
"char_freq_ch[", "char_freq_ch!", "char_freq_ch$", "char_freq_ch#", "capital_run_length_average",
"capital_run_length_longest", "capital_run_length_total", "spam")
head(spambase)
## word_freq_make word_freq_address word_freq_all word_freq_3d
## 1 0.00 0.64 0.64 0
## 2 0.21 0.28 0.50 0
## 3 0.06 0.00 0.71 0
## 4 0.00 0.00 0.00 0
## 5 0.00 0.00 0.00 0
## 6 0.00 0.00 0.00 0
## word_freq_our word_freq_over word_freq_remove word_freq_internet
## 1 0.32 0.00 0.00 0.00
## 2 0.14 0.28 0.21 0.07
## 3 1.23 0.19 0.19 0.12
## 4 0.63 0.00 0.31 0.63
## 5 0.63 0.00 0.31 0.63
## 6 1.85 0.00 0.00 1.85
## word_freq_order word_freq_mail word_freq_receive word_freq_will
## 1 0.00 0.00 0.00 0.64
## 2 0.00 0.94 0.21 0.79
## 3 0.64 0.25 0.38 0.45
## 4 0.31 0.63 0.31 0.31
## 5 0.31 0.63 0.31 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_people word_freq_report word_freq_addresses word_freq_free
## 1 0.00 0.00 0.00 0.32
## 2 0.65 0.21 0.14 0.14
## 3 0.12 0.00 1.75 0.06
## 4 0.31 0.00 0.00 0.31
## 5 0.31 0.00 0.00 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_business word_freq_email word_freq_you word_freq_credit
## 1 0.00 1.29 1.93 0.00
## 2 0.07 0.28 3.47 0.00
## 3 0.06 1.03 1.36 0.32
## 4 0.00 0.00 3.18 0.00
## 5 0.00 0.00 3.18 0.00
## 6 0.00 0.00 0.00 0.00
## word_freq_your word_freq_font word_freq_000 word_freq_money word_freq_hp
## 1 0.96 0 0.00 0.00 0
## 2 1.59 0 0.43 0.43 0
## 3 0.51 0 1.16 0.06 0
## 4 0.31 0 0.00 0.00 0
## 5 0.31 0 0.00 0.00 0
## 6 0.00 0 0.00 0.00 0
## word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1 0 0 0 0.00
## 2 0 0 0 0.07
## 3 0 0 0 0.00
## 4 0 0 0 0.00
## 5 0 0 0 0.00
## 6 0 0 0 0.00
## word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 1 0 0 0.00 0
## 2 0 0 0.00 0
## 3 0 0 0.06 0
## 4 0 0 0.00 0
## 5 0 0 0.00 0
## 6 0 0 0.00 0
## word_freq_meeting word_freq_original word_freq_project word_freq_re
## 1 0 0.00 0 0.00
## 2 0 0.00 0 0.00
## 3 0 0.12 0 0.06
## 4 0 0.00 0 0.00
## 5 0 0.00 0 0.00
## 6 0 0.00 0 0.00
## word_freq_edu word_freq_table word_freq_conference char_freq_ch;
## 1 0.00 0 0 0.00
## 2 0.00 0 0 0.00
## 3 0.06 0 0 0.01
## 4 0.00 0 0 0.00
## 5 0.00 0 0 0.00
## 6 0.00 0 0 0.00
## char_freq_ch( char_freq_ch[ char_freq_ch! char_freq_ch$ char_freq_ch#
## 1 0.000 0 0.778 0.000 0.000
## 2 0.132 0 0.372 0.180 0.048
## 3 0.143 0 0.276 0.184 0.010
## 4 0.137 0 0.137 0.000 0.000
## 5 0.135 0 0.135 0.000 0.000
## 6 0.223 0 0.000 0.000 0.000
## capital_run_length_average capital_run_length_longest
## 1 3.756 61
## 2 5.114 101
## 3 9.821 485
## 4 3.537 40
## 5 3.537 40
## 6 3.000 15
## capital_run_length_total spam
## 1 278 1
## 2 1028 1
## 3 2259 1
## 4 191 1
## 5 191 1
## 6 54 1
barplot(table(spambase$spam),main="Spam vs Non Spam")
In the data most of the emails are non smap.
# Set seed value
set.seed(123)
# Shuffle the data set in R
spambase <- spambase[sample(nrow(spambase)),]
#converst spam to factor
spambase$spam <- as.factor(spambase$spam)
index <- createDataPartition(spambase$spam, p = 0.70,list = FALSE)
training <- spambase[index,]
test <- spambase[-index,]
I’ve used SVM,knn and LDA for spambase classification
# Control parameters for model
train_ctrl <- trainControl(method = "repeatedcv",number = 10, repeats = 10)
model_svm <- train(spam~., data = training, method = "svmLinear",trControl = train_ctrl)
model_lda <- train(spam~., data = training, method = "lda",trControl = train_ctrl)
model_knn <- train(spam~., data = training, method = "knn",trControl = train_ctrl)
# Do prediction
pred_svm <- predict(model_svm, test[,-58])
pred_lda <- predict(model_lda, test[, -58])
pred_knn <- predict(model_knn, test[,-58])
paste("SVM confusion matrix")
## [1] "SVM confusion matrix"
confusionMatrix(pred_svm,test$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 788 60
## 1 48 483
##
## Accuracy : 0.9217
## 95% CI : (0.9062, 0.9353)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8353
##
## Mcnemar's Test P-Value : 0.2898
##
## Sensitivity : 0.9426
## Specificity : 0.8895
## Pos Pred Value : 0.9292
## Neg Pred Value : 0.9096
## Prevalence : 0.6062
## Detection Rate : 0.5714
## Detection Prevalence : 0.6149
## Balanced Accuracy : 0.9160
##
## 'Positive' Class : 0
##
paste("LDA confusion matrix")
## [1] "LDA confusion matrix"
confusionMatrix(pred_lda,test$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 781 111
## 1 55 432
##
## Accuracy : 0.8796
## 95% CI : (0.8613, 0.8963)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7432
##
## Mcnemar's Test P-Value : 1.965e-05
##
## Sensitivity : 0.9342
## Specificity : 0.7956
## Pos Pred Value : 0.8756
## Neg Pred Value : 0.8871
## Prevalence : 0.6062
## Detection Rate : 0.5664
## Detection Prevalence : 0.6468
## Balanced Accuracy : 0.8649
##
## 'Positive' Class : 0
##
paste("KNN confuxion matrix")
## [1] "KNN confuxion matrix"
confusionMatrix(pred_knn,test$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 718 148
## 1 118 395
##
## Accuracy : 0.8071
## 95% CI : (0.7853, 0.8276)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.592
##
## Mcnemar's Test P-Value : 0.07539
##
## Sensitivity : 0.8589
## Specificity : 0.7274
## Pos Pred Value : 0.8291
## Neg Pred Value : 0.7700
## Prevalence : 0.6062
## Detection Rate : 0.5207
## Detection Prevalence : 0.6280
## Balanced Accuracy : 0.7931
##
## 'Positive' Class : 0
##
Out of smv,lda and knn model SVM has best test performance with test accuracy 0.9217 . # ROC Curve
# calculate roc curve values
roc_svm <- roc(pred_svm,as.numeric(test$spam))
roc_lda <- roc(as.numeric(pred_lda),as.numeric(test$spam))
roc_knn <- roc(as.numeric(pred_knn), as.numeric(test$spam))
# Area under the curve
roc_svm$auc
## Area under the curve: 0.9194
roc_lda$auc
## Area under the curve: 0.8813
roc_knn$auc
## Area under the curve: 0.7995
plot(roc_svm,col="red",lty=1,main="ROC curve for 3 model")
plot(roc_lda,col="green",lty=2,add=TRUE)
plot(roc_knn,col="blue",lty=3,add=TRUE)
legend("topright", legend=c("SVM","LDA","KNN"), col = c("red","green","blue"), lty=1:3)
From the ROC curve we can that SVM has the highest prediction performance.