In this document classification project, I am using the classified data getting from https://spamassassin.apache.org/old/publiccorpus/ to train and test prediction models from different classification algorithms and then evaluate which algorithm(s) will perform better to accurately predict whether a new document (email) is spam or ham.

Load the R Packages

library(tm)
library(caret)
library(arm)
library(e1071)
library(minqa)
library(ranger)
library(SnowballC)
library(plyr)
library(dplyr)

Load the Data

# Working directories for spam and ham data.
ham <- "./ham"
spam <- "./spam"

# Load the spam and ham data and create corpus for each.
ham <- VCorpus(DirSource(ham))
spam <- VCorpus(DirSource(spam))  

# Number of documents for ham.
length(ham)
## [1] 2551
# Number of document for spam.
length(spam)
## [1] 1397

Data Preparation and Preprossesing

Document Term Matrix

# Create new spam and ham corpus for document term matrix.
ham_tdm <- VCorpus(VectorSource(ham))
spam_tdm <- VCorpus(VectorSource(spam))

# Random sampling 250 samples from thousands corpus to reduce number of corpus to classify due to computing power limitation on my pc.
set.seed(123)
ham_tdm <- sample(ham_tdm, 250)
set.seed(456)
spam_tdm <- sample(spam_tdm, 250)

# Convert the text to lowercase.
ham_tdm <- tm_map(ham_tdm, content_transformer(tolower))
ham_tdm <- tm_map(ham_tdm, PlainTextDocument)
spam_tdm <- tm_map(spam_tdm, content_transformer(tolower))
spam_tdm <- tm_map(spam_tdm, PlainTextDocument)

# Remove all punctuation from the corpus.
ham_tdm <- tm_map(ham_tdm, removePunctuation)
spam_tdm <- tm_map(spam_tdm, removePunctuation)

# Remove all English stopwords from the corpus.
ham_tdm <- tm_map(ham_tdm, removeWords, stopwords("en"))
spam_tdm <- tm_map(spam_tdm, removeWords, stopwords("en"))

# Remove all number from the corpus.
ham_tdm <- tm_map(ham_tdm, removeNumbers)
spam_tdm <- tm_map(spam_tdm, removeNumbers)

# Strip extra white spaces in the corpus.
ham_tdm <- tm_map(ham_tdm, stripWhitespace)
spam_tdm <- tm_map(spam_tdm, stripWhitespace)

# Stem the words in the corpus.
ham_tdm <- tm_map(ham_tdm, stemDocument)
spam_tdm <- tm_map(spam_tdm, stemDocument)

# Build document term matrix.
ham_tdm <- DocumentTermMatrix(ham_tdm)
spam_tdm <- DocumentTermMatrix(spam_tdm)

# Remove sparse terms which don't appear very often. Limit the document term matrix to contain terms appearing in at least 5% of documents.
ham_tdm <- removeSparseTerms(ham_tdm, 0.95)
spam_tdm <- removeSparseTerms(spam_tdm, 0.95)

Dataset Transformation

Convert the spam and ham document term matrix dataset to dataframe and assign a classification (factor) to each document.

# Label "0" for ham.
ham_trfm <- as.matrix(ham_tdm)
ham_trfm <- cbind(ham_trfm, 0)
colnames(ham_trfm)[ncol(ham_trfm)] <-'spamorham'

# Label "1" for spam.
spam_trfm <- as.matrix(spam_tdm)
spam_trfm <- cbind(spam_trfm, 1)
colnames(spam_trfm)[ncol(spam_trfm)] <-'spamorham'

# Combine both transformed spam and ham dataset.
spamham_trfm <- rbind.fill.matrix(ham_trfm, spam_trfm)

# Convert the combined spam and ham dataset to dataframe.
spamham_trfm<- as.data.frame(spamham_trfm)

# Turn the classification '0' and '1' labels into factor.
spamham_trfm$spamorham <- as.factor(spamham_trfm$spamorham)

# Replace NA values generated from using the rbind.fill.matrix with "0". The NA auto-generated because combine rows from two datasets which number of columns is different. The NA values should be 0 frequency because in fact the terms (variables) does not exist for those documents.
spamham_trfm[is.na(spamham_trfm)] <- 0

head(spamham_trfm, n=1)
##   accept actual add address ago agre allow almost alreadi also although
## 1      0      0   0       0   0    0     0      0       1    0        0
##   alway america anoth anyon anyth around ask aug august authnlegwnnet away
## 1     0       1     0     0     0      0   0   0      0             0    0
##   back bad base begin believ best better big bit box build bulk busi call
## 1    0   0    0     0      0    0      0   0   0   0     0    0    0    0
##   can cant case cell certain chang charsetiso charsetusascii check claim
## 1   0    0    0    0       0     0          0              0     0     0
##   clean clear close code come comment communic compani complet comput
## 1     0     0     0    0    0       0        0       0       0      0
##   connect contact contentdisposit contenttransferencod contenttyp
## 1       0       0               0                    0          1
##   copyright countri cours current daili data date day debian decid
## 1         0       0     0       0     0    0    2   0      0     0
##   deliveredto develop didnt differ discuss doesnt dogmaslashnullorg done
## 1           1       0     0      0       0      0                 2    0
##   dont easi edit edt egp egwn egwnnet els email encodingutf end engin
## 1    0    0    0   0   0    0       0   0     0           1   0     0
##   enough enus error errorsto esmtp etc even ever everi exact exampl
## 1      0    0     0        0     2   0    0    0     0     0      0
##   examplecom except exchang execut exim exist exmh exmhus
## 1          0      0       0      0    0     0    0      0
##   exmhusersadminexamplecom exmhusersadminredhatcom exmhusersexamplecom
## 1                        0                       0                   0
##   exmhuserslistmanexamplecom exmhuserslistmanredhatcom exmhusersredhatcom
## 1                          0                         0                  0
##   experi express fact fail fall fals far feel fetchmail figur file find
## 1      0       0    0    0    0    0   0    0         1     0    0    0
##   first fix folk follow forc forg fork forkadminxentcom forkexamplecom
## 1     0   0    0      0    0    0    0                0              0
##   forkxentcom formatflow forteanaowneryahoogroupscom
## 1           0          0                           0
##   forteanaunsubscribeegroupscom found free freshrpm fri friend gari
## 1                             0     0    0        0   0      0    0
##   general get give gmt gnulinux good got govern great group grow guy half
## 1       0   0    0   0        0    0   0      0     0     0    0   0    0
##   hand happen hard head helo helouswsflistsourceforgenet help high hit
## 1    0      0    0    0    0                           0    0    0   1
##   hold home howev httpdocsyahoocominfoterm
## 1    0    0     0                        0
##   httplistsfreshrpmsnetmailmanlistinforpmlist
## 1                                           0
##   httplistsfreshrpmsnetmailmanlistinforpmzzzlist
## 1                                              0
##   httplistsfreshrpmsnetpipermailrpmzzzlist
## 1                                        0
##   httpsexamplesourceforgenetlistslistinfospamassassintalk
## 1                                                       0
##   httpslistmanexamplecommailmanlistinfoexmhus
## 1                                           0
##   httpslistmanexamplecommailmanprivateexmhus
## 1                                          0
##   httpslistmanredhatcommailmanlistinfoexmhus
## 1                                          0
##   httpslistssourceforgenetlistslistinfospamassassintalk
## 1                                                     0
##   httpwwwnewsisfreecomclick httpxentcommailmanlistinfofork
## 1                         1                              0
##   httpxentcompipermailfork idea ill imap import includ inform inreplyto
## 1                        0    0   0    1      0      0      0         0
##   instal instead interest internet intmxcorpexamplecom intmxcorpredhatcom
## 1      0       0        0        0                   0                  0
##   invok issu ist ive jalapeno jmasonorg jmexmhjmasonorg jmjmasonorg
## 1     0    0   2   0        2         1               0           1
##   jmlocalhost jmrpmjmasonorg jmsajmasonorg john just keep key khare kind
## 1           2              0             0    0    0    0   0     0    0
##   know lairxentcom larg last lawrenc least less let life like limit line
## 1    0           0    0    0       0     0    0   0    0    0     0    0
##   link linux list listarch listhelp listid listmanexamplecom
## 1    0     0    0        0        0      0                 0
##   listmanredhatcom listpost listsubscrib listunsubscrib littl live
## 1                0        0            0              0     0    0
##   localhost localhostlocaldomain log long look lot machin made mail
## 1         3                    0   0    0    0   0      0    0    0
##   mailinglist maillocalhost mailtoexmhusersexamplecom
## 1           0             0                         0
##   mailtoexmhusersrequestexamplecomsubjecthelp
## 1                                           0
##   mailtoexmhusersrequestredhatcomsubjectsubscrib
## 1                                              0
##   mailtoexmhusersrequestredhatcomsubjectunsubscrib mailtoforkexamplecom
## 1                                                0                    0
##   mailtoforkrequestxentcomsubjecthelp
## 1                                   0
##   mailtoforkrequestxentcomsubjectsubscrib
## 1                                       0
##   mailtoforkrequestxentcomsubjectunsubscrib
## 1                                         0
##   mailtorpmlistrequestfreshrpmsnetsubjectsubscrib
## 1                                               0
##   mailtorpmlistrequestfreshrpmsnetsubjectunsubscrib
## 1                                                 0
##   mailtorpmzzzlistfreshrpmsnet
## 1                            0
##   mailtorpmzzzlistrequestfreshrpmsnetsubjecthelp
## 1                                              0
##   mailtospamassassintalkexamplesourceforgenet
## 1                                           0
##   mailtospamassassintalkrequestexamplesourceforgenetsubjecthelp
## 1                                                             0
##   mailtospamassassintalkrequestlistssourceforgenetsubjectsubscrib
## 1                                                               0
##   mailtospamassassintalkrequestlistssourceforgenetsubjectunsubscrib
## 1                                                                 0
##   mailtozzzzteanaunsubscribeyahoogroupscom maintain make man manag mani
## 1                                        0        0    0   0     0    0
##   mark market matthia may mayb mean messag messageid mgrpscdyahoocom
## 1    0      0       0   0    0    0      0         1               0
##   microsoft might mimeol mimevers miss mon month mozilla mtagrpscdyahoocom
## 1         0     0      0        0    0   0     0       0                 0
##   much must mxexamplecom mxredhatcom name need network never new news
## 1    0    0            0           0    0    0       0     0   0    0
##   ngrpscdyahoocom night nnfmp normal noth notic now nsegwnnet number oct
## 1               0     0     0      0    0     0   0         0      0   5
##   offic old one open order organ origin outlook packag page part
## 1     0   0   0    0     0     0      0       0      0    0    0
##   particular past pdt peopl perhap person pgp phobo
## 1          0    0   0     1      0      0   0     0
##   phoboslabsnetnoteinccom phone pick place pleas point polit possibl post
## 1                       0     0    0     0     0     0     0       0    0
##   postfix power practic preced prefer pretti probabl problem process
## 1       1     0       0      0      0      0       0       0       0
##   produc provid public put qmail qmqp question quit quot rather read real
## 1      0      0      0   0     0    0        0    0    0      0    0    0
##   realli reason receiv recent record refer relat releas remov replyto
## 1      0      0      3      0      0     0     0      0     0       0
##   report requir reserv result returnpath right robert rohit rpm rpmlist
## 1      0      1      0      0          1     0      0     0   0       0
##   rpmlistadminfreshrpmsnet rpmlistfreshrpmsnet rpmzzzlistadminfreshrpmsnet
## 1                        0                   0                           0
##   rpmzzzlistfreshrpmsnet rssfeedsexamplecom rssfeedsjmasonorg run said sat
## 1                      0                  2                 1   0    0   0
##   satalk save say second see seem seen sell send sender sent
## 1      0    0   0      0   0    0    0    0    0      0    0
##   senttozzzzexamplecomreturnsgroupsyahoocom sep septemb server servic set
## 1                                         0   0       0      0      0   0
##   sever sfnet show sign signatur similar sinc singledrop site smtp softwar
## 1     0     0    0    0        0       0    0          1    0    0       0
##   someon someth sort sourc spam spamassassin spamassassintalk
## 1      0      0    0     0    0            0                0
##   spamassassintalkadminexamplesourceforgenet
## 1                                          0
##   spamassassintalkadminlistssourceforgenet
## 1                                        0
##   spamassassintalkexamplesourceforgenet
## 1                                     0
##   spamassassintalklistssourceforgenet sponsor start state still stop stori
## 1                                   0       0     0     0     0    0     0
##   stuff subject suggest sun suppli support sure system take talk tell test
## 1     0       1       0   0      0       0    0      0    0    0    0    0
##   testsawl testsawlemailattributioninreptoknownmailinglist textplain thank
## 1        0                                               0         1     0
##   that there thing think though thought three thu time today told top tri
## 1    0     0     0     0      0       0     0   0    0     0    0   0   0
##   true tue turn two uid understand unknown unsubscrib updat upon url use
## 1    0   5    0   0   0          0       0          0     0    0   1   0
##   user userag userid usual uswsffwsourceforgenet uswsflistbsourceforgenet
## 1    0      0      0     0                     0                        0
##   uswsflistsourceforgenet vamm version versioncv via want war way wed week
## 1                       0    0       0         1   0    0   0   0   0    0
##   well will window without wonder word work world write wrote
## 1    0    0      0       0      0    0    0     0     0     0
##   xacceptlanguag xapparentlyto xbeenther xegroupsreturn xentcom xloop
## 1              0             0         0              0       0     0
##   xmailer xmailmanvers xmailscann xmimeol xmsmailprior xoriginald
## 1       0            0          0       0            0          0
##   xprioriti xpyzor xsender xspamlevel xspamstatus xurl xyahooprofil yahoo
## 1         0      0       0          1           1    0            0     0
##   year yet youll your yyyyexamplecom yyyylocalhostexamplecom
## 1    0   0     0    0              1                       1
##   yyyylocalhostnetnoteinccom zzzzexamplecom zzzzlocalhost
## 1                          0              0             0
##   zzzzlocalhostexamplecom zzzzteana zzzzteanayahoogroupscom spamorham abl
## 1                       0         0                       0         0   0
##   access account act addit advantag advertis afford age aligncent
## 1      0       0   0     0        0        0      0   0         0
##   aligncenterbfont aligncenterfont aligndcent aligndcenterfont alignleft
## 1                0               0          0                0         0
##   amp approv area arial avail benefit bfont bgcolor bgcolord
## 1   0      0    0     0     0       0     0       0        0
##   bgcolordffffff bgcolorffffff bill bodi border bordercolor bordercolord
## 1              0             0    0    0      0           0            0
##   borderd brand brfont buy bythinkgeek card cash cdt cellpad cellpaddingd
## 1       0     0      0   0           0    0    0   0       0            0
##   cellspac cellspacingd center charg charsetdiso charsetwindow
## 1        0            0      0     0           0             0
##   cipheredhdssdescbcsha citi click client color colord colordff
## 1                     0    0     0      0     0      0        0
##   colordffffff colorff colspan colspand confidenti contain
## 1            0       0       0        0          0       0
##   contentdtexthtml contenttexthtml control copi cost cpunk
## 1                0               0       0    0    0     0
##   cpunkseinsteinsszcom cpunkshqpronsnet cpunkslocalhost cpunksmindernet
## 1                    0                0               0               0
##   cpunkswastemindernet credit custom cypherpunksdspronsnet
## 1                    0      0      0                     0
##   cypherpunkseinsteinsszcom cypherpunksforwarddspronsnet cypherpunksoutgo
## 1                         0                            0                0
##   cypherpunkssszcom daemonwast deal dear deathtospamdeathtospamdeathtospam
## 1                 0          0    0    0                                 0
##   debt deliveryd direct div doctyp dollar download drive earn easili
## 1    0         0      0   0      0      0        0     0    0      0
##   effect einsteinsszcom eir english enter envelopefrom everyth expens face
## 1      0              0   0       0     0            0       0      0    0
##   faceari facedari facedverdana faceverdana famili fast fastest fax featur
## 1       0        0            0           0      0    0       0   0      0
##   fee fill final financi font fontfont fontp forkspamassassintaintorg form
## 1   0    0     0       0    0        0     0                        0    0
##   format french full futur geek german guarante heaven height heightd
## 1      0      0    0     0    0      0        0      0      0       0
##   helvetica herea host hour hqpronsnet html httpeinsteinsszcomcdr
## 1         0     0    0    0          0    0                     0
##   httpequivcontenttyp httpequivdcontenttyp
## 1                   0                    0
##   httpslistssourceforgenetlistslistinfospamassassinsight
## 1                                                      0
##   httpthinkgeekcomsf hundr img immedi improv incom increas independ
## 1                  0     0   0      0      0     0       0        0
##   industri info input instruct insur intend intern invest jmnetnoteinccom
## 1        0    0     0        0     0      0      0      0               0
##   jul jun lead learn legal level licens linuxmidrangecom listsszcom loan
## 1   0   0    0     0     0     0      0                0          0    0
##   local locustmindernet longer lose loss love low lowest mailnetnoteinccom
## 1     0               0      0    0    0    0   0      0                 0
##   mailtoforkspamassassintaintorg mailwebnotenet major
## 1                              0              0     0
##   mandarklabsnetnoteinccom matter mdomlocalhost meet meta methoddpost
## 1                        0      0             0    0    0           0
##   million mime minut money mortgag multipart multipartaltern namedgener
## 1       0    0     0     0       0         0               0          0
##   natur nbsp newslett next none offer onlin oper opportun optin owner
## 1     0    0        0    0    0     0     0    0        0     0     0
##   ownercypherpunkseinsteinsszcom paid parti pay payment pbfont per pfont
## 1                              0    0     0   0       0      0   0     0
##   plan pnbspp polici pop present price privaci privat product profession
## 1    0      0      0   0       0     0       0      0       0          0
##   profit program promot protect proven purchas qualiti quick quotedprint
## 1      0       0      0       0      0       0       0     0           0
##   rate reach readi regard regist repli request research respons return
## 1    0     0     0      0      0     0       0        0       0      0
##   risk roman russian sale san sansserif search section secur share ship
## 1    0     0       0    0   0         0      0       0     0     0    0
##   simpl simpli sincer size smtpmailyahoocom smtpsvc spamassassinsight
## 1     0      0      0    0                0       0                 0
##   spamassassinsightingslistssourceforgenet spanish special sszcom style
## 1                                        0       0       0      0     0
##   submit subscrib success tabl target tax tbodi tdfont tdimg technolog
## 1      0        0       0    0      0   0     0      0     0         0
##   term text textd texthtml thousand tool topmargin total track trade
## 1    0    0     0        0        0    0         0     0     0     0
##   transitionalen trust type typedsubmit unit utc valigndtop valigntop valu
## 1              0     0    0           0    0   0          0         0    0
##   valuedsubmit verifyno versiontlsvsslv visit wait wastemindernet watch
## 1            0        0               0     0    0              0     0
##   wcdtd web webmasterefii webnotenet websit weight welcom width widthd
## 1     0   0             0          0      0      0      0     0      0
##   wish within xacceptablelanguag xkeyword xlistadmin xmailinglist
## 1    0      0                  0        0          0            0
##   xoriginalarrivaltim xstatus xunsubscriptioninfo yyyynetnoteinccom
## 1                   0       0                   0                 0
##   yyyyspamassassintaintorg
## 1                        0

Split the Dataset into Training and Testing Set

# Split the dataset into training and testing set.
set.seed(567)
spamham_trfm_index <- sample(1:nrow(spamham_trfm), 0.9 * nrow(spamham_trfm))
spamham_train <- spamham_trfm[spamham_trfm_index, ]
spamham_test <- spamham_trfm[-spamham_trfm_index, ]

Train Prediction Model

Generalized Linear Model

# Train the model using random forest method.
spamham_gl_mdl <- train(spamorham ~ ., data = spamham_train, method = 'glm')
spamham_gl_mdl
## Generalized Linear Model 
## 
## 450 samples
## 851 predictors
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 450, 450, 450, 450, 450, 450, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.6163629  0.2319075

Bayesian GLM

# Train the model using bayesian generalized linear method.
spamham_by_mdl <- train(spamorham ~ ., data = spamham_train, method = 'bayesglm')
spamham_by_mdl
## Bayesian Generalized Linear Model 
## 
## 450 samples
## 851 predictors
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 450, 450, 450, 450, 450, 450, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9977907  0.9955553

Random Forest

# Train the model using random forest method.
spamham_rf_mdl <- train(spamorham ~ ., data = spamham_train, method = 'ranger')
spamham_rf_mdl
## Random Forest 
## 
## 450 samples
## 851 predictors
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 450, 450, 450, 450, 450, 450, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##     2   gini        0.9802262  0.9603420
##     2   extratrees  0.9667122  0.9332207
##    41   gini        0.9978281  0.9956427
##    41   extratrees  0.9966136  0.9932103
##   851   gini        0.9964465  0.9928786
##   851   extratrees  0.9954706  0.9909185
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 41, splitrule = gini
##  and min.node.size = 1.

Test Prediction Model

Generalized Linear Model

# Predict the outcome on a test set.
spamham_gl_pred <- predict(spamham_gl_mdl, newdata = spamham_test)
spamham_gl_pred
##  [1] 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 1
## [36] 0 1 1 1 1 1 1 0 0 1 0 1 1 1 1
## Levels: 0 1
# Compare predicted outcome and true outcome.
confusionMatrix(spamham_gl_pred, spamham_test$spamorham)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 19  6
##          1  7 18
##                                           
##                Accuracy : 0.74            
##                  95% CI : (0.5966, 0.8537)
##     No Information Rate : 0.52            
##     P-Value [Acc > NIR] : 0.001221        
##                                           
##                   Kappa : 0.48            
##                                           
##  Mcnemar's Test P-Value : 1.000000        
##                                           
##             Sensitivity : 0.7308          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.7600          
##          Neg Pred Value : 0.7200          
##              Prevalence : 0.5200          
##          Detection Rate : 0.3800          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.7404          
##                                           
##        'Positive' Class : 0               
## 

Bayesian

# Predict the outcome on a test set.
spamham_by_pred <- predict(spamham_by_mdl, newdata = spamham_test)
spamham_by_pred
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Levels: 0 1
# Compare predicted outcome and true outcome.
confusionMatrix(spamham_by_pred, spamham_test$spamorham)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 26  0
##          1  0 24
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9289, 1)
##     No Information Rate : 0.52       
##     P-Value [Acc > NIR] : 6.312e-15  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.00       
##             Specificity : 1.00       
##          Pos Pred Value : 1.00       
##          Neg Pred Value : 1.00       
##              Prevalence : 0.52       
##          Detection Rate : 0.52       
##    Detection Prevalence : 0.52       
##       Balanced Accuracy : 1.00       
##                                      
##        'Positive' Class : 0          
## 

Random Forest

# Predict the outcome on a test set.
spamham_rf_pred <- predict(spamham_rf_mdl, newdata = spamham_test)
spamham_rf_pred
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Levels: 0 1
# Compare predicted outcome and true outcome.
confusionMatrix(spamham_rf_pred, spamham_test$spamorham)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 26  0
##          1  0 24
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9289, 1)
##     No Information Rate : 0.52       
##     P-Value [Acc > NIR] : 6.312e-15  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.00       
##             Specificity : 1.00       
##          Pos Pred Value : 1.00       
##          Neg Pred Value : 1.00       
##              Prevalence : 0.52       
##          Detection Rate : 0.52       
##    Detection Prevalence : 0.52       
##       Balanced Accuracy : 1.00       
##                                      
##        'Positive' Class : 0          
## 

Discussion

From each of the confusion matrix result, we can see that the Bayesian GLM and Random Forest methods outperformed Generalized Linear Model, obtaining an impressive accuracy of 100% on the test set. The Genelized Linear Model which prediction accuracy is only 74%. So, it is better to either use Bayesian GLM or Random Forest to classify emails into spam (1) and ham (0) to completely avoid especially false positive cases to happen because it is more costly. User might completely miss an important email due to it being delivered to the spam folder as compared to false negative that user will need to delete the unsolicited emails.

If my laptop has more computing power, I will analyze more samples and compare with more classification methods such as CART model, SVM, decision tree, and so on. Above results, it took about 1 and half hours to generate.