Introduction

This assignment is to predict the class of document with spam/ham dataset. I choose following corpus from https://spamassassin.apache.org/old/publiccorpus/ for use:

20030228_easy_ham.tar.bz2

20030228_spam.tar.bz2

Data Acquisition

For project reproducibility, data is directly imported from the website and subsequently unzipped.

ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2"
ham_tmp <- tempfile(fileext = ".tar.bz2")
download.file(ham_url, ham_tmp, quiet = TRUE)
bunzip2(ham_tmp, overwrite = TRUE, remove = FALSE)
untar(gsub(".bz2", "", ham_tmp), exdir = tempdir())
ham_files_path <- file.path(tempdir(), untar(gsub(".bz2", "", ham_tmp), list = TRUE))
spam_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2"
spam_tmp <- tempfile(fileext = ".tar.bz2")
download.file(spam_url, spam_tmp, quiet = TRUE)
bunzip2(spam_tmp, overwrite = TRUE, remove = FALSE)
untar(gsub(".bz2", "", spam_tmp), exdir = tempdir())
spam_files_path <- file.path(tempdir(), untar(gsub(".bz2", "", spam_tmp), list = TRUE))
reademail <- function(path, tag){
  files <- list.files(path=path, 
                      full.names=TRUE, 
                      recursive=TRUE)
  email <- lapply(files, function(x) {
    body <- read_file(x)
    })
  email <- unlist(email)
  data <- as.data.frame(email)
  data$tag <- tag
  return (data)
}

ham_doc <- reademail(ham_files_path, tag="ham") 
spam_doc <- reademail(spam_files_path, tag="spam")
df <- rbind(ham_doc, spam_doc)
table(df$tag)
## 
##  ham spam 
## 2501  501

Now there are 2501 of “ham” and 501 of “spam” within the dataset.

Data Cleaning

Next step to preprocess text to keep what we need.

df<-df %>%
  mutate(email = str_remove_all(email, pattern = "<.*?>")) %>%
  mutate(email = str_remove_all(email, pattern = "[:digit:]")) %>%
  mutate(email = str_remove_all(email, pattern = "[:punct:]")) %>%
  mutate(email = str_remove_all(email, pattern = "[\n]")) %>%
  mutate(email = str_to_lower(email)) %>%
  unnest_tokens(output=text,input=email,
                token="paragraphs",
                format="text") %>%
  anti_join(stop_words, by=c("text"="word"))
stopworduse<-function(){
c(stopwords(),"english")
}

Content_update <- function(content){
  contentCorpus <- Corpus(VectorSource(content))
  contentCorpus <- tm_map(contentCorpus,PlainTextDocument)
  contentCorpus <- tm_map(contentCorpus, tolower)
  contentCorpus <- tm_map(contentCorpus,removeNumbers)
  contentCorpus<- tm_map(contentCorpus,removeWords,stopworduse())
  contentCorpus <- tm_map(contentCorpus,removePunctuation)
  contentCorpus <- tm_map(contentCorpus,stripWhitespace)
  return(contentCorpus)
}

corpus <- Content_update(df$text)

inspect(corpus[1:2])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 2
## 
## [1]  exmhworkersadminredhatcom thu aug returnpath deliveredto zzzzlocalhostnetnoteinccomreceived localhost localhost phoboslabsnetnoteinccom postfix esmtp id dec thu aug edtreceived phobos localhost imap fetchmail zzzzlocalhost singledrop thu aug istreceived listmanspamassassintaintorg listmanspamassassintaintorg dogmaslashnullorg esmtp id gmbyrz thu aug received listmanspamassassintaintorg localhostlocaldomain listmanredhatcom postfix esmtp id thu aug edtdeliveredto exmhworkerslistmanspamassassintaintorgreceived intmxcorpspamassassintaintorg intmxcorpspamassassintaintorg listmanredhatcom postfix esmtp id cfd thu aug edtreceived maillocalhost intmxcorpspamassassintaintorg id gmbyg exmhworkerslistmanredhatcom thu aug received mxspamassassintaintorg mxspamassassintaintorg intmxcorpredhatcom smtp id gmbyy thu aug received ratreepsuacth mxspamassassintaintorg smtp id gmbihl thu aug received deltacsmuozau deltacoepsuacth ratreepsuacth esmtp id gmbwel thu aug ictreceived munnariozau localhost deltacsmuozau esmtp id gmbqpw thu aug ictfrom robert elz chris garrigues cc exmhworkersspamassassintaintorgsubject re new sequences windowinreplyto references mimeversion contenttype textplain charsetusasciimessageid xloop exmhworkersspamassassintaintorgsender exmhworkersadminspamassassintaintorgerrorsto exmhworkersadminspamassassintaintorgxbeenthere exmhworkersspamassassintaintorgxmailmanversion precedence bulklisthelp listpost listsubscribe listid discussion list exmh developers listunsubscribe listarchive date thu aug date wed aug chris garrigues messageid cant reproduce errorfor repeatable like every time without failthis debug log pick happening pickit exec pick inbox list lbrace lbrace subject ftp rbrace rbrace sequence mercury exec pick inbox list lbrace lbrace subject ftp rbrace rbrace sequence mercury ftocpickmsgs hit marking hits tkerror syntax error expression int note run pick command hand delta pick inbox list lbrace lbrace subject ftp rbrace rbrace sequence mercury hitthats hit comes obviously version nmh imusing delta pick versionpick nmh compiled fuchsiacsmuozau sun mar ict relevant part mhprofile delta mhparam pickseq sel listsince pick command works sequence actually theone thats explicit command line search popup theone comes mhprofile get createdkreps still using version code form day ago haventbeen able reach cvs repository today local routing issue thinkexmhworkers mailing listexmhworkersredhatcomhttpslistmanredhatcommailmanlistinfoexmhworkers
## [2]  steveburtcursorsystemcom thu aug returnpath deliveredto zzzzlocalhostnetnoteinccomreceived localhost localhost phoboslabsnetnoteinccom postfix esmtp id beec thu aug edtreceived phobos localhost imap fetchmail zzzzlocalhost singledrop thu aug istreceived ngrpscdyahoocom ngrpscdyahoocom dogmaslashnullorg smtp id gmbktz thu aug xegroupsreturn senttozzzzspamassassintaintorgreturnsgroupsyahoocomreceived ngrpscdyahoocom nnfmp aug xsender steveburtcursorsystemcomxapparentlyto zzzzteanayahoogroupscomreceived egp mail aug received qmail invoked network aug received unknown mgrpscdyahoocom qmqp aug received unknown helo mailgatewaycursorsystemcom mtagrpscdyahoocom smtp aug received exchangecpslocal unverified mailgatewaycursorsystemcom content technologies smtprs esmtp id thu aug received exchangecpslocal internet mail service id thu aug messageid zzzzteanayahoogroupscom xmailer internet mail service xegroupsfrom steve burt steve burt xyahooprofile pyrusemimeversion mailinglist list zzzzteanayahoogroupscom contact forteanaowneryahoogroupscomdeliveredto mailing list zzzzteanayahoogroupscomprecedence bulklistunsubscribe date thu aug subject zzzzteana re alexanderreplyto zzzzteanayahoogroupscomcontenttype textplain charsetusasciicontenttransferencoding bitmartin postedtassos papadopoulos greek sculptor behind plan judged limestone mount kerdylio miles east salonika far mount athos monastic community ideal patriotic sculpture well alexanders granite features ft high ft wide museum restored amphitheatre car park admiring crowds areplannedso mountain limestone graniteif limestone itll weather pretty fast yahoo groups sponsor dvds free sp join nowhttpusclickyahoocomptybbnxieaamghaagsolbtm unsubscribe group send email toforteanaunsubscribeegroupscom use yahoo groups subject httpdocsyahoocominfoterms
dtm<-DocumentTermMatrix(corpus)
dtm
## <<DocumentTermMatrix (documents: 3002, terms: 75968)>>
## Non-/sparse entries: 425606/227630330
## Sparsity           : 100%
## Maximal term length: 196615
## Weighting          : term frequency (tf)

Data split and Dimensionality Reduction

Then, split the corpus and document term matrix into training and testing sets using a 7:3 ratio. And selecting words with frequencies exceeding 100 for dimensionality reduction, as 50 may not suffice.

corpus.train<-corpus[c(1:1750,2502:2851)]
corpus.test<-corpus[c(1751:2501,2852:3002)]
dtm.train<-dtm[c(1:1750,2502:2851),]
dtm.test<-dtm[c(1751:2501,2852:3002),]
updated_dtm_train <- as.matrix(dtm.train[1751:2100,])
sum<- colSums(updated_dtm_train)
term<-names(sum)
count<-as.numeric(sum)
dataframe<-as.data.frame(cbind(term,count),row.names=NULL,optional=F)
dataframe$count<-as.numeric(dataframe$count)
head(dataframe)
##           term count
## 1         able    19
## 2     actually    16
## 3          ago     7
## 4          aug  1172
## 5 bulklisthelp     6
## 6         cant    20
wordcloud2(dataframe)
countfunction <- function(x,lowfreq=0,highfreq=Inf){
  stopifnot(inherits(x,c("DocumentTermMatrix","TermDocumentMatrix","simple_triplet_matrix")),
            is.numeric(lowfreq),is.numeric(highfreq))
  if(inherits(x,"DocumentTermMatrix"))
    x<-t(x)
  rs <- slam::row_sums(x)
  y <- which(rs >= lowfreq & rs<= highfreq)
  return(x[y,])
}
dict<-Terms(countfunction(dtm.train,100))
length(dict)
## [1] 658
train<-DocumentTermMatrix(corpus.train,list(dictionary=dict))
train
## <<DocumentTermMatrix (documents: 2100, terms: 658)>>
## Non-/sparse entries: 154504/1227296
## Sparsity           : 89%
## Maximal term length: 92
## Weighting          : term frequency (tf)
test<-DocumentTermMatrix(corpus.test,list(dictionary=dict))
test
## <<DocumentTermMatrix (documents: 902, terms: 658)>>
## Non-/sparse entries: 36127/557389
## Sparsity           : 94%
## Maximal term length: 92
## Weighting          : term frequency (tf)
convert_counts <- function(x){
  x <- ifelse(x>0,1,0)
  x <- factor(x, levels=c(0,1),labels=c("No","Yes"))
  return(x)
}
Updated_train <- apply(train, MARGIN=2, convert_counts)
Updated_test<-apply(test, MARGIN = 2, convert_counts)

Model - Naive Bayes

Here I choose to use Naive Bayes because of its robustness in classification task.

train_type<-c(rep("ham",1750),rep("spam",350))
test_type<-c(rep("ham",751),rep("spam",151))
train_type<-as.data.frame(train_type)


model<-naiveBayes(Updated_train,train_type$train_type,laplace=1)
prediction<-predict(model,Updated_test,type = "class")
CrossTable(prediction,test_type,prop.chisq=TRUE,prop.t=FALSE,dnn=c("Prediction","Origin"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  902 
## 
##  
##              | Origin 
##   Prediction |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |       732 |         5 |       737 | 
##              |    22.837 |   113.581 |           | 
##              |     0.993 |     0.007 |     0.817 | 
##              |     0.975 |     0.033 |           | 
## -------------|-----------|-----------|-----------|
##         spam |        19 |       146 |       165 | 
##              |   102.006 |   507.327 |           | 
##              |     0.115 |     0.885 |     0.183 | 
##              |     0.025 |     0.967 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       751 |       151 |       902 | 
##              |     0.833 |     0.167 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Conclusion

Overall, the classification model demonstrated strong performance, accurately categorizing the majority of documents. Specifically, it achieved a 97.5% accuracy rate for “ham,” correctly classifying 732 out of 751 documents, and a 96.7% accuracy rate for “spam,” correctly classifying 146 out of 151 documents. These results show the effectiveness of the Naive Bayes classification model in distinguishing between “ham” and “spam” emails.