1. Introduction
4. Evaluation
5. Conclusion
require(tm)
## Loading required package: tm
## Loading required package: NLP
require(stringr)
## Loading required package: stringr
require (RTextTools)
## Loading required package: RTextTools
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
load("test.RData")
The objective of this assignment is to classify new emails as ham or spam. The spam and ham data set used in this assignment came from spamassassin.
I was curious to find out if using three key attributes ( From, Subject, Body) of an email can be used to classify it as spam or ham. Regular expression was used to extract From and Subject while the it seemed that the body of the email began after an empty line. The from, subject and body attributes were extracted from each email and were stored in a Corpus.
# variables with name "spam" was changed to ham when retrieving ham data.
filenames <- list.files("spam", pattern="*.*")
for(i in 1: length(filenames))
{
f <- readLines(str_c("spam\\",filenames[i]))
#extract from email
from_email <- ""
subject <-""
body_text <- ""
is_body <- FALSE
for(j in 1: length(f))
{
# From Email address
if(str_detect(f[j],"From.*\\s+(.+@.+\\.\\w+)\\s+(.+)"))
{
from_email <- gsub("From.*\\s+(.+@.+\\.\\w+)\\s+(.+)","\\1",f[j])
}
else if(str_detect(f[j],"From:\\s*.*\\s*<(.+)>"))
{
from_email <- gsub("From:\\s*.*\\s*<(.+)>","\\1",f[j])
}
else if(str_detect(f[j],"From:\\s*(.+)"))
{
from_email <- gsub("From:\\s*(.+)","\\1",f[j])
}
# Subject
else if(str_detect(f[j],"Subject:\\s*(.+)"))
{
subject <- gsub("Subject:\\s*(.+)","\\1",f[j])
}
else if(f[j] == "")
{
is_body <- TRUE
}
if(is_body)
{
body_text <-str_c(body_text,"\n",f[j])
}
}
# Creating Corpus
if(i == 1)
{
release_corpus_spam <- Corpus(VectorSource(i))
meta(release_corpus_spam[[i]], "from_email") <- from_email
meta(release_corpus_spam[[i]], "subject") <- subject
meta(release_corpus_spam[[1]], "body_text") <- body_text
}
else
{
tmp_corpus <- Corpus(VectorSource(i))
release_corpus_spam <- c(release_corpus_spam,tmp_corpus)
meta(release_corpus_spam[[i]], "from_email") <- from_email
meta(release_corpus_spam[[i]], "subject") <- subject
meta(release_corpus_spam[[i]], "body_text") <- body_text
}
}
meta_from <- meta(release_corpus_spam, type = "local", tag = "from_email")
meta_subject <- meta(release_corpus_spam, type = "local", tag = "subject")
meta_body <- meta(release_corpus_spam, type = "local", tag = "body_text")
# creating spam meta data
meta_data_spam <- data.frame(
from = unlist(meta_from),
subject = unlist(meta_subject),
body = unlist(meta_body),
spam = 1 # changed to 0 when ham was processed
)
# combining ham and spam dataframes into one dataframe
meta_ham_spam <- rbind(meta_data_ham, meta_data_spam)
The following variables were created
1). meta_data_spam : Data frame filled with spam
2). meta_data_ham : Data frame filled with ham
3.) meta_ham_spam : Data frame with spam and ham data.
3). release_corpus_spam : Corpus filled with spam
4). release_corpus_ham : Corpus filled with ham
release_corpus_spam
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 499
release_corpus_ham
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 248
The goal here is to create the document-term matrix and the classification container. This will be used to create the training model and classifiers.
#suffling the ham_spam dataset
meta_ham_spam <- meta_ham_spam[sample(nrow(meta_ham_spam), nrow(meta_ham_spam)),]
row.names(meta_ham_spam) <- NULL
from <- meta_ham_spam$from
subject <- meta_ham_spam$subject
body <- meta_ham_spam$body
class <- meta_ham_spam$spam
# creating the doucment-term matrix using the three email arributes
dtm <- create_matrix(from,subject,body, language="english", minWordLength=3, removeNumbers=TRUE, stemWords=FALSE, removePunctuation=TRUE,toLower = TRUE, weighting=weightTfIdf)
# creating container
container <- create_container(dtm,t(as.character(class)), trainSize=1:300, testSize=301:747, virgin=FALSE)
#creating training Models
svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")
#creating classification Models
svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)
Below is a word cloud that shows words that occurred more than 10 times in the subject of spam email .
dtm_subject <- create_matrix(meta_data_spam$subject, language="english", minWordLength=3, removeNumbers=TRUE, stemWords=FALSE, removePunctuation=TRUE,toLower = TRUE, weighting=weightTfIdf)
freq <- colSums(as.matrix(dtm_subject))
set.seed(142)
wordcloud(names(freq), freq, min.freq=5,scale=c(5, .1),colors=brewer.pal(6, "Dark2"))
After creating the training and classification models, I wanted to see how each model did at classifying the emails omitted from the training data set.
spam_ham <- data.frame(
correct_classification = class[301:747],
svm = as.character(svm_out[,1]),
tree = as.character(tree_out[,1]),
maxent = as.character(maxent_out[,1]),
stringsAsFactors = F)
#svm Performance
table(spam_ham[,1] == spam_ham[,2])
##
## FALSE TRUE
## 101 346
prop.table(table(spam_ham[,1] == spam_ham[,2]))
##
## FALSE TRUE
## 0.2259508 0.7740492
## Random forest performance
table(spam_ham[,1] == spam_ham[,3])
##
## FALSE TRUE
## 128 319
prop.table(table(spam_ham[,1] == spam_ham[,3]))
##
## FALSE TRUE
## 0.2863535 0.7136465
## Maximum entropy performance
table(spam_ham[,1] == spam_ham[,4])
##
## FALSE TRUE
## 89 358
prop.table(table(spam_ham[,1] == spam_ham[,4]))
##
## FALSE TRUE
## 0.1991051 0.8008949