Email Classification

require(tm)

## Loading required package: tm
## Loading required package: NLP

require(stringr)

## Loading required package: stringr

require (RTextTools)

## Loading required package: RTextTools
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve

require(wordcloud)

## Loading required package: wordcloud
## Loading required package: RColorBrewer

load("test.RData")

Introduction

The objective of this assignment is to classify new emails as ham or spam. The spam and ham data set used in this assignment came from spamassassin.

Parsing Emails

I was curious to find out if using three key attributes ( From, Subject, Body) of an email can be used to classify it as spam or ham. Regular expression was used to extract From and Subject while the it seemed that the body of the email began after an empty line. The from, subject and body attributes were extracted from each email and were stored in a Corpus.

# variables with  name "spam" was changed to ham when retrieving ham data. 
filenames <- list.files("spam", pattern="*.*")

for(i in 1: length(filenames))
{
  f <- readLines(str_c("spam\\",filenames[i]))
  #extract from email
  from_email <- ""
  subject <-""
  body_text <- ""
  is_body <- FALSE
  for(j in 1: length(f))
  {
    # From Email address
    if(str_detect(f[j],"From.*\\s+(.+@.+\\.\\w+)\\s+(.+)"))
    {
      from_email <- gsub("From.*\\s+(.+@.+\\.\\w+)\\s+(.+)","\\1",f[j])
    }
    else if(str_detect(f[j],"From:\\s*.*\\s*<(.+)>"))
    {
        from_email <- gsub("From:\\s*.*\\s*<(.+)>","\\1",f[j])
    }
    else if(str_detect(f[j],"From:\\s*(.+)"))
    {
      from_email <- gsub("From:\\s*(.+)","\\1",f[j])
    }
    # Subject 
    else if(str_detect(f[j],"Subject:\\s*(.+)"))
    {
      subject <- gsub("Subject:\\s*(.+)","\\1",f[j])
    }
    else if(f[j] == "")
    {
      is_body <- TRUE
    }
    if(is_body) 
    {
     body_text <-str_c(body_text,"\n",f[j])
    }
  }
  # Creating Corpus
  if(i == 1)
  {
    release_corpus_spam <- Corpus(VectorSource(i))
    meta(release_corpus_spam[[i]], "from_email") <- from_email
    meta(release_corpus_spam[[i]], "subject") <- subject
    meta(release_corpus_spam[[1]], "body_text") <- body_text
  }
  else
  {
    tmp_corpus <- Corpus(VectorSource(i))
    release_corpus_spam <- c(release_corpus_spam,tmp_corpus)
    meta(release_corpus_spam[[i]], "from_email") <- from_email
    meta(release_corpus_spam[[i]], "subject") <- subject
    meta(release_corpus_spam[[i]], "body_text") <- body_text
  }
    
}

meta_from <- meta(release_corpus_spam, type = "local", tag = "from_email")
meta_subject <- meta(release_corpus_spam, type = "local", tag = "subject")
meta_body <- meta(release_corpus_spam, type = "local", tag = "body_text")

# creating spam meta data
meta_data_spam <- data.frame(
  from = unlist(meta_from),
  subject = unlist(meta_subject),
  body = unlist(meta_body),
  spam = 1 # changed to 0 when ham was processed
)

# combining ham and spam dataframes into one dataframe
meta_ham_spam <- rbind(meta_data_ham, meta_data_spam)

The following variables were created

1). meta_data_spam : Data frame filled with spam

2). meta_data_ham : Data frame filled with ham

3.) meta_ham_spam : Data frame with spam and ham data.

3). release_corpus_spam : Corpus filled with spam

4). release_corpus_ham : Corpus filled with ham

release_corpus_spam

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 499

release_corpus_ham

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 248

Text Processing

The goal here is to create the document-term matrix and the classification container. This will be used to create the training model and classifiers.

#suffling the ham_spam dataset 

meta_ham_spam <- meta_ham_spam[sample(nrow(meta_ham_spam), nrow(meta_ham_spam)),]
row.names(meta_ham_spam) <- NULL

from <- meta_ham_spam$from
subject <- meta_ham_spam$subject
body <-  meta_ham_spam$body
class <- meta_ham_spam$spam

# creating the doucment-term matrix using the three email arributes
dtm <- create_matrix(from,subject,body, language="english", minWordLength=3, removeNumbers=TRUE, stemWords=FALSE, removePunctuation=TRUE,toLower = TRUE, weighting=weightTfIdf)

# creating container
container <- create_container(dtm,t(as.character(class)), trainSize=1:300, testSize=301:747, virgin=FALSE)

#creating training Models
svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")

#creating classification Models
svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)

Word Analysis

Below is a word cloud that shows words that occurred more than 10 times in the subject of spam email .

dtm_subject <- create_matrix(meta_data_spam$subject, language="english", minWordLength=3, removeNumbers=TRUE, stemWords=FALSE, removePunctuation=TRUE,toLower = TRUE, weighting=weightTfIdf)
freq <- colSums(as.matrix(dtm_subject))   
set.seed(142)   
wordcloud(names(freq), freq, min.freq=5,scale=c(5, .1),colors=brewer.pal(6, "Dark2"))

Evaluation

After creating the training and classification models, I wanted to see how each model did at classifying the emails omitted from the training data set.

spam_ham <- data.frame(
  correct_classification = class[301:747],
  svm = as.character(svm_out[,1]),
  tree = as.character(tree_out[,1]),
  maxent = as.character(maxent_out[,1]),
  stringsAsFactors = F)

#svm Performance
table(spam_ham[,1] == spam_ham[,2])

## 
## FALSE  TRUE 
##   101   346

prop.table(table(spam_ham[,1] == spam_ham[,2]))

## 
##     FALSE      TRUE 
## 0.2259508 0.7740492

## Random forest performance
table(spam_ham[,1] == spam_ham[,3])

## 
## FALSE  TRUE 
##   128   319

prop.table(table(spam_ham[,1] == spam_ham[,3]))

## 
##     FALSE      TRUE 
## 0.2863535 0.7136465

## Maximum entropy performance
table(spam_ham[,1] == spam_ham[,4])

## 
## FALSE  TRUE 
##    89   358

prop.table(table(spam_ham[,1] == spam_ham[,4]))

## 
##     FALSE      TRUE 
## 0.1991051 0.8008949

Conclusion

Maximum entropy classifier out performed the other classifiers by correctly classificing 358 out of 447 or about 80% of the emails correctly.