Description of the Assignment

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents


Attach libraries

library(tm)
library(SnowballC)
library(stringr)
library(RWeka)
library(RTextTools)


Text Processing Ham and Spam Emails


Create Corpus

The ham corpus contains 2500 emails where the first 2000 files will be used to train a classifier model, while the last 500 will be used to test the model.

The spam corpus contains 1396 emails where the first 1000 files will be used to traina classifier model, while the last 396 will be used to test the model.

The file “cmds” was removed from both source directories.

ham_corpus = Corpus(DirSource("Week_10/easy_ham"), 
                             readerControl = list(language = "english", load = F))

spam_corpus = Corpus(DirSource("Week_10/spam_2"), 
                             readerControl = list(language = "english", load = F))


Add meta information

for (i in 1:length(ham_corpus)) {
  meta(ham_corpus[[i]], "mail_label") <- "ham"
}

for (i in 1:length(spam_corpus)) {
  meta(spam_corpus[[i]], "mail_label") <- "spam"
}


Data Cleansing

ham_corpus <- tm_map(ham_corpus, removeNumbers)
ham_corpus <- tm_map(ham_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
ham_corpus <- tm_map(ham_corpus, removeWords, words = stopwords("english"))
ham_corpus <- tm_map(ham_corpus, stripWhitespace)
ham_corpus <- tm_map(ham_corpus, content_transformer(tolower))
ham_corpus <- tm_map(ham_corpus, stemDocument)

spam_corpus <- tm_map(spam_corpus, removeNumbers)
spam_corpus <- tm_map(spam_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
spam_corpus <- tm_map(spam_corpus, removeWords, words = stopwords("english"))
spam_corpus <- tm_map(spam_corpus, stripWhitespace)
spam_corpus <- tm_map(spam_corpus, content_transformer(tolower))
spam_corpus <- tm_map(spam_corpus, stemDocument)


Create document-term matrix

ham_dtm <- DocumentTermMatrix(ham_corpus)
ham_dtm <- removeSparseTerms(ham_dtm, 1-(10/length(ham_corpus)))

spam_dtm <- DocumentTermMatrix(spam_corpus)
spam_dtm <- removeSparseTerms(spam_dtm, 1-(10/length(spam_corpus)))


Display the document-term matrix for non-spam emails

ham_dtm
## <<DocumentTermMatrix (documents: 2500, terms: 3590)>>
## Non-/sparse entries: 347245/8627755
## Sparsity           : 96%
## Maximal term length: 70
## Weighting          : term frequency (tf)


Display the document-term matrix for spam emails

spam_dtm
## <<DocumentTermMatrix (documents: 1396, terms: 3404)>>
## Non-/sparse entries: 244975/4507009
## Sparsity           : 95%
## Maximal term length: 56
## Weighting          : term frequency (tf)


Create estimation procedure using Random Forest model

# Create container
ham_mail <- unlist(meta(ham_corpus, "mail_label"))
N <-length(ham_mail)

# Train using files 1-2000, test using 2001-last row
ham_container <- create_container(ham_dtm,
                                  labels = ham_mail,
                                  trainSize = 1:2000, 
                                  testSize= 2001:N,
                                  virgin = FALSE)

ham_tree_model <- train_model(ham_container, "TREE")


spam_mail <- unlist(meta(spam_corpus, "mail_label"))
S <-length(spam_mail)

# Train using files 1-1000, test using 1001-last row
spam_container <- create_container(spam_dtm,
                                   labels = spam_mail,
                                   trainSize = 1:1000,
                                   testSize= 1001:S,
                                   virgin = FALSE)
spam_tree_model <- train_model(spam_container, "TREE")


Classify test ham emails

ham_tree_out <- classify_model(ham_container, ham_tree_model)

# Display last few rows of the classified test ham emails
tail(ham_tree_out)
##     TREE_LABEL TREE_PROB
## 495        ham         1
## 496        ham         1
## 497        ham         1
## 498        ham         1
## 499        ham         1
## 500        ham         1


Classify test spam emails

spam_tree_out <- classify_model(spam_container, spam_tree_model)

# Display last few rows of the classified test spam emails
tail(spam_tree_out)
##     TREE_LABEL TREE_PROB
## 391       spam         1
## 392       spam         1
## 393       spam         1
## 394       spam         1
## 395       spam         1
## 396       spam         1

Evaluate the Model Performance

ham_labels_out <- data.frame(
      correct_label = ham_mail[2001:N],
      tree = as.character(ham_tree_out[,1]),
      stringsAsFactors = F
      )

table(ham_labels_out[,1] == ham_labels_out[,2])
## 
## TRUE 
##  500
spam_labels_out <- data.frame(
      correct_label = spam_mail[1001:S],
      tree = as.character(spam_tree_out[,1]),
      stringsAsFactors = F
      )

table(spam_labels_out[,1] == spam_labels_out[,2])
## 
## TRUE 
##  396

I observed that the estimation model accurately classified the test emails as ham / spam accordingly.