Using RTextTools for Email Classification (Spam/Ham).

library(tidyr)
library(meta)
library(stringr)
library(tidytext)
library(RTextTools)
library(tm)

Read the corpora for the Spam and Ham collections.

Load and initialize the Spam and Easy Ham corpora.

These are downloaded and unzipped from http://spamassassin.apache.org/old/publiccorpus/

setwd("C:/Users/vikas/cuny/data607/projects/Project_4/")
spam <- Corpus(DirSource("spam/20050311_spam_2.tar/spam_2"),
               readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
length(spam)
## [1] 1396
easy_ham <- Corpus(DirSource("easy_ham/20030228_easy_ham_2.tar/easy_ham_2"),
                   readerControl = list(reader = readPlain, language = "en_US", load = TRUE))
length(easy_ham)
## [1] 1400
# Metadata tagging needed for the subsequent processing based on a term-document matrix,
# container and model training and classification.

meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "ham"

spam
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 1
## Content:  documents: 1396
# head(meta(spam))

easy_ham
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 1
## Content:  documents: 1400
# head(meta(easy_ham))

Create aggregate term-document matrices, consisting of train and test portions obtained from the combined spam and easy_ham corpora.

l1 <- length(spam)
l2 <- length(easy_ham)


# Set aside portions for train and test.
TrainPercent <- 0.75

i1 <- sample(l1, round(TrainPercent*l1))
i2 <- sample(l2, round(TrainPercent*l2))

spam_train <- spam[i1]
spam_test <- spam[-i1]

easy_ham_train <- easy_ham[i2]
easy_ham_test <- easy_ham[-i2]

tdm_train <- c(TermDocumentMatrix(spam_train),
               TermDocumentMatrix(easy_ham_train))

tdm_test <- c(TermDocumentMatrix(spam_test),
              TermDocumentMatrix(easy_ham_test))

tdm_all <- c(TermDocumentMatrix(spam_train),
             TermDocumentMatrix(easy_ham_train),
             TermDocumentMatrix(spam_test),
             TermDocumentMatrix(easy_ham_test))


label.train <- c(rep("spam", length(spam_train)),
                 rep("ham", length(easy_ham_train)))

label.test <- c(rep("spam", length(spam_test)),
                rep("ham", length(easy_ham_test)))

label.all <- c(rep("spam", length(spam_train)),
               rep("ham", length(easy_ham_train)),
               rep("spam", length(spam_test)),
               rep("ham", length(easy_ham_test)))

Create a container and use it to create a Support Vector Machine model.

r1 <- length(spam_train) + length(easy_ham_train)
r2 <- length(spam_test) + length(easy_ham_test)

ctr <- create_container(tdm_train,
                        labels = label.train,
                        trainSize = 1 : r1,
                        virgin = F)

svm_model <- train_model(ctr, "SVM")

Use the trained SVM to classify.

ctr2 <- create_container(tdm_all,
                         labels = label.all,
                         testSize = r1+1:r1+r2,
                         virgin = F)

svm_out <- classify_model(ctr2, svm_model)

head(svm_out)
##   SVM_LABEL SVM_PROB
## 1      spam 0.996471
## 2      spam 0.996471
## 3      spam 0.996471
## 4      spam 0.996471
## 5      spam 0.996471
## 6      spam 0.996471

Create models using Random Forest and Maximum Entropy also. Use them to classify.

tree_model <- train_model(ctr, "TREE")
maxent_model <- train_model(ctr, "MAXENT")

tree_out <- classify_model(ctr2, tree_model)
maxent_out <- classify_model(ctr2, maxent_model)

head(tree_out)
##   TREE_LABEL TREE_PROB
## 1       spam  0.996633
## 2       spam  0.996633
## 3       spam  0.996633
## 4       spam  0.996633
## 5       spam  0.996633
## 6       spam  0.996633
head(maxent_out)
##   MAXENTROPY_LABEL MAXENTROPY_PROB
## 1             spam             0.5
## 2             spam             0.5
## 3             spam             0.5
## 4             spam             0.5
## 5             spam             0.5
## 6             spam             0.5

Compare the results from the three models.

labels_out <- data.frame(
    correct_label = label.all,
    svm = as.character(svm_out[,1]),
    tree = as.character(tree_out[,1]),
    maxent = as.character(maxent_out[,1]),
    stringsAsFactors = F)

## SVM Performance
prop.table(table(labels_out[,1] == labels_out[,2]))
## 
##    FALSE     TRUE 
## 0.508226 0.491774
## Random Forest Performance
prop.table(table(labels_out[,1] == labels_out[,3]))
## 
##     FALSE      TRUE 
## 0.5193133 0.4806867
## Maximum Entropy Performance
prop.table(table(labels_out[,1] == labels_out[,4]))
## 
##     FALSE      TRUE 
## 0.6312589 0.3687411