Week 11

Spam Classification

The below code snippet will not run in rmarkdown but if you run it outside of trying to knit the document, you can load ham_docs and spam_docs onto your own machine. Note this will download to your working directory and delete, if you already have a folder marked spam_2 or easy_ham in the working directory you may want to reset your working directory.

ham_messages <- "https://spamassassin.apache.org/publiccorpus/20030228_easy_ham.tar.bz2"
download.file(ham_messages, destfile="CHunt_tmp.tar.bz2")
untar("CHunt_tmp.tar.bz2")
ham_docs <- Corpus(DirSource(file.path(getwd(), "easy_ham"))) 

spam_messages <- "https://spamassassin.apache.org/publiccorpus/20050311_spam_2.tar.bz2"
download.file(spam_messages, destfile="CHunt_tmp.tar.bz2")
untar("CHunt_tmp.tar.bz2")
spam_docs <- Corpus(DirSource(file.path(getwd(), "spam_2"))) 

## Removes downloaded and temp files 
unlink(file.path(getwd(), "spam_2"), recursive=TRUE)
unlink(file.path(getwd(), "easy_ham"), recursive=TRUE)
file.remove("CHunt_tmp.tar.bz2")

Creating Corpus

Lets begin by combining and cleaning a corpus of spam and ham classified emails provided by smapassassin.

library(tm)
library(dplyr)
library(stringr)

ham_spam_collection <- c(ham_docs, spam_docs)                     %>%
                        tm_map(content_transformer(tolower))      %>%
                        tm_map(removeNumbers)                     %>%
                        tm_map(removeWords, stopwords("english")) %>%
                        tm_map(removePunctuation)                 %>%
                        tm_map(stemDocument) 


for (i in (1:(length(ham_docs)))){
            meta(ham_spam_collection[[i]], tag = "Class") <- "Ham"
            }

for (i in ((length(ham_docs)+1):length(ham_spam_collection))){
            meta(ham_spam_collection[[i]], tag = "Class") <- "Spam"
           }

Creating Term Document Matrix

Now we have a cleaned and tagged corpus of both Ham and Spam documents, the next step is to create our document term matrix and container so we can train our classifier. We are going to set aside a random 1/5th of the data set as our test set.

library(RTextTools)

dtm_ham_spam_collection <- DocumentTermMatrix(ham_spam_collection) %>%
                           removeSparseTerms(sparse = .8)  

class_data <- unlist(meta(ham_spam_collection, tag = "Class"))

n <- length(ham_spam_collection)
all_docs <- (1:n)
testSet <- sample(all_docs, size = round(n/5, 0), replace = FALSE)
trainSet <- all_docs[! all_docs %in% testSet]        

container <- create_container(dtm_ham_spam_collection, 
                              trainSize = trainSet,
                              testSize = testSet,
                              labels = class_data,
                              virgin = FALSE)

Train Classifiers

Using our container we will train the classifiers and use our test set to determine accuracy of the model.

SVM <- train_model(container,"SVM")
MAXENT <- train_model(container,"MAXENT")
BOOSTING <- train_model(container,"BOOSTING")
BAGGING <- train_model(container,"BAGGING")
RF <- train_model(container,"RF")
TREE <- train_model(container,"TREE")

SVM_CLASSIFY <- classify_model(container, SVM)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)
BOOSTING_CLASSIFY <- classify_model(container, BOOSTING)
BAGGING_CLASSIFY <- classify_model(container, BAGGING)
RF_CLASSIFY <- classify_model(container, RF)
TREE_CLASSIFY <- classify_model(container, TREE)

Accuracy Results

After training several different models lets look at the accuracy results. Hopefully we will see some models that perform well best on our analysis

results <- data.frame(correct = as.character(class_data[testSet]), 
                      SVM = as.character(SVM_CLASSIFY[,1]), 
                      MAXENT = as.character(MAXENT_CLASSIFY[,1]),
                      BOOST = as.character(BOOSTING_CLASSIFY[,1]),
                      BAGGING = as.character(BAGGING_CLASSIFY[,1]),
                      RF = as.character(RF_CLASSIFY[,1]),
                      TREE = as.character(TREE_CLASSIFY[,1]))
kable(head(results))

correct	SVM	MAXENT	BOOST	BAGGING	RF	TREE
Spam	Ham	Ham	Ham	Ham	Ham	Ham
Ham	Ham	Ham	Ham	Ham	Ham	Ham
Ham	Spam	Ham	Spam	Spam	Spam	Spam
Ham	Ham	Ham	Ham	Ham	Ham	Ham
Ham	Ham	Ham	Ham	Ham	Ham	Ham
Ham	Ham	Ham	Ham	Ham	Ham	Ham

SVM <- prop.table(table(results$correct == results$SVM))
MAXENT <- prop.table(table(results$correct == results$MAXENT))
BOOST <- prop.table(table(results$correct == results$BOOST))
BAGGING <- prop.table(table(results$correct == results$BAGGING))
RF <- prop.table(table(results$correct == results$RF))
TREE <- prop.table(table(results$correct == results$TREE))

all_models <- data.frame(rbind(SVM, MAXENT, BOOST, BAGGING, RF, TREE))
all_models <- cbind(rownames(all_models), all_models)

colnames(all_models) <- c("Model","Wrong", "Right")