Document Classification

This week’s assignment tasked students with predicting the classes of documents on the basis of an already classified training dataset. I used the public corpus of spam and ham emails from the Apache SpamAssassin Project for my training and test data. I downloaded and unzipped the corpora linked below into my working directory for the assignment.

Load required packages

library(stringr)
library(dplyr)
library(tm)
library(RTextTools)
library(wordcloud)
library(DT)
library(ROCR)
library(ggplot2)

Import spam and ham datasets, assign classifications, and combine corpora

The individual spam and ham corpora were read-in by calling DirSource() within VCorpus() and pointing to the relevant subdirectories of my working directory for the assignment. These were combined into a single corpus of emails after assigning their pre-existing classifications as spam or ham as metadata values with the meta() function.

spam <- VCorpus(DirSource("spam", encoding = "UTF-8"))
easy_ham <- VCorpus(DirSource("easy_ham_2", encoding = "UTF-8"))
hard_ham <- VCorpus(DirSource("hard_ham", encoding = "UTF-8"))

meta(spam, "spam") <- 1
meta(easy_ham, "spam") <- 0
meta(hard_ham, "spam") <- 0

emails <- c(spam, easy_ham, hard_ham)

Clean combined emails corpus

Converting the character encoding of the contents of emails to “UTF-8-MAC” prevented errors in the execution of functions on the corpus in the remainder of the assignment.

emails <- tm_map(emails, content_transformer(function(x) iconv(x, to = 'UTF-8-MAC', sub = 'byte')))
emails <- tm_map(emails, content_transformer(tolower))
emails <- tm_map(emails, removeNumbers)
emails <- tm_map(emails, removeWords, words = stopwords("en"))
emails <- tm_map(emails, content_transformer(function(x) str_replace_all(x, "[[:punct:]]|<|>", " ")))
emails <- tm_map(emails, stripWhitespace)

Word cloud of contents of emails after cleaning

The top 500 words from the combined emails corpus were visualized in a word cloud.

wordcloud(emails, min.freq = 3, max.words = 500)

Randomize order of spam and ham in emails corpus

The order of the emails in the combined corpus was randomized using the function sample() in order to ensure balanced proportions of spam and ham between the model training and testing subsets.

set.seed(2016)
emails <- sample(emails)

props_classes <- bind_rows(data.frame(dataset = "training", prop.table(table(spam = meta(emails[1:1500]))), 
                                      stringsAsFactors = FALSE),
          data.frame(dataset = "test", prop.table(table(spam = meta(emails[1501:length(emails)]))), 
                     stringsAsFactors = FALSE))
colnames(props_classes)[3] <- "prop"

knitr::kable(props_classes)
dataset spam prop
training 0 0.7680000
training 1 0.2320000
test 0 0.7656968
test 1 0.2343032

Predict classes of test emails

dtm <- DocumentTermMatrix(emails, control = list(minWordLength = 2, minDocFreq = 5)) 
dtm
## <<DocumentTermMatrix (documents: 2153, terms: 78578)>>
## Non-/sparse entries: 513608/168664826
## Sparsity           : 100%
## Maximal term length: 123
## Weighting          : term frequency (tf)
dtm <- removeSparseTerms(dtm, 0.95)
dtm
## <<DocumentTermMatrix (documents: 2153, terms: 665)>>
## Non-/sparse entries: 244697/1187048
## Sparsity           : 83%
## Maximal term length: 19
## Weighting          : term frequency (tf)
# First 10 rows and 5 columns of document-term matrix
inspect(dtm[1:10, 1:5])
## <<DocumentTermMatrix (documents: 10, terms: 5)>>
## Non-/sparse entries: 4/46
## Sparsity           : 92%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## 
##                                         Terms
## Docs                                     able accept access account
##   00388.53eae0055e66fcb7194f9cca080fdefe    0      0      0       1
##   00308.c80d7cb2a6981efac408b429d42d2b89    0      0      0       0
##   01310.7bfe2d833bc6cf1e5e1acd32de2a25bb    0      0      0       0
##   00288.8c8bc71976c3b67d900ebd8eeab8a0f5    0      0      0       0
##   00526.618ca98770b667fd66a8a278bb1b7b5c    1      0      0       0
##   00261.12b64e557e52daf5fc5a52e47df2f4e3    0      0      2       0
##   00823.d8ecd70437b16c3ecb0e8a496c34514b    0      0      0       0
##   00010.e82bd1f5f7eae426682a7f8e4cbf1ae6    0      0      0       0
##   00006.5ab5620d3d7c6c0db76234556a16f6c1    0      0      0       1
##   00115.c97af50ef7ccd816f95bbdc6f4d226b2    0      0      0       0
##                                         Terms
## Docs                                     actually
##   00388.53eae0055e66fcb7194f9cca080fdefe        0
##   00308.c80d7cb2a6981efac408b429d42d2b89        0
##   01310.7bfe2d833bc6cf1e5e1acd32de2a25bb        0
##   00288.8c8bc71976c3b67d900ebd8eeab8a0f5        0
##   00526.618ca98770b667fd66a8a278bb1b7b5c        0
##   00261.12b64e557e52daf5fc5a52e47df2f4e3        0
##   00823.d8ecd70437b16c3ecb0e8a496c34514b        0
##   00010.e82bd1f5f7eae426682a7f8e4cbf1ae6        0
##   00006.5ab5620d3d7c6c0db76234556a16f6c1        0
##   00115.c97af50ef7ccd816f95bbdc6f4d226b2        0
# Most frequent terms in document-term matrix
findFreqTerms(dtm, 1000)
##   [1] "admin"        "align="       "also"         "alt="        
##   [5] "archive"      "arial"        "aug"          "beenthere"   
##   [9] "bgcolor="     "bgcolor=d"    "bit"          "border="     
##  [13] "border=d"     "bulk"         "can"          "cellpadding="
##  [17] "cellspacing=" "center"       "click"        "color"       
##  [21] "color="       "color=d"      "colspan="     "com"         
##  [25] "content"      "date"         "delivered"    "div"         
##  [29] "dogma"        "drop"         "edt"          "egwn"        
##  [33] "email"        "encoding"     "errors"       "esmtp"       
##  [37] "example"      "face="        "face=d"       "fetchmail"   
##  [41] "ffffff"       "font"         "fork"         "free"        
##  [45] "freshrpms"    "fri"          "get"          "gif"         
##  [49] "group"        "height="      "help"         "helvetica"   
##  [53] "home"         "href="        "href=d"       "html"        
##  [57] "http"         "https"        "ilug"         "images"      
##  [61] "imap"         "img"          "information"  "input"       
##  [65] "internet"     "irish"        "ist"          "jmason"      
##  [69] "jul"          "just"         "labs"         "like"        
##  [73] "linux"        "list"         "listinfo"     "lists"       
##  [77] "localhost"    "lugh"         "mail"         "mailer"      
##  [81] "mailman"      "mailto"       "make"         "message"     
##  [85] "microsoft"    "mime"         "mon"          "name="       
##  [89] "nbsp"         "net"          "netnoteinc"   "new"         
##  [93] "news"         "newsletter"   "normal"       "now"         
##  [97] "one"          "online"       "option"       "org"         
## [101] "path"         "pdt"          "people"       "phobos"      
## [105] "php"          "plain"        "please"       "post"        
## [109] "postfix"      "precedence"   "razor"        "received"    
## [113] "redhat"       "reply"        "request"      "return"      
## [117] "right"        "root"         "rpm"          "sans"        
## [121] "sat"          "search"       "sender"       "sep"         
## [125] "serif"        "single"       "size"         "size="       
## [129] "size=d"       "slashnull"    "smtp"         "software"    
## [133] "sourceforge"  "spamassassin" "src="         "style="      
## [137] "subject"      "subscribe"    "subscription" "sun"         
## [141] "system"       "table"        "taint"        "talk"        
## [145] "text"         "thu"          "time"         "title"       
## [149] "top"          "transfer"     "tuatha"       "tue"         
## [153] "type"         "unsubscribe"  "use"          "users"       
## [157] "usw"          "verdana"      "version"      "web"         
## [161] "webnote"      "wed"          "width="       "width=d"     
## [165] "will"         "windows"      "www"          "xent"        
## [169] "yyyy"         "zzzlist"      "zzzz"
container <- create_container(dtm,
                              labels = unlist(meta(emails)),
                              trainSize = 1:1500,
                              testSize = 1501:length(emails),
                              virgin = FALSE)

svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")

svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)

Incorrect classifications

results <- cbind(meta(emails[1501:length(emails)]), svm_out, tree_out, maxent_out)
datatable(results %>% filter(spam != SVM_LABEL | spam != TREE_LABEL | spam != MAXENTROPY_LABEL),
             rownames = TRUE, options = list(dom = 'tp', scrollX = TRUE))

Summary statistics of classifiers’ performance

Summary statistics for the performance of the classifiers were obtained using the create_analytics() function from RTextTools.

analytics <- create_analytics(container, cbind(svm_out, tree_out, maxent_out))

knitr::kable(select(analytics@algorithm_summary, SVM_PRECISION:SVM_FSCORE))
SVM_PRECISION SVM_RECALL SVM_FSCORE
0 0.97 0.99 0.98
1 0.98 0.88 0.93
knitr::kable(select(analytics@algorithm_summary, TREE_PRECISION:TREE_FSCORE))
TREE_PRECISION TREE_RECALL TREE_FSCORE
0 0.99 1.00 0.99
1 0.99 0.96 0.97
knitr::kable(select(analytics@algorithm_summary, MAXENTROPY_PRECISION:MAXENTROPY_FSCORE))
MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
0 1.00 0.98 0.99
1 0.95 0.99 0.97
knitr::kable(analytics@ensemble_summary)
n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
n >= 1 1.00 0.99
n >= 2 1.00 0.99
n >= 3 0.95 1.00

ROC curves and AUC values for classifiers

The classifiers’ performance was also evaluated by plotting their ROC curves and calculating AUC values.

results$SVM_PROB[results$SVM_LABEL == 0] <- 1 - results$SVM_PROB[results$SVM_LABEL == 0] 
results$TREE_PROB[results$TREE_LABEL == 0] <- 1 - results$TREE_PROB[results$TREE_LABEL == 0] 
results$MAXENTROPY_PROB[results$MAXENTROPY_LABEL == 0] <- 
  (1 - results$MAXENTROPY_PROB[results$MAXENTROPY_LABEL == 0])

pred_svm <- prediction(results$SVM_PROB, results$spam)
pred_tree <- prediction(results$TREE_PROB, results$spam)
pred_maxent <- prediction(results$MAXENTROPY_PROB, results$spam)

prf_svm <- performance(pred_svm, measure = "tpr", x.measure = "fpr")
prf_tree <- performance(pred_tree, measure = "tpr", x.measure = "fpr")
prf_maxent <- performance(pred_maxent, measure = "tpr", x.measure = "fpr")

auc_svm <- performance(pred_svm, measure = "auc")@y.values[[1]]
auc_tree <- performance(pred_tree, measure = "auc")@y.values[[1]]
auc_maxent <- performance(pred_maxent, measure = "auc")@y.values[[1]]

legend.labels <- c(str_c("SVM AUC = ", round(auc_svm, digits = 4)),
                   str_c("Tree AUC = ", round(auc_tree, digits = 4)),
                   str_c("Max Entropy AUC = ", round(auc_maxent, digits = 4)))

model_perf <- bind_rows(data.frame(model = "SVM", 
                                  FPR = unlist(prf_svm@x.values), 
                                  TPR = unlist(prf_svm@y.values),
                                  stringsAsFactors = FALSE),
                       data.frame(model = "Tree", 
                                  FPR = unlist(prf_tree@x.values), 
                                  TPR = unlist(prf_tree@y.values),
                                  stringsAsFactors = FALSE),
                       data.frame(model = "Max Entropy", 
                                  FPR = unlist(prf_maxent@x.values), 
                                  TPR = unlist(prf_maxent@y.values),
                                  stringsAsFactors = FALSE))

model_perf$model <- factor(model_perf$model, levels = c("SVM", "Tree", "Max Entropy")) 

ggplot(model_perf, aes(FPR, TPR, color = model, group = model)) + 
  geom_line() +
  geom_abline(slope = 1, intercept = 0, linetype = 2) +
  ggtitle("Performance of Spam/Ham Classifiers") +
  scale_color_discrete(name = "Classifier Model", labels = legend.labels) +
  theme(legend.position = c(0.7, 0.2))

Additional testing of models

I also downloaded the following linked corpora of ham and spam and unzipped them in my current working directory for further model testing.

The unzipped directories for these three sets of emails were renamed “test_easy_ham”, “test_hard_ham”, and “test_spam” respectively.

spam2 <- VCorpus(DirSource("test_spam", encoding = "UTF-8"))
easy_ham2 <- VCorpus(DirSource("test_easy_ham", encoding = "UTF-8"))
hard_ham2 <- VCorpus(DirSource("test_hard_ham", encoding = "UTF-8"))

meta(spam2, "spam") <- 1
meta(easy_ham2, "spam") <- 0
meta(hard_ham2, "spam") <- 0

emails2 <- c(spam2, easy_ham2, hard_ham2)

emails2 <- tm_map(emails2, content_transformer(function(x) iconv(x, to = 'UTF-8-MAC', sub = 'byte')))
emails2 <- tm_map(emails2, content_transformer(tolower))
emails2 <- tm_map(emails2, removeNumbers)
emails2 <- tm_map(emails2, removeWords, words = stopwords("en"))
emails2 <- tm_map(emails2, content_transformer(function(x) str_replace_all(x, "[[:punct:]]|<|>", " ")))
emails2 <- tm_map(emails2, stripWhitespace)

set.seed(1)
emails2 <- sample(emails2)

Edit create_matrix() function for compatibility between installed versions of tm and RTextTools

create_matrix_edit <- function (textColumns, language = "english", minDocFreq = 1, 
    maxDocFreq = Inf, minWordLength = 3, maxWordLength = Inf, 
    ngramLength = 1, originalMatrix = NULL, removeNumbers = FALSE, 
    removePunctuation = TRUE, removeSparseTerms = 0, removeStopwords = TRUE, 
    stemWords = FALSE, stripWhitespace = TRUE, toLower = TRUE, 
    weighting = weightTf) 
{
    stem_words <- function(x) {
        split <- strsplit(x, " ")
        return(wordStem(unlist(split), language = language))
    }
    tokenize_ngrams <- function(x, n = ngramLength) return(rownames(as.data.frame(unclass(textcnt(x, 
        method = "string", n = n)))))
    control <- list(bounds = list(local = c(minDocFreq, maxDocFreq)), 
        language = language, tolower = toLower, removeNumbers = removeNumbers, 
        removePunctuation = removePunctuation, stopwords = removeStopwords, 
        stripWhitespace = stripWhitespace, wordLengths = c(minWordLength, 
            maxWordLength), weighting = weighting)
    if (ngramLength > 1) {
        control <- append(control, list(tokenize = tokenize_ngrams), 
            after = 7)
    }
    else {
        control <- append(control, list(tokenize = scan_tokenizer), 
            after = 4)
    }
    if (stemWords == TRUE && ngramLength == 1) 
        control <- append(control, list(stemming = stem_words), 
            after = 7)
    trainingColumn <- apply(as.matrix(textColumns), 1, paste, 
        collapse = " ")
    trainingColumn <- sapply(as.vector(trainingColumn, mode = "character"), 
        iconv, to = "UTF8", sub = "byte")
    corpus <- Corpus(VectorSource(trainingColumn), readerControl = list(language = language))
    matrix <- DocumentTermMatrix(corpus, control = control)
    if (removeSparseTerms > 0) 
        matrix <- removeSparseTerms(matrix, removeSparseTerms)
    if (!is.null(originalMatrix)) {
        terms <- colnames(originalMatrix[, which(!colnames(originalMatrix) %in% 
            colnames(matrix))])
        weight <- 0
        if (attr(weighting, "acronym") == "tf-idf") 
            weight <- 1e-09
        amat <- matrix(weight, nrow = nrow(matrix), ncol = length(terms))
        colnames(amat) <- terms
        rownames(amat) <- rownames(matrix)
        fixed <- as.DocumentTermMatrix(cbind(matrix[, which(colnames(matrix) %in% 
            colnames(originalMatrix))], amat), weighting = weighting)
        matrix <- fixed
    }
    matrix <- matrix[, sort(colnames(matrix))]
    gc()
    return(matrix)
}
dtm2 <- create_matrix_edit(cbind(content(emails2)), minWordLength = 2, minDocFreq = 5, 
                      weighting = weightTf, originalMatrix = dtm) 

container2 <- create_container(dtm2,
                              labels = unlist(meta(emails2)),
                              testSize = 1:length(emails2),
                              virgin = FALSE)

svm_out2 <- classify_model(container2, svm_model)
tree_out2 <- classify_model(container2, tree_model)
maxent_out2 <- classify_model(container2, maxent_model)

analytics2 <- create_analytics(container2, cbind(svm_out2, tree_out2, maxent_out2))

knitr::kable(select(analytics2@algorithm_summary, SVM_PRECISION:SVM_FSCORE))
SVM_PRECISION SVM_RECALL SVM_FSCORE
0 0.95 0.74 0.83
1 0.35 0.79 0.49
knitr::kable(select(analytics2@algorithm_summary, TREE_PRECISION:TREE_FSCORE))
TREE_PRECISION TREE_RECALL TREE_FSCORE
0 0.92 0.58 0.71
1 0.24 0.72 0.36
knitr::kable(select(analytics2@algorithm_summary, MAXENTROPY_PRECISION:MAXENTROPY_FSCORE))
MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
0 0.95 0.81 0.87
1 0.41 0.77 0.54
knitr::kable(analytics2@ensemble_summary)
n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
n >= 1 1.00 0.81
n >= 2 1.00 0.81
n >= 3 0.57 0.80

ROC curves and AUC values for second test set

results2 <- cbind(meta(emails2[1:length(emails2)]), svm_out2, tree_out2, maxent_out2)

results2$SVM_PROB[results2$SVM_LABEL == 0] <- 1 - results2$SVM_PROB[results2$SVM_LABEL == 0] 
results2$TREE_PROB[results2$TREE_LABEL == 0] <- 1 - results2$TREE_PROB[results2$TREE_LABEL == 0] 
results2$MAXENTROPY_PROB[results2$MAXENTROPY_LABEL == 0] <- 
  (1 - results2$MAXENTROPY_PROB[results2$MAXENTROPY_LABEL == 0])

pred_svm2 <- prediction(results2$SVM_PROB, results2$spam)
pred_tree2 <- prediction(results2$TREE_PROB, results2$spam)
pred_maxent2 <- prediction(results2$MAXENTROPY_PROB, results2$spam)

prf_svm2 <- performance(pred_svm2, measure = "tpr", x.measure = "fpr")
prf_tree2 <- performance(pred_tree2, measure = "tpr", x.measure = "fpr")
prf_maxent2 <- performance(pred_maxent2, measure = "tpr", x.measure = "fpr")

auc_svm2 <- performance(pred_svm2, measure = "auc")@y.values[[1]]
auc_tree2 <- performance(pred_tree2, measure = "auc")@y.values[[1]]
auc_maxent2 <- performance(pred_maxent2, measure = "auc")@y.values[[1]]

legend.labels2 <- c(str_c("SVM AUC = ", round(auc_svm2, digits = 4)),
                   str_c("Tree AUC = ", round(auc_tree2, digits = 4)),
                   str_c("Max Entropy AUC = ", round(auc_maxent2, digits = 4)))

model_perf2 <- bind_rows(data.frame(model = "SVM", 
                                  FPR = unlist(prf_svm2@x.values), 
                                  TPR = unlist(prf_svm2@y.values),
                                  stringsAsFactors = FALSE),
                       data.frame(model = "Tree", 
                                  FPR = unlist(prf_tree2@x.values), 
                                  TPR = unlist(prf_tree2@y.values),
                                  stringsAsFactors = FALSE),
                       data.frame(model = "Max Entropy", 
                                  FPR = unlist(prf_maxent2@x.values), 
                                  TPR = unlist(prf_maxent2@y.values),
                                  stringsAsFactors = FALSE))

model_perf2$model <- factor(model_perf2$model, levels = c("SVM", "Tree", "Max Entropy")) 

ggplot(model_perf2, aes(FPR, TPR, color = model, group = model)) + 
  geom_line() +
  geom_abline(slope = 1, intercept = 0, linetype = 2) +
  ggtitle("Performance of Spam/Ham Classifiers") +
  scale_color_discrete(name = "Classifier Model", labels = legend.labels2) +
  theme(legend.position = c(0.7, 0.2))

Additional testing on a new, larger set of emails reveals the superior performance of the max entropy and SVM-based classifiers relative to the tree classifier. The discrepancy between the performance of the tree classifier here and on the initial test set of emails suggests that that model was overfitted during training.