This week’s assignment tasked students with predicting the classes of documents on the basis of an already classified training dataset. I used the public corpus of spam and ham emails from the Apache SpamAssassin Project for my training and test data. I downloaded and unzipped the corpora linked below into my working directory for the assignment.
library(stringr)
library(dplyr)
library(tm)
library(RTextTools)
library(wordcloud)
library(DT)
library(ROCR)
library(ggplot2)
The individual spam and ham corpora were read-in by calling DirSource() within VCorpus() and pointing to the relevant subdirectories of my working directory for the assignment. These were combined into a single corpus of emails after assigning their pre-existing classifications as spam or ham as metadata values with the meta() function.
spam <- VCorpus(DirSource("spam", encoding = "UTF-8"))
easy_ham <- VCorpus(DirSource("easy_ham_2", encoding = "UTF-8"))
hard_ham <- VCorpus(DirSource("hard_ham", encoding = "UTF-8"))
meta(spam, "spam") <- 1
meta(easy_ham, "spam") <- 0
meta(hard_ham, "spam") <- 0
emails <- c(spam, easy_ham, hard_ham)
emails corpusConverting the character encoding of the contents of emails to “UTF-8-MAC” prevented errors in the execution of functions on the corpus in the remainder of the assignment.
emails <- tm_map(emails, content_transformer(function(x) iconv(x, to = 'UTF-8-MAC', sub = 'byte')))
emails <- tm_map(emails, content_transformer(tolower))
emails <- tm_map(emails, removeNumbers)
emails <- tm_map(emails, removeWords, words = stopwords("en"))
emails <- tm_map(emails, content_transformer(function(x) str_replace_all(x, "[[:punct:]]|<|>", " ")))
emails <- tm_map(emails, stripWhitespace)
emails after cleaningThe top 500 words from the combined emails corpus were visualized in a word cloud.
wordcloud(emails, min.freq = 3, max.words = 500)
emails corpusThe order of the emails in the combined corpus was randomized using the function sample() in order to ensure balanced proportions of spam and ham between the model training and testing subsets.
set.seed(2016)
emails <- sample(emails)
props_classes <- bind_rows(data.frame(dataset = "training", prop.table(table(spam = meta(emails[1:1500]))),
stringsAsFactors = FALSE),
data.frame(dataset = "test", prop.table(table(spam = meta(emails[1501:length(emails)]))),
stringsAsFactors = FALSE))
colnames(props_classes)[3] <- "prop"
knitr::kable(props_classes)
| dataset | spam | prop |
|---|---|---|
| training | 0 | 0.7680000 |
| training | 1 | 0.2320000 |
| test | 0 | 0.7656968 |
| test | 1 | 0.2343032 |
emailsdtm <- DocumentTermMatrix(emails, control = list(minWordLength = 2, minDocFreq = 5))
dtm
## <<DocumentTermMatrix (documents: 2153, terms: 78578)>>
## Non-/sparse entries: 513608/168664826
## Sparsity : 100%
## Maximal term length: 123
## Weighting : term frequency (tf)
dtm <- removeSparseTerms(dtm, 0.95)
dtm
## <<DocumentTermMatrix (documents: 2153, terms: 665)>>
## Non-/sparse entries: 244697/1187048
## Sparsity : 83%
## Maximal term length: 19
## Weighting : term frequency (tf)
# First 10 rows and 5 columns of document-term matrix
inspect(dtm[1:10, 1:5])
## <<DocumentTermMatrix (documents: 10, terms: 5)>>
## Non-/sparse entries: 4/46
## Sparsity : 92%
## Maximal term length: 8
## Weighting : term frequency (tf)
##
## Terms
## Docs able accept access account
## 00388.53eae0055e66fcb7194f9cca080fdefe 0 0 0 1
## 00308.c80d7cb2a6981efac408b429d42d2b89 0 0 0 0
## 01310.7bfe2d833bc6cf1e5e1acd32de2a25bb 0 0 0 0
## 00288.8c8bc71976c3b67d900ebd8eeab8a0f5 0 0 0 0
## 00526.618ca98770b667fd66a8a278bb1b7b5c 1 0 0 0
## 00261.12b64e557e52daf5fc5a52e47df2f4e3 0 0 2 0
## 00823.d8ecd70437b16c3ecb0e8a496c34514b 0 0 0 0
## 00010.e82bd1f5f7eae426682a7f8e4cbf1ae6 0 0 0 0
## 00006.5ab5620d3d7c6c0db76234556a16f6c1 0 0 0 1
## 00115.c97af50ef7ccd816f95bbdc6f4d226b2 0 0 0 0
## Terms
## Docs actually
## 00388.53eae0055e66fcb7194f9cca080fdefe 0
## 00308.c80d7cb2a6981efac408b429d42d2b89 0
## 01310.7bfe2d833bc6cf1e5e1acd32de2a25bb 0
## 00288.8c8bc71976c3b67d900ebd8eeab8a0f5 0
## 00526.618ca98770b667fd66a8a278bb1b7b5c 0
## 00261.12b64e557e52daf5fc5a52e47df2f4e3 0
## 00823.d8ecd70437b16c3ecb0e8a496c34514b 0
## 00010.e82bd1f5f7eae426682a7f8e4cbf1ae6 0
## 00006.5ab5620d3d7c6c0db76234556a16f6c1 0
## 00115.c97af50ef7ccd816f95bbdc6f4d226b2 0
# Most frequent terms in document-term matrix
findFreqTerms(dtm, 1000)
## [1] "admin" "align=" "also" "alt="
## [5] "archive" "arial" "aug" "beenthere"
## [9] "bgcolor=" "bgcolor=d" "bit" "border="
## [13] "border=d" "bulk" "can" "cellpadding="
## [17] "cellspacing=" "center" "click" "color"
## [21] "color=" "color=d" "colspan=" "com"
## [25] "content" "date" "delivered" "div"
## [29] "dogma" "drop" "edt" "egwn"
## [33] "email" "encoding" "errors" "esmtp"
## [37] "example" "face=" "face=d" "fetchmail"
## [41] "ffffff" "font" "fork" "free"
## [45] "freshrpms" "fri" "get" "gif"
## [49] "group" "height=" "help" "helvetica"
## [53] "home" "href=" "href=d" "html"
## [57] "http" "https" "ilug" "images"
## [61] "imap" "img" "information" "input"
## [65] "internet" "irish" "ist" "jmason"
## [69] "jul" "just" "labs" "like"
## [73] "linux" "list" "listinfo" "lists"
## [77] "localhost" "lugh" "mail" "mailer"
## [81] "mailman" "mailto" "make" "message"
## [85] "microsoft" "mime" "mon" "name="
## [89] "nbsp" "net" "netnoteinc" "new"
## [93] "news" "newsletter" "normal" "now"
## [97] "one" "online" "option" "org"
## [101] "path" "pdt" "people" "phobos"
## [105] "php" "plain" "please" "post"
## [109] "postfix" "precedence" "razor" "received"
## [113] "redhat" "reply" "request" "return"
## [117] "right" "root" "rpm" "sans"
## [121] "sat" "search" "sender" "sep"
## [125] "serif" "single" "size" "size="
## [129] "size=d" "slashnull" "smtp" "software"
## [133] "sourceforge" "spamassassin" "src=" "style="
## [137] "subject" "subscribe" "subscription" "sun"
## [141] "system" "table" "taint" "talk"
## [145] "text" "thu" "time" "title"
## [149] "top" "transfer" "tuatha" "tue"
## [153] "type" "unsubscribe" "use" "users"
## [157] "usw" "verdana" "version" "web"
## [161] "webnote" "wed" "width=" "width=d"
## [165] "will" "windows" "www" "xent"
## [169] "yyyy" "zzzlist" "zzzz"
container <- create_container(dtm,
labels = unlist(meta(emails)),
trainSize = 1:1500,
testSize = 1501:length(emails),
virgin = FALSE)
svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")
svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)
results <- cbind(meta(emails[1501:length(emails)]), svm_out, tree_out, maxent_out)
datatable(results %>% filter(spam != SVM_LABEL | spam != TREE_LABEL | spam != MAXENTROPY_LABEL),
rownames = TRUE, options = list(dom = 'tp', scrollX = TRUE))
Summary statistics for the performance of the classifiers were obtained using the create_analytics() function from RTextTools.
analytics <- create_analytics(container, cbind(svm_out, tree_out, maxent_out))
knitr::kable(select(analytics@algorithm_summary, SVM_PRECISION:SVM_FSCORE))
| SVM_PRECISION | SVM_RECALL | SVM_FSCORE | |
|---|---|---|---|
| 0 | 0.97 | 0.99 | 0.98 |
| 1 | 0.98 | 0.88 | 0.93 |
knitr::kable(select(analytics@algorithm_summary, TREE_PRECISION:TREE_FSCORE))
| TREE_PRECISION | TREE_RECALL | TREE_FSCORE | |
|---|---|---|---|
| 0 | 0.99 | 1.00 | 0.99 |
| 1 | 0.99 | 0.96 | 0.97 |
knitr::kable(select(analytics@algorithm_summary, MAXENTROPY_PRECISION:MAXENTROPY_FSCORE))
| MAXENTROPY_PRECISION | MAXENTROPY_RECALL | MAXENTROPY_FSCORE | |
|---|---|---|---|
| 0 | 1.00 | 0.98 | 0.99 |
| 1 | 0.95 | 0.99 | 0.97 |
knitr::kable(analytics@ensemble_summary)
| n-ENSEMBLE COVERAGE | n-ENSEMBLE RECALL | |
|---|---|---|
| n >= 1 | 1.00 | 0.99 |
| n >= 2 | 1.00 | 0.99 |
| n >= 3 | 0.95 | 1.00 |
The classifiers’ performance was also evaluated by plotting their ROC curves and calculating AUC values.
results$SVM_PROB[results$SVM_LABEL == 0] <- 1 - results$SVM_PROB[results$SVM_LABEL == 0]
results$TREE_PROB[results$TREE_LABEL == 0] <- 1 - results$TREE_PROB[results$TREE_LABEL == 0]
results$MAXENTROPY_PROB[results$MAXENTROPY_LABEL == 0] <-
(1 - results$MAXENTROPY_PROB[results$MAXENTROPY_LABEL == 0])
pred_svm <- prediction(results$SVM_PROB, results$spam)
pred_tree <- prediction(results$TREE_PROB, results$spam)
pred_maxent <- prediction(results$MAXENTROPY_PROB, results$spam)
prf_svm <- performance(pred_svm, measure = "tpr", x.measure = "fpr")
prf_tree <- performance(pred_tree, measure = "tpr", x.measure = "fpr")
prf_maxent <- performance(pred_maxent, measure = "tpr", x.measure = "fpr")
auc_svm <- performance(pred_svm, measure = "auc")@y.values[[1]]
auc_tree <- performance(pred_tree, measure = "auc")@y.values[[1]]
auc_maxent <- performance(pred_maxent, measure = "auc")@y.values[[1]]
legend.labels <- c(str_c("SVM AUC = ", round(auc_svm, digits = 4)),
str_c("Tree AUC = ", round(auc_tree, digits = 4)),
str_c("Max Entropy AUC = ", round(auc_maxent, digits = 4)))
model_perf <- bind_rows(data.frame(model = "SVM",
FPR = unlist(prf_svm@x.values),
TPR = unlist(prf_svm@y.values),
stringsAsFactors = FALSE),
data.frame(model = "Tree",
FPR = unlist(prf_tree@x.values),
TPR = unlist(prf_tree@y.values),
stringsAsFactors = FALSE),
data.frame(model = "Max Entropy",
FPR = unlist(prf_maxent@x.values),
TPR = unlist(prf_maxent@y.values),
stringsAsFactors = FALSE))
model_perf$model <- factor(model_perf$model, levels = c("SVM", "Tree", "Max Entropy"))
ggplot(model_perf, aes(FPR, TPR, color = model, group = model)) +
geom_line() +
geom_abline(slope = 1, intercept = 0, linetype = 2) +
ggtitle("Performance of Spam/Ham Classifiers") +
scale_color_discrete(name = "Classifier Model", labels = legend.labels) +
theme(legend.position = c(0.7, 0.2))
I also downloaded the following linked corpora of ham and spam and unzipped them in my current working directory for further model testing.
The unzipped directories for these three sets of emails were renamed “test_easy_ham”, “test_hard_ham”, and “test_spam” respectively.
spam2 <- VCorpus(DirSource("test_spam", encoding = "UTF-8"))
easy_ham2 <- VCorpus(DirSource("test_easy_ham", encoding = "UTF-8"))
hard_ham2 <- VCorpus(DirSource("test_hard_ham", encoding = "UTF-8"))
meta(spam2, "spam") <- 1
meta(easy_ham2, "spam") <- 0
meta(hard_ham2, "spam") <- 0
emails2 <- c(spam2, easy_ham2, hard_ham2)
emails2 <- tm_map(emails2, content_transformer(function(x) iconv(x, to = 'UTF-8-MAC', sub = 'byte')))
emails2 <- tm_map(emails2, content_transformer(tolower))
emails2 <- tm_map(emails2, removeNumbers)
emails2 <- tm_map(emails2, removeWords, words = stopwords("en"))
emails2 <- tm_map(emails2, content_transformer(function(x) str_replace_all(x, "[[:punct:]]|<|>", " ")))
emails2 <- tm_map(emails2, stripWhitespace)
set.seed(1)
emails2 <- sample(emails2)
create_matrix() function for compatibility between installed versions of tm and RTextToolscreate_matrix_edit <- function (textColumns, language = "english", minDocFreq = 1,
maxDocFreq = Inf, minWordLength = 3, maxWordLength = Inf,
ngramLength = 1, originalMatrix = NULL, removeNumbers = FALSE,
removePunctuation = TRUE, removeSparseTerms = 0, removeStopwords = TRUE,
stemWords = FALSE, stripWhitespace = TRUE, toLower = TRUE,
weighting = weightTf)
{
stem_words <- function(x) {
split <- strsplit(x, " ")
return(wordStem(unlist(split), language = language))
}
tokenize_ngrams <- function(x, n = ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,
method = "string", n = n)))))
control <- list(bounds = list(local = c(minDocFreq, maxDocFreq)),
language = language, tolower = toLower, removeNumbers = removeNumbers,
removePunctuation = removePunctuation, stopwords = removeStopwords,
stripWhitespace = stripWhitespace, wordLengths = c(minWordLength,
maxWordLength), weighting = weighting)
if (ngramLength > 1) {
control <- append(control, list(tokenize = tokenize_ngrams),
after = 7)
}
else {
control <- append(control, list(tokenize = scan_tokenizer),
after = 4)
}
if (stemWords == TRUE && ngramLength == 1)
control <- append(control, list(stemming = stem_words),
after = 7)
trainingColumn <- apply(as.matrix(textColumns), 1, paste,
collapse = " ")
trainingColumn <- sapply(as.vector(trainingColumn, mode = "character"),
iconv, to = "UTF8", sub = "byte")
corpus <- Corpus(VectorSource(trainingColumn), readerControl = list(language = language))
matrix <- DocumentTermMatrix(corpus, control = control)
if (removeSparseTerms > 0)
matrix <- removeSparseTerms(matrix, removeSparseTerms)
if (!is.null(originalMatrix)) {
terms <- colnames(originalMatrix[, which(!colnames(originalMatrix) %in%
colnames(matrix))])
weight <- 0
if (attr(weighting, "acronym") == "tf-idf")
weight <- 1e-09
amat <- matrix(weight, nrow = nrow(matrix), ncol = length(terms))
colnames(amat) <- terms
rownames(amat) <- rownames(matrix)
fixed <- as.DocumentTermMatrix(cbind(matrix[, which(colnames(matrix) %in%
colnames(originalMatrix))], amat), weighting = weighting)
matrix <- fixed
}
matrix <- matrix[, sort(colnames(matrix))]
gc()
return(matrix)
}
dtm2 <- create_matrix_edit(cbind(content(emails2)), minWordLength = 2, minDocFreq = 5,
weighting = weightTf, originalMatrix = dtm)
container2 <- create_container(dtm2,
labels = unlist(meta(emails2)),
testSize = 1:length(emails2),
virgin = FALSE)
svm_out2 <- classify_model(container2, svm_model)
tree_out2 <- classify_model(container2, tree_model)
maxent_out2 <- classify_model(container2, maxent_model)
analytics2 <- create_analytics(container2, cbind(svm_out2, tree_out2, maxent_out2))
knitr::kable(select(analytics2@algorithm_summary, SVM_PRECISION:SVM_FSCORE))
| SVM_PRECISION | SVM_RECALL | SVM_FSCORE | |
|---|---|---|---|
| 0 | 0.95 | 0.74 | 0.83 |
| 1 | 0.35 | 0.79 | 0.49 |
knitr::kable(select(analytics2@algorithm_summary, TREE_PRECISION:TREE_FSCORE))
| TREE_PRECISION | TREE_RECALL | TREE_FSCORE | |
|---|---|---|---|
| 0 | 0.92 | 0.58 | 0.71 |
| 1 | 0.24 | 0.72 | 0.36 |
knitr::kable(select(analytics2@algorithm_summary, MAXENTROPY_PRECISION:MAXENTROPY_FSCORE))
| MAXENTROPY_PRECISION | MAXENTROPY_RECALL | MAXENTROPY_FSCORE | |
|---|---|---|---|
| 0 | 0.95 | 0.81 | 0.87 |
| 1 | 0.41 | 0.77 | 0.54 |
knitr::kable(analytics2@ensemble_summary)
| n-ENSEMBLE COVERAGE | n-ENSEMBLE RECALL | |
|---|---|---|
| n >= 1 | 1.00 | 0.81 |
| n >= 2 | 1.00 | 0.81 |
| n >= 3 | 0.57 | 0.80 |
results2 <- cbind(meta(emails2[1:length(emails2)]), svm_out2, tree_out2, maxent_out2)
results2$SVM_PROB[results2$SVM_LABEL == 0] <- 1 - results2$SVM_PROB[results2$SVM_LABEL == 0]
results2$TREE_PROB[results2$TREE_LABEL == 0] <- 1 - results2$TREE_PROB[results2$TREE_LABEL == 0]
results2$MAXENTROPY_PROB[results2$MAXENTROPY_LABEL == 0] <-
(1 - results2$MAXENTROPY_PROB[results2$MAXENTROPY_LABEL == 0])
pred_svm2 <- prediction(results2$SVM_PROB, results2$spam)
pred_tree2 <- prediction(results2$TREE_PROB, results2$spam)
pred_maxent2 <- prediction(results2$MAXENTROPY_PROB, results2$spam)
prf_svm2 <- performance(pred_svm2, measure = "tpr", x.measure = "fpr")
prf_tree2 <- performance(pred_tree2, measure = "tpr", x.measure = "fpr")
prf_maxent2 <- performance(pred_maxent2, measure = "tpr", x.measure = "fpr")
auc_svm2 <- performance(pred_svm2, measure = "auc")@y.values[[1]]
auc_tree2 <- performance(pred_tree2, measure = "auc")@y.values[[1]]
auc_maxent2 <- performance(pred_maxent2, measure = "auc")@y.values[[1]]
legend.labels2 <- c(str_c("SVM AUC = ", round(auc_svm2, digits = 4)),
str_c("Tree AUC = ", round(auc_tree2, digits = 4)),
str_c("Max Entropy AUC = ", round(auc_maxent2, digits = 4)))
model_perf2 <- bind_rows(data.frame(model = "SVM",
FPR = unlist(prf_svm2@x.values),
TPR = unlist(prf_svm2@y.values),
stringsAsFactors = FALSE),
data.frame(model = "Tree",
FPR = unlist(prf_tree2@x.values),
TPR = unlist(prf_tree2@y.values),
stringsAsFactors = FALSE),
data.frame(model = "Max Entropy",
FPR = unlist(prf_maxent2@x.values),
TPR = unlist(prf_maxent2@y.values),
stringsAsFactors = FALSE))
model_perf2$model <- factor(model_perf2$model, levels = c("SVM", "Tree", "Max Entropy"))
ggplot(model_perf2, aes(FPR, TPR, color = model, group = model)) +
geom_line() +
geom_abline(slope = 1, intercept = 0, linetype = 2) +
ggtitle("Performance of Spam/Ham Classifiers") +
scale_color_discrete(name = "Classifier Model", labels = legend.labels2) +
theme(legend.position = c(0.7, 0.2))
Additional testing on a new, larger set of emails reveals the superior performance of the max entropy and SVM-based classifiers relative to the tree classifier. The discrepancy between the performance of the tree classifier here and on the initial test set of emails suggests that that model was overfitted during training.