The Task
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).
Loaded Packages
prettydoc TRUE
SnowballC TRUE
knitr TRUE
rvest TRUE
stringr TRUE
tm TRUE
RTextTools TRUE
caret TRUE
dplyr TRUE
Download & Extract Email Files
url <- "http://spamassassin.apache.org/old/publiccorpus/"
# parse the HTML
files <- url %>% read_html() %>% html_nodes("a") %>% html_attr("href")
# concat function
"%&%" <- function(x, y) paste0(x, y)
# select the 2 files needed
my_files <- files[c(9, 14)]
file_urls <- url %&% my_files
# create temporary directory
data_dir <- "data-sets/spamham"
dir.create(data_dir)
# Download and unzip the doubly zipped files
for (i in 1:length(my_files)) {
if (!my_files[i] %in% list.files(data_dir)) {
my_dest <- file.path(data_dir, my_files[i])
download.file(file_urls[i], destfile = my_dest)
untar(bzfile(my_dest, "rb"), exdir = data_dir)
}
}Let’s take a look at the downloaded files & created directories
(afile <- list.files(data_dir))## [1] "20030228_easy_ham.tar.bz2" "20050311_spam_2.tar.bz2"
## [3] "easy_ham" "spam_2"
head(list.files(data_dir %&% "/" %&% afile[3]))## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"
## [6] "00006.253ea2f9a9cc36fa0b1129b04b806608"
Now let’s create 2 corpora
# Create the 2 corpora
spam_corpus <- VCorpus(DirSource(data_dir %&% "/" %&% "spam_2"), readerControl = list(language = "lat"))
ham_corpus <- VCorpus(DirSource(data_dir %&% "/" %&% "easy_ham"), readerControl = list(language = "lat"))Add 2 metadata attributes: email type and group
For the “group” attribute, we will alternate between assigning the attibute values of “test” and “train” to each document
# add 2 metadata attributes email group attribute
email_attr <- "email_type"
attr_value1 <- "spam"
attr_value2 <- "ham"
# group attribute
group_attr <- "group"
# loop through spam corpus and create metadata
for (i in 1:length(spam_corpus)) {
meta(spam_corpus[[i]], email_attr) <- attr_value1
meta(spam_corpus[[i]], group_attr) <- ifelse(i%%2 == 0, "test", "train")
}
# loop through ham corpus and create metadata
for (i in 1:length(ham_corpus)) {
meta(ham_corpus[[i]], email_attr) <- attr_value2
meta(ham_corpus[[i]], group_attr) <- ifelse(i%%2 == 0, "test", "train")
}Let’s combine the corpora & see how many documents were assigned to the Test and Train groups.
# combine corpora
my_corpus <- c(ham_corpus, spam_corpus)
email_types <- unlist(meta(my_corpus, type = "local", tag = "email_type"))
group_list <- unlist(meta(my_corpus, type = "local", tag = "group"))
addmargins(table(email_types, group_list))## group_list
## email_types test train Sum
## ham 1250 1251 2501
## spam 698 699 1397
## Sum 1948 1950 3898
Clean my Corpus & Create a Document-Term Matrix with the Sparse Words Removed
cleaned_corpus <- tm_map(my_corpus, removeNumbers)
cleaned_corpus <- tm_map(cleaned_corpus, content_transformer(tolower))
cleaned_corpus <- tm_map(cleaned_corpus, removeWords, words = stopwords("en"))
cleaned_corpus <- tm_map(cleaned_corpus, stemDocument)
cleaned_corpus <- tm_map(cleaned_corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
cleaned_corpus <- tm_map(cleaned_corpus, stripWhitespace)
# Create Document-Term Matrix
dtm <- DocumentTermMatrix(cleaned_corpus)
# remove sparse terms
dtm <- removeSparseTerms(dtm, 1 - (10/length(cleaned_corpus)))
inspect(dtm)## <<DocumentTermMatrix (documents: 3898, terms: 6056)>>
## Non-/sparse entries: 487070/23119218
## Sparsity : 98%
## Maximal term length: 74
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aug esmtp jmlocalhost localhost
## 00028.60393e49c90f750226bee6381eb3e69d 0 2 0 0
## 00051.8b17ce16ace4d5845e2299c0123e1f14 0 2 0 0
## 00265.d0ebd6ba8f3e2b8d71e9cdaa2ec6fd91 0 5 0 4
## 00570.d98ca90ac201b5d881f2397c95838eb2 0 5 2 5
## 00670.be029e37187b8615a231865e3663dcf9 0 5 2 4
## 00677.b957e34b4dd0d9263b56bf71b1168d8a 0 4 2 3
## 00765.ea01c46568902b1338c9685b55d77f6c 0 5 2 3
## 00907.74983d9d0d6ee3c681a48cf893f123b5 0 4 0 1
## 00942.727cb1619115cdee240fa418da19dd1f 0 4 2 3
## 01083.a6b3c50be5abf782b585995d2c11176b 0 4 0 1
## Terms
## Docs mon oct postfix received sep wed
## 00028.60393e49c90f750226bee6381eb3e69d 4 0 2 2 0 0
## 00051.8b17ce16ace4d5845e2299c0123e1f14 0 0 2 2 0 0
## 00265.d0ebd6ba8f3e2b8d71e9cdaa2ec6fd91 0 8 3 6 0 4
## 00570.d98ca90ac201b5d881f2397c95838eb2 0 0 3 6 8 0
## 00670.be029e37187b8615a231865e3663dcf9 0 0 3 6 8 0
## 00677.b957e34b4dd0d9263b56bf71b1168d8a 0 0 3 6 9 0
## 00765.ea01c46568902b1338c9685b55d77f6c 0 0 3 6 8 0
## 00907.74983d9d0d6ee3c681a48cf893f123b5 8 0 0 8 0 0
## 00942.727cb1619115cdee240fa418da19dd1f 0 9 3 6 0 4
## 01083.a6b3c50be5abf782b585995d2c11176b 0 0 0 8 0 0
Create Container, Train & Classify Models
training_indices <- which(group_list == "train")
test_indices <- which(group_list == "test")
container <- create_container(dtm, labels = email_types, trainSize = training_indices,
testSize = test_indices, virgin = F)
# slotNames(container)
# Train models
svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")
# Classify models
svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)Aggregate the Results of the 3 Models with the Actuals
# Construct data frame with correct labels
results <- data.frame(actuals = email_types[test_indices], svm = as.character(svm_out[,
1]), tree = as.character(tree_out[, 1]), maxent = as.character(maxent_out[,
1]), stringsAsFactors = F)Create Confusion Matrices using the caret package
Support Vector Machine Results
Confusion Matrix and Statistics
Reference
Prediction ham spam
ham 1194 4
spam 4 688
Accuracy : 0.9958
95% CI : (0.9917, 0.9982)
No Information Rate : 0.6339
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9909
Mcnemar's Test P-Value : 1
Sensitivity : 0.9942
Specificity : 0.9967
Pos Pred Value : 0.9942
Neg Pred Value : 0.9967
Prevalence : 0.3661
Detection Rate : 0.3640
Detection Prevalence : 0.3661
Balanced Accuracy : 0.9954
'Positive' Class : spam
Random Forest Results
Confusion Matrix and Statistics
Reference
Prediction ham spam
ham 1185 20
spam 13 672
Accuracy : 0.9825
95% CI : (0.9756, 0.988)
No Information Rate : 0.6339
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9623
Mcnemar's Test P-Value : 0.2963
Sensitivity : 0.9891
Specificity : 0.9711
Pos Pred Value : 0.9834
Neg Pred Value : 0.9810
Prevalence : 0.6339
Detection Rate : 0.6270
Detection Prevalence : 0.6376
Balanced Accuracy : 0.9801
'Positive' Class : ham
Maximum Entropy Results
Confusion Matrix and Statistics
Reference
Prediction ham spam
ham 1196 8
spam 2 684
Accuracy : 0.9947
95% CI : (0.9903, 0.9975)
No Information Rate : 0.6339
P-Value [Acc > NIR] : <2e-16
Kappa : 0.9886
Mcnemar's Test P-Value : 0.1138
Sensitivity : 0.9884
Specificity : 0.9983
Pos Pred Value : 0.9971
Neg Pred Value : 0.9934
Prevalence : 0.3661
Detection Rate : 0.3619
Detection Prevalence : 0.3630
Balanced Accuracy : 0.9934
'Positive' Class : spam
Conclusions
Overall, all 3 supervised classifer models came close to correctly classifying nearly all of the documents into spam and ham. Their balanced accuracies ranged from .9801 to .9954.
Out of the 3 models, the Support Vector Machine supervised classifier was the most accurate, followed Maximum Entropy and then Random Forest.
# Close connections & delete local data closeAllConnections()
# unlink(data_dir, recursive = T)