This week’s assignment is aimed at parsing and classifying a sample of spam and non-spam e-mails using the tm and RTextTools packages as described in the course book. The labeled files are provided by https://spamassassin.apache.org/publiccorpus/
library(stringr)
library(tm)
First, create the corpus of spam e-mails
if (!dir.exists("spam")){
download.file(url = "https://spamassassin.apache.org/publiccorpus/https://spamassassin.apache.org/publiccorpus/20021010_spam.tar.bz2", destfile = "20021010_spam.tar.bz2")
untar("20021010_spam.tar.bz2",compressed = "bzip2")
}
spam_files = list.files(path = "spam",full.names = T)
# Construct the corpus frame by reading the first document
# Note that the first file in the spam folder is a non-relevant index document, so start at index 2.
tmp = readLines(con = spam_files[2])
tmp = str_c(tmp, collapse = "")
spam_corpus = Corpus(VectorSource(tmp))
meta(spam_corpus[[1]], "label") = "spam"
# Add the remaining documents from the folder
for (i in 3: length(spam_files)) {
tmp = readLines(con = spam_files[i])
tmp = str_c(tmp, collapse = "")
if (length(tmp) != 0) {
tmp_corpus = Corpus(VectorSource(tmp))
meta(tmp_corpus[[1]], "label") = "spam"
spam_corpus = c(spam_corpus, tmp_corpus)
}
}
Use the same approach to build the non-spam (or “ham”) corpus
if (!dir.exists("ham")){
download.file(url = "https://spamassassin.apache.org/publiccorpus/20021010_easy_ham.tar.bz2", destfile = "20021010_easy_ham.tar.bz2")
untar("20021010_easy_ham.tar.bz2",compressed = "bzip2")
}
ham_files = list.files(path = "easy_ham",full.names = T)
# Construct the corpus frame by reading the first document
tmp = readLines(con = ham_files[1])
tmp = str_c(tmp, collapse = "")
ham_corpus = Corpus(VectorSource(tmp))
meta(ham_corpus[[1]], "label") = "ham"
# Add the remaining documents from the folder
for (i in 2: length(ham_files)) {
tmp = readLines(con = ham_files[i])
tmp = str_c(tmp, collapse = "")
if (length(tmp) != 0) {
tmp_corpus = Corpus(VectorSource(tmp))
meta(tmp_corpus[[1]], "label") = "ham"
ham_corpus = c(ham_corpus, tmp_corpus)
}
}
Combine the data in a single corpus that will be used throughout the rest of the processing.
total_corpus = c(ham_corpus,spam_corpus)
# Check that the spam/ham labels have been applied correctly
meta_data = data.frame(unlist(meta(total_corpus, "label")))
table(meta_data)
## meta_data
## ham spam
## 2551 500
The number of spam & ham labels matches the original length of document folders.
The collected text in the corpus is very messy. Here is an example of a single “spam” document:
total_corpus[[3000]][1]
## $content
## [1] "From home_loans@eudoramail.com Tue Sep 24 10:52:24 2002Return-Path: <home_loans@eudoramail.com>Delivered-To: zzzz@localhost.jmason.orgReceived: from localhost (jalapeno [127.0.0.1])\tby zzzzason.org (Postfix) with ESMTP id C986816F16\tfor <zzzz@localhost>; Tue, 24 Sep 2002 10:52:23 +0100 (IST)Received: from jalapeno [127.0.0.1]\tby localhost with IMAP (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Tue, 24 Sep 2002 10:52:23 +0100 (IST)Received: from email1.micrel.com ([65.218.208.2]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g8O1gsC15674 for <webmaster@efi.ie>; Tue, 24 Sep 2002 02:42:55 +0100Received: from mx1.eudoramail.com ([202.9.153.104]) by email1.micrel.com with Microsoft SMTPSVC(5.0.2195.4905); Mon, 23 Sep 2002 18:43:15 -0700Message-Id: <000045ce0ef9$00002ab2$000040ed@mx1.eudoramail.com>To: <Undisclosed.Recipients@dogma.slashnull.org>From: home_loans@eudoramail.comSubject: Lenders WILL COMPETE for your mortgage IVXDate: Tue, 24 Sep 2002 07:13:20 -0700MIME-Version: 1.0Content-Type: text/plain; charset=\"Windows-1252\"Content-Transfer-Encoding: 7bitX-Originalarrivaltime: 24 Sep 2002 01:43:16.0846 (UTC) FILETIME=[C0E1A8E0:01C2636B]Dear Homeowner, Interest Rates are at their lowest point in 40 years!We help you find the best rate for your situation bymatching your needs with hundreds of lenders!Home Improvement, Refinance, Second Mortgage,Home Equity Loans, and More! Even with less thanperfect credit!This service is 100% FREE to home owners and newhome buyers without any obligation. Just fill out a quick, simple form and jump-startyour future plans today!Visit http://61.145.116.186/user0201/index.asp?Afft=QM10To unsubscribe, please visit:http://61.145.116.186/light/watch.asp"
Apply cleaning functions of the tm package and some customized text transformers to each document in the corpus
# Remove numbers
total_corpus = tm_map(total_corpus, content_transformer(removeNumbers))
# Remove punctuation and anything between <brackets> or the "\t" string, as these parts are not informative
total_corpus = tm_map(total_corpus, content_transformer(function(x)
str_replace_all(x,pattern = "[[:punct:]]|\\<.+?\\>|\\t", replacement = " ")))
# Apply further cleaning transformations
total_corpus = tm_map(total_corpus, content_transformer(tolower))
total_corpus = tm_map(total_corpus, content_transformer(stripWhitespace))
total_corpus = tm_map(total_corpus, content_transformer(removePunctuation))
total_corpus = tm_map(total_corpus, content_transformer(removePunctuation))
total_corpus = tm_map(total_corpus, content_transformer(function(x)
removeWords(x, stopwords("english"))))
Now the content ratio in the text seems much higher:
total_corpus[[3000]][1]
## $content
## [1] " home loans eudoramail com tue sep return path delivered zzzz localhost jmason orgreceived localhost jalapeno zzzzason org postfix esmtp id cf tue sep ist received jalapeno localhost imap fetchmail zzzz localhost single drop tue sep ist received email micrel com dogma slashnull org esmtp id gogsc tue sep received mx eudoramail com email micrel com microsoft smtpsvc mon sep message id home loans eudoramail comsubject lenders will compete mortgage ivxdate tue sep mime version content type text plain charset windows content transfer encoding bitx originalarrivaltime sep utc filetime ceae cb dear homeowner interest rates lowest point years help find best rate situation bymatching needs hundreds lenders home improvement refinance second mortgage home equity loans even less thanperfect credit service free home owners newhome buyers without obligation just fill quick simple form jump startyour future plans today visit http user index asp afftqmto unsubscribe please visit http light watch asp"
Before modeling, we generate a permutation of the corpus dataset so that the training and test split of the create_container function could be used without problems.
# Generate a random permutation of corpus documents
set.seed(123)
total_corpus1 = sample(total_corpus)
# Extract the labels
labels = unlist(meta(total_corpus1, "label"))
head(labels,20)
## 1 1 1 1 1 1 1 1 1 1
## "ham" "ham" "ham" "spam" "spam" "ham" "ham" "spam" "ham" "ham"
## 1 1 1 1 1 1 1 1 1 1
## "spam" "ham" "ham" "ham" "ham" "spam" "ham" "ham" "ham" "spam"
Now we can create a document-term matrix. For weighting, we use the TF-IDF measure as described in the Data Science for Business Book. Also we do not allow words shorter than 2 (uninformative) or longer than 10 characters (these are words pasted together by mistake).
Using the DTM, we can apply three built-in models from the RTextTools library to predict the class (“ham” or “spam”):
# Create a dtm for the new corpus
dtm = DocumentTermMatrix(total_corpus1, control = list(stemming=T,
weighting = weightTfIdf,
wordLengths = c(3,10)))
# Remove sparse terms appearing in less than 10 documents
dtm = removeSparseTerms(dtm, 1-(10/length(total_corpus1)))
# Create container
library(RTextTools)
N = length(labels)
trainsize = round(N*0.75) # use 75% for training and the rest for the test
container <- create_container(
dtm,
labels = labels,
trainSize = 1:trainsize,
testSize = (trainsize+1):N,
virgin = FALSE
)
# Train three models
models = train_models(container, algorithms=c("MAXENT","SVM","BOOSTING"))
# Generate results for the test data
results = classify_models(container, models)
Now we can compare the modeling results with the true label values for the test data. We can use precision and recall as metrics:
validation = data.frame(MAXENT = results$MAXENTROPY_LABEL,
SVM=results$SVM_LABEL,
BOOST=results$LOGITBOOST_LABEL,
truelabel = labels[(trainsize+1):N],
stringsAsFactors = F)
max_perf = table(validation$MAXENT,validation$truelabel,dnn=c("MAXENT","True"))
svm_perf = table(validation$SVM,validation$truelabel,dnn=c("SVM","True"))
boost_perf = table(validation$BOOST,validation$truelabel,dnn=c("BOOST","True"))
# Create special functions for our results tables
precision = function(x) {x[2,2]/(x[2,2]+x[2,1])}
recall = function(x) {x[2,2]/(x[2,2]+x[1,2])}
# Calculate and visualize the metrics
models_perf = data.frame(Model = c("MAXENT","SVM","BOOST"),
Precision = unlist(lapply(list(max_perf,svm_perf,boost_perf),
FUN = precision)),
Recall = unlist(lapply(list(max_perf,svm_perf,boost_perf),
FUN = recall)))
library(plotly)
plot_ly(data=models_perf, x = ~Model, y = ~Precision, type = "bar",
name = "Precision") %>%
add_trace(data=models_perf, x = ~Model, y = ~Recall, type = "bar",
name = "Recall") %>%
layout(title = "Classification Model Performance on Test Data",yaxis = list(title="Value"))
We see that the Boosting model performs unreasonably well having perfectly classified each message. This is a sign of overfitting. Maximum entropy model is on the second place. The SVM model performed the worst, with a recall of 93%, which means that 7% of spam messages would still land in the main inbox after the classification by this model.
Reference