Load libraries
library(stringr)
library(tm)
library(RTextTools)
library(SnowballC)
library(RCurl)
Set up the file urls, the file names of all email files are stored in the third column of the cmds file in the corresponding folder
ham_folder<-"https://raw.githubusercontent.com/ezaccountz/Data_607_Project_4/master/ham/"
spam_folder<-"https://raw.githubusercontent.com/ezaccountz/Data_607_Project_4/master/spam/"
ham_file_table <- read.table(str_c(ham_folder, "cmds"),stringsAsFactors = FALSE)
spam_file_table <- read.table(str_c(spam_folder, "cmds"),stringsAsFactors = FALSE)
ham_files <- str_c(ham_folder, ham_file_table$V3)
spam_files <- str_c(spam_folder, spam_file_table$V3)
Create 2 character vectors, one for the ham emails and one for the spam emails. Also check if the file exist before downloading the file.
ham_doc <- character()
spam_doc <- character()
for (file_url in ham_files) {
if (url.exists(file_url)) ham_doc <- append(ham_doc,getURL(file_url))
}
for (file_url in spam_files) {
if (url.exists(file_url)) spam_doc <- append(spam_doc,getURL(file_url))
}
Check if all files are successfully downloaded
print(str_c(length(ham_doc), " of ", str_c(length(ham_files), " ham files are loaded")))
## [1] "2500 of 2500 ham files are loaded"
print(str_c(length(spam_doc), " of ", str_c(length(spam_files), " spam files are loaded")))
## [1] "500 of 500 spam files are loaded"
Calculate the number of documents are used for traning and the remaining will be used for model testing. I this case, I withhold 20% of the documents for testing
test_case_withhold <- 0.2
num_train_ham <- as.integer((1 - test_case_withhold) * length(ham_doc))
num_train_spam <- as.integer((1 - test_case_withhold) * length(spam_doc))
Combine the ham and spam files into one vector and create the labels (ham or spam) of the documents, the first x% (x is as defined above) of the documents are for training and the remaining will be for testing.
corpus_doc <- character()
corpus_label <- character()
corpus_doc <- append(corpus_doc, ham_doc[1:num_train_ham])
corpus_label <- append(corpus_label, rep("ham",num_train_ham))
corpus_doc <- append(corpus_doc, spam_doc[1:num_train_spam])
corpus_label <- append(corpus_label, rep("spam",num_train_spam))
corpus_doc <- append(corpus_doc, ham_doc[(num_train_ham + 1): length(ham_doc)])
corpus_label <- append(corpus_label, rep("ham",length(ham_doc) - num_train_ham))
corpus_doc <- append(corpus_doc, spam_doc[(num_train_spam + 1): length(spam_doc)])
corpus_label <- append(corpus_label, rep("spam",length(spam_doc) - num_train_spam))
Separate the header information and the body of the email. The header and the body of an email are separated by an empty line in the email file, which can be found by the regex “”
corpus_header <- character()
corpus_content <- character()
for (doc in corpus_doc) {
temp <- str_locate(doc, "\n{2}")[1]
corpus_header <- append(corpus_header,str_sub(doc, 1, temp))
corpus_content <- append(corpus_content,str_sub(doc,temp + 2))
}
Set up a function to check the readability of a string or document. A readable string should consist mostly english alphabet characters. The function calculates the percentage of english alphabet characters in an input string
check_readable <- function(doc_str) {
temp <- doc_str
temp <- str_remove_all(temp, " ")
temp <- unlist(str_split(temp, ""))
temp2 <- str_detect(temp, "[A-Za-z]")
return(length(temp[temp2]) / length(temp))
}
Check if the emails contain subjects and extract the subjects if they do.
Check if the subjects contain an exclamation mark, which is showed on many spam advertising emails.
Calculate the readability of the subjects.
Check if the emails are reply emails, a reply email from someone is usually a non-spam email.
corpus_subject <- str_extract(corpus_header, "Subject:.*\n")
corpus_subject <- str_remove(corpus_subject, "Subject:")
corpus_subject <- str_remove(corpus_subject, "\n")
corpus_no_subject <- is.na(corpus_subject) | str_detect(corpus_subject, "^[[:space:]]*$")
corpus_subject_exclamation <- str_detect(corpus_subject, "!")
corpus_subject_readability <- lapply(corpus_subject, check_readable)
corpus_reply_to <- str_detect(corpus_header, "Reply-To:.*\n")
Replace the new line characters by a space character. Also replace the “ ” by a actual space character.
corpus_content_modified <- corpus_content %>%
str_replace_all("\n", " ") %>%
str_replace_all(" ", " ")
Check if there is a link to a web page in the content of the email.
Calculate the readability of the contents.
corpus_content_link <- str_detect(corpus_content_modified, regex("https?:",ignore_case=TRUE))
corpus_content_readability <- lapply(corpus_content_modified, check_readable)
Remove all html tags and all url links
corpus_content_modified <- corpus_content_modified %>%
str_remove_all("<.*?>") %>%
str_remove_all(regex("https?:.*[[:space:]]",ignore_case=TRUE))
Based on the some of the common characteristics of spam emails, the following indicators are added to the contents of the emails:
Add a pseudo token “nosubject” to the content of the mails if they have no subject.
Add a pseudo token “exclamationsubject” to the content of the mails if the subject of the emails has an exclamation mark.
Add a pseudo token “unreadablesubject” to the content of the mails if the readability of the subject of the emails are less than 0.7.
Add a pseudo token “nonreplyemail” to the content of the mails if they are not reply emails.
Add a pseudo token “containlink” to the content of the mails if they contain a link the a web page.
Add a pseudo token “unreadablecontent” to the content of the mails if the readability of the content of the emails are less than 0.5.
corpus_content_modified[corpus_no_subject] <- str_c(corpus_content_modified[corpus_no_subject], " nosubject")
corpus_content_modified[corpus_subject_exclamation] <- str_c(corpus_content_modified[corpus_subject_exclamation], " exclamationsubject")
corpus_content_modified[!is.na(corpus_subject_readability) & corpus_subject_readability < 0.7] <-
str_c(corpus_content_modified[!is.na(corpus_subject_readability) & corpus_subject_readability < 0.7], " unreadablesubject")
corpus_content_modified[!corpus_reply_to] <- str_c(corpus_content_modified[!corpus_reply_to], " nonreplyemail")
corpus_content_modified[corpus_content_link] <- str_c(corpus_content_modified[corpus_content_link], " containlink")
corpus_content_modified[!is.na(corpus_content_readability) & corpus_content_readability < 0.5] <-
str_c(corpus_content_modified[!is.na(corpus_content_readability) & corpus_content_readability < 0.5], " unreadablecontent")
Create the document corpus
corpus_content_modified <- Corpus(VectorSource(corpus_content_modified))
Clean up the document texts and create a document term matrix for modeling
tdm <- corpus_content_modified %>%
tm_map(removeNumbers) %>%
tm_map(str_replace_all, pattern = "[[:punct:]]", replacement = " ") %>%
tm_map(removeWords, words = stopwords("en")) %>%
tm_map(tolower) %>%
tm_map(stemDocument) %>%
DocumentTermMatrix() %>%
removeSparseTerms(1 - (10/length(corpus_content_modified)))
tdm
## <<DocumentTermMatrix (documents: 3000, terms: 2772)>>
## Non-/sparse entries: 151697/8164303
## Sparsity : 98%
## Maximal term length: 67
## Weighting : term frequency (tf)
Create a container for modeling. As defined above, a percentage of the documents are used for model training and the remaining will be used for testing.
container <- create_container(tdm, labels = corpus_label,
trainSize = 1:(num_train_ham + num_train_spam),
testSize = (num_train_ham + num_train_spam + 1) : length(corpus_content),
virgin = FALSE)
In the text book “Automated Data Collection with R” chapter 10, 3 models are introduced: the Support Vector Machines, the Random Forest and the Maximum Entropy. However, the Tree model is later used instead of the Random Forest model. So I would like to include it in my case to see the performance of the model.
svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")
rf_model <- train_model(container, "RF")
Notice that the run time of the training process of the Random Forest model is significantly higher than the other models, which is consistent with the description that the model is building multiple decision trees and then pick the best one.
Finally we test the performance of the models.
svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)
rf_out <- classify_model(container, rf_model)
labels_out <- data.frame(
correct_label = corpus_label[(num_train_ham + num_train_spam + 1) : length(corpus_content)],
svm = as.character(svm_out[,1]),
tree = as.character(tree_out[,1]),
maxent = as.character(maxent_out[,1]),
rf = as.character(rf_out[,1])
)
labels_out[496:505,]
## correct_label svm tree maxent rf
## 496 ham ham spam ham ham
## 497 ham ham ham ham ham
## 498 ham ham ham ham ham
## 499 ham ham spam ham ham
## 500 ham ham ham ham ham
## 501 spam spam spam spam spam
## 502 spam spam spam spam spam
## 503 spam spam spam spam spam
## 504 spam spam ham spam ham
## 505 spam spam spam spam spam
Compare the acuracy of the models
print("Support Vector Machines")
## [1] "Support Vector Machines"
table(labels_out[,1] == labels_out[,2])
##
## FALSE TRUE
## 10 590
print("Tree")
## [1] "Tree"
table(labels_out[,1] == labels_out[,3])
##
## FALSE TRUE
## 70 530
print("Maximum Entropy")
## [1] "Maximum Entropy"
table(labels_out[,1] == labels_out[,4])
##
## FALSE TRUE
## 8 592
print("Random Forest")
## [1] "Random Forest"
table(labels_out[,1] == labels_out[,5])
##
## FALSE TRUE
## 12 588
As we can see, the Tree model is significantly worse than other models. The tree generated by Random Forest is much better and the other 2 models have relatively the same accuracy.
For better comparison, let’s look at the confusion matrixs:
print("Support Vector Machines")
## [1] "Support Vector Machines"
table(labels_out[,1], labels_out[,1] == labels_out[,2])
##
## FALSE TRUE
## ham 0 500
## spam 10 90
print("Tree")
## [1] "Tree"
table(labels_out[,1], labels_out[,1] == labels_out[,3])
##
## FALSE TRUE
## ham 51 449
## spam 19 81
print("Maximum Entropy")
## [1] "Maximum Entropy"
table(labels_out[,1], labels_out[,1] == labels_out[,4])
##
## FALSE TRUE
## ham 4 496
## spam 4 96
print("Random Forest")
## [1] "Random Forest"
table(labels_out[,1], labels_out[,1] == labels_out[,5])
##
## FALSE TRUE
## ham 0 500
## spam 12 88
Though the Maximum Entropy has the highest accuracy, it classifies 4 ham emails as spam emails. There is little or no cost for misclassifying spam as ham. However, the cost for misclassifying ham as spam may be critical as the email may contain important message. Comparing Support Vector Machines to Random Forest, SVM performs a little bit better and the run time of Random Forest is significantly higher. Therefore, SVM is the best model in this case.