The assignment was to use a set of email messages and build a classifier to separate them into spam or ham. I used a set of messages from Kaggle (Wessel van Lit, https://www.kaggle.com/veleon/ham-and-spam-dataset, 2019). I used a manual Bayesian model to gain a better understanding of how the process works, as opposed to simply feeding data into canned functions. My approach was as follows:
Create directory structure and split messages into five folds for training and holdout data.
# Set parameters
# Number of folds to partition the data into
numfolds <- 5
# Use the top [maxwords] occurrences to decrease runtime;
# using every single tokenized word resulted in near-perfect models, which
# was pretty supsect and not reasonable for practical use.
maxwords <- 200
# See if the dataset has been unzipped yet; if not, automatically unzip it.
if (!dir.exists("hamnspam")) {
if (file.exists("kaggle_spam_ham.zip")) {
print("Unzipping")
unzip("kaggle_spam_ham.zip", overwrite = T)
}
}
# Partiion into folds for training and holdout sets
for (k in seq(-1, 1, 2)) {
# Set variable indicating if this is the "ham" loop or "spam loop"
hamorspam <- "ham"
if (k == 1) {
hamorspam <- "spam"
}
print(paste0("Partitioning ", hamorspam," directory"))
# Iterate over the number of folds
for (i in c(0:(numfolds - 1))) {
# Create the ham or spam folder if it doesn't exist
if (!dir.exists(paste0(hamorspam, "/", i))) {
dir.create(paste0(hamorspam, "/", i))
}
}
# Generate list of files and iterate through them, moving them into a different fold
tmpfiles <- list.files(path=hamorspam, full.names=F, recursive=F, include.dirs=F)
ct <- 0
for (f in tmpfiles) {
fromfile <- paste0(hamorspam, "/", f)
tofile <- paste0(hamorspam, "/", as.character(ct %% numfolds), "/", f)
if (file.info(fromfile)$isdir) {
next
}
ct <- ct + 1
if (ct %% 100 == 0) {
print(paste0(" moving file ", ct, " of ", length(tmpfiles), ": ", fromfile, " -> ", tofile))
}
file.rename(from = fromfile, to = tofile)
}
}
## [1] "Partitioning ham directory"
## [1] "Partitioning spam directory"
# Count total ham and spam
print("Getting file counts")
## [1] "Getting file counts"
totalham <- 0
totalspam <- 0
for (k in seq(-1, 1, 2)) {
# Set variable indicating if this is the "ham" loop or "spam loop"
hamorspam <- "ham"
if (k == 1) {
hamorspam <- "spam"
}
tmpfiles <- list.files(path=hamorspam, full.names=F, recursive=T, include.dirs=F)
if(k == 1) {
totalspam <- length(tmpfiles)
} else {
totalham <- length(tmpfiles)
}
}
Ingest the data and tokenize it into terms.
# To only use a balanced set, find the maximum number of each type (ham or spam) to choose
maxset <- totalham
if (totalspam < totalham) {
maxset <- totalspam
}
maxset <- maxset / numfolds
# Iterate through each file in the specified fold
processFold <- function(custom_stopwords, k, foldnum, holdoutflag, maxset) {
# Set variable indicating if this is the "ham" loop or "spam loop"
hamorspam <- "ham"
if (k == 1) {
hamorspam <- "spam"
}
print(paste0(" Fold #", foldnum, ", reading ", hamorspam," directory"))
# Generate list of files and iterate through them, moving them into a different fold
current_path <- paste0(hamorspam, "/", foldnum)
tmpfiles <- list.files(path=current_path, full.names=F, recursive=F, include.dirs=F)
ctf <- 0 #file counter
for (f in tmpfiles) {
# Exit as soon as we reach maxset to maintain a balanced sample set from each fold
if (ctf >= maxset) {
break
}
# Increment counters
ct <<- ct + 1
ctf <- ctf + 1
if (ct %% 100 == 0) {
print(paste0(" reading file #", ct, ": ", current_path, "/", f))
}
# Read file
filename <- paste0(current_path, "/", f)
txt <- readtext(filename, verbosity=0)
# Extract subject from header
header_end <- str_locate(txt, '\n\n')
m <- str_match(str_sub(txt, 1, header_end[1] - 1), 'Subject: *(.+?)\n')
subj <- m[2]
# Extract message body, discarding header
msgbody <- str_sub(txt, header_end[2] + 1)
# Tokenize
dftmp <- data.frame(text = paste(subj, msgbody))
dftmp <- dftmp %>% unnest_tokens(word, text, token = "ngrams", n_min = 1, n = 1)
# Remove leading and trailing underscores
dftmp <- dftmp %>% mutate(word = str_replace(word, '^_+(.+)$', '\\1'))
dftmp <- dftmp %>% mutate(word = str_replace(word, '^(.+)_+$', '\\1'))
# Lemmatize words
dftmp <- dftmp %>% mutate(word = lemmatize_words(word))
# Remove words with numbers
dftmp <- dftmp %>% filter(!grepl('\\d', word))
# Remove all underscores
dftmp <- dftmp %>% filter(!grepl('^_+$', word))
# Remove stopwords
dftmp <- dftmp %>% anti_join(custom_stopwords, by = c("word"))
# TF
dftmp <- dftmp %>% count(word, sort = F) %>%
rename(tf = n)
# Add doc_id, spam fields, and holdout fields
dftmp <- dftmp %>% mutate(doc_id = f, spam = k, fold = foldnum)
# Bind to main data frame
df <<- bind_rows(df, dftmp)
} # for each file
}
# Load standard stop_words and read in custom stop_words from file
data(stop_words)
custom_stopwords <- read.delim("custom_stopwords.txt", sep = '\n', header = F) %>%
rename(word = V1) %>% mutate(lexicon = "custom")
custom_stopwords <- bind_rows(custom_stopwords, stop_words)
# Create main data frame
df <- data.frame(matrix(vector(), 0, 5), stringsAsFactors=F)
colnames(df) <- c("doc_id", "word", "tf", "spam", "fold")
# Seed the tables to set the variable types
df <- df %>% mutate(doc_id = "text", word = "text", tf = 0, spam = 0, fold = 0)
# Do this twice - once for ham, once for spam
ct <- 0
for (k in seq(-1, 1, 2)) {
# Iterate over folds
for (foldnum in c(0:(numfolds-1))) {
# Read and tokenize files
processFold(custom_stopwords, k, foldnum, 0, maxset)
} # for each fold
} # once for ham, once for ham
## [1] " Fold #0, reading ham directory"
## [1] " reading file #100: ham/0/0496.397e015dabee7ac3c6e655ad9bf66052"
## [1] " Fold #1, reading ham directory"
## [1] " reading file #200: ham/1/0492.3aa3aaa0ac9343fd1aa11864e8c281b1"
## [1] " Fold #2, reading ham directory"
## [1] " reading file #300: ham/2/0488.8fddac859ba93d27a041fe770be65d29"
## [1] " Fold #3, reading ham directory"
## [1] " reading file #400: ham/3/0484.9c63b894f85b12f32ad5e5b97cd5c49c"
## [1] " Fold #4, reading ham directory"
## [1] " reading file #500: ham/4/0480.ac223ce8c5be563214bdc8e3af2a18b6"
## [1] " Fold #0, reading spam directory"
## [1] " reading file #600: spam/0/0470.b9e513715695ea1c79c1e5af0fb0eea9"
## [1] " Fold #1, reading spam directory"
## [1] " reading file #700: spam/1/0466.11bc31540055c320b62e5886ef27c4b2"
## [1] " Fold #2, reading spam directory"
## [1] " reading file #800: spam/2/0467.b59a3337d0979ba8d587bf4c166db8b1"
## [1] " Fold #3, reading spam directory"
## [1] " reading file #900: spam/3/0468.8edb99340b9a96a81813be2d3362605d"
## [1] " Fold #4, reading spam directory"
## [1] " reading file #1000: spam/4/0469.c1d9ab5918d50dac4242ef53c4aaf678"
Train the model by iterating over the folds, reserving a holdout fold each iteration.
# Iterate over training folds
dfscores <- NULL
for (holdoutfold in c((numfolds-1):0)) {
print(paste0("Set holdout fold to ", holdoutfold))
# Get total number of docs by ham or spam, along with the probabilities of ham or spam
print(" Calculating counts")
numham <- count(df %>%
filter(fold != holdoutfold) %>%
filter(spam == -1) %>%
group_by(doc_id) %>%
summarize(n = n()))[[1]]
numspam <- count(df %>%
filter(fold != holdoutfold) %>%
filter(spam == 1) %>%
group_by(doc_id) %>%
summarize(n = n()))[[1]]
pham <- numham / (numham + numspam)
pspam <- numspam / (numham + numspam)
# Calculate the probability of seeing each word in the training set (which exludes the holdout fold)
print(" Calculating p_word")
dfwords <- df %>%
filter(fold != holdoutfold) %>%
group_by(spam, word) %>%
summarise(docct = n()) %>%
mutate(p = ifelse(spam == 1, docct / numspam, docct / numham)) %>%
rename(spam_of_word = spam) %>%
slice_max(n = maxwords, order_by = p, with_ties = F)
# Using the holdout fold, calculate Bayes probabilities of ham and spam
print(" Calculating Bayes probabilities")
dfjoin <- df %>%
filter(fold == holdoutfold) %>%
inner_join(dfwords, by = c("word")) %>%
group_by(doc_id, spam, spam_of_word) %>%
summarise(prod_p = prod(p)) %>%
mutate(prod_p = ifelse(spam_of_word == 1, prod_p * pspam, prod_p * pham))
# Calculate final probability scores
print(" Calculating final probability scores")
dfresult <- dfjoin %>%
spread(spam_of_word, prod_p) %>%
rename(prod_p_ham = `-1`, prod_p_spam = `1`) %>%
mutate(prod_p_ham = ifelse(is.na(prod_p_ham), 0, prod_p_ham)) %>%
mutate(prod_p_spam = ifelse(is.na(prod_p_spam), 0, prod_p_spam)) %>%
mutate(p_spam = ifelse(prod_p_spam > prod_p_ham, prod_p_spam, 1 - prod_p_ham)) %>%
mutate(is_spam = ifelse(spam == 1, 1, 0)) %>%
mutate(is_ham = ifelse(spam == -1, 1, 0)) %>%
arrange(desc(p_spam))
# Generate thresholds for ROC curves
print(" Generating thresholds for ROC curves")
currentThreshold <- 1
dfthreshold <- dfresult
for (i in 1:nrow(dfresult)) {
if (dfthreshold[i, 'is_spam'] == 1 & dfthreshold[i, 'p_spam'] >= currentThreshold) {
dfthreshold[i, 'true_pos'] <- 1
dfthreshold[i, 'false_pos'] <- 0
}
else if (dfthreshold[i, 'is_spam'] == 1 & dfthreshold[i, 'p_spam'] < currentThreshold) {
dfthreshold[i, 'true_pos'] <- 0
dfthreshold[i, 'false_pos'] <- 1
}
else {
dfthreshold[i, 'true_pos'] <- 0
dfthreshold[i, 'false_pos'] <- 0
}
dfthreshold[i, 'pct_instances'] <- i / nrow(dfresult)
currentThreshold <- dfthreshold[i, 'p_spam']
}
print(" Calculating cumulative rates")
dfthreshold$cum_true_pos <- cumsum(dfthreshold$true_pos)
dfthreshold$cum_false_pos <- cumsum(dfthreshold$false_pos)
dfthreshold$cum_true_pos_rate <- dfthreshold$cum_true_pos / sum(dfthreshold$true_pos)
dfthreshold$cum_false_pos_rate <- dfthreshold$cum_false_pos / sum(dfthreshold$false_pos)
dfthreshold$lift <- dfthreshold$cum_true_pos_rate / dfthreshold$pct_instances
dfthreshold$holdoutfold <- holdoutfold
# Save data frame to a new instance so we can reference it outside of the for loop
if (exists('dfscores') && is.data.frame(get('dfscores'))) {
dfscores <- dfscores %>%
bind_rows(dfthreshold)
} else {
dfscores <- dfthreshold
}
# Generate ROC curve
print(" Ploting ROC curves")
if (!exists('rocplot')) {
rocplot <- NULL
}
rocplot[[holdoutfold + 1]] <- dfthreshold %>%
ggplot() +
theme(text = element_text(size = 10)) +
geom_point(mapping = aes(x = cum_false_pos_rate, y = cum_true_pos_rate)) +
ggtitle(paste0("Holdout fold #", holdoutfold)) +
xlab("False positive rate") +
ylab("True positive rate")
if (!exists('auc_result')) {
auc_result <- NULL
}
auc_result[holdoutfold + 1] <- area_under_curve(dfthreshold$cum_false_pos_rate,
dfthreshold$cum_true_pos_rate, method = "trapezoid")
print(paste0(" AUC (holdout fold #", holdoutfold, ") = ", auc_result[holdoutfold + 1]))
} # for each cross-validation set
## [1] "Set holdout fold to 4"
## [1] " Calculating counts"
## [1] " Calculating p_word"
## [1] " Calculating Bayes probabilities"
## [1] " Calculating final probability scores"
## [1] " Generating thresholds for ROC curves"
## [1] " Calculating cumulative rates"
## [1] " Ploting ROC curves"
## [1] " AUC (holdout fold #4) = 0.871783088235294"
## [1] "Set holdout fold to 3"
## [1] " Calculating counts"
## [1] " Calculating p_word"
## [1] " Calculating Bayes probabilities"
## [1] " Calculating final probability scores"
## [1] " Generating thresholds for ROC curves"
## [1] " Calculating cumulative rates"
## [1] " Ploting ROC curves"
## [1] " AUC (holdout fold #3) = 0.917157275021026"
## [1] "Set holdout fold to 2"
## [1] " Calculating counts"
## [1] " Calculating p_word"
## [1] " Calculating Bayes probabilities"
## [1] " Calculating final probability scores"
## [1] " Generating thresholds for ROC curves"
## [1] " Calculating cumulative rates"
## [1] " Ploting ROC curves"
## [1] " AUC (holdout fold #2) = 0.950363714163457"
## [1] "Set holdout fold to 1"
## [1] " Calculating counts"
## [1] " Calculating p_word"
## [1] " Calculating Bayes probabilities"
## [1] " Calculating final probability scores"
## [1] " Generating thresholds for ROC curves"
## [1] " Calculating cumulative rates"
## [1] " Ploting ROC curves"
## [1] " AUC (holdout fold #1) = 0.911993097497843"
## [1] "Set holdout fold to 0"
## [1] " Calculating counts"
## [1] " Calculating p_word"
## [1] " Calculating Bayes probabilities"
## [1] " Calculating final probability scores"
## [1] " Generating thresholds for ROC curves"
## [1] " Calculating cumulative rates"
## [1] " Ploting ROC curves"
## [1] " AUC (holdout fold #0) = 0.852112676056338"
Present the results.
# Plot ROC curves
grid.arrange(ncol = 2, rocplot[[1]], rocplot[[2]], rocplot[[3]], rocplot[[4]], rocplot[[5]],
top = textGrob("ROC curves",gp=gpar(fontsize=12,font=3)))
# Show AUC values
df_auc <- data.frame(holdoutfold = c(0:(numfolds-1)), auc = auc_result)
df_auc
# Plot cumulative response curves
dfscores %>%
ggplot(aes(x = 100 * pct_instances, y = 100 * cum_true_pos_rate,
color = as.character(holdoutfold), linetype = as.character(holdoutfold))) +
theme(text = element_text(size = 10)) +
geom_point(alpha = 0.01) +
geom_line() + geom_abline(slope = 1, intercept = 0) +
xlab("Percentage of test instances") +
ylab("Percentage of positives") +
ggtitle("Cumulative resposne curve")
dfscores %>% ggplot(aes(x = 100 * pct_instances, y = lift,
color = as.character(holdoutfold), linetype = as.character(holdoutfold))) +
theme(text = element_text(size = 10)) +
geom_point(alpha = 0.01) +
geom_line() +
xlab("Percentage of test instances") +
ylab("Lift") +
ggtitle("Lift curve")
Display the most common hammy and spammy words.
# Top hammy words
dftopwords <- df %>%
filter(spam == -1) %>%
count(word, spam, sort = T) %>%
ungroup()
head(dftopwords, 20)
# Top spammy words
dftopwords <- df %>%
filter(spam == 1) %>%
count(word, spam, sort = T) %>%
ungroup()
head(dftopwords, 20)
# Word cloud - hammy words
df %>%
filter(spam == -1) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 75))
# Word cloud - spammy words
df %>%
filter(spam == 1) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 75))
As shown in the ROC plots, AUC table, and cumulative response curves, the model that performed best was the one in which holdoutfold #3 was exluded from the training data and used for testing. The model performs vert well, with an AUC of 0.95.
It should be noted that when the full tokenized word set from fold was used for training, some models achieved an AUC of 1.000, but at the expense of runtimes of well over two minutes. To balance model performance with resource utilization, I introduced a “maxwords” variable and set it at 200. While I didn’t do any benchmarking to evaluate the effect of maxwords on resource consumption, setting maxwords to 200 still achieved a respectable 0.95 AUC while reducing runtime to about 30 seconds.