Spam Filter using Naive Bayes Classifier

This is from Chapter 3 of Machine Learning for Hackers book.

Create Paths for reading text first

Use library(tm) for text mining

# Load libraries
library("tm")
library("ggplot2")

# Set the global paths
spam.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", "03-Classification", 
    "data", "spam")
spam2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", "03-Classification", 
    "data", "spam_2")
easyham.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", 
    "03-Classification", "data", "easy_ham")
easyham2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", 
    "03-Classification", "data", "easy_ham_2")
hardham.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", 
    "03-Classification", "data", "hard_ham")
hardham2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", 
    "03-Classification", "data", "hard_ham_2")

Write a Function to Read and Store the Email Content

For each “corpus”, we use this function to read and store the information into a file.
The message always begins after the first full line break (“null line”)
rt argument means “read as text”
readLine() returns each line as a separate element of a character vector
- read in all of the lines
- locate the first empty line of the text: which(text == "")[1]
- extract all elements afterwards
The message always begins after the first full line break (“null line”)

Write a Function to Read and Store the Email Content

For each “corpus”, we use this function to read and store the information into a file.

get.msg <- function(path) {
    con <- file(path, open = "rt", encoding = "latin1")
    text <- readLines(con)
    if (is.numeric(which(text == "")[1]) && is.finite(which(text == "")[1]) && 
        which(text == "")[1] < length(text)) {
        msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
    } else {
        msg <- ""
    }
    close(con)
    return(paste(msg, collapse = "\n"))
}

Use Function to Store Contents from Spam Emails

dir() gets all file names in the directory
Need to get rid of cmds file which is a long list of Unix based commands to move files in these directories
Use sapply() and pass an anonymous function so that we can use the “value” in the “cell”

spam.docs <- dir(spam.path)
spam.docs <- spam.docs[spam.docs != "cmds"]
all.spam <- sapply(spam.docs, function(p) get.msg(file.path(spam.path, p)))

Term Document Matrix (TDM)

Term Document Matrix: Columns contain all the terms found in all of the documents. Rows are the emails (each document). tm package is very handy for text distilling

get.tdm <- function(doc.vec) {
    control <- list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, 
        minDocFreq = 2)
    doc.corpus <- Corpus(VectorSource(doc.vec))
    # Construct Corpus using vector of emails
    doc.dtm <- TermDocumentMatrix(doc.corpus, control)
    return(doc.dtm)
}
spam.tdm <- get.tdm(all.spam)

Use TDM to build Training Data for Spam

spam.matrix <- as.matrix(spam.tdm)
spam.counts <- rowSums(spam.matrix)
spam.df <- data.frame(cbind(names(spam.counts), as.numeric(spam.counts)), stringsAsFactors = FALSE)
names(spam.df) <- c("term", "frequency")
spam.df$frequency <- as.numeric(spam.df$frequency)
## % of emails in which a given term occurs GIVEN THAT IT'S SPAM (CONDITIONAL
## PROBABILITY)
spam.occurrence <- sapply(1:nrow(spam.matrix), function(i) {
    length(which(spam.matrix[i, ] > 0))/ncol(spam.matrix)
})
## Frequency of the term within ENTIRE CORPUS (PRIOR PROBABILITY)
spam.density <- spam.df$frequency/sum(spam.df$frequency)

Add density and occurrence rate to the Spam data frame

Use transform() to add density (PRIOR PROB) and occurrence rate (CONDITIONAL PROB) to spam.df

spam.df <- transform(spam.df, density = spam.density, occurrence = spam.occurrence)
head(spam.df[with(spam.df, order(-occurrence)), ])

##         term frequency  density occurrence
## 5701   email       692 0.006356      0.496
## 13917 please       360 0.003306      0.452
## 10933   list       364 0.003343      0.396
## 2109    body       341 0.003132      0.388
## 20144   will       696 0.006392      0.380
## 8585    html       372 0.003417      0.372

Now Build TDM for Ham emails

Only use “Easy Ham” emails in training set for sake of simplicity
Prob should use “Hard Ham” as well in training for a production/more accurate classifier

easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[easyham.docs != "cmds"]
all.easyham <- sapply(easyham.docs[1:length(spam.docs)], function(p) get.msg(file.path(easyham.path, 
    p)))

easyham.tdm <- get.tdm(all.easyham)

As with Spam, So with Ham!

easyham.matrix <- as.matrix(easyham.tdm)
easyham.counts <- rowSums(easyham.matrix)
easyham.df <- data.frame(cbind(names(easyham.counts), as.numeric(easyham.counts)), 
    stringsAsFactors = FALSE)
names(easyham.df) <- c("term", "frequency")
easyham.df$frequency <- as.numeric(easyham.df$frequency)
# CONDITIONAL PROBABILITY
easyham.occurrence <- sapply(1:nrow(easyham.matrix), function(i) length(which(easyham.matrix[i, 
    ] > 0))/ncol(easyham.matrix))
# PRIOR PROBABILITY
easyham.density <- easyham.df$frequency/sum(easyham.df$frequency)

Frequency table for Ham training data

easyham.df <- transform(easyham.df, density = easyham.density, occurrence = easyham.occurrence)
head(easyham.df[with(easyham.df, order(-occurrence)), ])

##        term frequency  density occurrence
## 12201 wrote       237 0.003788      0.378
## 1449    can       321 0.005131      0.352
## 6547   list       235 0.003756      0.346
## 11608   use       244 0.003900      0.346
## 4674  group       205 0.003277      0.336
## 7770    one       311 0.004971      0.320

CLASSIFIER FUNCTION

For terms that are NOT in the training sets, we have to provide a conditional probability, otherwise, the Bayes formula would be 0! Hence, c = 1e-6
We are also assuming spams and hams are equally likely as a prior probability (prior=0.5). Although this assumption is questionable.

classify.email <- function(path, training.df, prior = 0.5, c = 1e-06) {
    msg <- get.msg(path)
    msg.tdm <- get.tdm(msg)
    msg.freq <- rowSums(as.matrix(msg.tdm))
    msg.match <- intersect(names(msg.freq), training.df$term)
    if (length(msg.match) < 1) {
        return(prior * c^(length(msg.freq)))
    } else {
        match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
        return(prior * prod(match.probs) * c^(length(msg.freq) - length(msg.match)))
    }
}

Use Classifier and Spam and Ham Training model

hardham.docs <- dir(hardham.path)
hardham.docs <- hardham.docs[hardham.docs != "cmds"]
hardham.spamtest <- sapply(hardham.docs, function(p) classify.email(file.path(hardham.path, 
    p), training.df = spam.df))
hardham.hamtest <- sapply(hardham.docs, function(p) classify.email(file.path(hardham.path, 
    p), training.df = easyham.df))
hardham.res <- ifelse(hardham.spamtest > hardham.hamtest, FALSE, TRUE)
summary(hardham.res)

##    Mode   FALSE    TRUE    NA's 
## logical       8     241       0

Develop the Spam Classifer

ADJUSTING THE PRIOR PROBABILITIES WILL IMPROVE RESULTS GREATLY!
In reality, 90%-10% ham-to-spam ratio

spam.classifier <- function(path) {
    pr.spam <- classify.email(path, spam.df, prior = 0.8)
    pr.ham <- classify.email(path, easyham.df, prior = 0.2)
    return(c(pr.spam, pr.ham, ifelse(pr.spam > pr.ham, 1, 0)))
}

Apply Classifier to ALL Emails

easyham2.docs <- dir(easyham2.path)
easyham2.docs <- easyham2.docs[easyham2.docs != "cmds"]

hardham2.docs <- dir(hardham2.path)
hardham2.docs <- hardham2.docs[hardham2.docs != "cmds"]

spam2.docs <- dir(spam2.path)
spam2.docs <- spam2.docs[spam2.docs != "cmds"]

easyham2.class <- suppressWarnings(lapply(easyham2.docs, function(p) {
    spam.classifier(file.path(easyham2.path, p))
}))
hardham2.class <- suppressWarnings(lapply(hardham2.docs, function(p) {
    spam.classifier(file.path(hardham2.path, p))
}))
spam2.class <- suppressWarnings(lapply(spam2.docs, function(p) {
    spam.classifier(file.path(spam2.path, p))
}))

Create a single, final, data frame

easyham2.matrix <- do.call(rbind, easyham2.class)
easyham2.final <- cbind(easyham2.matrix, "EASYHAM")

hardham2.matrix <- do.call(rbind, hardham2.class)
hardham2.final <- cbind(hardham2.matrix, "HARDHAM")

spam2.matrix <- do.call(rbind, spam2.class)
spam2.final <- cbind(spam2.matrix, "SPAM")

class.matrix <- rbind(easyham2.final, hardham2.final, spam2.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.SPAM", "Pr.HAM", "Class", "Type")
class.df$Pr.SPAM <- as.numeric(class.df$Pr.SPAM)
class.df$Pr.HAM <- as.numeric(class.df$Pr.HAM)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)

Results

The results are not great…
Spam filtering works only slightly better than 50%
Perhaps did something wrong. It's very different from the book.

get.results <- function(bool.vector) {
    results <- c(length(bool.vector[which(bool.vector == FALSE)])/length(bool.vector), 
        length(bool.vector[which(bool.vector == TRUE)])/length(bool.vector))
    return(results)
}

easyham2.col <- get.results(subset(class.df, Type == "EASYHAM")$Class)
hardham2.col <- get.results(subset(class.df, Type == "HARDHAM")$Class)
spam2.col <- get.results(subset(class.df, Type == "SPAM")$Class)

class.res <- rbind(easyham2.col, hardham2.col, spam2.col)
colnames(class.res) <- c("NOT SPAM", "SPAM")
print(class.res)

##              NOT SPAM    SPAM
## easyham2.col   0.9871 0.01286
## hardham2.col   0.9637 0.03629
## spam2.col      0.4538 0.54617

Plotting the Results

ggplot(class.df, aes(x = log(Pr.HAM), log(Pr.SPAM))) + geom_point(aes(shape = Type, 
    alpha = 0.5)) + stat_abline(yintercept = 0, slope = 1) + scale_shape_manual(values = c(EASYHAM = 1, 
    HARDHAM = 2, SPAM = 3), name = "Email Type") + scale_alpha(guide = "none") + 
    xlab("log[Pr(HAM)]") + ylab("log[Pr(SPAM)]") + theme_bw() + theme(axis.text.x = element_blank(), 
    axis.text.y = element_blank())

plot of chunk unnamed-chunk-16