Spam Filter using Naive Bayes Classifier

This is from Chapter 3 of Machine Learning for Hackers book.

Create Paths for reading text first

# Load libraries
library("tm")
library("ggplot2")

# Set the global paths
spam.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", "03-Classification", 
    "data", "spam")
spam2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", "03-Classification", 
    "data", "spam_2")
easyham.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", 
    "03-Classification", "data", "easy_ham")
easyham2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", 
    "03-Classification", "data", "easy_ham_2")
hardham.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", 
    "03-Classification", "data", "hard_ham")
hardham2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", 
    "03-Classification", "data", "hard_ham_2")

Write a Function to Read and Store the Email Content

Write a Function to Read and Store the Email Content

get.msg <- function(path) {
    con <- file(path, open = "rt", encoding = "latin1")
    text <- readLines(con)
    if (is.numeric(which(text == "")[1]) && is.finite(which(text == "")[1]) && 
        which(text == "")[1] < length(text)) {
        msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
    } else {
        msg <- ""
    }
    close(con)
    return(paste(msg, collapse = "\n"))
}

Use Function to Store Contents from Spam Emails

spam.docs <- dir(spam.path)
spam.docs <- spam.docs[spam.docs != "cmds"]
all.spam <- sapply(spam.docs, function(p) get.msg(file.path(spam.path, p)))

Term Document Matrix (TDM)

get.tdm <- function(doc.vec) {
    control <- list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, 
        minDocFreq = 2)
    doc.corpus <- Corpus(VectorSource(doc.vec))
    # Construct Corpus using vector of emails
    doc.dtm <- TermDocumentMatrix(doc.corpus, control)
    return(doc.dtm)
}
spam.tdm <- get.tdm(all.spam)

Use TDM to build Training Data for Spam

spam.matrix <- as.matrix(spam.tdm)
spam.counts <- rowSums(spam.matrix)
spam.df <- data.frame(cbind(names(spam.counts), as.numeric(spam.counts)), stringsAsFactors = FALSE)
names(spam.df) <- c("term", "frequency")
spam.df$frequency <- as.numeric(spam.df$frequency)
## % of emails in which a given term occurs GIVEN THAT IT'S SPAM (CONDITIONAL
## PROBABILITY)
spam.occurrence <- sapply(1:nrow(spam.matrix), function(i) {
    length(which(spam.matrix[i, ] > 0))/ncol(spam.matrix)
})
## Frequency of the term within ENTIRE CORPUS (PRIOR PROBABILITY)
spam.density <- spam.df$frequency/sum(spam.df$frequency)

Add density and occurrence rate to the Spam data frame

Use transform() to add density (PRIOR PROB) and occurrence rate (CONDITIONAL PROB) to spam.df

spam.df <- transform(spam.df, density = spam.density, occurrence = spam.occurrence)
head(spam.df[with(spam.df, order(-occurrence)), ])
##         term frequency  density occurrence
## 5701   email       692 0.006356      0.496
## 13917 please       360 0.003306      0.452
## 10933   list       364 0.003343      0.396
## 2109    body       341 0.003132      0.388
## 20144   will       696 0.006392      0.380
## 8585    html       372 0.003417      0.372

Now Build TDM for Ham emails

easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[easyham.docs != "cmds"]
all.easyham <- sapply(easyham.docs[1:length(spam.docs)], function(p) get.msg(file.path(easyham.path, 
    p)))

easyham.tdm <- get.tdm(all.easyham)

As with Spam, So with Ham!

easyham.matrix <- as.matrix(easyham.tdm)
easyham.counts <- rowSums(easyham.matrix)
easyham.df <- data.frame(cbind(names(easyham.counts), as.numeric(easyham.counts)), 
    stringsAsFactors = FALSE)
names(easyham.df) <- c("term", "frequency")
easyham.df$frequency <- as.numeric(easyham.df$frequency)
# CONDITIONAL PROBABILITY
easyham.occurrence <- sapply(1:nrow(easyham.matrix), function(i) length(which(easyham.matrix[i, 
    ] > 0))/ncol(easyham.matrix))
# PRIOR PROBABILITY
easyham.density <- easyham.df$frequency/sum(easyham.df$frequency)

Frequency table for Ham training data

easyham.df <- transform(easyham.df, density = easyham.density, occurrence = easyham.occurrence)
head(easyham.df[with(easyham.df, order(-occurrence)), ])
##        term frequency  density occurrence
## 12201 wrote       237 0.003788      0.378
## 1449    can       321 0.005131      0.352
## 6547   list       235 0.003756      0.346
## 11608   use       244 0.003900      0.346
## 4674  group       205 0.003277      0.336
## 7770    one       311 0.004971      0.320

CLASSIFIER FUNCTION

classify.email <- function(path, training.df, prior = 0.5, c = 1e-06) {
    msg <- get.msg(path)
    msg.tdm <- get.tdm(msg)
    msg.freq <- rowSums(as.matrix(msg.tdm))
    msg.match <- intersect(names(msg.freq), training.df$term)
    if (length(msg.match) < 1) {
        return(prior * c^(length(msg.freq)))
    } else {
        match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
        return(prior * prod(match.probs) * c^(length(msg.freq) - length(msg.match)))
    }
}

Use Classifier and Spam and Ham Training model

hardham.docs <- dir(hardham.path)
hardham.docs <- hardham.docs[hardham.docs != "cmds"]
hardham.spamtest <- sapply(hardham.docs, function(p) classify.email(file.path(hardham.path, 
    p), training.df = spam.df))
hardham.hamtest <- sapply(hardham.docs, function(p) classify.email(file.path(hardham.path, 
    p), training.df = easyham.df))
hardham.res <- ifelse(hardham.spamtest > hardham.hamtest, FALSE, TRUE)
summary(hardham.res)
##    Mode   FALSE    TRUE    NA's 
## logical       8     241       0

Develop the Spam Classifer

spam.classifier <- function(path) {
    pr.spam <- classify.email(path, spam.df, prior = 0.8)
    pr.ham <- classify.email(path, easyham.df, prior = 0.2)
    return(c(pr.spam, pr.ham, ifelse(pr.spam > pr.ham, 1, 0)))
}

Apply Classifier to ALL Emails

easyham2.docs <- dir(easyham2.path)
easyham2.docs <- easyham2.docs[easyham2.docs != "cmds"]

hardham2.docs <- dir(hardham2.path)
hardham2.docs <- hardham2.docs[hardham2.docs != "cmds"]

spam2.docs <- dir(spam2.path)
spam2.docs <- spam2.docs[spam2.docs != "cmds"]

easyham2.class <- suppressWarnings(lapply(easyham2.docs, function(p) {
    spam.classifier(file.path(easyham2.path, p))
}))
hardham2.class <- suppressWarnings(lapply(hardham2.docs, function(p) {
    spam.classifier(file.path(hardham2.path, p))
}))
spam2.class <- suppressWarnings(lapply(spam2.docs, function(p) {
    spam.classifier(file.path(spam2.path, p))
}))

Create a single, final, data frame

easyham2.matrix <- do.call(rbind, easyham2.class)
easyham2.final <- cbind(easyham2.matrix, "EASYHAM")

hardham2.matrix <- do.call(rbind, hardham2.class)
hardham2.final <- cbind(hardham2.matrix, "HARDHAM")

spam2.matrix <- do.call(rbind, spam2.class)
spam2.final <- cbind(spam2.matrix, "SPAM")

class.matrix <- rbind(easyham2.final, hardham2.final, spam2.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.SPAM", "Pr.HAM", "Class", "Type")
class.df$Pr.SPAM <- as.numeric(class.df$Pr.SPAM)
class.df$Pr.HAM <- as.numeric(class.df$Pr.HAM)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)

Results

get.results <- function(bool.vector) {
    results <- c(length(bool.vector[which(bool.vector == FALSE)])/length(bool.vector), 
        length(bool.vector[which(bool.vector == TRUE)])/length(bool.vector))
    return(results)
}

easyham2.col <- get.results(subset(class.df, Type == "EASYHAM")$Class)
hardham2.col <- get.results(subset(class.df, Type == "HARDHAM")$Class)
spam2.col <- get.results(subset(class.df, Type == "SPAM")$Class)

class.res <- rbind(easyham2.col, hardham2.col, spam2.col)
colnames(class.res) <- c("NOT SPAM", "SPAM")
print(class.res)
##              NOT SPAM    SPAM
## easyham2.col   0.9871 0.01286
## hardham2.col   0.9637 0.03629
## spam2.col      0.4538 0.54617

Plotting the Results

ggplot(class.df, aes(x = log(Pr.HAM), log(Pr.SPAM))) + geom_point(aes(shape = Type, 
    alpha = 0.5)) + stat_abline(yintercept = 0, slope = 1) + scale_shape_manual(values = c(EASYHAM = 1, 
    HARDHAM = 2, SPAM = 3), name = "Email Type") + scale_alpha(guide = "none") + 
    xlab("log[Pr(HAM)]") + ylab("log[Pr(SPAM)]") + theme_bw() + theme(axis.text.x = element_blank(), 
    axis.text.y = element_blank())

plot of chunk unnamed-chunk-16