This is from Chapter 3 of Machine Learning for Hackers book.
library(tm) for text mining# Load libraries
library("tm")
library("ggplot2")
# Set the global paths
spam.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", "03-Classification",
"data", "spam")
spam2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers", "03-Classification",
"data", "spam_2")
easyham.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers",
"03-Classification", "data", "easy_ham")
easyham2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers",
"03-Classification", "data", "easy_ham_2")
hardham.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers",
"03-Classification", "data", "hard_ham")
hardham2.path <- file.path("~", "Dropbox", "Data Science", "ML_for_Hackers",
"03-Classification", "data", "hard_ham_2")
rt argument means “read as text”readLine() returns each line as a separate element of a character vector
which(text == "")[1]get.msg <- function(path) {
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con)
if (is.numeric(which(text == "")[1]) && is.finite(which(text == "")[1]) &&
which(text == "")[1] < length(text)) {
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
} else {
msg <- ""
}
close(con)
return(paste(msg, collapse = "\n"))
}
dir() gets all file names in the directorysapply() and pass an anonymous function so that we can use the “value” in the “cell”spam.docs <- dir(spam.path)
spam.docs <- spam.docs[spam.docs != "cmds"]
all.spam <- sapply(spam.docs, function(p) get.msg(file.path(spam.path, p)))
tm package is very handy for text distillingget.tdm <- function(doc.vec) {
control <- list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE,
minDocFreq = 2)
doc.corpus <- Corpus(VectorSource(doc.vec))
# Construct Corpus using vector of emails
doc.dtm <- TermDocumentMatrix(doc.corpus, control)
return(doc.dtm)
}
spam.tdm <- get.tdm(all.spam)
spam.matrix <- as.matrix(spam.tdm)
spam.counts <- rowSums(spam.matrix)
spam.df <- data.frame(cbind(names(spam.counts), as.numeric(spam.counts)), stringsAsFactors = FALSE)
names(spam.df) <- c("term", "frequency")
spam.df$frequency <- as.numeric(spam.df$frequency)
## % of emails in which a given term occurs GIVEN THAT IT'S SPAM (CONDITIONAL
## PROBABILITY)
spam.occurrence <- sapply(1:nrow(spam.matrix), function(i) {
length(which(spam.matrix[i, ] > 0))/ncol(spam.matrix)
})
## Frequency of the term within ENTIRE CORPUS (PRIOR PROBABILITY)
spam.density <- spam.df$frequency/sum(spam.df$frequency)
Use transform() to add density (PRIOR PROB) and occurrence rate (CONDITIONAL PROB) to spam.df
spam.df <- transform(spam.df, density = spam.density, occurrence = spam.occurrence)
head(spam.df[with(spam.df, order(-occurrence)), ])
## term frequency density occurrence
## 5701 email 692 0.006356 0.496
## 13917 please 360 0.003306 0.452
## 10933 list 364 0.003343 0.396
## 2109 body 341 0.003132 0.388
## 20144 will 696 0.006392 0.380
## 8585 html 372 0.003417 0.372
easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[easyham.docs != "cmds"]
all.easyham <- sapply(easyham.docs[1:length(spam.docs)], function(p) get.msg(file.path(easyham.path,
p)))
easyham.tdm <- get.tdm(all.easyham)
easyham.matrix <- as.matrix(easyham.tdm)
easyham.counts <- rowSums(easyham.matrix)
easyham.df <- data.frame(cbind(names(easyham.counts), as.numeric(easyham.counts)),
stringsAsFactors = FALSE)
names(easyham.df) <- c("term", "frequency")
easyham.df$frequency <- as.numeric(easyham.df$frequency)
# CONDITIONAL PROBABILITY
easyham.occurrence <- sapply(1:nrow(easyham.matrix), function(i) length(which(easyham.matrix[i,
] > 0))/ncol(easyham.matrix))
# PRIOR PROBABILITY
easyham.density <- easyham.df$frequency/sum(easyham.df$frequency)
easyham.df <- transform(easyham.df, density = easyham.density, occurrence = easyham.occurrence)
head(easyham.df[with(easyham.df, order(-occurrence)), ])
## term frequency density occurrence
## 12201 wrote 237 0.003788 0.378
## 1449 can 321 0.005131 0.352
## 6547 list 235 0.003756 0.346
## 11608 use 244 0.003900 0.346
## 4674 group 205 0.003277 0.336
## 7770 one 311 0.004971 0.320
c = 1e-6prior=0.5). Although this assumption is questionable.classify.email <- function(path, training.df, prior = 0.5, c = 1e-06) {
msg <- get.msg(path)
msg.tdm <- get.tdm(msg)
msg.freq <- rowSums(as.matrix(msg.tdm))
msg.match <- intersect(names(msg.freq), training.df$term)
if (length(msg.match) < 1) {
return(prior * c^(length(msg.freq)))
} else {
match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
return(prior * prod(match.probs) * c^(length(msg.freq) - length(msg.match)))
}
}
hardham.docs <- dir(hardham.path)
hardham.docs <- hardham.docs[hardham.docs != "cmds"]
hardham.spamtest <- sapply(hardham.docs, function(p) classify.email(file.path(hardham.path,
p), training.df = spam.df))
hardham.hamtest <- sapply(hardham.docs, function(p) classify.email(file.path(hardham.path,
p), training.df = easyham.df))
hardham.res <- ifelse(hardham.spamtest > hardham.hamtest, FALSE, TRUE)
summary(hardham.res)
## Mode FALSE TRUE NA's
## logical 8 241 0
spam.classifier <- function(path) {
pr.spam <- classify.email(path, spam.df, prior = 0.8)
pr.ham <- classify.email(path, easyham.df, prior = 0.2)
return(c(pr.spam, pr.ham, ifelse(pr.spam > pr.ham, 1, 0)))
}
easyham2.docs <- dir(easyham2.path)
easyham2.docs <- easyham2.docs[easyham2.docs != "cmds"]
hardham2.docs <- dir(hardham2.path)
hardham2.docs <- hardham2.docs[hardham2.docs != "cmds"]
spam2.docs <- dir(spam2.path)
spam2.docs <- spam2.docs[spam2.docs != "cmds"]
easyham2.class <- suppressWarnings(lapply(easyham2.docs, function(p) {
spam.classifier(file.path(easyham2.path, p))
}))
hardham2.class <- suppressWarnings(lapply(hardham2.docs, function(p) {
spam.classifier(file.path(hardham2.path, p))
}))
spam2.class <- suppressWarnings(lapply(spam2.docs, function(p) {
spam.classifier(file.path(spam2.path, p))
}))
easyham2.matrix <- do.call(rbind, easyham2.class)
easyham2.final <- cbind(easyham2.matrix, "EASYHAM")
hardham2.matrix <- do.call(rbind, hardham2.class)
hardham2.final <- cbind(hardham2.matrix, "HARDHAM")
spam2.matrix <- do.call(rbind, spam2.class)
spam2.final <- cbind(spam2.matrix, "SPAM")
class.matrix <- rbind(easyham2.final, hardham2.final, spam2.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.SPAM", "Pr.HAM", "Class", "Type")
class.df$Pr.SPAM <- as.numeric(class.df$Pr.SPAM)
class.df$Pr.HAM <- as.numeric(class.df$Pr.HAM)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)
get.results <- function(bool.vector) {
results <- c(length(bool.vector[which(bool.vector == FALSE)])/length(bool.vector),
length(bool.vector[which(bool.vector == TRUE)])/length(bool.vector))
return(results)
}
easyham2.col <- get.results(subset(class.df, Type == "EASYHAM")$Class)
hardham2.col <- get.results(subset(class.df, Type == "HARDHAM")$Class)
spam2.col <- get.results(subset(class.df, Type == "SPAM")$Class)
class.res <- rbind(easyham2.col, hardham2.col, spam2.col)
colnames(class.res) <- c("NOT SPAM", "SPAM")
print(class.res)
## NOT SPAM SPAM
## easyham2.col 0.9871 0.01286
## hardham2.col 0.9637 0.03629
## spam2.col 0.4538 0.54617
ggplot(class.df, aes(x = log(Pr.HAM), log(Pr.SPAM))) + geom_point(aes(shape = Type,
alpha = 0.5)) + stat_abline(yintercept = 0, slope = 1) + scale_shape_manual(values = c(EASYHAM = 1,
HARDHAM = 2, SPAM = 3), name = "Email Type") + scale_alpha(guide = "none") +
xlab("log[Pr(HAM)]") + ylab("log[Pr(SPAM)]") + theme_bw() + theme(axis.text.x = element_blank(),
axis.text.y = element_blank())