This reprises “R-Bloggers” contributor Dennis Lee’s example on building a spam classifier using RTextTools and compares different model outputs. The exercise largely recreates Dennis’s process in order to study the algorithm and compare it with a similar test in “Automated Data Collection” pg. 310-313. The container object from that exercise partitions its document matrix by organizational labels whereas this exercise is binary, either spam or ham.
It was unclear from Lee’s description, what the benchmark statistics were used to compare with the output from the models.
Lee comments that this is a largely unexplored area as there are NOT many example models built using the RTextTools package thus he to “explores the feasibility of building a model used to classify large text, i.e. raw text without ANY features.”
http://www.r-bloggers.com/classifying-emails-as-spam-or-ham-using-rtexttools/
get.msg <- function(path.dir)
{
con <- file(path.dir, open="rt", encoding="latin1")
text <- readLines(con)
msg <- text[seq(which(text=="")[1]+1,length(text),1)]
close(con)
return(paste(msg, collapse="\n"))
}
get.msg.try <- function(path.dir)
{
con <- file(path.dir, open="rt", encoding="latin1")
text <- readLines(con)
options(warn=-1)
msg <- tryCatch( text[seq(which(text=="")[1]+1,length(text),1)],
error=function(e) { 9999 }, finally={} )
close(con)
if( substr(msg, 1, 5)=="Error" )
{
return("Error")
}
else
{
return(paste(msg, collapse="\n"))
}
}
get.all <- function(path.dir,filter.string)
{
all.file <- dir(path.dir, filter.string)
all.file <- all.file[which(all.file!="cmds")]
msg.all <- sapply(all.file, function(p) get.msg(paste0(path.dir,p)))
}
get.all.try <- function(path.dir,filter.string)
{
all.file <- dir(path.dir,filter.string)
all.file <- all.file[which(all.file!="cmds")]
msg.all <- sapply(all.file, function(p) get.msg.try(paste0(path.dir,p)))
}
# mark differences for training/evaluation
easy_ham.dfr$outcome <- 2
easy_ham_2.dfr$outcome <- 2
spam.dfr$outcome <- 4
spam_2.dfr$outcome <- 4
names(easy_ham.dfr) <- c("text", "outcome")
names(easy_ham_2.dfr) <- c("text", "outcome")
names(spam.dfr) <- c("text", "outcome")
names(spam_2.dfr) <- c("text", "outcome")
train.data <- rbind(easy_ham.dfr, spam.dfr)
train.num <- nrow(train.data)
train.data <- rbind(train.data, easy_ham_2.dfr, spam_2.dfr)
#set.seed(100)
train_out.data <- train.data$outcome
train_txt.data <- train.data$text
matrix <- create_matrix(
train_txt.data,language="english",
minWordLength=3,
removeNumbers=TRUE,
stemWords=FALSE,
removePunctuation=TRUE,
stripWhitespace=TRUE,
weighting=tm::weightTfIdf
)
spam_labels <-unlist(t(train_out.data))
# Create container
container <- create_container(
matrix,
t(train_out.data),
trainSize=1:train.num,
testSize=(train.num+1):nrow(train.data),
virgin=FALSE
)
slotnames <- as.data.frame(slotNames(container))
colnames(slotnames) <- c("SLOT NAMES")
kable(slotnames)
training_matrix
classification_matrix training_codes
testing_codes
column_names
virgin
#maximum entropy probability distribution
maxent.model <- train_model(container, "MAXENT")
#non-probabilistic binary linear classifier
svm.model <- train_model(container, "SVM")
The Maximum Entropy model shows a higher probablity of identifying spam then the Support Vector Machine model.
svm.result <- classify_model(container, svm.model)
kable(head(svm.result), align = 'l')
SVM_LABEL | SVM_PROB |
---|---|
2 | 0.9492005 |
2 | 0.9025327 |
2 | 0.6902503 |
2 | 0.9326576 |
2 | 0.9607400 |
2 | 0.9188272 |
svm.analytic <- create_analytics(container, svm.result)
svm.doc <- svm.analytic@document_summary
svm_spam.doc <- svm.doc[svm.doc$MANUAL_CODE==4, ]
svm_ham.doc <- svm.doc[svm.doc$MANUAL_CODE==2, ]
svm.true.pos <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==4,]) / nrow(svm_spam.doc)
svm.false.neg <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==2,]) / nrow(svm_spam.doc)
svm.true.neg <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==2,]) / nrow(svm_ham.doc)
svm.false.pos <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==4,]) / nrow(svm_ham.doc)
output <- data.frame(SVM_CLASS=NA, "TRUE"=NA, "FALSE"=NA)[numeric(0), ]
output[1,] <- c('SPAM (POS)',svm.true.pos, svm.false.neg)
output[2,] <- c('HAM (NEG)',svm.true.neg,svm.false.pos)
kable(output,align= 'l')
SVM_CLASS | TRUE. | FALSE. |
---|---|---|
SPAM (POS) | 0.98 | 0.02 |
HAM (NEG) | 0.99 | 0.01 |
maxent.result <- classify_model(container, maxent.model)
kable(head(maxent.result), align = 'l')
MAXENTROPY_LABEL | MAXENTROPY_PROB |
---|---|
2 | 0.9900302 |
2 | 0.9791415 |
2 | 0.9484637 |
2 | 0.9874170 |
2 | 0.9902596 |
2 | 0.9764587 |
maxent.analytic <- create_analytics(container, maxent.result)
maxent.doc <- maxent.analytic@document_summary
maxent_spam.doc <- maxent.doc[maxent.doc$MANUAL_CODE==4, ]
maxent_ham.doc <- maxent.doc[maxent.doc$MANUAL_CODE==2, ]
maxent.true.pos <- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==4,]) / nrow(maxent_spam.doc)
maxent.false.neg<- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==2,]) / nrow(maxent_spam.doc)
maxent.true.neg <- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==2,]) / nrow(maxent_ham.doc)
maxent.false.pos<- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==4,]) / nrow(maxent_ham.doc)
output <- data.frame(MAXENT_CLASS=NA, "TRUE"=NA, "FALSE"=NA)[numeric(0), ]
output[1,] <- c('SPAM (POS)',maxent.true.pos, maxent.false.neg)
output[2,] <- c('HAM (NEG)',maxent.true.neg,maxent.false.pos)
kable(output,align= 'l')
MAXENT_CLASS | TRUE. | FALSE. |
---|---|---|
SPAM (POS) | 0.91 | 0.09 |
HAM (NEG) | 1 | 0 |