This reprises “R-Bloggers” contributor Dennis Lee’s example on building a spam classifier using RTextTools and compares different model outputs. The exercise largely recreates Dennis’s process in order to study the algorithm and compare it with a similar test in “Automated Data Collection” pg. 310-313. The container object from that exercise partitions its document matrix by organizational labels whereas this exercise is binary, either spam or ham.

It was unclear from Lee’s description, what the benchmark statistics were used to compare with the output from the models.

Lee comments that this is a largely unexplored area as there are NOT many example models built using the RTextTools package thus he to “explores the feasibility of building a model used to classify large text, i.e. raw text without ANY features.”

http://www.r-bloggers.com/classifying-emails-as-spam-or-ham-using-rtexttools/

1 Load RTextTools & supporting packages

2 Functions: collects & append lines from list of files

get.msg <- function(path.dir)
{
  con <- file(path.dir, open="rt", encoding="latin1")
  text <- readLines(con)
  msg <- text[seq(which(text=="")[1]+1,length(text),1)]
  close(con)
  return(paste(msg, collapse="\n"))
}
get.msg.try <- function(path.dir)
{
  con <- file(path.dir, open="rt", encoding="latin1")
  text <- readLines(con)
  options(warn=-1)
  msg <- tryCatch( text[seq(which(text=="")[1]+1,length(text),1)],
                      error=function(e) { 9999 }, finally={} )
  close(con)
  if( substr(msg, 1, 5)=="Error" ) 
  {
    return("Error")
  }
  else 
  {
    return(paste(msg, collapse="\n"))
  }
}
get.all <- function(path.dir,filter.string)
{
  all.file <- dir(path.dir, filter.string)
  all.file <- all.file[which(all.file!="cmds")]
  msg.all <- sapply(all.file, function(p) get.msg(paste0(path.dir,p)))
}
get.all.try <- function(path.dir,filter.string)
{
  all.file <- dir(path.dir,filter.string)
  all.file <- all.file[which(all.file!="cmds")]
  msg.all <- sapply(all.file, function(p) get.msg.try(paste0(path.dir,p)))
}

3 Create & split data into training/test sets

# mark differences for training/evaluation
easy_ham.dfr$outcome    <- 2
easy_ham_2.dfr$outcome  <- 2
spam.dfr$outcome        <- 4
spam_2.dfr$outcome      <- 4

names(easy_ham.dfr)   <- c("text", "outcome")
names(easy_ham_2.dfr) <- c("text", "outcome")
names(spam.dfr)       <- c("text", "outcome")
names(spam_2.dfr)     <- c("text", "outcome")

train.data  <- rbind(easy_ham.dfr, spam.dfr)
train.num   <- nrow(train.data)
train.data  <- rbind(train.data, easy_ham_2.dfr, spam_2.dfr)

4 Build model

#set.seed(100)
train_out.data <- train.data$outcome
train_txt.data <- train.data$text
matrix <- create_matrix(
    train_txt.data,language="english", 
    minWordLength=3, 
    removeNumbers=TRUE, 
    stemWords=FALSE, 
    removePunctuation=TRUE, 
    stripWhitespace=TRUE,
    weighting=tm::weightTfIdf
)

spam_labels <-unlist(t(train_out.data))

# Create container
container <- create_container(
    matrix,
    t(train_out.data),
    trainSize=1:train.num, 
    testSize=(train.num+1):nrow(train.data), 
    virgin=FALSE
)

slotnames <- as.data.frame(slotNames(container))
colnames(slotnames) <- c("SLOT NAMES")
kable(slotnames)

4.1 SLOT NAMES

training_matrix
classification_matrix training_codes
testing_codes
column_names
virgin

#maximum entropy probability distribution
maxent.model    <- train_model(container, "MAXENT")

#non-probabilistic binary linear classifier
svm.model       <- train_model(container, "SVM")

5 Compare model output

The Maximum Entropy model shows a higher probablity of identifying spam then the Support Vector Machine model.

svm.result <- classify_model(container, svm.model)
kable(head(svm.result), align = 'l')
SVM_LABEL SVM_PROB
2 0.9492005
2 0.9025327
2 0.6902503
2 0.9326576
2 0.9607400
2 0.9188272
svm.analytic  <- create_analytics(container, svm.result)
svm.doc       <- svm.analytic@document_summary
svm_spam.doc  <- svm.doc[svm.doc$MANUAL_CODE==4, ]
svm_ham.doc   <- svm.doc[svm.doc$MANUAL_CODE==2, ]

svm.true.pos  <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==4,]) / nrow(svm_spam.doc)
svm.false.neg <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==2,]) / nrow(svm_spam.doc)

svm.true.neg  <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==2,]) / nrow(svm_ham.doc)
svm.false.pos <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==4,]) / nrow(svm_ham.doc)  

output <- data.frame(SVM_CLASS=NA, "TRUE"=NA, "FALSE"=NA)[numeric(0), ]
output[1,] <- c('SPAM (POS)',svm.true.pos, svm.false.neg)
output[2,] <- c('HAM (NEG)',svm.true.neg,svm.false.pos)
kable(output,align= 'l')
SVM_CLASS TRUE. FALSE.
SPAM (POS) 0.98 0.02
HAM (NEG) 0.99 0.01
maxent.result   <- classify_model(container, maxent.model)
kable(head(maxent.result), align = 'l')
MAXENTROPY_LABEL MAXENTROPY_PROB
2 0.9900302
2 0.9791415
2 0.9484637
2 0.9874170
2 0.9902596
2 0.9764587
maxent.analytic <- create_analytics(container, maxent.result)
maxent.doc      <- maxent.analytic@document_summary
maxent_spam.doc <- maxent.doc[maxent.doc$MANUAL_CODE==4, ]
maxent_ham.doc  <- maxent.doc[maxent.doc$MANUAL_CODE==2, ]

maxent.true.pos <- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==4,]) / nrow(maxent_spam.doc)
maxent.false.neg<- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==2,]) / nrow(maxent_spam.doc)
maxent.true.neg <- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==2,]) / nrow(maxent_ham.doc)
maxent.false.pos<- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==4,]) / nrow(maxent_ham.doc)      
   
output <- data.frame(MAXENT_CLASS=NA, "TRUE"=NA, "FALSE"=NA)[numeric(0), ]
output[1,] <- c('SPAM (POS)',maxent.true.pos, maxent.false.neg)
output[2,] <- c('HAM (NEG)',maxent.true.neg,maxent.false.pos)
kable(output,align= 'l')      
MAXENT_CLASS TRUE. FALSE.
SPAM (POS) 0.91 0.09
HAM (NEG) 1 0