1 Load RTextTools & supporting packages
2 Functions: collects & append lines from list of files
3 Create & split data into training/test sets
4 Build model
- 4.1 SLOT NAMES
5 Compare model output

This reprises “R-Bloggers” contributor Dennis Lee’s example on building a spam classifier using RTextTools and compares different model outputs. The exercise largely recreates Dennis’s process in order to study the algorithm and compare it with a similar test in “Automated Data Collection” pg. 310-313. The container object from that exercise partitions its document matrix by organizational labels whereas this exercise is binary, either spam or ham.

It was unclear from Lee’s description, what the benchmark statistics were used to compare with the output from the models.

Lee comments that this is a largely unexplored area as there are NOT many example models built using the RTextTools package thus he to “explores the feasibility of building a model used to classify large text, i.e. raw text without ANY features.”

http://www.r-bloggers.com/classifying-emails-as-spam-or-ham-using-rtexttools/

1 Load RTextTools & supporting packages

2 Functions: collects & append lines from list of files

get.msg <- function(path.dir)
{
  con <- file(path.dir, open="rt", encoding="latin1")
  text <- readLines(con)
  msg <- text[seq(which(text=="")[1]+1,length(text),1)]
  close(con)
  return(paste(msg, collapse="\n"))
}
get.msg.try <- function(path.dir)
{
  con <- file(path.dir, open="rt", encoding="latin1")
  text <- readLines(con)
  options(warn=-1)
  msg <- tryCatch( text[seq(which(text=="")[1]+1,length(text),1)],
                      error=function(e) { 9999 }, finally={} )
  close(con)
  if( substr(msg, 1, 5)=="Error" ) 
  {
    return("Error")
  }
  else 
  {
    return(paste(msg, collapse="\n"))
  }
}
get.all <- function(path.dir,filter.string)
{
  all.file <- dir(path.dir, filter.string)
  all.file <- all.file[which(all.file!="cmds")]
  msg.all <- sapply(all.file, function(p) get.msg(paste0(path.dir,p)))
}
get.all.try <- function(path.dir,filter.string)
{
  all.file <- dir(path.dir,filter.string)
  all.file <- all.file[which(all.file!="cmds")]
  msg.all <- sapply(all.file, function(p) get.msg.try(paste0(path.dir,p)))
}

3 Create & split data into training/test sets

Attempts to extract data from files directly resulted in large performance hits and high CPU/Memory usage. Algorithm assigns numerics to ham/spam outcomes as character strings fail for reasons that not fully understood.

# mark differences for training/evaluation
easy_ham.dfr$outcome    <- 2
easy_ham_2.dfr$outcome  <- 2
spam.dfr$outcome        <- 4
spam_2.dfr$outcome      <- 4

names(easy_ham.dfr)   <- c("text", "outcome")
names(easy_ham_2.dfr) <- c("text", "outcome")
names(spam.dfr)       <- c("text", "outcome")
names(spam_2.dfr)     <- c("text", "outcome")

train.data  <- rbind(easy_ham.dfr, spam.dfr)
train.num   <- nrow(train.data)
train.data  <- rbind(train.data, easy_ham_2.dfr, spam_2.dfr)

4 Build model

#set.seed(100)
train_out.data <- train.data$outcome
train_txt.data <- train.data$text
matrix <- create_matrix(
    train_txt.data,language="english", 
    minWordLength=3, 
    removeNumbers=TRUE, 
    stemWords=FALSE, 
    removePunctuation=TRUE, 
    stripWhitespace=TRUE,
    weighting=tm::weightTfIdf
)

spam_labels <-unlist(t(train_out.data))

# Create container
container <- create_container(
    matrix,
    t(train_out.data),
    trainSize=1:train.num, 
    testSize=(train.num+1):nrow(train.data), 
    virgin=FALSE
)

slotnames <- as.data.frame(slotNames(container))
colnames(slotnames) <- c("SLOT NAMES")
kable(slotnames)

4.1 SLOT NAMES

training_matrix
classification_matrix training_codes
testing_codes
column_names
virgin

#maximum entropy probability distribution
maxent.model    <- train_model(container, "MAXENT")

#non-probabilistic binary linear classifier
svm.model       <- train_model(container, "SVM")

5 Compare model output

The Maximum Entropy model shows a higher probablity of identifying spam then the Support Vector Machine model.

svm.result <- classify_model(container, svm.model)
kable(head(svm.result), align = 'l')

SVM_LABEL	SVM_PROB
2	0.9492005
2	0.9025327
2	0.6902503
2	0.9326576
2	0.9607400
2	0.9188272

svm.analytic  <- create_analytics(container, svm.result)
svm.doc       <- svm.analytic@document_summary
svm_spam.doc  <- svm.doc[svm.doc$MANUAL_CODE==4, ]
svm_ham.doc   <- svm.doc[svm.doc$MANUAL_CODE==2, ]

svm.true.pos  <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==4,]) / nrow(svm_spam.doc)
svm.false.neg <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==2,]) / nrow(svm_spam.doc)

svm.true.neg  <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==2,]) / nrow(svm_ham.doc)
svm.false.pos <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==4,]) / nrow(svm_ham.doc)  

output <- data.frame(SVM_CLASS=NA, "TRUE"=NA, "FALSE"=NA)[numeric(0), ]
output[1,] <- c('SPAM (POS)',svm.true.pos, svm.false.neg)
output[2,] <- c('HAM (NEG)',svm.true.neg,svm.false.pos)
kable(output,align= 'l')

SVM_CLASS	TRUE.	FALSE.
SPAM (POS)	0.98	0.02
HAM (NEG)	0.99	0.01

maxent.result   <- classify_model(container, maxent.model)
kable(head(maxent.result), align = 'l')

MAXENTROPY_LABEL	MAXENTROPY_PROB
2	0.9900302
2	0.9791415
2	0.9484637
2	0.9874170
2	0.9902596
2	0.9764587

maxent.analytic <- create_analytics(container, maxent.result)
maxent.doc      <- maxent.analytic@document_summary
maxent_spam.doc <- maxent.doc[maxent.doc$MANUAL_CODE==4, ]
maxent_ham.doc  <- maxent.doc[maxent.doc$MANUAL_CODE==2, ]

maxent.true.pos <- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==4,]) / nrow(maxent_spam.doc)
maxent.false.neg<- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==2,]) / nrow(maxent_spam.doc)
maxent.true.neg <- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==2,]) / nrow(maxent_ham.doc)
maxent.false.pos<- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==4,]) / nrow(maxent_ham.doc)      
   
output <- data.frame(MAXENT_CLASS=NA, "TRUE"=NA, "FALSE"=NA)[numeric(0), ]
output[1,] <- c('SPAM (POS)',maxent.true.pos, maxent.false.neg)
output[2,] <- c('HAM (NEG)',maxent.true.neg,maxent.false.pos)
kable(output,align= 'l')

MAXENT_CLASS	TRUE.	FALSE.
SPAM (POS)	0.91	0.09
HAM (NEG)	1	0

Separating Spam from Ham

Scott Karr