R Markdown

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(tm)
## Loading required package: NLP
get.msg.try <- function(path.dir)
{
  con <- file(path.dir, open="rt", encoding="latin1")
  text <- readLines(con)
  options(warn=-1)
  msg <- tryCatch( text[seq(which(text=="")[1]+1,length(text),1)],
                   error=function(e) { 9999 }, finally={} )
  close(con)
  if( substr(msg, 1, 5)=="Error" ) 
  {
    return("Error")
  }
  else 
  {
    return(paste(msg, collapse="\n"))
  }
}
get.all.try <- function(path.dir)
{
  all.file <- dir(path.dir)
  all.file <- all.file[which(all.file!="cmds")]
  msg.all <- sapply(all.file, function(p) get.msg.try(paste0(path.dir,p)))
}
easy_ham.all    <- get.all.try("easy_ham/")
easy_ham_2.all  <- get.all.try("easy_ham_2/")
spam.all        <- get.all.try("spam/")
spam_2.all      <- get.all.try("spam_2/")

easy_ham.dfr    <- as.data.frame(easy_ham.all)
easy_ham_2.dfr  <- as.data.frame(easy_ham_2.all)
spam.dfr        <- as.data.frame(spam.all)
spam_2.dfr      <- as.data.frame(spam_2.all)
rownames(easy_ham.dfr)    <- NULL
rownames(easy_ham_2.dfr)  <- NULL
rownames(spam.dfr)        <- NULL
rownames(spam_2.dfr)      <- NULL
# 0 presents ham, and 1 presents spam 
easy_ham.dfr$outcome    <- 0
easy_ham_2.dfr$outcome  <- 0
spam.dfr$outcome        <- 1
spam_2.dfr$outcome      <- 1
names(easy_ham.dfr)   <- c("text", "outcome")
names(easy_ham_2.dfr) <- c("text", "outcome")
names(spam.dfr)       <- c("text", "outcome")
names(spam_2.dfr)     <- c("text", "outcome")
train.data  <- rbind(easy_ham.dfr, spam.dfr)
train.num   <- nrow(train.data)
train.data  <- rbind(train.data, easy_ham_2.dfr, spam_2.dfr)
names(train.data) <- c("text", "outcome")

set.seed(2012)
train_out.data <- train.data$outcome
train_txt.data <- train.data$text

matrix <- create_matrix(train_txt.data, language="english", minWordLength=3, removeNumbers=TRUE, stemWords=FALSE, removePunctuation=TRUE, weighting=weightTfIdf)
container <- create_container(matrix, t(train_out.data), trainSize=1:train.num, testSize=(train.num+1):nrow(train.data), virgin=FALSE)

# maxent model
maxent.model    <- train_model(container, "MAXENT")
maxent.result   <- classify_model(container, maxent.model)
maxent.analytic <- create_analytics(container, maxent.result)
maxent.doc      <- maxent.analytic@document_summary
maxent_spam.doc <- maxent.doc[maxent.doc$MANUAL_CODE==1, ]
maxent_ham.doc  <- maxent.doc[maxent.doc$MANUAL_CODE==0, ]
(maxent.true.pos  <- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==1,]) / nrow(maxent_spam.doc))
## [1] 0.9040115
(maxent.false.neg<- nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==0,]) / nrow(maxent_spam.doc))
## [1] 0.09598854
(maxent.true.neg <- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==0,]) / nrow(maxent_ham.doc))
## [1] 0.9928571
(maxent.false.pos<- nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==1,]) / nrow(maxent_ham.doc))
## [1] 0.007142857
(maxent.acc <- (nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==1,]) + nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==0,]))/nrow(maxent.doc))
## [1] 0.9484979
df.maxent <- cbind(data.frame(model = 'MAXENT', 
                        TP = nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==1,]),
                        FP = nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==1,]),
                        TN = nrow(maxent_ham.doc[maxent_ham.doc$CONSENSUS_CODE==0,]),
                        FN = nrow(maxent_spam.doc[maxent_spam.doc$CONSENSUS_CODE==0,]),
                        TPR = maxent.true.pos,
                        FNR = maxent.false.neg,
                        TNR = maxent.true.neg,
                        FPR = maxent.false.pos,
                        ACC = maxent.acc), 
                        maxent.analytic@algorithm_summary[2, ])
names(df.maxent)[11:13] <- c("PRECISION", "RECALL", "FSCORE")

# svm model
svm.model       <- train_model(container, "SVM")
svm.result    <- classify_model(container, svm.model)
svm.analytic  <- create_analytics(container, svm.result)
svm.doc       <- svm.analytic@document_summary
svm_spam.doc  <- svm.doc[svm.doc$MANUAL_CODE==1, ]
svm_ham.doc   <- svm.doc[svm.doc$MANUAL_CODE==0, ]
(svm.true.pos  <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==1,]) / nrow(svm_spam.doc))
## [1] 0.9255014
(svm.false.neg <- nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==0,]) / nrow(svm_spam.doc))
## [1] 0.07449857
(svm.true.neg  <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==0,]) / nrow(svm_ham.doc))
## [1] 0.9871429
(svm.false.pos <- nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==1,]) / nrow(svm_ham.doc))
## [1] 0.01285714
(svm.acc <- (nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==1,]) + nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==0,]))/nrow(svm.doc))
## [1] 0.9563662
df.svm <- cbind(data.frame(model = 'SVM', 
                        TP = nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==1,]),
                        FP = nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==1,]),
                        TN = nrow(svm_ham.doc[svm_ham.doc$CONSENSUS_CODE==0,]),
                        FN = nrow(svm_spam.doc[svm_spam.doc$CONSENSUS_CODE==0,]),
                        TPR = svm.true.pos,
                        FNR = svm.false.neg,
                        TNR = svm.true.neg,
                        FPR = svm.false.pos,
                        ACC = svm.acc), 
                svm.analytic@algorithm_summary[2, ])
names(df.svm)[11:13] <- c("PRECISION", "RECALL", "FSCORE")
(df <- rbind(df.maxent, df.svm))
##     model   TP FP   TN  FN       TPR        FNR       TNR         FPR
## 1  MAXENT 1262 10 1390 134 0.9040115 0.09598854 0.9928571 0.007142857
## 11    SVM 1292 18 1382 104 0.9255014 0.07449857 0.9871429 0.012857143
##          ACC PRECISION RECALL FSCORE
## 1  0.9484979      0.99   0.90   0.94
## 11 0.9563662      0.99   0.93   0.96

Conclusion:

  1. In filtering out the spams, accuracies in both maxent and svm is close, and F-score is higher with svm model 96% vs. 94%. I consider that it is more important when a ham is filtered out as spam than let a few spams skip out, in case important emails are filtered out. So false postive would be considered with higher priority than false negative. Maxent model has false postive 0.7% vs. svm 1.2%, I will select maxent model over svm.