TextMining - Document Classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

Installing necessary packages

#install.packages("tm")
#install.packages("tidytext")
#install.packages("RTextTools")

Analyzing Spam Email and uncovering Key Terms that can help with spam classification

Defining spam and ham directory paths and building function for extracting only the message content (no email headers)

setwd("C:/DATA/HHP/Personal/Degrees/Ms. Data Science (CUNY)/R Working Dir")
spam_path <- "C:/DATA/HHP/Personal/Degrees/Ms. Data Science (CUNY)/R Working Dir/spam_2/"
ham_path <- "C:/DATA/HHP/Personal/Degrees/Ms. Data Science (CUNY)/R Working Dir/easy_ham/"
get_msg <- function(path) {
  con <- file(path,open="rt",encoding="latin1")
  text <- readLines(con)
  msg <- text[seq(which(text=="")[1]+1,length(text))]  
  close(con)
  return(paste(msg,collapse="\n"))
}

Building spam and ham email datasets

spam_files <- dir(spam_path)
spam_ds <- sapply(spam_files, function(p) get_msg(paste(spam_path, p, sep="")))
ham_files <- dir(ham_path)
ham_ds <- sapply(ham_files, function(p) get_msg(paste(ham_path, p, sep="")))
length(spam_ds)
## [1] 1396
length(ham_ds)
## [1] 2500

Create Corpus and Term-Document Matrices for spam and ham data sets

library(tm)
## Loading required package: NLP
control <- list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
spam_corpus <- Corpus(VectorSource(spam_ds))
spam_tdm <- TermDocumentMatrix(spam_corpus, control)
ham_corpus <- Corpus(VectorSource(ham_ds))
ham_tdm <- TermDocumentMatrix(ham_corpus, control)
spam_tdm
## <<TermDocumentMatrix (terms: 34788, documents: 1396)>>
## Non-/sparse entries: 200489/48363559
## Sparsity           : 100%
## Maximal term length: 121
## Weighting          : term frequency (tf)
ham_tdm
## <<TermDocumentMatrix (terms: 25280, documents: 2500)>>
## Non-/sparse entries: 208429/62991571
## Sparsity           : 100%
## Maximal term length: 76
## Weighting          : term frequency (tf)

Removing sparse terms (80% of sparse percentage of empty)

spam_tdm_unsprsd <- removeSparseTerms(spam_tdm, 0.8)
ham_tdm_unsprsd <- removeSparseTerms(ham_tdm, 0.8)
spam_tdm_unsprsd
## <<TermDocumentMatrix (terms: 79, documents: 1396)>>
## Non-/sparse entries: 37731/72553
## Sparsity           : 66%
## Maximal term length: 11
## Weighting          : term frequency (tf)
ham_tdm_unsprsd
## <<TermDocumentMatrix (terms: 24, documents: 2500)>>
## Non-/sparse entries: 19242/40758
## Sparsity           : 68%
## Maximal term length: 8
## Weighting          : term frequency (tf)

Top terms by frequency (mentioned at least 50 times)

length(findFreqTerms(spam_tdm_unsprsd,50))
## [1] 79
spam_topterms <- findFreqTerms(spam_tdm_unsprsd,50)
spam_topterms
##  [1] "address"     "business"    "click"       "email"       "form"       
##  [6] "free"        "get"         "http"        "information" "just"       
## [11] "like"        "list"        "listinfo"    "mailing"     "may"        
## [16] "message"     "money"       "new"         "one"         "please"     
## [21] "receive"     "send"        "subject"     "time"        "type"       
## [26] "use"         "will"        "www"         "body"        "can"        
## [31] "center"      "color"       "com"         "don"         "font"       
## [36] "home"        "href"        "html"        "mail"        "mailto"     
## [41] "make"        "name"        "now"         "right"       "today"      
## [46] "remove"      "removed"     "want"        "nbsp"        "size"       
## [51] "table"       "width"       "align"       "bgcolor"     "border"     
## [56] "cellpadding" "cellspacing" "charset"     "content"     "div"        
## [61] "equiv"       "face"        "ffffff"      "head"        "height"     
## [66] "meta"        "net"         "text"        "title"       "arial"      
## [71] "best"        "internet"    "top"         "link"        "helvetica"  
## [76] "sans"        "serif"       "img"         "src"
length(findFreqTerms(ham_tdm_unsprsd,50))
## [1] 24
ham_topterms <- findFreqTerms(ham_tdm_unsprsd,50)
ham_topterms
##  [1] "can"      "com"      "date"     "get"      "like"     "list"    
##  [7] "listinfo" "mailing"  "mailman"  "one"      "time"     "email"   
## [13] "http"     "now"      "use"      "new"      "www"      "just"    
## [19] "will"     "wrote"    "lists"    "net"      "don"      "url"

Find top associations for the top 10 terms (lower correlation limit of 0.4)

spam_assocs <- findAssocs(spam_tdm_unsprsd, spam_topterms[1:10], 0.4) ## more consistent association patterns found in spam
spam_assocs
## $address
##     send     mail     make      one     will     list      can internet 
##     0.67     0.66     0.56     0.52     0.47     0.44     0.44     0.43 
##      now     time 
##     0.42     0.41 
## 
## $business
## money  html  home 
##  0.55  0.46  0.42 
## 
## $click
## ffffff 
##   0.42 
## 
## $email
## receive 
##    0.43 
## 
## $form
##    name    type   width   right charset   align 
##    0.67    0.64    0.50    0.42    0.42    0.40 
## 
## $free
##   can money   one  will  home   don today  best  want 
##  0.63  0.61  0.56  0.54  0.51  0.47  0.46  0.45  0.41 
## 
## $get
##    money      can     will      one     home     make      don     send 
##     0.74     0.72     0.68     0.66     0.57     0.56     0.54     0.48 
##     want     like     time     font internet     best      com      now 
##     0.47     0.46     0.46     0.46     0.46     0.45     0.43     0.42 
##     face      use    today 
##     0.42     0.41     0.41 
## 
## $http
##      www     href      com      div     size     font    align      net 
##     0.97     0.94     0.76     0.74     0.71     0.56     0.54     0.53 
##    color internet     nbsp 
##     0.46     0.46     0.43 
## 
## $information
##      one      can     will      use      don     like     make     time 
##     0.62     0.59     0.57     0.54     0.53     0.50     0.50     0.48 
##     send     want  message      com   mailto    today      may internet 
##     0.47     0.46     0.45     0.45     0.45     0.45     0.44     0.43 
##  subject 
##     0.42 
## 
## $just
##      can      one     will     make     like     send      don     time 
##     0.80     0.79     0.73     0.71     0.69     0.67     0.64     0.59 
##      use      now     want internet      com    today    money   mailto 
##     0.55     0.52     0.52     0.52     0.51     0.50     0.49     0.47 
##     best  receive     list  message 
##     0.47     0.43     0.41     0.40
ham_assocs <- findAssocs(ham_tdm_unsprsd, ham_topterms[1:10], 0.4) ## no consistent association patterns found in ham
ham_assocs
## $can
## will  new time just  use 
## 0.72 0.63 0.59 0.50 0.49 
## 
## $com
##  www http 
## 0.86 0.80 
## 
## $date
##  url 
## 0.45 
## 
## $get
## numeric(0)
## 
## $like
## just 
## 0.43 
## 
## $list
##  net 
## 0.54 
## 
## $listinfo
##   net lists 
##  0.53  0.44 
## 
## $mailing
##   net lists 
##  0.57  0.51 
## 
## $mailman
## numeric(0)
## 
## $one
##  now 
## 0.45

Generate Master list of spam identification terms and create a wordcloud

spam_id_terms <- setdiff(spam_topterms, ham_topterms)
spam_id_terms
##  [1] "address"     "business"    "click"       "form"        "free"       
##  [6] "information" "may"         "message"     "money"       "please"     
## [11] "receive"     "send"        "subject"     "type"        "body"       
## [16] "center"      "color"       "font"        "home"        "href"       
## [21] "html"        "mail"        "mailto"      "make"        "name"       
## [26] "right"       "today"       "remove"      "removed"     "want"       
## [31] "nbsp"        "size"        "table"       "width"       "align"      
## [36] "bgcolor"     "border"      "cellpadding" "cellspacing" "charset"    
## [41] "content"     "div"         "equiv"       "face"        "ffffff"     
## [46] "head"        "height"      "meta"        "text"        "title"      
## [51] "arial"       "best"        "internet"    "top"         "link"       
## [56] "helvetica"   "sans"        "serif"       "img"         "src"
library(wordcloud)
## Loading required package: RColorBrewer
spam_tdm_cloud <- as.matrix(spam_tdm_unsprsd)
v <- sort(rowSums(spam_tdm_cloud),decreasing=TRUE)
d <- data.frame(word=names(v),freq=v)   
wordcloud(d$word,d$freq,max.words=50, min.freq=50, colors=brewer.pal(8, 'Dark2'))

Classification of emails (Spam and Ham)

Creating a combined/scrambled spam+ham corpus and assigning labels

spam_corpus <- VCorpus(VectorSource(spam_ds))
for (i in 1:length(spam_corpus)) {
  meta(spam_corpus[[i]], tag="emailtype") <- c("Spam")
}
ham_corpus <- VCorpus(VectorSource(ham_ds))
for (i in 1:length(ham_corpus)) {
  meta(ham_corpus[[i]], tag="emailtype") <- c("Ham")
}
email_corpus <- c(spam_corpus, ham_corpus)
email_corpus <- email_corpus[sample(c(1:length(email_corpus)))]

Creating DTM and removing sparse terms (80% of sparse percentage of empty)

controldtm <- list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE)
email_dtm <- DocumentTermMatrix(email_corpus, controldtm)
email_dtm
## <<DocumentTermMatrix (documents: 3896, terms: 70801)>>
## Non-/sparse entries: 407584/275433112
## Sparsity           : 100%
## Maximal term length: 868
## Weighting          : term frequency (tf)
email_dtm_unsprsd <- removeSparseTerms(email_dtm, 0.8)
email_dtm_unsprsd
## <<DocumentTermMatrix (documents: 3896, terms: 17)>>
## Non-/sparse entries: 17927/48305
## Sparsity           : 73%
## Maximal term length: 7
## Weighting          : term frequency (tf)
inspect(email_dtm_unsprsd[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 22/78
## Sparsity           : 78%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   can date email free get just like list mailing message
##   1139   0    0     0    0   0    2    0    2       1       0
##   1217   0    0     0    0   1    0    0    0       0       0
##   1677   0    0     4    0   0    0    1    0       0       1
##   1769   2    0     0    0   0    0    0    0       0       1
##   2008   0    1     0    0   0    0    0    0       0       0
##   212    0    0     0    0   2    0    0    2       0       5
##   2355   0    1     0    0   0    0    0    0       0       0
##   256    0    0     2    4   0    0    0    2       2       3
##   785    0    0     0    0   0    0    0    0       0       0
##   974    0    0     0    0   0    0    1    1       1       0

Creating Container with all relevant information to use in the classification procedure

library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
email_labels <- unlist(meta(email_corpus, "emailtype"))
N <- length(email_labels)
splitN <- round(0.7*N)
table(email_labels)
## email_labels
##  Ham Spam 
## 2500 1396
container_email <- create_container(email_dtm_unsprsd, labels = email_labels, trainSize = 1:splitN, testSize = splitN:N, virgin = FALSE)

Training the models (Support Vector Machines, Random Forest and MaxEntropy)

svm_model <- train_model(container_email, "SVM")
tree_model <- train_model(container_email, "TREE")
maxent_model <- train_model(container_email, "MAXENT")

Scoring Emails (Spam and Ham)

svm_score <- classify_model(container_email, svm_model)
tree_score <- classify_model(container_email, tree_model)
maxent_score <- classify_model(container_email, maxent_model)

Presenting Results

library(knitr)
table(email_labels[splitN:N])
## 
##  Ham Spam 
##  751  419
table(as.character(svm_score[,1]))
## 
##  Ham Spam 
##  742  428
table(as.character(tree_score[,1]))
## 
##  Ham Spam 
##  742  428
table(as.character(maxent_score[,1]))
## 
##  Ham Spam 
##  803  367
results <- data.frame(
  email_labels[splitN:N],
  as.character(svm_score[,1]),
  as.character(tree_score[,1]),
  as.character(maxent_score[,1]))

colnames(results) <- c("CorrectLabel", "SVM", "DTree", "MaxEntropy")
kable(summary(results))
CorrectLabel SVM DTree MaxEntropy
Ham :751 Ham :742 Ham :742 Ham :803
Spam:419 Spam:428 Spam:428 Spam:367

Better classification performance shown by the MaxEntropy classifier, followed by SVM. Random Forest showed the worst of the three