TextMining - Document Classification
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/
Installing necessary packages
#install.packages("tm")
#install.packages("tidytext")
#install.packages("RTextTools")
Analyzing Spam Email and uncovering Key Terms that can help with spam classification
Defining spam and ham directory paths and building function for extracting only the message content (no email headers)
setwd("C:/DATA/HHP/Personal/Degrees/Ms. Data Science (CUNY)/R Working Dir")
spam_path <- "C:/DATA/HHP/Personal/Degrees/Ms. Data Science (CUNY)/R Working Dir/spam_2/"
ham_path <- "C:/DATA/HHP/Personal/Degrees/Ms. Data Science (CUNY)/R Working Dir/easy_ham/"
get_msg <- function(path) {
con <- file(path,open="rt",encoding="latin1")
text <- readLines(con)
msg <- text[seq(which(text=="")[1]+1,length(text))]
close(con)
return(paste(msg,collapse="\n"))
}
Building spam and ham email datasets
spam_files <- dir(spam_path)
spam_ds <- sapply(spam_files, function(p) get_msg(paste(spam_path, p, sep="")))
ham_files <- dir(ham_path)
ham_ds <- sapply(ham_files, function(p) get_msg(paste(ham_path, p, sep="")))
length(spam_ds)
## [1] 1396
length(ham_ds)
## [1] 2500
Create Corpus and Term-Document Matrices for spam and ham data sets
library(tm)
## Loading required package: NLP
control <- list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
spam_corpus <- Corpus(VectorSource(spam_ds))
spam_tdm <- TermDocumentMatrix(spam_corpus, control)
ham_corpus <- Corpus(VectorSource(ham_ds))
ham_tdm <- TermDocumentMatrix(ham_corpus, control)
spam_tdm
## <<TermDocumentMatrix (terms: 34788, documents: 1396)>>
## Non-/sparse entries: 200489/48363559
## Sparsity : 100%
## Maximal term length: 121
## Weighting : term frequency (tf)
ham_tdm
## <<TermDocumentMatrix (terms: 25280, documents: 2500)>>
## Non-/sparse entries: 208429/62991571
## Sparsity : 100%
## Maximal term length: 76
## Weighting : term frequency (tf)
Removing sparse terms (80% of sparse percentage of empty)
spam_tdm_unsprsd <- removeSparseTerms(spam_tdm, 0.8)
ham_tdm_unsprsd <- removeSparseTerms(ham_tdm, 0.8)
spam_tdm_unsprsd
## <<TermDocumentMatrix (terms: 79, documents: 1396)>>
## Non-/sparse entries: 37731/72553
## Sparsity : 66%
## Maximal term length: 11
## Weighting : term frequency (tf)
ham_tdm_unsprsd
## <<TermDocumentMatrix (terms: 24, documents: 2500)>>
## Non-/sparse entries: 19242/40758
## Sparsity : 68%
## Maximal term length: 8
## Weighting : term frequency (tf)
Top terms by frequency (mentioned at least 50 times)
length(findFreqTerms(spam_tdm_unsprsd,50))
## [1] 79
spam_topterms <- findFreqTerms(spam_tdm_unsprsd,50)
spam_topterms
## [1] "address" "business" "click" "email" "form"
## [6] "free" "get" "http" "information" "just"
## [11] "like" "list" "listinfo" "mailing" "may"
## [16] "message" "money" "new" "one" "please"
## [21] "receive" "send" "subject" "time" "type"
## [26] "use" "will" "www" "body" "can"
## [31] "center" "color" "com" "don" "font"
## [36] "home" "href" "html" "mail" "mailto"
## [41] "make" "name" "now" "right" "today"
## [46] "remove" "removed" "want" "nbsp" "size"
## [51] "table" "width" "align" "bgcolor" "border"
## [56] "cellpadding" "cellspacing" "charset" "content" "div"
## [61] "equiv" "face" "ffffff" "head" "height"
## [66] "meta" "net" "text" "title" "arial"
## [71] "best" "internet" "top" "link" "helvetica"
## [76] "sans" "serif" "img" "src"
length(findFreqTerms(ham_tdm_unsprsd,50))
## [1] 24
ham_topterms <- findFreqTerms(ham_tdm_unsprsd,50)
ham_topterms
## [1] "can" "com" "date" "get" "like" "list"
## [7] "listinfo" "mailing" "mailman" "one" "time" "email"
## [13] "http" "now" "use" "new" "www" "just"
## [19] "will" "wrote" "lists" "net" "don" "url"
Find top associations for the top 10 terms (lower correlation limit of 0.4)
spam_assocs <- findAssocs(spam_tdm_unsprsd, spam_topterms[1:10], 0.4) ## more consistent association patterns found in spam
spam_assocs
## $address
## send mail make one will list can internet
## 0.67 0.66 0.56 0.52 0.47 0.44 0.44 0.43
## now time
## 0.42 0.41
##
## $business
## money html home
## 0.55 0.46 0.42
##
## $click
## ffffff
## 0.42
##
## $email
## receive
## 0.43
##
## $form
## name type width right charset align
## 0.67 0.64 0.50 0.42 0.42 0.40
##
## $free
## can money one will home don today best want
## 0.63 0.61 0.56 0.54 0.51 0.47 0.46 0.45 0.41
##
## $get
## money can will one home make don send
## 0.74 0.72 0.68 0.66 0.57 0.56 0.54 0.48
## want like time font internet best com now
## 0.47 0.46 0.46 0.46 0.46 0.45 0.43 0.42
## face use today
## 0.42 0.41 0.41
##
## $http
## www href com div size font align net
## 0.97 0.94 0.76 0.74 0.71 0.56 0.54 0.53
## color internet nbsp
## 0.46 0.46 0.43
##
## $information
## one can will use don like make time
## 0.62 0.59 0.57 0.54 0.53 0.50 0.50 0.48
## send want message com mailto today may internet
## 0.47 0.46 0.45 0.45 0.45 0.45 0.44 0.43
## subject
## 0.42
##
## $just
## can one will make like send don time
## 0.80 0.79 0.73 0.71 0.69 0.67 0.64 0.59
## use now want internet com today money mailto
## 0.55 0.52 0.52 0.52 0.51 0.50 0.49 0.47
## best receive list message
## 0.47 0.43 0.41 0.40
ham_assocs <- findAssocs(ham_tdm_unsprsd, ham_topterms[1:10], 0.4) ## no consistent association patterns found in ham
ham_assocs
## $can
## will new time just use
## 0.72 0.63 0.59 0.50 0.49
##
## $com
## www http
## 0.86 0.80
##
## $date
## url
## 0.45
##
## $get
## numeric(0)
##
## $like
## just
## 0.43
##
## $list
## net
## 0.54
##
## $listinfo
## net lists
## 0.53 0.44
##
## $mailing
## net lists
## 0.57 0.51
##
## $mailman
## numeric(0)
##
## $one
## now
## 0.45
Generate Master list of spam identification terms and create a wordcloud
spam_id_terms <- setdiff(spam_topterms, ham_topterms)
spam_id_terms
## [1] "address" "business" "click" "form" "free"
## [6] "information" "may" "message" "money" "please"
## [11] "receive" "send" "subject" "type" "body"
## [16] "center" "color" "font" "home" "href"
## [21] "html" "mail" "mailto" "make" "name"
## [26] "right" "today" "remove" "removed" "want"
## [31] "nbsp" "size" "table" "width" "align"
## [36] "bgcolor" "border" "cellpadding" "cellspacing" "charset"
## [41] "content" "div" "equiv" "face" "ffffff"
## [46] "head" "height" "meta" "text" "title"
## [51] "arial" "best" "internet" "top" "link"
## [56] "helvetica" "sans" "serif" "img" "src"
library(wordcloud)
## Loading required package: RColorBrewer
spam_tdm_cloud <- as.matrix(spam_tdm_unsprsd)
v <- sort(rowSums(spam_tdm_cloud),decreasing=TRUE)
d <- data.frame(word=names(v),freq=v)
wordcloud(d$word,d$freq,max.words=50, min.freq=50, colors=brewer.pal(8, 'Dark2'))

Classification of emails (Spam and Ham)
Creating a combined/scrambled spam+ham corpus and assigning labels
spam_corpus <- VCorpus(VectorSource(spam_ds))
for (i in 1:length(spam_corpus)) {
meta(spam_corpus[[i]], tag="emailtype") <- c("Spam")
}
ham_corpus <- VCorpus(VectorSource(ham_ds))
for (i in 1:length(ham_corpus)) {
meta(ham_corpus[[i]], tag="emailtype") <- c("Ham")
}
email_corpus <- c(spam_corpus, ham_corpus)
email_corpus <- email_corpus[sample(c(1:length(email_corpus)))]
Creating DTM and removing sparse terms (80% of sparse percentage of empty)
controldtm <- list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE)
email_dtm <- DocumentTermMatrix(email_corpus, controldtm)
email_dtm
## <<DocumentTermMatrix (documents: 3896, terms: 70801)>>
## Non-/sparse entries: 407584/275433112
## Sparsity : 100%
## Maximal term length: 868
## Weighting : term frequency (tf)
email_dtm_unsprsd <- removeSparseTerms(email_dtm, 0.8)
email_dtm_unsprsd
## <<DocumentTermMatrix (documents: 3896, terms: 17)>>
## Non-/sparse entries: 17927/48305
## Sparsity : 73%
## Maximal term length: 7
## Weighting : term frequency (tf)
inspect(email_dtm_unsprsd[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 22/78
## Sparsity : 78%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs can date email free get just like list mailing message
## 1139 0 0 0 0 0 2 0 2 1 0
## 1217 0 0 0 0 1 0 0 0 0 0
## 1677 0 0 4 0 0 0 1 0 0 1
## 1769 2 0 0 0 0 0 0 0 0 1
## 2008 0 1 0 0 0 0 0 0 0 0
## 212 0 0 0 0 2 0 0 2 0 5
## 2355 0 1 0 0 0 0 0 0 0 0
## 256 0 0 2 4 0 0 0 2 2 3
## 785 0 0 0 0 0 0 0 0 0 0
## 974 0 0 0 0 0 0 1 1 1 0
Training the models (Support Vector Machines, Random Forest and MaxEntropy)
svm_model <- train_model(container_email, "SVM")
tree_model <- train_model(container_email, "TREE")
maxent_model <- train_model(container_email, "MAXENT")
Scoring Emails (Spam and Ham)
svm_score <- classify_model(container_email, svm_model)
tree_score <- classify_model(container_email, tree_model)
maxent_score <- classify_model(container_email, maxent_model)
Presenting Results
library(knitr)
table(email_labels[splitN:N])
##
## Ham Spam
## 751 419
table(as.character(svm_score[,1]))
##
## Ham Spam
## 742 428
table(as.character(tree_score[,1]))
##
## Ham Spam
## 742 428
table(as.character(maxent_score[,1]))
##
## Ham Spam
## 803 367
results <- data.frame(
email_labels[splitN:N],
as.character(svm_score[,1]),
as.character(tree_score[,1]),
as.character(maxent_score[,1]))
colnames(results) <- c("CorrectLabel", "SVM", "DTree", "MaxEntropy")
kable(summary(results))
|
Ham :751 |
Ham :742 |
Ham :742 |
Ham :803 |
|
Spam:419 |
Spam:428 |
Spam:428 |
Spam:367 |
Better classification performance shown by the MaxEntropy classifier, followed by SVM. Random Forest showed the worst of the three