It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents
library(tm)
library(SnowballC)
library(stringr)
library(RWeka)
library(RTextTools)
The ham corpus contains 2500 emails where the first 2000 files will be used to train a classifier model, while the last 500 will be used to test the model.
The spam corpus contains 1396 emails where the first 1000 files will be used to traina classifier model, while the last 396 will be used to test the model.
The file “cmds” was removed from both source directories.
ham_corpus = Corpus(DirSource("Week_10/easy_ham"),
readerControl = list(language = "english", load = F))
spam_corpus = Corpus(DirSource("Week_10/spam_2"),
readerControl = list(language = "english", load = F))
for (i in 1:length(ham_corpus)) {
meta(ham_corpus[[i]], "mail_label") <- "ham"
}
for (i in 1:length(spam_corpus)) {
meta(spam_corpus[[i]], "mail_label") <- "spam"
}
ham_corpus <- tm_map(ham_corpus, removeNumbers)
ham_corpus <- tm_map(ham_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
ham_corpus <- tm_map(ham_corpus, removeWords, words = stopwords("english"))
ham_corpus <- tm_map(ham_corpus, stripWhitespace)
ham_corpus <- tm_map(ham_corpus, content_transformer(tolower))
ham_corpus <- tm_map(ham_corpus, stemDocument)
spam_corpus <- tm_map(spam_corpus, removeNumbers)
spam_corpus <- tm_map(spam_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
spam_corpus <- tm_map(spam_corpus, removeWords, words = stopwords("english"))
spam_corpus <- tm_map(spam_corpus, stripWhitespace)
spam_corpus <- tm_map(spam_corpus, content_transformer(tolower))
spam_corpus <- tm_map(spam_corpus, stemDocument)
ham_dtm <- DocumentTermMatrix(ham_corpus)
ham_dtm <- removeSparseTerms(ham_dtm, 1-(10/length(ham_corpus)))
spam_dtm <- DocumentTermMatrix(spam_corpus)
spam_dtm <- removeSparseTerms(spam_dtm, 1-(10/length(spam_corpus)))
Display the document-term matrix for non-spam emails
ham_dtm
## <<DocumentTermMatrix (documents: 2500, terms: 3590)>>
## Non-/sparse entries: 347245/8627755
## Sparsity : 96%
## Maximal term length: 70
## Weighting : term frequency (tf)
Display the document-term matrix for spam emails
spam_dtm
## <<DocumentTermMatrix (documents: 1396, terms: 3404)>>
## Non-/sparse entries: 244975/4507009
## Sparsity : 95%
## Maximal term length: 56
## Weighting : term frequency (tf)
# Create container
ham_mail <- unlist(meta(ham_corpus, "mail_label"))
N <-length(ham_mail)
# Train using files 1-2000, test using 2001-last row
ham_container <- create_container(ham_dtm,
labels = ham_mail,
trainSize = 1:2000,
testSize= 2001:N,
virgin = FALSE)
ham_tree_model <- train_model(ham_container, "TREE")
spam_mail <- unlist(meta(spam_corpus, "mail_label"))
S <-length(spam_mail)
# Train using files 1-1000, test using 1001-last row
spam_container <- create_container(spam_dtm,
labels = spam_mail,
trainSize = 1:1000,
testSize= 1001:S,
virgin = FALSE)
spam_tree_model <- train_model(spam_container, "TREE")
ham_tree_out <- classify_model(ham_container, ham_tree_model)
# Display last few rows of the classified test ham emails
tail(ham_tree_out)
## TREE_LABEL TREE_PROB
## 495 ham 1
## 496 ham 1
## 497 ham 1
## 498 ham 1
## 499 ham 1
## 500 ham 1
spam_tree_out <- classify_model(spam_container, spam_tree_model)
# Display last few rows of the classified test spam emails
tail(spam_tree_out)
## TREE_LABEL TREE_PROB
## 391 spam 1
## 392 spam 1
## 393 spam 1
## 394 spam 1
## 395 spam 1
## 396 spam 1
ham_labels_out <- data.frame(
correct_label = ham_mail[2001:N],
tree = as.character(ham_tree_out[,1]),
stringsAsFactors = F
)
table(ham_labels_out[,1] == ham_labels_out[,2])
##
## TRUE
## 500
spam_labels_out <- data.frame(
correct_label = spam_mail[1001:S],
tree = as.character(spam_tree_out[,1]),
stringsAsFactors = F
)
table(spam_labels_out[,1] == spam_labels_out[,2])
##
## TRUE
## 396
I observed that the estimation model accurately classified the test emails as ham / spam accordingly.