It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).
library(tm)
## Loading required package: NLP
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(stringr)
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
spam.dir <- file.path("C:/Users/Patrizia/Desktop/Ambra MSDA/W9/spam")
ham.dir<- file.path("C:/Users/Patrizia/Desktop/Ambra MSDA/W9/easy_ham_2")
spamcorpus <- VCorpus(DirSource(spam.dir))
hamcorpus <- VCorpus(DirSource(ham.dir))
meta(spamcorpus[[501]])
## author : character(0)
## datetimestamp: 2017-04-16 13:34:49
## description : character(0)
## heading : character(0)
## id : cmds
## language : en
## origin : character(0)
idx <- meta(spamcorpus, "id") == 'cmds'
idxh<-meta(hamcorpus, "id") == 'cmds'
spamcorpus<-spamcorpus[-idx]
hamcorpus<-hamcorpus[-idxh]
library(wordcloud)
## Loading required package: RColorBrewer
##Clean up the spamcorpus and remove custom list of words that are found in the first and last rows of each email
spamcorpus1<-tm_map(spamcorpus, content_transformer(function(x) str_replace_all(x, "If you wish to leave this list please use the link below", " ")))
##Remove URLS, emails and extra text
spamcorpus1<-tm_map(spamcorpus1, content_transformer(function(x) str_replace_all(x, "http.* *|\\S+@\\S+", " ")))
spamcorpus1<- spamcorpus1 %>% tm_map(content_transformer(tolower))
spamcorpus1<-tm_map(spamcorpus1, content_transformer(function(x) str_replace_all(x, "from.*$|return-path.*$|delivered.*$|received.*$|reply.*$|to:.*$|date:.*$|x-mailer:.*$|content-transfer-encoding:.*$|x-mime-autoconverted.*$|content-type.*$|mIME-Version.*$|message-id:.*$|\t.*$|sender:.*$|precedence:.*$|list-Id:.*$|x-mailman-version:.*$|x-beenthere:.*$|list maintainer:.*$|subject|font|href|src|nbsp|esmpt|smtp|img|widthd|heightd|smtp", " ")))
##Remove HTML tags
spamcorpus1<-tm_map(spamcorpus1, content_transformer(function(x) str_replace_all(x, "<.*?>", " ")))
spamcorpus1<- spamcorpus1 %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords(kind="en")) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
spamcorpus1tdm<- as.matrix(TermDocumentMatrix(spamcorpus1))
word.freq <- sort(rowSums(spamcorpus1tdm), decreasing = T)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 30,max.freq=50,
random.order = F)
for(i in seq(length(hamcorpus))){
meta(hamcorpus[[i]], tag = "type")<- "HAM"
}
for(i in seq(length(spamcorpus))){
meta(spamcorpus[[i]], tag = "type")<- "SPAM"
}
emails<-c(spamcorpus, hamcorpus, recursive=T)
emails<-sample(emails)
emails<- emails %>% tm_map(content_transformer(tolower)) %>%
tm_map(content_transformer(removePunctuation)) %>%
tm_map(content_transformer(stemDocument)) %>%
tm_map(content_transformer(removeNumbers))
emails<-tm_map(emails,removeWords, words = stopwords("en"))
spam_tags <- factor(unlist(meta(emails, "type")))
table(spam_tags)
## spam_tags
## HAM SPAM
## 1400 500
emailstdm<-TermDocumentMatrix(emails)
emailstdm<-removeSparseTerms(emailstdm, .99)
email.df <- as.data.frame(data.matrix(emailstdm),stringsAsFactors=FALSE)
## Remove words with total frequency less than 3
email.df<- email.df[rowSums(email.df) > 3, ]
##Extract the spam tag
spam<-c()
for(i in 1:length(emails)){
spam<-c(spam,emails[[i]]$meta$type)
}
N<-nDocs(emailstdm)
## set up model container using a 75/25 split between training and test data. Reference code as per chapter 10 of "Automated Data Collection with R"
container <- create_container(
email.df,
labels = spam,
trainSize = 1:(0.75*N),
testSize = (0.75*N+1):N,
virgin = FALSE
)
slotNames(container)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
emailsdf_train <- email.df[1:1425,]
emailsdf_test <- email.df[1426:1900,]
##Use SVM model classifier to make predictions
svm.model <- train_model(container, "SVM")
svm.output <- classify_model(container, svm.model)
head(svm.output)
## SVM_LABEL SVM_PROB
## 1 HAM 0.7140429
## 2 HAM 0.7118988
## 3 HAM 0.7486656
## 4 HAM 0.7554778
## 5 HAM 0.7206037
## 6 HAM 0.7670432
##Since we know the correct labels, we can investigate how often the algorithms have misclassified the emails.
labels_out <- data.frame( correct_label = spam[1426:N], svm = as.character(svm.output[,1]), stringsAsFactors = F)
round(prop.table(table(labels_out[,1] == labels_out[,2])), digits = 3)
##
## FALSE TRUE
## 0.259 0.741
##SVM classified 71% of the emails correctly either as HAM or SPAM
##by Type
head(labels_out)
## correct_label svm
## 1 HAM HAM
## 2 HAM HAM
## 3 HAM HAM
## 4 HAM HAM
## 5 HAM HAM
## 6 SPAM HAM
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## annotate(): ggplot2, NLP
## filter(): dplyr, stats
## lag(): dplyr, stats
labels_out_ham <- labels_out %>% filter(correct_label == "HAM")
## SVM performance for ham
table(labels_out_ham[,1] == labels_out_ham[,2])
##
## TRUE
## 352
round(prop.table(table(labels_out_ham[,1] == labels_out_ham[,2])), digits = 3)
##
## TRUE
## 1
labels_out_spam <- labels_out %>% filter(correct_label == "SPAM")
## SVM performance for spam
table(labels_out_spam[,1] == labels_out_spam[,2])
##
## FALSE
## 123
round(prop.table(table(labels_out_spam[,1] == labels_out_spam[,2])), digits = 3)
##
## FALSE
## 1
##Did the model classify all test spam data incorrectly?
```