It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam. For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/
getwd()
setwd("C:/Users/rkothari/Documents/MSDA/DATA 607/Assignment_spam_ham/")
spam_url <- "https://spamassassin.apache.org/publiccorpus/20030228_spam.tar.bz2"
spam2_url <- "https://spamassassin.apache.org/publiccorpus/20050311_spam_2.tar.bz2"
hard_ham_url <- "https://spamassassin.apache.org/publiccorpus/20030228_hard_ham.tar.bz2"
easy_ham_url <- "https://spamassassin.apache.org/publiccorpus/20030228_easy_ham.tar.bz2"
#Download tarballs
download.file(spam_url, destfile="spam.tar.gz")
download.file(spam2_url, destfile="spam2.tar.gz")
download.file(hard_ham_url, destfile="hardham.tar.gz")
download.file(easy_ham_url, destfile="easyham.tar.gz")
#Extract tarballs
untar("spam.tar.gz")
untar("spam2.tar.gz")
untar("hardham.tar.gz")
untar("easyham.tar.gz")
#Create spam and ham corpus
spam <- Corpus(DirSource("spam"), readerControl = list(language="lat"))
spam2 <- Corpus(DirSource("spam_2"), readerControl = list(language="lat"))
easy_ham <- Corpus(DirSource("easy_ham"), readerControl = list(language="lat"))
hard_ham <- Corpus(DirSource("hard_ham"), readerControl = list(language="lat"))
length(spam)
length(spam2)
length(easy_ham)
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)
#Remove cmds file
if (file.exists("easy_ham/cmds")) file.remove("easy_ham/cmds")
if (file.exists("hard_ham/cmds")) file.remove("hard_ham/cmds")
if (file.exists("spam/cmds")) file.remove("spam/cmds")
#Add meta labels
meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "easy_ham"
meta(hard_ham, tag = "type") <- "hard_ham"
#Combine corpus objects.
all_corpus <- c(spam,easy_ham, hard_ham, recursive=T)
easy_corpus <- c(spam,easy_ham, recursive=T)
hard_corpus <- c(spam,hard_ham, recursive=T)
#Create reduced and randomized corpus
all_corpus_sample <- sample(all_corpus, 500)
easy_corpus_sapmle <- sample(easy_corpus, 500)
hard_corpus_sample <- sample(hard_corpus, 500)
#Build document-term matrix.
spam_tdm_all <- DocumentTermMatrix(all_corpus_sample)
spam_tdm_easy <- DocumentTermMatrix(easy_corpus_sapmle)
spam_tdm_hard <- DocumentTermMatrix(hard_corpus_sample)
#Collect meta labels
spam_all_no <- unlist(meta(all_corpus_sample, "type")[,1])
head(spam_all_no)
Container_all <- create_container(spam_tdm_all, labels = spam_all_no, trainSize = 1:250, testSize = 251:length(spam_all_no), virgin = FALSE)
#Training models
svm_model <- train_model(Container_all, "SVM")
rf_model <- train_model(Container_all, "RF")
maxent_model <- train_model(Container_all, "MAXENT")
svm_classify <- classify_model(Container_all, svm_model)
rf_classify <- classify_model(Container_all, rf_model)
maxent_classify <- classify_model(Container_all, maxent_model)
head(svm_classify)
head(rf_classify)
head(maxent_classify)
I referenced the links below and is very helpful in explaining the text classification process.
http://www.svm-tutorial.com/2014/11/svm-classify-text-r/
http://www.svm-tutorial.com/text-classification-prepare-data/
https://www.r-bloggers.com/classifying-emails-as-spam-or-ham-using-rtexttools/