Assignment_text_classification

Setup

Assignment

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam. For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

Dataload

getwd()
setwd("C:/Users/rkothari/Documents/MSDA/DATA 607/Assignment_spam_ham/")

spam_url <- "https://spamassassin.apache.org/publiccorpus/20030228_spam.tar.bz2"
spam2_url <- "https://spamassassin.apache.org/publiccorpus/20050311_spam_2.tar.bz2"
hard_ham_url <- "https://spamassassin.apache.org/publiccorpus/20030228_hard_ham.tar.bz2"
easy_ham_url <- "https://spamassassin.apache.org/publiccorpus/20030228_easy_ham.tar.bz2"

#Download tarballs
download.file(spam_url, destfile="spam.tar.gz")
download.file(spam2_url, destfile="spam2.tar.gz")
download.file(hard_ham_url, destfile="hardham.tar.gz")
download.file(easy_ham_url, destfile="easyham.tar.gz")

#Extract tarballs
untar("spam.tar.gz")
untar("spam2.tar.gz")
untar("hardham.tar.gz")
untar("easyham.tar.gz")

#Create spam and ham corpus
spam <- Corpus(DirSource("spam"), readerControl = list(language="lat"))
spam2 <- Corpus(DirSource("spam_2"), readerControl = list(language="lat"))
easy_ham <- Corpus(DirSource("easy_ham"), readerControl = list(language="lat"))
hard_ham <- Corpus(DirSource("hard_ham"), readerControl = list(language="lat"))
length(spam)
length(spam2)
length(easy_ham)

Cleaning and filterning text

tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)

#Remove cmds file
if (file.exists("easy_ham/cmds")) file.remove("easy_ham/cmds")
if (file.exists("hard_ham/cmds")) file.remove("hard_ham/cmds")
if (file.exists("spam/cmds")) file.remove("spam/cmds")

#Add meta labels
meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "easy_ham"
meta(hard_ham, tag = "type") <- "hard_ham"

Creating Document term matrix

#Combine corpus objects.

all_corpus <- c(spam,easy_ham, hard_ham, recursive=T)
easy_corpus <- c(spam,easy_ham, recursive=T)
hard_corpus <- c(spam,hard_ham, recursive=T)

#Create reduced and randomized corpus

all_corpus_sample <- sample(all_corpus, 500)
easy_corpus_sapmle <- sample(easy_corpus, 500)
hard_corpus_sample <- sample(hard_corpus, 500)

#Build document-term matrix. 

spam_tdm_all <- DocumentTermMatrix(all_corpus_sample)
spam_tdm_easy <- DocumentTermMatrix(easy_corpus_sapmle)
spam_tdm_hard <- DocumentTermMatrix(hard_corpus_sample)

Creating containers and divide the data into training and testing

#Collect meta labels

spam_all_no <- unlist(meta(all_corpus_sample, "type")[,1])
head(spam_all_no)
Container_all <- create_container(spam_tdm_all,  labels = spam_all_no, trainSize = 1:250, testSize = 251:length(spam_all_no), virgin = FALSE)

Training and classification

Training

#Training models

svm_model <- train_model(Container_all, "SVM")
rf_model <- train_model(Container_all, "RF")
maxent_model <- train_model(Container_all, "MAXENT")

Classification

svm_classify <- classify_model(Container_all, svm_model)
rf_classify <- classify_model(Container_all, rf_model)
maxent_classify <- classify_model(Container_all, maxent_model)
head(svm_classify)
head(rf_classify)
head(maxent_classify)

References

I referenced the links below and is very helpful in explaining the text classification process.

http://www.svm-tutorial.com/2014/11/svm-classify-text-r/

http://www.svm-tutorial.com/text-classification-prepare-data/

https://www.r-bloggers.com/classifying-emails-as-spam-or-ham-using-rtexttools/