Project 4

Goal

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

Select the dataset and unzip them

url_spam <- "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
file_spam <- "20050311_spam_2.tar.bz2"
file_spam2<-"20050311_spam_2.tar"

url_ham <- "http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2"
file_ham <- "20030228_easy_ham_2.tar.bz2"
file_ham2 <- "20030228_easy_ham_2.tar"

download.file(url_spam, file_spam)
download.file(url_ham, file_ham)

#Found the below function here : https://rdrr.io/github/jefferislab/jimpipeline/src/R/bunzip2.R 
bunzip2 <- function(f, keep=TRUE) {
  args=shQuote(path.expand(f))
  if (keep) args=c("--keep", args)
  bunzip2cmd=Sys.which('bunzip2')
  if(!nzchar(bunzip2cmd)) stop("Unable to find bunzip2!")
  system2(bunzip2cmd, args = args)
  invisible(tools::file_path_sans_ext(f))
}

bunzip2(file_spam)
bunzip2(file_ham)
untar(file_ham2, exdir="spamham")
untar(file_spam2, exdir = "spamham")

Get the filename

spam_dir<-"spamham\\spam_2\\"
ham_dir<-"spamham\\easy_ham_2\\"
spam_list=list.files(spam_dir)
ham_list=list.files(ham_dir)

Remove .cmd files

spam_list = spam_list[which(spam_list!="cmds")]
ham_list=ham_list[which(ham_list!="cmds")]

Functions for creating, cleaning and adding tags to the corpus

vcorpus_df <- function(file_path) {
  corpus <- file_path %>%                            
    paste(., list.files(.), sep = "/") %>%          # Create a vector of file paths 
    lapply(readLines) %>%                           # Read the text in each file
    VectorSource() %>%                              # Turn into VectorSource
    VCorpus()                                       # Turn into VCorpus
  return(corpus)
}

clean <- function(corpus) {
    corpus <- corpus %>%
    tm_map(removeNumbers) %>%                       # Remove numbers
    tm_map(removePunctuation) %>%                   # Remove punctuation symbols 
    tm_map(tolower) %>%                             # Transform  to lowercase
    tm_map(PlainTextDocument) %>%                   # Transform back to PlainTextDocument
    tm_map(removeWords, stopwords("en")) %>%        # Remove stopwords
    tm_map(stripWhitespace) %>%                     # Remove white spaces
    tm_map(stemDocument)                            # Reduce to stems
  return(corpus)
}

tag_df <- function(corpus, tag, value){
  for (i in 1:length(corpus)){
    meta(corpus[[i]], tag) <- value                    # Add the value to the specified tag
  }
  return(corpus)
}

Creating corpus for Spam and Ham

The functions defined above are used to create the Spam and Ham corpus

ham_corpus <- ham_dir%>%
   vcorpus_df %>% 
   clean  %>% 
   tag_df(tag = "ham_spam", value = "ham")


spam_corpus <- spam_dir %>%
  vcorpus_df %>%
  clean %>%
  tag_df(tag = "ham_spam", value = "spam")


spamham_corpus <- c(ham_corpus, spam_corpus) #Join both the corpus

Check proportion of Spam and Ham

spamham_corpus_p <- spamham_corpus %>%
  meta(tag = "ham_spam") %>%
  unlist() %>%
  table() 
spamham_corpus_p

## .
##  ham spam 
## 1401 1397

Word Cloud

wordcloud(ham_corpus,max.words = 100, min.freq=1000)

wordcloud(spam_corpus,max.words = 100, min.freq=1000)

Training the data

create the document term matrix and remove sparse terms. Terms that appear in 10 or more documents are only included.

A container is created with 70% of the data.

spamham_dtm <- spamham_corpus %>% 
                    DocumentTermMatrix() %>% 
                    removeSparseTerms(1-(10/length(spamham_corpus)))

spamham_label <- unlist(meta(spamham_corpus, "ham_spam"))


N <- length(spamham_label)
split <- round(0.7*N) 
container <- create_container(spamham_dtm, labels = spamham_label, trainSize = 1:split, testSize = (split+1):N, virgin = F)

Using the training algorithms - SVM, TREE, BAGGING, BOOSTING, GLMNET, RF and SLDA

spamham_svm <- train_model(container, "SVM")
spamham_tree <- train_model(container, "TREE")
spamham_bag <- train_model(container, "BAGGING")
spamham_boost <- train_model(container, "BOOSTING")
spamham_glmnet <- train_model(container, "GLMNET")
spamham_rf <- train_model(container, "RF")
spamham_slda <- train_model(container, "SLDA")

Testing

# Classifying using the trained models
spamham_svm_class <- classify_model(container, spamham_svm)
spamham_tree_class <- classify_model(container, spamham_tree)
spamham_bag_class <- classify_model(container, spamham_bag)
spamham_boost_class <- classify_model(container, spamham_boost)
spamham_glmnet_class <- classify_model(container, spamham_glmnet)
spamham_rd_class <- classify_model(container, spamham_rf)
spamham_slda_class <- classify_model(container, spamham_slda)


# Collect the classification results into a data frame 
spamham_class <- data.frame(
  original = spamham_label[(split+1):N],
  svm = as.character(spamham_svm_class[,1]),
  tree = as.character(spamham_tree_class[,1]),
  bag = as.character(spamham_bag_class[,1]),
  boost = as.character(spamham_boost_class[,1]),
  glmnet = as.character(spamham_glmnet_class[,1]),
  rd = as.character(spamham_rd_class[,1]),
  slda = as.character(spamham_slda_class[,1]))
  
# Print results
for (i in 1:8){
  print(names(spamham_class)[i])
  table(spamham_class[,1] == spamham_class[,i]) %>% 
    print() %>% 
    prop.table() %>% 
    round(2) %>% 
    kable()
}

## [1] "original"
## 
## TRUE 
##  839 
## [1] "svm"
## 
## FALSE  TRUE 
##   295   544 
## [1] "tree"
## 
## FALSE  TRUE 
##   562   277 
## [1] "bag"
## 
## FALSE  TRUE 
##   502   337 
## [1] "boost"
## 
## FALSE  TRUE 
##   224   615 
## [1] "glmnet"
## 
## FALSE  TRUE 
##   476   363 
## [1] "rd"
## 
## FALSE  TRUE 
##   281   558 
## [1] "slda"
## 
## FALSE  TRUE 
##   319   520

Conclusion

From the above results it is observed that BOOST results are most accurate having 73% accuracy.