Data 607 - Project 4

Introduction

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

Libraries

require(knitr)

## Loading required package: knitr

require(plyr)

## Loading required package: plyr

## Warning: package 'plyr' was built under R version 4.0.4

require(tm)

## Loading required package: tm

## Warning: package 'tm' was built under R version 4.0.4

## Loading required package: NLP

require(tidytext)

## Loading required package: tidytext

## Warning: package 'tidytext' was built under R version 4.0.5

require(class)

## Loading required package: class

require(tidyr)

## Loading required package: tidyr

require(tidyverse)

## Loading required package: tidyverse

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v dplyr   1.0.4
## v tibble  3.0.6     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::arrange()    masks plyr::arrange()
## x purrr::compact()    masks plyr::compact()
## x dplyr::count()      masks plyr::count()
## x dplyr::failwith()   masks plyr::failwith()
## x dplyr::filter()     masks stats::filter()
## x dplyr::id()         masks plyr::id()
## x dplyr::lag()        masks stats::lag()
## x dplyr::mutate()     masks plyr::mutate()
## x dplyr::rename()     masks plyr::rename()
## x dplyr::summarise()  masks plyr::summarise()
## x dplyr::summarize()  masks plyr::summarize()

require(caret)

## Loading required package: caret

## Warning: package 'caret' was built under R version 4.0.5

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

require(RTextTools)

## Loading required package: RTextTools

## Warning: package 'RTextTools' was built under R version 4.0.5

## Loading required package: SparseM

## Warning: package 'SparseM' was built under R version 4.0.4

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli

Loading Data

I downloaded and extracted the docs into a folder named spamham on local drive D.

# Directories
spam_folder <- 'D:/spamham/spam_2'
ham_folder <- 'D:/spamham/easy_ham'

#docs
spam_docs <- list.files(path = spam_folder, full.names = TRUE)
ham_docs <- list.files(path = ham_folder, full.names = TRUE)

#Ignore the 'cmd' docs
spam_docs = spam_docs[which(spam_docs!="cmds")]
ham_docs=ham_docs[which(ham_docs!="cmds")]

#Count of spam and ham docs
length(spam_docs)

## [1] 1397

length(ham_docs)

## [1] 2501

Tidying Data and Creating Corpus

# Function to create a corpus
corpusCreate <- function(doc_path) {
  corpus <- doc_path %>%                            
    paste(., list.files(.), sep = "/") %>%  # Create a vector of doc paths 
    lapply(readLines) %>%                   # Read the text in each doc
    VectorSource() %>%  # Create VectorSource of the read text
    VCorpus()           # create VCorpus
  return(corpus)
}

# Function to clean the data
fileClean <- function(corpus) {
    corpus <- corpus %>%
    tm_map(removeNumbers) %>%       # Remove numbers
    tm_map(removePunctuation) %>%   # Remove punctuation symbols 
    tm_map(tolower) %>%             # Transform  to lowercase
    tm_map(PlainTextDocument) %>%   # Transform back to PlainTextDocument
    tm_map(removeWords, stopwords("en")) %>% # Remove stopwords
    tm_map(stripWhitespace) %>%              # Remove white spaces
    tm_map(stemDocument)                     #Reduce to stems
  return(corpus)
}

# Function to add a value to a specified tag
tagValue <- function(corpus, tag, value){
  for (i in 1:length(corpus)){
    meta(corpus[[i]], tag) <- value
  }
  return(corpus)
}

#Create the two corpuses

# ham corpus
ham_corpus <- ham_folder%>%
   corpusCreate %>% 
   fileClean  %>% 
   tagValue(tag = "ham_spam", value = "ham")



# spam corpus
spam_corpus <- spam_folder %>%
  corpusCreate %>%
  fileClean %>%
  tagValue(tag = "ham_spam", value = "spam")

# Joining the corpuses together and mix them
combined_corpus <- c(ham_corpus, spam_corpus)

# Scramble the order of the corpus
combined_corpus <- combined_corpus[sample(c(1:length(combined_corpus)))]

# create the document term matrix and remove sparse terms. 
corpus_dtm <- combined_corpus %>% 
                    DocumentTermMatrix() %>% 
                    removeSparseTerms(1-(10/length(combined_corpus))) # Terms appearing in less than 10 documents will be left out.

# Retrieving Spam Ham labels.
corpus_labels <- unlist(meta(combined_corpus, "ham_spam"))

Training

# Create  container and split the dataset: 80% for training and 20% for testing
label_num <- length(corpus_labels)
split <- round(0.8*label_num) 
container <- create_container(
  corpus_dtm, 
  labels = corpus_labels, 
  trainSize = 1:split,
  testSize = (split+1):label_num,
  virgin = F
)

Training Models

I will use three different modelling algorithms

Decision Tree
Boosting
Suvervisor Vector Machines

corpus_tree_model <- train_model(container, "TREE")
corpus_boosting_model <- train_model(container, "BOOSTING")
corpus_svm_model <- train_model(container, "SVM")

Testing

# Classifying using the trained models
corpus_tree_out <- classify_model(container, corpus_tree_model)
corpus_boosting_out <- classify_model(container, corpus_boosting_model)
corpus_svm_out <- classify_model(container, corpus_svm_model)

# Collect the classification results into a table
corpus_out_labels <- data.frame(
  correct_label = corpus_labels[(split+1):label_num],
  tree = as.character(corpus_tree_out[,1]),
  boosting = as.character(corpus_boosting_out[,1]),
  svm = as.character(corpus_svm_out[,1]))

# Print the results
for (i in 2:4){
  print(names(corpus_out_labels)[i])
  table(corpus_out_labels[,1] == corpus_out_labels[,i]) %>% 
    print() %>% 
    prop.table() %>% 
    round(2) %>% 
    print()
}

## [1] "tree"
## 
## FALSE  TRUE 
##    10   770 
## 
## FALSE  TRUE 
##  0.01  0.99 
## [1] "boosting"
## 
## FALSE  TRUE 
##     4   776 
## 
## FALSE  TRUE 
##  0.01  0.99 
## [1] "svm"
## 
## FALSE  TRUE 
##     3   777 
## 
## FALSE  TRUE 
##     0     1

Conclusion

All the model classifiers achieved a 99% accuracy rate.