DATA 607 Project 4 - Document Classification

Assignment Overview

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

List the first 6 Ham files in the “spam_ham_training/easy_ham” directory.

ham_training_files <- list.files(ham_directory)
head(ham_training_files)

## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"
## [6] "00006.253ea2f9a9cc36fa0b1129b04b806608"

List the first 6 Spam files in the “spam_ham_training/spam” directory.

spam_training_files <- list.files(spam_directory)
head(spam_training_files)

## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"

Define a function that removes email headers so that we are left with clean data for analysis.

#' Strips header data from emails.
#'
#' Removes irrelevant header information from emails so we
#' are left with clean body text for accurate test/training comparison.
#'
#' @param email Email content.
#'
#' @return Email body content free from header data.
#'
strip_email_headers <- function(email) {
  message <- str_split(email,"\n\n") %>% unlist()
  email_body_content <- paste(message[2:length(message)], collapse = ' ')

  return(email_body_content)
}

Define a function that retrieves data from the training files and returns a data frame that we can use for further analysis.

#' Retrieves data from spam and ham training files.
#'
#' Pulls in email training data from the given directory, cleans the data for analysis,
#' and returns it as a data frame.
#'
#' @param directory String: The path to the ham/spam training files directory.
#' @param type String: The type of email data being passed.
#'
#' @return A data frame of email training file data.
#'
fetch_training_file_data <- function(directory, type) {
  training_files <- list.files(directory)
  message_content <- NA
  count <- 1

  for (file in 1:length(training_files)) {
    file_path <- paste0(directory, '/', training_files[file])
    training_email <-suppressWarnings(warning(readtext(file_path, TRUE)))
    training_email_body <- strip_email_headers(training_email)
    training_email_body <- gsub("<.*?>", " ", training_email_body)
    message <- list(paste(training_email_body, collapse = '\n'))
    message_content <- c(message_content, message)
    count <- count + 1
  }
  
  # When we perform our prediction analysis of the data, the randomForest() function expects a
  # numeric label, so we create a new column called "document_class", and populate it with
  # numeric values based on the value of the "type" parameter (0 for ham, 1 for spam).
  if (type == 'ham') {
    document_class <- 0
  } else if (type == 'spam') {
    document_class <- 1
  }

  training_emails <- data.frame()
  training_emails <- as.data.frame(unlist(message_content), stringsAsFactors = FALSE)
  training_emails$message_type <- type
  training_emails$document_class <- document_class
  colnames(training_emails) <- c('message', 'message_type', 'document_class')
 
  return (training_emails)
}

Define a function that converts email dataframes to corpuses.

#' Converts spam/ham data frames to corpuses.
#'
#' Pulls in an email training dataframe, cleans the data,
#' and returns it as a corpus.
#'
#' @param email_data Dataframe: A dataframe containing ham/spam email data.
#' @param type String: The type of email data being passed.
#'
#' @return A corpus of training email data.
#'
create_training_corpus <- function(email_data, type) {
  # Create a corpus of email content and clean the data for analysis purposes.
  training_corpus <- VCorpus(VectorSource(email_data))
  
  if (type != 'combined') {
    meta(training_corpus, tag = 'messageType') <- type
  }

  training_corpus <- tm_map(training_corpus, content_transformer(function(x) iconv(x, "UTF-8", "ASCII")))
  training_corpus <- tm_map(training_corpus, content_transformer(tolower))
  training_corpus <- tm_map(training_corpus, stripWhitespace)
  training_corpus <- tm_map(training_corpus, PlainTextDocument)
  training_corpus <- tm_map(training_corpus, removePunctuation)
  training_corpus <- tm_map(training_corpus, removeNumbers)
  training_corpus <- tm_map(training_corpus, content_transformer(removeWords), stopwords("english"))   
  training_corpus <- tm_map(training_corpus, removeWords, stopwords("english")) 

  return(training_corpus)
}

Calculate the frequency of ham terms and display them in a table.

# Create the Ham dataframe.
ham_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/easy_ham"
ham_dataframe <- fetch_training_file_data(ham_directory, 'ham')

# Create the ham corpus and term matrix.
ham_corpus <- create_training_corpus(ham_dataframe, 'ham')
  
ham_term_matrix <- DocumentTermMatrix(ham_corpus)
ham_term_matrix <- removeSparseTerms(ham_term_matrix, 0.99)
ham_ordered <- as.matrix(ham_term_matrix)
ham_term_frequency <- colSums(ham_ordered)
ham_term_frequency <- sort(ham_term_frequency, decreasing = T)
ham_data <- head(ham_term_frequency, 35)
kable(ham_data, "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T)

	x
ham	2600
can	1371
list	1211
will	1181
just	1076
get	1007
one	993
use	989
like	927
wrote	871
dont	830
date	796
people	778
mailing	777
new	765
time	714
now	681
email	678
message	654
url	639
also	603
think	556
said	548
way	547
even	518
make	493
work	471
using	466
know	450
see	445
spam	437
first	435
good	427
sep	425
world	419

Display ham term frequencies in a bar plot.

ham_term_freq <- data.frame(Term = names(ham_term_frequency), Frequency = ham_term_frequency)
ham_term_plot <- ggplot(subset(ham_term_freq, Frequency > 500), aes(x = reorder(Term, - Frequency), y = Frequency)) +
  geom_bar(stat = "identity", fill = '#4CAF50') +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  theme(panel.background = element_rect(fill = '#FFFFFF'))

ham_term_plot

Display ham term frequencies in a word cloud.

ham_terms <- names(ham_term_frequency)
wordcloud(ham_terms[1:50], ham_term_frequency[1:50], random.color = TRUE, colors = palette())

Calculate the frequency of spam terms and display them in a table.

# Create the Spam dataframe.
spam_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/spam"
spam_dataframe <- fetch_training_file_data(spam_directory, 'spam')

# Create the spam corpus and term matrix.
spam_corpus <- create_training_corpus(spam_dataframe, 'spam')

spam_term_matrix <- DocumentTermMatrix(spam_corpus)
spam_term_matrix <- removeSparseTerms(spam_term_matrix, 0.99)
spam_ordered <- as.matrix(spam_term_matrix)
spam_term_frequency <- colSums(spam_ordered)
spam_term_frequency <- sort(spam_term_frequency, decreasing = T)
spam_data <- head(spam_term_frequency, 35)
kable(spam_data, "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T)

	x
email	2664
will	2253
nbsp	1817
free	1590
spam	1500
can	1431
click	1423
please	1174
business	1154
get	1122
list	1010
money	940
information	886
now	834
receive	820
one	806
order	788
address	754
new	744
just	728
people	713
name	687
time	670
send	661
make	631
credit	623
home	621
mailing	603
want	578
removed	576
message	561
internet	558
dont	530
web	530
may	521

Display spam term frequencies in a bar plot.

spam_term_freq <- data.frame(Term = names(spam_term_frequency), Frequency = spam_term_frequency)
spam_term_plot <- ggplot(subset(spam_term_freq, Frequency > 500), aes(x = reorder(Term, - Frequency), y = Frequency)) +
  geom_bar(stat = "identity", fill = '#DC143C') +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  theme(panel.background = element_rect(fill = '#FFFFFF'))

spam_term_plot

Display spam term frequencies in a word cloud.

spam_terms <- names(spam_term_frequency)
wordcloud(spam_terms[1:50], spam_term_frequency[1:50], random.color = TRUE, colors = palette())

Prepare the data for prediction analysis by merging the Spam and Ham dataframes together.

combined_dataframe <- rbind(spam_dataframe, ham_dataframe)
combined_corpus <- create_training_corpus(combined_dataframe$message, 'combined') 

combined_term_matrix <- DocumentTermMatrix(combined_corpus)
combined_term_matrix <- removeSparseTerms(combined_term_matrix, 0.99)

combined_dataset = as.data.frame(as.matrix(combined_term_matrix))
combined_dataset$document_class <- combined_dataframe$document_class

Randomize the newly combined dataframe and split it into 2 seperate datasets - Training and Testing.

randomized_dataset <- combined_dataset[sample(c(1:length(combined_dataset)))]

split <- sample.split(randomized_dataset$document_class, SplitRatio = 0.8)
training_dataset <- subset(randomized_dataset, split == TRUE)
testing_dataset = subset(randomized_dataset, split == FALSE)

Perform the prediction analysis using the Random Forest model.

sample_size <- ncol(training_dataset) -1
classification <- randomForest(x = training_dataset[-sample_size], y = training_dataset$document_class, ntree = 3)

spam_prediction = predict(classification, newdata = testing_dataset[-sample_size])

confusion_matrix <- table(spam_prediction > 0, testing_dataset$document_class)
colnames(confusion_matrix) <- c('HAM', 'SPAM')

confusion_matrix

##        
##         HAM SPAM
##   FALSE 245    0
##   TRUE  255  280

Calculate the accuracy of the model.

success <- confusion_matrix['TRUE', 2] + confusion_matrix['FALSE', 1]
accuracy_percentage <- success / nrow(testing_dataset) * 100
accuracy <- round(accuracy_percentage, 2)
paste('Our prediction model has an accuracy of ', accuracy, '%.', sep = '')

## [1] "Our prediction model has an accuracy of 67.31%."