library(tm)
library(knitr)
library(dplyr)
library(tidytext)
library(ggplot2)
library(kableExtra)
library(wordcloud)
library(tidyverse)
library(readtext)
library(caTools)
library(randomForest)

spam_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/spam"
ham_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/easy_ham"

Assignment Overview

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

List the first 6 Ham files in the “spam_ham_training/easy_ham” directory.

ham_training_files <- list.files(ham_directory)
head(ham_training_files)
## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"
## [6] "00006.253ea2f9a9cc36fa0b1129b04b806608"

List the first 6 Spam files in the “spam_ham_training/spam” directory.

spam_training_files <- list.files(spam_directory)
head(spam_training_files)
## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"

Define a function that removes email headers so that we are left with clean data for analysis.

#' Strips header data from emails.
#'
#' Removes irrelevant header information from emails so we
#' are left with clean body text for accurate test/training comparison.
#'
#' @param email Email content.
#'
#' @return Email body content free from header data.
#'
strip_email_headers <- function(email) {
  message <- str_split(email,"\n\n") %>% unlist()
  email_body_content <- paste(message[2:length(message)], collapse = ' ')

  return(email_body_content)
}

Define a function that retrieves data from the training files and returns a data frame that we can use for further analysis.

#' Retrieves data from spam and ham training files.
#'
#' Pulls in email training data from the given directory, cleans the data for analysis,
#' and returns it as a data frame.
#'
#' @param directory String: The path to the ham/spam training files directory.
#' @param type String: The type of email data being passed.
#'
#' @return A data frame of email training file data.
#'
fetch_training_file_data <- function(directory, type) {
  training_files <- list.files(directory)
  message_content <- NA
  count <- 1

  for (file in 1:length(training_files)) {
    file_path <- paste0(directory, '/', training_files[file])
    training_email <-suppressWarnings(warning(readtext(file_path, TRUE)))
    training_email_body <- strip_email_headers(training_email)
    training_email_body <- gsub("<.*?>", " ", training_email_body)
    message <- list(paste(training_email_body, collapse = '\n'))
    message_content <- c(message_content, message)
    count <- count + 1
  }
  
  # When we perform our prediction analysis of the data, the randomForest() function expects a
  # numeric label, so we create a new column called "document_class", and populate it with
  # numeric values based on the value of the "type" parameter (0 for ham, 1 for spam).
  if (type == 'ham') {
    document_class <- 0
  } else if (type == 'spam') {
    document_class <- 1
  }

  training_emails <- data.frame()
  training_emails <- as.data.frame(unlist(message_content), stringsAsFactors = FALSE)
  training_emails$message_type <- type
  training_emails$document_class <- document_class
  colnames(training_emails) <- c('message', 'message_type', 'document_class')
 
  return (training_emails)
}

Define a function that converts email dataframes to corpuses.

#' Converts spam/ham data frames to corpuses.
#'
#' Pulls in an email training dataframe, cleans the data,
#' and returns it as a corpus.
#'
#' @param email_data Dataframe: A dataframe containing ham/spam email data.
#' @param type String: The type of email data being passed.
#'
#' @return A corpus of training email data.
#'
create_training_corpus <- function(email_data, type) {
  # Create a corpus of email content and clean the data for analysis purposes.
  training_corpus <- VCorpus(VectorSource(email_data))
  
  if (type != 'combined') {
    meta(training_corpus, tag = 'messageType') <- type
  }

  training_corpus <- tm_map(training_corpus, content_transformer(function(x) iconv(x, "UTF-8", "ASCII")))
  training_corpus <- tm_map(training_corpus, content_transformer(tolower))
  training_corpus <- tm_map(training_corpus, stripWhitespace)
  training_corpus <- tm_map(training_corpus, PlainTextDocument)
  training_corpus <- tm_map(training_corpus, removePunctuation)
  training_corpus <- tm_map(training_corpus, removeNumbers)
  training_corpus <- tm_map(training_corpus, content_transformer(removeWords), stopwords("english"))   
  training_corpus <- tm_map(training_corpus, removeWords, stopwords("english")) 

  return(training_corpus)
}

Calculate the frequency of ham terms and display them in a table.

# Create the Ham dataframe.
ham_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/easy_ham"
ham_dataframe <- fetch_training_file_data(ham_directory, 'ham')

# Create the ham corpus and term matrix.
ham_corpus <- create_training_corpus(ham_dataframe, 'ham')
  
ham_term_matrix <- DocumentTermMatrix(ham_corpus)
ham_term_matrix <- removeSparseTerms(ham_term_matrix, 0.99)
ham_ordered <- as.matrix(ham_term_matrix)
ham_term_frequency <- colSums(ham_ordered)
ham_term_frequency <- sort(ham_term_frequency, decreasing = T)
ham_data <- head(ham_term_frequency, 35)
kable(ham_data, "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T)
x
ham 2600
can 1371
list 1211
will 1181
just 1076
get 1007
one 993
use 989
like 927
wrote 871
dont 830
date 796
people 778
mailing 777
new 765
time 714
now 681
email 678
message 654
url 639
also 603
think 556
said 548
way 547
even 518
make 493
work 471
using 466
know 450
see 445
spam 437
first 435
good 427
sep 425
world 419

Display ham term frequencies in a bar plot.

ham_term_freq <- data.frame(Term = names(ham_term_frequency), Frequency = ham_term_frequency)
ham_term_plot <- ggplot(subset(ham_term_freq, Frequency > 500), aes(x = reorder(Term, - Frequency), y = Frequency)) +
  geom_bar(stat = "identity", fill = '#4CAF50') +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  theme(panel.background = element_rect(fill = '#FFFFFF'))

ham_term_plot

Display ham term frequencies in a word cloud.

ham_terms <- names(ham_term_frequency)
wordcloud(ham_terms[1:50], ham_term_frequency[1:50], random.color = TRUE, colors = palette())

Calculate the frequency of spam terms and display them in a table.

# Create the Spam dataframe.
spam_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/spam"
spam_dataframe <- fetch_training_file_data(spam_directory, 'spam')

# Create the spam corpus and term matrix.
spam_corpus <- create_training_corpus(spam_dataframe, 'spam')

spam_term_matrix <- DocumentTermMatrix(spam_corpus)
spam_term_matrix <- removeSparseTerms(spam_term_matrix, 0.99)
spam_ordered <- as.matrix(spam_term_matrix)
spam_term_frequency <- colSums(spam_ordered)
spam_term_frequency <- sort(spam_term_frequency, decreasing = T)
spam_data <- head(spam_term_frequency, 35)
kable(spam_data, "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T)
x
email 2664
will 2253
nbsp 1817
free 1590
spam 1500
can 1431
click 1423
please 1174
business 1154
get 1122
list 1010
money 940
information 886
now 834
receive 820
one 806
order 788
address 754
new 744
just 728
people 713
name 687
time 670
send 661
make 631
credit 623
home 621
mailing 603
want 578
removed 576
message 561
internet 558
dont 530
web 530
may 521

Display spam term frequencies in a bar plot.

spam_term_freq <- data.frame(Term = names(spam_term_frequency), Frequency = spam_term_frequency)
spam_term_plot <- ggplot(subset(spam_term_freq, Frequency > 500), aes(x = reorder(Term, - Frequency), y = Frequency)) +
  geom_bar(stat = "identity", fill = '#DC143C') +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  theme(panel.background = element_rect(fill = '#FFFFFF'))

spam_term_plot

Display spam term frequencies in a word cloud.

spam_terms <- names(spam_term_frequency)
wordcloud(spam_terms[1:50], spam_term_frequency[1:50], random.color = TRUE, colors = palette())

Prepare the data for prediction analysis by merging the Spam and Ham dataframes together.

combined_dataframe <- rbind(spam_dataframe, ham_dataframe)
combined_corpus <- create_training_corpus(combined_dataframe$message, 'combined') 

combined_term_matrix <- DocumentTermMatrix(combined_corpus)
combined_term_matrix <- removeSparseTerms(combined_term_matrix, 0.99)

combined_dataset = as.data.frame(as.matrix(combined_term_matrix))
combined_dataset$document_class <- combined_dataframe$document_class

Randomize the newly combined dataframe and split it into 2 seperate datasets - Training and Testing.

randomized_dataset <- combined_dataset[sample(c(1:length(combined_dataset)))]

split <- sample.split(randomized_dataset$document_class, SplitRatio = 0.8)
training_dataset <- subset(randomized_dataset, split == TRUE)
testing_dataset = subset(randomized_dataset, split == FALSE)

Perform the prediction analysis using the Random Forest model.

sample_size <- ncol(training_dataset) -1
classification <- randomForest(x = training_dataset[-sample_size], y = training_dataset$document_class, ntree = 3)

spam_prediction = predict(classification, newdata = testing_dataset[-sample_size])

confusion_matrix <- table(spam_prediction > 0, testing_dataset$document_class)
colnames(confusion_matrix) <- c('HAM', 'SPAM')

confusion_matrix
##        
##         HAM SPAM
##   FALSE 245    0
##   TRUE  255  280

Calculate the accuracy of the model.

success <- confusion_matrix['TRUE', 2] + confusion_matrix['FALSE', 1]
accuracy_percentage <- success / nrow(testing_dataset) * 100
accuracy <- round(accuracy_percentage, 2)
paste('Our prediction model has an accuracy of ', accuracy, '%.', sep = '')
## [1] "Our prediction model has an accuracy of 67.31%."