library(tm)
library(knitr)
library(dplyr)
library(tidytext)
library(ggplot2)
library(kableExtra)
library(wordcloud)
library(tidyverse)
library(readtext)
library(caTools)
library(randomForest)
spam_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/spam"
ham_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/easy_ham"
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/
ham_training_files <- list.files(ham_directory)
head(ham_training_files)
## [1] "00001.7c53336b37003a9286aba55d2945844c"
## [2] "00002.9c4069e25e1ef370c078db7ee85ff9ac"
## [3] "00003.860e3c3cee1b42ead714c5c874fe25f7"
## [4] "00004.864220c5b6930b209cc287c361c99af1"
## [5] "00005.bf27cdeaf0b8c4647ecd61b1d09da613"
## [6] "00006.253ea2f9a9cc36fa0b1129b04b806608"
spam_training_files <- list.files(spam_directory)
head(spam_training_files)
## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
#' Strips header data from emails.
#'
#' Removes irrelevant header information from emails so we
#' are left with clean body text for accurate test/training comparison.
#'
#' @param email Email content.
#'
#' @return Email body content free from header data.
#'
strip_email_headers <- function(email) {
message <- str_split(email,"\n\n") %>% unlist()
email_body_content <- paste(message[2:length(message)], collapse = ' ')
return(email_body_content)
}
#' Retrieves data from spam and ham training files.
#'
#' Pulls in email training data from the given directory, cleans the data for analysis,
#' and returns it as a data frame.
#'
#' @param directory String: The path to the ham/spam training files directory.
#' @param type String: The type of email data being passed.
#'
#' @return A data frame of email training file data.
#'
fetch_training_file_data <- function(directory, type) {
training_files <- list.files(directory)
message_content <- NA
count <- 1
for (file in 1:length(training_files)) {
file_path <- paste0(directory, '/', training_files[file])
training_email <-suppressWarnings(warning(readtext(file_path, TRUE)))
training_email_body <- strip_email_headers(training_email)
training_email_body <- gsub("<.*?>", " ", training_email_body)
message <- list(paste(training_email_body, collapse = '\n'))
message_content <- c(message_content, message)
count <- count + 1
}
# When we perform our prediction analysis of the data, the randomForest() function expects a
# numeric label, so we create a new column called "document_class", and populate it with
# numeric values based on the value of the "type" parameter (0 for ham, 1 for spam).
if (type == 'ham') {
document_class <- 0
} else if (type == 'spam') {
document_class <- 1
}
training_emails <- data.frame()
training_emails <- as.data.frame(unlist(message_content), stringsAsFactors = FALSE)
training_emails$message_type <- type
training_emails$document_class <- document_class
colnames(training_emails) <- c('message', 'message_type', 'document_class')
return (training_emails)
}
#' Converts spam/ham data frames to corpuses.
#'
#' Pulls in an email training dataframe, cleans the data,
#' and returns it as a corpus.
#'
#' @param email_data Dataframe: A dataframe containing ham/spam email data.
#' @param type String: The type of email data being passed.
#'
#' @return A corpus of training email data.
#'
create_training_corpus <- function(email_data, type) {
# Create a corpus of email content and clean the data for analysis purposes.
training_corpus <- VCorpus(VectorSource(email_data))
if (type != 'combined') {
meta(training_corpus, tag = 'messageType') <- type
}
training_corpus <- tm_map(training_corpus, content_transformer(function(x) iconv(x, "UTF-8", "ASCII")))
training_corpus <- tm_map(training_corpus, content_transformer(tolower))
training_corpus <- tm_map(training_corpus, stripWhitespace)
training_corpus <- tm_map(training_corpus, PlainTextDocument)
training_corpus <- tm_map(training_corpus, removePunctuation)
training_corpus <- tm_map(training_corpus, removeNumbers)
training_corpus <- tm_map(training_corpus, content_transformer(removeWords), stopwords("english"))
training_corpus <- tm_map(training_corpus, removeWords, stopwords("english"))
return(training_corpus)
}
# Create the Ham dataframe.
ham_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/easy_ham"
ham_dataframe <- fetch_training_file_data(ham_directory, 'ham')
# Create the ham corpus and term matrix.
ham_corpus <- create_training_corpus(ham_dataframe, 'ham')
ham_term_matrix <- DocumentTermMatrix(ham_corpus)
ham_term_matrix <- removeSparseTerms(ham_term_matrix, 0.99)
ham_ordered <- as.matrix(ham_term_matrix)
ham_term_frequency <- colSums(ham_ordered)
ham_term_frequency <- sort(ham_term_frequency, decreasing = T)
ham_data <- head(ham_term_frequency, 35)
kable(ham_data, "html", escape = F) %>%
kable_styling("striped", full_width = T) %>%
column_spec(1, bold = T)
| x | |
|---|---|
| ham | 2600 |
| can | 1371 |
| list | 1211 |
| will | 1181 |
| just | 1076 |
| get | 1007 |
| one | 993 |
| use | 989 |
| like | 927 |
| wrote | 871 |
| dont | 830 |
| date | 796 |
| people | 778 |
| mailing | 777 |
| new | 765 |
| time | 714 |
| now | 681 |
| 678 | |
| message | 654 |
| url | 639 |
| also | 603 |
| think | 556 |
| said | 548 |
| way | 547 |
| even | 518 |
| make | 493 |
| work | 471 |
| using | 466 |
| know | 450 |
| see | 445 |
| spam | 437 |
| first | 435 |
| good | 427 |
| sep | 425 |
| world | 419 |
ham_term_freq <- data.frame(Term = names(ham_term_frequency), Frequency = ham_term_frequency)
ham_term_plot <- ggplot(subset(ham_term_freq, Frequency > 500), aes(x = reorder(Term, - Frequency), y = Frequency)) +
geom_bar(stat = "identity", fill = '#4CAF50') +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
theme(panel.background = element_rect(fill = '#FFFFFF'))
ham_term_plot
ham_terms <- names(ham_term_frequency)
wordcloud(ham_terms[1:50], ham_term_frequency[1:50], random.color = TRUE, colors = palette())
# Create the Spam dataframe.
spam_directory <- "/Users/stephenhaslett/Data607/data_607_project_4/spam_ham_training/spam"
spam_dataframe <- fetch_training_file_data(spam_directory, 'spam')
# Create the spam corpus and term matrix.
spam_corpus <- create_training_corpus(spam_dataframe, 'spam')
spam_term_matrix <- DocumentTermMatrix(spam_corpus)
spam_term_matrix <- removeSparseTerms(spam_term_matrix, 0.99)
spam_ordered <- as.matrix(spam_term_matrix)
spam_term_frequency <- colSums(spam_ordered)
spam_term_frequency <- sort(spam_term_frequency, decreasing = T)
spam_data <- head(spam_term_frequency, 35)
kable(spam_data, "html", escape = F) %>%
kable_styling("striped", full_width = T) %>%
column_spec(1, bold = T)
| x | |
|---|---|
| 2664 | |
| will | 2253 |
| nbsp | 1817 |
| free | 1590 |
| spam | 1500 |
| can | 1431 |
| click | 1423 |
| please | 1174 |
| business | 1154 |
| get | 1122 |
| list | 1010 |
| money | 940 |
| information | 886 |
| now | 834 |
| receive | 820 |
| one | 806 |
| order | 788 |
| address | 754 |
| new | 744 |
| just | 728 |
| people | 713 |
| name | 687 |
| time | 670 |
| send | 661 |
| make | 631 |
| credit | 623 |
| home | 621 |
| mailing | 603 |
| want | 578 |
| removed | 576 |
| message | 561 |
| internet | 558 |
| dont | 530 |
| web | 530 |
| may | 521 |
spam_term_freq <- data.frame(Term = names(spam_term_frequency), Frequency = spam_term_frequency)
spam_term_plot <- ggplot(subset(spam_term_freq, Frequency > 500), aes(x = reorder(Term, - Frequency), y = Frequency)) +
geom_bar(stat = "identity", fill = '#DC143C') +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
theme(panel.background = element_rect(fill = '#FFFFFF'))
spam_term_plot
spam_terms <- names(spam_term_frequency)
wordcloud(spam_terms[1:50], spam_term_frequency[1:50], random.color = TRUE, colors = palette())
combined_dataframe <- rbind(spam_dataframe, ham_dataframe)
combined_corpus <- create_training_corpus(combined_dataframe$message, 'combined')
combined_term_matrix <- DocumentTermMatrix(combined_corpus)
combined_term_matrix <- removeSparseTerms(combined_term_matrix, 0.99)
combined_dataset = as.data.frame(as.matrix(combined_term_matrix))
combined_dataset$document_class <- combined_dataframe$document_class
randomized_dataset <- combined_dataset[sample(c(1:length(combined_dataset)))]
split <- sample.split(randomized_dataset$document_class, SplitRatio = 0.8)
training_dataset <- subset(randomized_dataset, split == TRUE)
testing_dataset = subset(randomized_dataset, split == FALSE)
sample_size <- ncol(training_dataset) -1
classification <- randomForest(x = training_dataset[-sample_size], y = training_dataset$document_class, ntree = 3)
spam_prediction = predict(classification, newdata = testing_dataset[-sample_size])
confusion_matrix <- table(spam_prediction > 0, testing_dataset$document_class)
colnames(confusion_matrix) <- c('HAM', 'SPAM')
confusion_matrix
##
## HAM SPAM
## FALSE 245 0
## TRUE 255 280
success <- confusion_matrix['TRUE', 2] + confusion_matrix['FALSE', 1]
accuracy_percentage <- success / nrow(testing_dataset) * 100
accuracy <- round(accuracy_percentage, 2)
paste('Our prediction model has an accuracy of ', accuracy, '%.', sep = '')
## [1] "Our prediction model has an accuracy of 67.31%."