DATA 607 Project 4

Load in Libraries

library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(e1071)
library(RColorBrewer)   
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(caret)
## Loading required package: lattice

Read in all of the ham files by pointing to the file location and using a loop to read in each file, and then converting the files into a dataframe

# Create Ham Dataframe
ham_dir="U:/spamham/easy_ham"
hamFileNames = list.files(ham_dir)

# List of docs
ham_docs_list <- NA
for(i in 1:length(hamFileNames))
{
  filepath<-paste0(ham_dir, "/", hamFileNames[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  ham_docs_list = c(ham_docs_list,list1)
  
}

# ham data frame
hamDF <-as.data.frame(unlist(ham_docs_list),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")

Read in all of the spam files by pointing to the file location and using a loop to read in each file, and then converting the files into a dataframe

# Create Spam Dataframe
spam_dir="U:/spamham/spam_2"
spamFileNames = list.files(spam_dir)

spam_docs_list <- NA
for(i in 1:length(spamFileNames))
{
  filepath<-paste0(spam_dir, "/", spamFileNames[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  spam_docs_list = c(spam_docs_list,list1)
  
}

spamDF <-as.data.frame(unlist(spam_docs_list),stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text","type")

Create a combined dataframe

# creating combined data frame of spam and ham
spam_ham_df <- rbind(hamDF, spamDF)

#Look at the length of the different messages
spam_ham_df$Text_Length <- nchar(spam_ham_df$text)
summary(spam_ham_df$Text_Length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4720    4720    5215    5038    5215    5215       2
#gives us an idea of the relationship between text length and whether or not something is spam
ggplot(spam_ham_df, aes(x = Text_Length, fill = type)) +theme_bw()+
  geom_histogram(binwidth = 5)+
  labs(y = "Text Count", x = "Length of Text", title = "Text Length by Label")
## Warning: Removed 2 rows containing non-finite values (stat_bin).

Build Word Clouds for Visualization Purposes

# Build a General Word Cloud
set.seed(1234)
wordcloud(words = spam_ham_df$text, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : phoboslabsnetnoteinccom could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : xmailmanversion could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : w142z064000057nycnydslcncnet could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : 10298824683116tmdadeepeddyvirciocom could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : 132581030015585munnariozau could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : 1721652254 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : compiled could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : d03e543c36 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : developers could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : exmhworkers could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : exmhworkerslistmanspamassassintaintorg could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : ftocpickmsgs could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : fuchsiacsmuozau could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : g7mbqpw13260 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : g7mby7g11259 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : g7mby7y11255 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : g7mbyrz04811 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : hand could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : happening could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : httpslistmanredhatcommailmanlistinfoexmhworkers could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : httpslistmanspamassassintaintorgmailmanprivateexmhworkers could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : listsubscribe could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : listunsubscribe could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : maillocalhost could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : mailtoexmhworkersrequestredhatcomsubjectunsubscribe could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : mhparam could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : obviously could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : popup could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : reach could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : repeatable could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : reproduce could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : sequences could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spam_ham_df$text, min.freq = 1, max.words =
## 200, : zzzzlocalhostnetnoteinccom could not be fit on page. It will not be
## plotted.

# spam word cloud
set.seed(1234)
wordcloud(words = spamDF$text, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## g72lqwv13294 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## greetings could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## httpwwwlinuxiemailmanlistinfoilug could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## income could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## insanity could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## jmilugjmasonorg could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## landed could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## lifechanging could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## lugh could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## mailing could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## maintainer could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## meaning could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## message could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## messageid could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## mimeversion could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## mistake could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## money could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## most could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## necessary could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## numerous could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## opportunities could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## page could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## past could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## period could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## permission could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## phobos could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## pitch could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## postfix could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## precedence could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## returnpath could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## sacred could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## sender could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## services could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## siddhi could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## signup could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## smtpd32706 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## something could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## start could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## startnow2002hotmailcom could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## state could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## stop could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## textplain could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## think could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## truth could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## unsolicitated could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## unsolicited could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## unsubscription could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## used could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## way could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## why could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## with could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## within could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## wonder could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## words could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## xbeenthere could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words = 200, :
## xmailmanversion could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = spamDF$text, min.freq = 1, max.words =
## 200, : yyyylocalhostnetnoteinccom could not be fit on page. It will not be
## plotted.

# ham word cloud
set.seed(1234)
wordcloud(words = hamDF$text, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## httpslistmanspamassassintaintorgmailmanlistinfoexmhworkers could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## 102994306626919tmdadeepeddyvirciocom could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## 1029944441398tmdadeepeddyvirciocom could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## 132581030015585munnariozau could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## deltacoepsuacth could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## developers could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## dogmaslashnullorg could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## exmhworkers could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## exmhworkerslistmanspamassassintaintorg could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## ftocpickmsgs could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## fuchsiacsmuozau could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## g7mbihl25223 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## g7mbqpw13260 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## g7mbwel29762 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## g7mby7g11259 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## g7mby7y11255 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## g7mbyrz04811 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## happening could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## httpslistmanredhatcommailmanlistinfoexmhworkers could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## httpslistmanspamassassintaintorgmailmanprivateexmhworkers could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## intmx1corpredhatcom could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## kremunnariozau could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## listarchive could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## listhelp could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## listsubscribe could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## listunsubscribe could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## mailing could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## maillocalhost could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## mailtoexmhworkersrequestredhatcomsubjectsubscribe could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## mailtoexmhworkersrequestredhatcomsubjectunsubscribe could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## mailtoexmhworkersrequestspamassassintaintorgsubjecthelp could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## mailtoexmhworkersspamassassintaintorg could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## marking could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## mhparam could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## mimeversion could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## munnariozau could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## obviously could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## popup could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## precedence could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## reach could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## references could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## relevant could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## repeatable could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## repository could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## reproduce could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## returnpath could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## robert could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## routing could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## sequences could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## since could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## singledrop could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## syntax could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## textplain could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## tkerror could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## today could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## window could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## without could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## xbeenthere could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## xmailmanversion could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## zzzzexmhspamassassintaintorg could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = hamDF$text, min.freq = 1, max.words = 200, :
## zzzzlocalhostnetnoteinccom could not be fit on page. It will not be
## plotted.

Subset the data into training and test sets

# Model to assess spam and ham
#use the caret package to create a 70/30 stratified split #classification and regression training 
set.seed(32984)
indexes <- createDataPartition(spam_ham_df$type, times = 1, p =0.7, list = FALSE) #gives back the index of the row numbers for the train set

#assign the training and the test data
train_spam_ham  <- spam_ham_df[indexes,]
test_spam_ham <- spam_ham_df[-indexes,]

#Verify Proportions of data
prop.table(table(train_spam_ham$type))
## 
##       ham      spam 
## 0.6415233 0.3584767
prop.table(table(test_spam_ham$type))
## 
##      ham     spam 
## 0.641574 0.358426

Create corpus file for both training and test data

# Create corpus for training and test data
train_email_corpus <- Corpus(VectorSource(train_spam_ham$text))
test_email_corpus <- Corpus(VectorSource(test_spam_ham$text))

train_clean_corpus <- tm_map(train_email_corpus ,removeNumbers)
## Warning in tm_map.SimpleCorpus(train_email_corpus, removeNumbers):
## transformation drops documents
test_clean_corpus <- tm_map(test_email_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(test_email_corpus, removeNumbers):
## transformation drops documents
train_clean_corpus <- tm_map(train_clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removePunctuation):
## transformation drops documents
test_clean_corpus <- tm_map(test_clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removePunctuation):
## transformation drops documents
train_clean_corpus <- tm_map(train_clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removeWords,
## stopwords()): transformation drops documents
test_clean_corpus  <- tm_map(test_clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removeWords,
## stopwords()): transformation drops documents
train_clean_corpus<- tm_map(train_clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, stripWhitespace):
## transformation drops documents
test_clean_corpus<- tm_map(test_clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, stripWhitespace):
## transformation drops documents
train_email_dtm <- DocumentTermMatrix(train_clean_corpus)
test_email_dtm <- DocumentTermMatrix(test_clean_corpus)

Use a naive bayes classifier

# count function
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

train_set <- apply(train_email_dtm, 2, convert_count)
test_set <- apply(test_email_dtm, 2, convert_count)

# classification of email
classifier <- naiveBayes(train_set, factor(train_spam_ham$type))

#Predict using the test data
test_pred <- predict(classifier, newdata=test_set)

table(test_pred, test_spam_ham$type)
##          
## test_pred ham spam
##      ham  750    1
##      spam   0  418

Corey Arnouts

November 5, 2018