This markdown takes on the task of classifying emails as spam or ham. Sample directories are available online that contain labeled spam and ham emails. After downloading locally, we can determine characteristics through word tokenization.
Bring in packages
library(tidytext)
library(textdata)
library(dplyr)
library(stringr)
library(tidyr)
library(wordcloud)
Get list of files from local directory
set.seed(5547)
spamfiles <- sample(dir("corpus/spam_2/"),100, replace = FALSE)
hamfiles <- sample(dir("corpus/easy_ham_2/"),100, replace = FALSE)
Read all of the spam files into one text (using the keyword “Subject” as a reliable divider between the header and body)
spamdir <- "corpus/spam_2/"
spamtraining <- NA
for (i in spamfiles) {
spam <- readLines(paste0(spamdir,i))
bodymarker <- str_which(spam,"Subject")
spam <- spam %>%
data.frame(email = i)%>%
rename(text = 1)%>%
slice(-(1:bodymarker))%>%
summarise(test = toString(text))%>%
str_remove_all("<..*>|[:punct:]|[:digit:]")
spamtraining <- c(spamtraining, spam)
}
Read all of the ham files into one text
hamdir <- "corpus/easy_ham_2/"
hamtraining <- NA
for (i in hamfiles) {
ham <- readLines(paste0(hamdir,i))
bodymarker <- str_which(ham,"Subject")
ham <- ham %>%
data.frame(email = i)%>%
rename(text = 1)%>%
slice(-(1:bodymarker))%>%
summarise(test = toString(text))%>%
str_remove_all("<..*>|[:punct:]|[:digit:]")
hamtraining <- c(hamtraining, ham)
}
Get top 30 words for Spam
spamcorpus <- spamtraining%>%
unlist()%>%
data.frame(class="spam")%>%
rename(text = 1)%>%
unnest_tokens(word,text)%>%
anti_join(stop_words)
spamtop30<- spamcorpus%>%
count(word, sort = TRUE)%>%
rename(spamwords = n)%>%
head(30)
wordcloud(spamtop30$word,spamtop30$spamwords)
Get top 30 words for Spam
hamcorpus <- hamtraining%>%
unlist()%>%
data.frame(class="ham")%>%
rename(text = 1)%>%
unnest_tokens(word,text)%>%
anti_join(stop_words)
hamtop30<- hamcorpus%>%
count(word, sort = TRUE)%>%
rename(hamwords = n)%>%
head(30)
wordcloud(hamtop30$word,hamtop30$hamwords)
Bring in a different sample to test
set.seed(7894)
spamfiles <- sample(dir("corpus/spam_2/"),100, replace = FALSE)
hamfiles <- sample(dir("corpus/easy_ham_2/"),100, replace = FALSE)
testresults <- NA
for (i in spamfiles)
{
test <- readLines(paste0(spamdir,i))%>%
data_frame()%>%
rename(text = 1)%>%
unnest_tokens(word, text)%>%
left_join(spamtop30)%>%
left_join(hamtop30)%>%
summarise(hams = length(unique(hamwords)),spams = length(unique(spamwords)))
testresults <- union_all(testresults,test)%>%
mutate(spams > hams)
}
Correctly predicted 85 out of 100 spam emails
summary(testresults$`spams > hams`)
## Mode FALSE TRUE NA's
## logical 15 85 1
testresultsham <- NA
for (i in hamfiles)
{
test <- readLines(paste0(hamdir,i))%>%
data_frame()%>%
rename(text = 1)%>%
unnest_tokens(word, text)%>%
left_join(spamtop30)%>%
left_join(hamtop30)%>%
summarise(hams = length(unique(hamwords)),spams = length(unique(spamwords)))
testresultsham <- union_all(testresults,test)%>%
mutate(spams > hams)
}
Correctly predicted 85 out of 100 ham emails
summary(testresultsham$`spams > hams`)
## Mode FALSE TRUE NA's
## logical 16 85 1