Introduction

This markdown takes on the task of classifying emails as spam or ham. Sample directories are available online that contain labeled spam and ham emails. After downloading locally, we can determine characteristics through word tokenization.

Bring in packages

library(tidytext)
library(textdata)
library(dplyr)
library(stringr)
library(tidyr)
library(wordcloud)

Get list of files from local directory

set.seed(5547)
spamfiles <- sample(dir("corpus/spam_2/"),100, replace = FALSE)
hamfiles <- sample(dir("corpus/easy_ham_2/"),100, replace = FALSE)

Reading the files

Read all of the spam files into one text (using the keyword “Subject” as a reliable divider between the header and body)

spamdir <- "corpus/spam_2/"
spamtraining <- NA
for (i in spamfiles) {
  
spam <- readLines(paste0(spamdir,i))

bodymarker <- str_which(spam,"Subject")

spam <- spam %>%
  data.frame(email = i)%>%
  rename(text = 1)%>%
  slice(-(1:bodymarker))%>%
  summarise(test = toString(text))%>%
  str_remove_all("<..*>|[:punct:]|[:digit:]")

spamtraining <- c(spamtraining, spam)
}

Read all of the ham files into one text

hamdir <- "corpus/easy_ham_2/"
hamtraining <- NA
for (i in hamfiles) {
  
ham <- readLines(paste0(hamdir,i))

bodymarker <- str_which(ham,"Subject")

ham <- ham %>%
  data.frame(email = i)%>%
  rename(text = 1)%>%
  slice(-(1:bodymarker))%>%
  summarise(test = toString(text))%>%
  str_remove_all("<..*>|[:punct:]|[:digit:]")

hamtraining <- c(hamtraining, ham)
}

Word Scores

Get top 30 words for Spam

spamcorpus <- spamtraining%>%
  unlist()%>%
  data.frame(class="spam")%>%
  rename(text = 1)%>%
  unnest_tokens(word,text)%>%
  anti_join(stop_words)

spamtop30<- spamcorpus%>%
  count(word, sort = TRUE)%>%
  rename(spamwords = n)%>%
  head(30)
wordcloud(spamtop30$word,spamtop30$spamwords)

Get top 30 words for Spam

hamcorpus <- hamtraining%>%
  unlist()%>%
  data.frame(class="ham")%>%
  rename(text = 1)%>%
  unnest_tokens(word,text)%>%
  anti_join(stop_words)

hamtop30<- hamcorpus%>%
  count(word, sort = TRUE)%>%
  rename(hamwords = n)%>%
  head(30)
wordcloud(hamtop30$word,hamtop30$hamwords)

Test spam vs ham

Bring in a different sample to test

set.seed(7894)
spamfiles <- sample(dir("corpus/spam_2/"),100, replace = FALSE)
hamfiles <- sample(dir("corpus/easy_ham_2/"),100, replace = FALSE)

testresults <- NA
for (i in spamfiles) 
{
test <- readLines(paste0(spamdir,i))%>%
  data_frame()%>%
  rename(text = 1)%>%
  unnest_tokens(word, text)%>%
  left_join(spamtop30)%>%
  left_join(hamtop30)%>%
  summarise(hams = length(unique(hamwords)),spams = length(unique(spamwords)))

testresults <- union_all(testresults,test)%>%
  mutate(spams > hams)
}

Correctly predicted 85 out of 100 spam emails

summary(testresults$`spams > hams`)

##    Mode   FALSE    TRUE    NA's 
## logical      15      85       1

testresultsham <- NA
for (i in hamfiles) 
{
test <- readLines(paste0(hamdir,i))%>%
  data_frame()%>%
  rename(text = 1)%>%
  unnest_tokens(word, text)%>%
  left_join(spamtop30)%>%
  left_join(hamtop30)%>%
  summarise(hams = length(unique(hamwords)),spams = length(unique(spamwords)))

testresultsham <- union_all(testresults,test)%>%
  mutate(spams > hams)
}

Correctly predicted 85 out of 100 ham emails

summary(testresultsham$`spams > hams`)

##    Mode   FALSE    TRUE    NA's 
## logical      16      85       1

Document Classification

Mustafa Telab

11/10/2020

Introduction

Reading the files

Word Scores

Test spam vs ham