Intro

library(tidytext)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.2
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

Loading the Corpus

ham_dir <- "/Users/genesismiddleton/Desktop/Data Acquisition & Management/Project 4/easy_ham"
spam_dir <- "/Users/genesismiddleton/Desktop/Data Acquisition & Management/Project 4/spam"
ham_files <- list.files(ham_dir)
spam_files <- list.files(spam_dir)

Emails -> Dataframe

#transforming "ham" emails into a tibble
ham <- tibble()
for (file in ham_files) {
  f1 <- paste(ham_dir, '/', file, sep="")
  lines <- readLines(f1)
  first_blank <- which(lines == "")[1] # find the index of the first blank line
  if (!is.na(first_blank)) {
    content <- lines[(first_blank+1):length(lines)] # use only the lines after the first blank line
  } else {
    content <- lines # if there are no blank lines, use all the lines
  }
  x <- tibble(content = content, label = "ham", file = file) # add a "file" column
  ham <- rbind(ham, x)
}


#transforming "spam" emails into a tibble
spam <- tibble()
for (file in spam_files) {
  f1 <- paste(spam_dir, '/', file, sep="")
  lines <- readLines(f1)
  first_blank <- which(lines == "")[1] # find the index of the first blank line
  if (!is.na(first_blank)) {
    content <- lines[(first_blank+1):length(lines)] # use only the lines after the first blank line
  } else {
    content <- lines
  }
  x <- tibble(content = content, label = "spam", file = file) # add a "file" column
  spam <- rbind(spam, x)
}
#binding into 1 tibble

all_emails <- rbind(ham, spam)

# Create a tibble to store the unique file names and their assigned numbers
file_numbers <- tibble(file = unique(all_emails$file)) %>% 
  mutate(file_number = row_number())

# Add a new column to all_emails with the assigned file number
all_emails <- all_emails %>% 
  left_join(file_numbers, by = "file") %>% 
  unnest_tokens(word, content) %>%
  filter(!str_detect(word, "(\\d)|(_)")) %>% 
  anti_join(stop_words)
## Joining with `by = join_by(word)`
all_emails <- data.frame(all_emails)
all_emails <- select(all_emails, ncol(all_emails), everything())
all_emails <- all_emails %>%
  select(-file)

all_emails <- all_emails %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
html_words <- c("font", "td", "br","size","tr","width","http","color","align","height","nbsp","center","table","border","arial","option","img","src","style","href", "alt", "iiq.us", "receive", "left", "form", "tbody", "title", "valign", "information", "head", "tahoma", "li", "report", "input", "body", "mv", "cellspacing", "cellpadding", "list", "colspan", "click", "class","gif", "helvetica", "ffffff", "images", "email", "div", "top", "blockquote", "margin", "mail", "verdana","bgcolor", "type", "content", "html", "span", "text", "free", "serif", "sans", "rpm", "listinfo", "spamassassin", "exmh", "url", "https")

all_emails <- all_emails[!(all_emails$word %in% html_words),]
all_emails <- all_emails %>%
  group_by(label) %>%
  add_count(file_number, word, sort = TRUE) %>%
  ungroup() %>%
  distinct(word, label, .keep_all = TRUE)
all_emails <- all_emails %>%
  group_by(label) %>%
  summarize(word, file_number, n, total = sum(n)) %>%
  mutate(percentage = (n/total)) 
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
all_emails %>%
  filter(label == "ham") %>%
  arrange(desc(n)) %>%
  head(10)
## # A tibble: 10 × 6
## # Groups:   label [1]
##    label word      file_number     n total percentage
##    <chr> <chr>           <int> <int> <int>      <dbl>
##  1 ham   alb               507   119 49934    0.00238
##  2 ham   united            737    88 49934    0.00176
##  3 ham   echo             1464    82 49934    0.00164
##  4 ham   bush              627    78 49934    0.00156
##  5 ham   president         627    76 49934    0.00152
##  6 ham   configure        1464    72 49934    0.00144
##  7 ham   ximian             98    63 49934    0.00126
##  8 ham   unseen             28    61 49934    0.00122
##  9 ham   lists              28    61 49934    0.00122
## 10 ham   msgs               28    59 49934    0.00118
all_emails %>%
  filter(label == "spam") %>%
  arrange(desc(n)) %>%
  head(10)
## # A tibble: 10 × 6
## # Groups:   label [1]
##    label word       file_number     n total percentage
##    <chr> <chr>            <int> <int> <int>      <dbl>
##  1 spam  enenkio           2807   126 31237    0.00403
##  2 spam  mso               2693   104 31237    0.00333
##  3 spam  padding           2799   104 31237    0.00333
##  4 spam  de                2819   102 31237    0.00327
##  5 spam  kingdom           2807    93 31237    0.00298
##  6 spam  islands           2807    92 31237    0.00295
##  7 spam  bottom            2998    77 31237    0.00247
##  8 spam  family            2799    71 31237    0.00227
##  9 spam  atoll             2807    70 31237    0.00224
## 10 spam  background        2799    69 31237    0.00221

Quick Sentiment Analysis

get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows
ham_sentiment <- all_emails %>%
   filter(label == "ham")

ham_sentiment %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(word) %>%
  summarise(label, sentiment = sum(value), file_number, n, total, percentage) %>%
  arrange(sentiment)
## Joining with `by = join_by(word)`
## # A tibble: 1,423 × 7
##    word         label sentiment file_number     n total percentage
##    <chr>        <chr>     <dbl>       <int> <int> <int>      <dbl>
##  1 bastard      ham          -5         295     1 49934  0.0000200
##  2 bastards     ham          -5          10     1 49934  0.0000200
##  3 bitch        ham          -5         125     1 49934  0.0000200
##  4 bitches      ham          -5        1826     1 49934  0.0000200
##  5 cock         ham          -5         244     1 49934  0.0000200
##  6 cunt         ham          -5         682     1 49934  0.0000200
##  7 ass          ham          -4         728     4 49934  0.0000801
##  8 asshole      ham          -4         699     1 49934  0.0000200
##  9 bullshit     ham          -4         699     1 49934  0.0000200
## 10 catastrophic ham          -4         730     2 49934  0.0000401
## # ℹ 1,413 more rows
spam_sentiment <- all_emails %>%
   filter(label == "spam")

spam_sentiment %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(word) %>%
  summarise(label, sentiment = sum(value), file_number, n, total, percentage) %>%
  arrange(sentiment)
## Joining with `by = join_by(word)`
## # A tibble: 700 × 7
##    word       label sentiment file_number     n total percentage
##    <chr>      <chr>     <dbl>       <int> <int> <int>      <dbl>
##  1 cock       spam         -5        2599     1 31237  0.0000320
##  2 niggas     spam         -5        2881     1 31237  0.0000320
##  3 nigger     spam         -5        2881     1 31237  0.0000320
##  4 ass        spam         -4        2937    14 31237  0.000448 
##  5 fraud      spam         -4        2807     4 31237  0.000128 
##  6 fraudulent spam         -4        2807    14 31237  0.000448 
##  7 fucking    spam         -4        2601     1 31237  0.0000320
##  8 hell       spam         -4        2812     2 31237  0.0000640
##  9 torture    spam         -4        2785     1 31237  0.0000320
## 10 abuse      spam         -3        2807     7 31237  0.000224 
## # ℹ 690 more rows
all_emails %>%
inner_join(get_sentiments("afinn")) %>%
  group_by(label) %>%
  summarise(sentiment = sum(value)) %>%
  arrange(sentiment)
## Joining with `by = join_by(word)`
## # A tibble: 2 × 2
##   label sentiment
##   <chr>     <dbl>
## 1 ham        -648
## 2 spam        -19

Training and Testing

set.seed(160)

h <- runif(nrow(all_emails))

all_emails_r <- all_emails[order(h), ]
all_emails_r
## # A tibble: 40,260 × 6
## # Groups:   label [2]
##    label word         file_number     n total percentage
##    <chr> <chr>              <int> <int> <int>      <dbl>
##  1 spam  easy                3013     4 31237  0.000128 
##  2 ham   scoot                828     1 49934  0.0000200
##  3 spam  avenue              2972     1 31237  0.0000320
##  4 ham   specop              1021     1 49934  0.0000200
##  5 spam  dressed             2881     1 31237  0.0000320
##  6 ham   canvas               515     1 49934  0.0000200
##  7 spam  introduce           2752     2 31237  0.0000640
##  8 ham   additionally          36     1 49934  0.0000200
##  9 ham   magne               1172     1 49934  0.0000200
## 10 ham   contributors         315     1 49934  0.0000200
## # ℹ 40,250 more rows
train <- all_emails_r[1:70, ]
test <- all_emails_r[71:100, ]
# Create corpus for training and test data
train_word_corpus <- Corpus(VectorSource(train$word))
test_word_corpus <- Corpus(VectorSource(test$word))

train_clean_corpus <- tm_map(train_word_corpus ,removeNumbers)
## Warning in tm_map.SimpleCorpus(train_word_corpus, removeNumbers):
## transformation drops documents
test_clean_corpus <- tm_map(test_word_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(test_word_corpus, removeNumbers): transformation
## drops documents
train_clean_corpus <- tm_map(train_clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removePunctuation):
## transformation drops documents
test_clean_corpus <- tm_map(test_clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removePunctuation):
## transformation drops documents
train_clean_corpus <- tm_map(train_clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removeWords, stopwords()):
## transformation drops documents
test_clean_corpus  <- tm_map(test_clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removeWords, stopwords()):
## transformation drops documents
train_clean_corpus<- tm_map(train_clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, stripWhitespace):
## transformation drops documents
test_clean_corpus<- tm_map(test_clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, stripWhitespace):
## transformation drops documents
train_word_dtm <- DocumentTermMatrix(train_clean_corpus)
test_word_dtm <- DocumentTermMatrix(test_clean_corpus)
convert_count <- function(x) {
  y <- ifelse(x > 0, 1, 0)
  y <- factor(y, levels=c(0, 1), labels=c("No", "Yes"))
  y
}

train_x <- apply(train_word_dtm, 2, convert_count)
test_x <- apply(test_word_dtm, 2, convert_count)

library(e1071)
# Get the number of elements in train$label
n <- length(train$label)


train_x <- train_x[1:n,]
train$label <- train$label[1:n]

# classification of email
classifier <- naiveBayes(train_x, train$label)
test_pred <- predict(classifier, newdata=test_x)

table(test_pred, test$label)
##          
## test_pred ham spam
##      ham   20    9
##      spam   1    0

Conclusion