Intro
library(tidytext)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ dplyr 1.1.2
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
Loading the Corpus
ham_dir <- "/Users/genesismiddleton/Desktop/Data Acquisition & Management/Project 4/easy_ham"
spam_dir <- "/Users/genesismiddleton/Desktop/Data Acquisition & Management/Project 4/spam"
ham_files <- list.files(ham_dir)
spam_files <- list.files(spam_dir)
Emails -> Dataframe
#transforming "ham" emails into a tibble
ham <- tibble()
for (file in ham_files) {
f1 <- paste(ham_dir, '/', file, sep="")
lines <- readLines(f1)
first_blank <- which(lines == "")[1] # find the index of the first blank line
if (!is.na(first_blank)) {
content <- lines[(first_blank+1):length(lines)] # use only the lines after the first blank line
} else {
content <- lines # if there are no blank lines, use all the lines
}
x <- tibble(content = content, label = "ham", file = file) # add a "file" column
ham <- rbind(ham, x)
}
#transforming "spam" emails into a tibble
spam <- tibble()
for (file in spam_files) {
f1 <- paste(spam_dir, '/', file, sep="")
lines <- readLines(f1)
first_blank <- which(lines == "")[1] # find the index of the first blank line
if (!is.na(first_blank)) {
content <- lines[(first_blank+1):length(lines)] # use only the lines after the first blank line
} else {
content <- lines
}
x <- tibble(content = content, label = "spam", file = file) # add a "file" column
spam <- rbind(spam, x)
}
#binding into 1 tibble
all_emails <- rbind(ham, spam)
# Create a tibble to store the unique file names and their assigned numbers
file_numbers <- tibble(file = unique(all_emails$file)) %>%
mutate(file_number = row_number())
# Add a new column to all_emails with the assigned file number
all_emails <- all_emails %>%
left_join(file_numbers, by = "file") %>%
unnest_tokens(word, content) %>%
filter(!str_detect(word, "(\\d)|(_)")) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
all_emails <- data.frame(all_emails)
all_emails <- select(all_emails, ncol(all_emails), everything())
all_emails <- all_emails %>%
select(-file)
all_emails <- all_emails %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
html_words <- c("font", "td", "br","size","tr","width","http","color","align","height","nbsp","center","table","border","arial","option","img","src","style","href", "alt", "iiq.us", "receive", "left", "form", "tbody", "title", "valign", "information", "head", "tahoma", "li", "report", "input", "body", "mv", "cellspacing", "cellpadding", "list", "colspan", "click", "class","gif", "helvetica", "ffffff", "images", "email", "div", "top", "blockquote", "margin", "mail", "verdana","bgcolor", "type", "content", "html", "span", "text", "free", "serif", "sans", "rpm", "listinfo", "spamassassin", "exmh", "url", "https")
all_emails <- all_emails[!(all_emails$word %in% html_words),]
all_emails <- all_emails %>%
group_by(label) %>%
add_count(file_number, word, sort = TRUE) %>%
ungroup() %>%
distinct(word, label, .keep_all = TRUE)
all_emails <- all_emails %>%
group_by(label) %>%
summarize(word, file_number, n, total = sum(n)) %>%
mutate(percentage = (n/total))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
all_emails %>%
filter(label == "ham") %>%
arrange(desc(n)) %>%
head(10)
## # A tibble: 10 × 6
## # Groups: label [1]
## label word file_number n total percentage
## <chr> <chr> <int> <int> <int> <dbl>
## 1 ham alb 507 119 49934 0.00238
## 2 ham united 737 88 49934 0.00176
## 3 ham echo 1464 82 49934 0.00164
## 4 ham bush 627 78 49934 0.00156
## 5 ham president 627 76 49934 0.00152
## 6 ham configure 1464 72 49934 0.00144
## 7 ham ximian 98 63 49934 0.00126
## 8 ham unseen 28 61 49934 0.00122
## 9 ham lists 28 61 49934 0.00122
## 10 ham msgs 28 59 49934 0.00118
all_emails %>%
filter(label == "spam") %>%
arrange(desc(n)) %>%
head(10)
## # A tibble: 10 × 6
## # Groups: label [1]
## label word file_number n total percentage
## <chr> <chr> <int> <int> <int> <dbl>
## 1 spam enenkio 2807 126 31237 0.00403
## 2 spam mso 2693 104 31237 0.00333
## 3 spam padding 2799 104 31237 0.00333
## 4 spam de 2819 102 31237 0.00327
## 5 spam kingdom 2807 93 31237 0.00298
## 6 spam islands 2807 92 31237 0.00295
## 7 spam bottom 2998 77 31237 0.00247
## 8 spam family 2799 71 31237 0.00227
## 9 spam atoll 2807 70 31237 0.00224
## 10 spam background 2799 69 31237 0.00221
Quick Sentiment Analysis
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
ham_sentiment <- all_emails %>%
filter(label == "ham")
ham_sentiment %>%
inner_join(get_sentiments("afinn")) %>%
group_by(word) %>%
summarise(label, sentiment = sum(value), file_number, n, total, percentage) %>%
arrange(sentiment)
## Joining with `by = join_by(word)`
## # A tibble: 1,423 × 7
## word label sentiment file_number n total percentage
## <chr> <chr> <dbl> <int> <int> <int> <dbl>
## 1 bastard ham -5 295 1 49934 0.0000200
## 2 bastards ham -5 10 1 49934 0.0000200
## 3 bitch ham -5 125 1 49934 0.0000200
## 4 bitches ham -5 1826 1 49934 0.0000200
## 5 cock ham -5 244 1 49934 0.0000200
## 6 cunt ham -5 682 1 49934 0.0000200
## 7 ass ham -4 728 4 49934 0.0000801
## 8 asshole ham -4 699 1 49934 0.0000200
## 9 bullshit ham -4 699 1 49934 0.0000200
## 10 catastrophic ham -4 730 2 49934 0.0000401
## # ℹ 1,413 more rows
spam_sentiment <- all_emails %>%
filter(label == "spam")
spam_sentiment %>%
inner_join(get_sentiments("afinn")) %>%
group_by(word) %>%
summarise(label, sentiment = sum(value), file_number, n, total, percentage) %>%
arrange(sentiment)
## Joining with `by = join_by(word)`
## # A tibble: 700 × 7
## word label sentiment file_number n total percentage
## <chr> <chr> <dbl> <int> <int> <int> <dbl>
## 1 cock spam -5 2599 1 31237 0.0000320
## 2 niggas spam -5 2881 1 31237 0.0000320
## 3 nigger spam -5 2881 1 31237 0.0000320
## 4 ass spam -4 2937 14 31237 0.000448
## 5 fraud spam -4 2807 4 31237 0.000128
## 6 fraudulent spam -4 2807 14 31237 0.000448
## 7 fucking spam -4 2601 1 31237 0.0000320
## 8 hell spam -4 2812 2 31237 0.0000640
## 9 torture spam -4 2785 1 31237 0.0000320
## 10 abuse spam -3 2807 7 31237 0.000224
## # ℹ 690 more rows
all_emails %>%
inner_join(get_sentiments("afinn")) %>%
group_by(label) %>%
summarise(sentiment = sum(value)) %>%
arrange(sentiment)
## Joining with `by = join_by(word)`
## # A tibble: 2 × 2
## label sentiment
## <chr> <dbl>
## 1 ham -648
## 2 spam -19
Training and Testing
set.seed(160)
h <- runif(nrow(all_emails))
all_emails_r <- all_emails[order(h), ]
all_emails_r
## # A tibble: 40,260 × 6
## # Groups: label [2]
## label word file_number n total percentage
## <chr> <chr> <int> <int> <int> <dbl>
## 1 spam easy 3013 4 31237 0.000128
## 2 ham scoot 828 1 49934 0.0000200
## 3 spam avenue 2972 1 31237 0.0000320
## 4 ham specop 1021 1 49934 0.0000200
## 5 spam dressed 2881 1 31237 0.0000320
## 6 ham canvas 515 1 49934 0.0000200
## 7 spam introduce 2752 2 31237 0.0000640
## 8 ham additionally 36 1 49934 0.0000200
## 9 ham magne 1172 1 49934 0.0000200
## 10 ham contributors 315 1 49934 0.0000200
## # ℹ 40,250 more rows
train <- all_emails_r[1:70, ]
test <- all_emails_r[71:100, ]
# Create corpus for training and test data
train_word_corpus <- Corpus(VectorSource(train$word))
test_word_corpus <- Corpus(VectorSource(test$word))
train_clean_corpus <- tm_map(train_word_corpus ,removeNumbers)
## Warning in tm_map.SimpleCorpus(train_word_corpus, removeNumbers):
## transformation drops documents
test_clean_corpus <- tm_map(test_word_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(test_word_corpus, removeNumbers): transformation
## drops documents
train_clean_corpus <- tm_map(train_clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removePunctuation):
## transformation drops documents
test_clean_corpus <- tm_map(test_clean_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removePunctuation):
## transformation drops documents
train_clean_corpus <- tm_map(train_clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(train_clean_corpus, removeWords, stopwords()):
## transformation drops documents
test_clean_corpus <- tm_map(test_clean_corpus, removeWords, stopwords())
## Warning in tm_map.SimpleCorpus(test_clean_corpus, removeWords, stopwords()):
## transformation drops documents
train_clean_corpus<- tm_map(train_clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(train_clean_corpus, stripWhitespace):
## transformation drops documents
test_clean_corpus<- tm_map(test_clean_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(test_clean_corpus, stripWhitespace):
## transformation drops documents
train_word_dtm <- DocumentTermMatrix(train_clean_corpus)
test_word_dtm <- DocumentTermMatrix(test_clean_corpus)
convert_count <- function(x) {
y <- ifelse(x > 0, 1, 0)
y <- factor(y, levels=c(0, 1), labels=c("No", "Yes"))
y
}
train_x <- apply(train_word_dtm, 2, convert_count)
test_x <- apply(test_word_dtm, 2, convert_count)
library(e1071)
# Get the number of elements in train$label
n <- length(train$label)
train_x <- train_x[1:n,]
train$label <- train$label[1:n]
# classification of email
classifier <- naiveBayes(train_x, train$label)
test_pred <- predict(classifier, newdata=test_x)
table(test_pred, test$label)
##
## test_pred ham spam
## ham 20 9
## spam 1 0
Conclusion