library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
allspam <- 'C:/Users/ddebo/Downloads/spamham/20050311_spam_2/spam_2'
allham <- 'C:/Users/ddebo/Downloads/spamham/20030228_easy_ham_2/easy_ham_2'
read_emails <- function(dir, label) {
files <- list.files(dir, full.names = TRUE)
texts <- sapply(files, readLines, warn = FALSE)
data.frame(text = sapply(texts, paste, collapse = " "), label = label, stringsAsFactors = FALSE)
}
spam_data <- read_emails(allspam, "spam")
ham_data <- read_emails(allham, "ham")
emails <- rbind(spam_data, ham_data)
table(emails$label)
##
## ham spam
## 1401 1397
It looks like we have an almost equal amount of each type of email, which should result in a stronger model.
Since emails contain a lot of data outside of the text body, that needs to be removed from analysis.
emails$text <- iconv(emails$text, from = "", to = "UTF-8", sub = "byte")
corpus <- Corpus(VectorSource(emails$text))
corpus_clean <- tm_map(corpus, content_transformer(tolower)) # convert to lowercase
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus_clean <- tm_map(corpus_clean, removePunctuation) # remove punctuation
## Warning in tm_map.SimpleCorpus(corpus_clean, removePunctuation): transformation
## drops documents
corpus_clean <- tm_map(corpus_clean, removeNumbers) # remove numbers
## Warning in tm_map.SimpleCorpus(corpus_clean, removeNumbers): transformation
## drops documents
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords("en")) # remove stopwords (the, and, you, etc)
## Warning in tm_map.SimpleCorpus(corpus_clean, removeWords, stopwords("en")):
## transformation drops documents
corpus_clean <- tm_map(corpus_clean, stripWhitespace) # clean extra spaces
## Warning in tm_map.SimpleCorpus(corpus_clean, stripWhitespace): transformation
## drops documents
After going through the cleaning process, the text needs to be stored in a document-term matrix.
dtm <- DocumentTermMatrix(corpus_clean)
# Remove sparse terms
dtm <- removeSparseTerms(dtm, 0.99)
# Convert to data frame
email_dtm <- as.data.frame(as.matrix(dtm))
email_dtm$label <- as.factor(emails$label)
set.seed(68105)
train_index <- createDataPartition(email_dtm$label, p = 0.8, list = FALSE)
train_data <- email_dtm[train_index, ]
test_data <- email_dtm[-train_index, ]
library(e1071)
model <- naiveBayes(label ~ ., data = train_data)
predictions <- predict(model, newdata = test_data)
confusionMatrix(predictions, test_data$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 276 23
## spam 4 256
##
## Accuracy : 0.9517
## 95% CI : (0.9305, 0.9679)
## No Information Rate : 0.5009
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9034
##
## Mcnemar's Test P-Value : 0.000532
##
## Sensitivity : 0.9857
## Specificity : 0.9176
## Pos Pred Value : 0.9231
## Neg Pred Value : 0.9846
## Prevalence : 0.5009
## Detection Rate : 0.4937
## Detection Prevalence : 0.5349
## Balanced Accuracy : 0.9516
##
## 'Positive' Class : ham
##
With this Naive Bayes model, we have 95% accuracy in labeling. It is also a sign that this model is a step in the right direction is that we have many fewer cases of type 1 error than type 2 error. As specified in class, it is preferable that the occasional spam message slip through the filter than having actual important email wind up in the spam folder.