Load packages
library(stringr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rvest)
library(tm)
## Warning: package 'tm' was built under R version 4.4.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.4.2
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
Download ham and spam files
# ham
ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20021010_hard_ham.tar.bz2"
ham_file <- tempfile(fileext = ".tar.bz2")
download.file(ham_url, destfile = ham_file, mode = "wb")
# spam
spam_url <- "https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2"
spam_file <- tempfile(fileext = ".tar.bz2")
download.file(spam_url, destfile = ham_file, mode = "wb")
Clean corpus
corpus <- VCorpus(VectorSource(email_data$text))
# Clean corpus
safe_tolower <- content_transformer(function(x) {
tryCatch(tolower(x), error = function(e) "")})
corpus_clean <- tm_map(corpus, safe_tolower) # convert to lowercase
corpus_clean <- tm_map(corpus_clean, removePunctuation) # remove punctuation
corpus_clean <- tm_map(corpus_clean, removeNumbers) # remove numbers
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords("en")) # remove stopwords (the, and, you, etc)
corpus_clean <- tm_map(corpus_clean, stripWhitespace) # clean extra spaces
# Create document-term matrix & df
dtm <- DocumentTermMatrix(corpus_clean)
# Remove sparse terms
dtm <- removeSparseTerms(dtm, 0.99)
dtm_df <- as.data.frame(as.matrix(dtm))
dtm_df$label <- email_data$label
Split data into training and test set
set.seed(123123)
# Split off 80% of the emails to test
train_idx <- createDataPartition(dtm_df$label, p = 0.8, list = FALSE)
train_data <- dtm_df[train_idx, ]
test_data <- dtm_df[-train_idx, ]
train_data$label <- factor(train_data$label)
Training Naive Bayes Model
# Remove broken emails
corpus_clean <- corpus_clean[sapply(corpus_clean, function(doc) {
txt <- tryCatch(as.character(doc), error = function(e) "")
nchar(txt) > 0
})]
# Document matrix
dtm <- DocumentTermMatrix(corpus_clean)
model <- naiveBayes(label ~ ., data = train_data)
# Check ratios of ham vs spam
table(train_data$label)
##
## ham spam
## 200 401
Test rest of emails
test_data$label <- factor(test_data$label, levels = c("ham", "spam"))
# Predictions based on test data
pred <- predict(model, newdata = test_data)
# Convert predictions to factor
pred <- factor(pred, levels = c("ham", "spam"))
confusionMatrix(pred, test_data$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 29 0
## spam 21 100
##
## Accuracy : 0.86
## 95% CI : (0.794, 0.9112)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 6.039e-08
##
## Kappa : 0.648
##
## Mcnemar's Test P-Value : 1.275e-05
##
## Sensitivity : 0.5800
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8264
## Prevalence : 0.3333
## Detection Rate : 0.1933
## Detection Prevalence : 0.1933
## Balanced Accuracy : 0.7900
##
## 'Positive' Class : ham
##