library(tm)
## Warning: package 'tm' was built under R version 4.4.2
## Loading required package: NLP
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
library(text)
## Warning: package 'text' was built under R version 4.4.2
## [0;34mThis is text (version 1.2.3).
## [0m[0;32mText is new and still rapidly improving.
##
## Newer versions may have improved functions and updated defaults to reflect current understandings of the state-of-the-art.
## Please send us feedback based on your experience.[0m[0;35m
##
## For more information about the package see www.r-text.org.[0m
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.2
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.4.2
## Loading required package: RColorBrewer
# File paths for spam and ham emails
easy_ham_path <- "C:/Users/poiso/Downloads/easy_ham"
spam_path <- "C:/Users/poiso/Downloads/spam"
# Function to read all files in a directory
read_emails <- function(path) {
files <- list.files(path, full.names = TRUE) # Get full paths
sapply(files, function(x) paste(readLines(x, warn = FALSE), collapse = " "))
}
# Reads emails
easy_ham <- read_emails(easy_ham_path)
spam <- read_emails(spam_path)
# Creates labeled data frame
data <- data.frame(
text = c(easy_ham, spam),
label = factor(c(rep("ham", length(easy_ham)), rep("spam", length(spam))))
)
str(data)
## 'data.frame': 3052 obs. of 2 variables:
## $ text : chr "From exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002 Return-Path: <exmh-workers-admin@example.com> Deli"| __truncated__ "From Steve_Burt@cursor-system.com Thu Aug 22 12:46:39 2002 Return-Path: <Steve_Burt@cursor-system.com> Deliver"| __truncated__ "From timc@2ubh.com Thu Aug 22 13:52:59 2002 Return-Path: <timc@2ubh.com> Delivered-To: zzzz@localhost.netnotei"| __truncated__ "From irregulars-admin@tb.tf Thu Aug 22 14:23:39 2002 Return-Path: <irregulars-admin@tb.tf> Delivered-To: zzzz@"| __truncated__ ...
## $ label: Factor w/ 2 levels "ham","spam": 1 1 1 1 1 1 1 1 1 1 ...
# Function to clean email text
clean_text <- function(text) {
text <- iconv(text, from = "latin1", to = "UTF-8", sub = "") # Convert encoding to UTF-8
text <- tolower(text) # Convert to lowercase
text <- removeNumbers(text) # Remove numbers
text <- removePunctuation(text) # Remove punctuation
text <- removeWords(text, stopwords("en")) # Remove stop words
text <- stripWhitespace(text) # Remove extra whitespace
return(text)
}
# Applies the clean_text function
data$text <- sapply(data$text, clean_text)
# Split data into 80% training and 20% testing sets
set.seed(123)
train_index <- createDataPartition(data$label, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Balances the dataset by undersampling ham, necessary due to large imbalance in observations
ham_sample <- train_data %>%
filter(label == "ham") %>%
sample_n(sum(train_data$label == "spam"))
spam_sample <- train_data %>% filter(label == "spam")
# Combines the balanced data set
balanced_train_data <- bind_rows(ham_sample, spam_sample)
# Checks class distribution
table(balanced_train_data$label)
##
## ham spam
## 401 401
# Create Corpus for the balanced training data
balanced_corpus <- Corpus(VectorSource(balanced_train_data$text))
# Generate the DTM
balanced_dtm <- DocumentTermMatrix(balanced_corpus, control = list(wordLengths = c(1, Inf)))
# Reduce sparsity
balanced_dtm <- removeSparseTerms(balanced_dtm, 0.99)
# Convert to dataframe
train_matrix <- as.data.frame(as.matrix(balanced_dtm))
train_matrix$label <- balanced_train_data$label
test_corpus <- Corpus(VectorSource(test_data$text))
# Create the DTM for test data using the same terms
test_dtm <- DocumentTermMatrix(test_corpus, control = list(dictionary = Terms(balanced_dtm)))
test_matrix <- as.data.frame(as.matrix(test_dtm))
test_matrix$label <- test_data$label
# Train the Naive Bayes model
model <- naiveBayes(label ~ ., data = train_matrix)
# Predict on the test data set
predictions <- predict(model, newdata = test_matrix)
# Creates a confusion matrix
conf_matrix <- confusionMatrix(predictions, test_matrix$label)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 502 31
## spam 8 69
##
## Accuracy : 0.9361
## 95% CI : (0.9136, 0.9541)
## No Information Rate : 0.8361
## P-Value [Acc > NIR] : 8.57e-14
##
## Kappa : 0.743
##
## Mcnemar's Test P-Value : 0.000427
##
## Sensitivity : 0.9843
## Specificity : 0.6900
## Pos Pred Value : 0.9418
## Neg Pred Value : 0.8961
## Prevalence : 0.8361
## Detection Rate : 0.8230
## Detection Prevalence : 0.8738
## Balanced Accuracy : 0.8372
##
## 'Positive' Class : ham
##
# Display Accuracy
cat("Accuracy:", conf_matrix$overall["Accuracy"], "\n")
## Accuracy: 0.9360656
# Example new emails to classify
new_docs <- c(
"Congratulations! You have won a $1,000 Amazon gift card. Click here to claim now!",
"Hi team, please review the annual reports here."
)
# Cleans the new documents
new_docs_clean <- sapply(new_docs, clean_text)
# Create a Corpus and DTM for new documents
new_corpus <- Corpus(VectorSource(new_docs_clean))
new_dtm <- DocumentTermMatrix(new_corpus, control = list(dictionary = Terms(balanced_dtm)))
# Convert the new DTM to a dataframe
new_matrix <- as.data.frame(as.matrix(new_dtm))
# Predict labels for the new documents
new_predictions <- predict(model, newdata = new_matrix)
print(data.frame(Document = new_docs, Prediction = new_predictions))
## Document
## 1 Congratulations! You have won a $1,000 Amazon gift card. Click here to claim now!
## 2 Hi team, please review the annual reports here.
## Prediction
## 1 ham
## 2 ham
# Plots the distribution of spam and ham emails
data %>%
count(label) %>%
ggplot(aes(x = label, y = n, fill = label)) +
geom_bar(stat = "identity") +
labs(title = "Spam vs Ham Distribution", x = "Label", y = "Count") +
theme_minimal()

# Creates a word cloud for spam emails
spam_corpus <- Corpus(VectorSource(data$text[data$label == "spam"]))
wordcloud(spam_corpus, max.words = 100, random.order = FALSE, scale = c(3, 0.5))

# Creates a word cloud for ham emails
ham_corpus <- Corpus(VectorSource(data$text[data$label == "ham"]))
wordcloud(ham_corpus, max.words = 100, random.order = FALSE, scale = c(3, 0.5))

# Display the confusion matrix as a table using Kable
conf_matrix_table <- as.data.frame(conf_matrix$table)
kable(conf_matrix_table, caption = "Confusion Matrix", format = "html") %>%
kable_styling("striped", full_width = FALSE)
Confusion Matrix
|
Prediction
|
Reference
|
Freq
|
|
ham
|
ham
|
502
|
|
spam
|
ham
|
8
|
|
ham
|
spam
|
31
|
|
spam
|
spam
|
69
|