In this project, I aim to develop a machine learning model to classify emails as either spam or legitimate (ham). I utilize a dataset containing examples of both types of emails, where I preprocess by cleaning and converting into a suitable format for analysis. Leveraging the Naive Bayes classifier, I train the model on a portion of the dataset and evaluate its performance on unseen data. By computing a confusion matrix and accuracy metrics, I assess the effectiveness of the model in accurately distinguishing between spam and ham emails.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
library(SnowballC)
download.file("https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2", "easy_ham.tar.bz2")
download.file("https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2", "spam.tar.bz2")
system("tar -xjf easy_ham.tar.bz2")
## [1] 0
system("tar -xjf spam.tar.bz2")
## [1] 0
clean_text <- function(text) {
# Convert text to lowercase
text <- tolower(text)
# Remove special characters, numbers, and punctuation
text <- gsub("[^a-z\\s]", "", text)
# Remove extra whitespaces
text <- gsub("\\s+", " ", text)
return(text)
}
ham_emails <- lapply(list.files("easy_ham", full.names = TRUE), function(file) {
text <- readLines(file, encoding = "latin1", warn = FALSE)
clean_text(text)
})
# Read and clean spam emails
spam_emails <- lapply(list.files("spam", full.names = TRUE), function(file) {
text <- readLines(file, encoding = "latin1", warn = FALSE)
clean_text(text)
})
emails <- c(ham_emails, spam_emails)
labels <- c(rep("ham", length(ham_emails)), rep("spam", length(spam_emails)))
set.seed(123) # for reproducibility
sample_indices <- sample(length(emails), size = round(0.8 * length(emails)))
train_emails <- emails[sample_indices]
test_emails <- emails[-sample_indices]
train_labels <- labels[sample_indices]
test_labels <- labels[-sample_indices]
train_corpus <- Corpus(VectorSource(train_emails))
train_corpus <- tm_map(train_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(train_corpus, content_transformer(tolower)):
## transformation drops documents
train_corpus <- tm_map(train_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(train_corpus, removeNumbers): transformation
## drops documents
train_corpus <- tm_map(train_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(train_corpus, removePunctuation): transformation
## drops documents
train_corpus <- tm_map(train_corpus, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(train_corpus, removeWords, stopwords("en")):
## transformation drops documents
train_corpus <- tm_map(train_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(train_corpus, stemDocument): transformation
## drops documents
dtm <- DocumentTermMatrix(train_corpus)
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.2
nb_model <- naiveBayes(as.matrix(dtm), train_labels)
# Preprocess test data
test_corpus <- Corpus(VectorSource(test_emails))
test_corpus <- tm_map(test_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(test_corpus, content_transformer(tolower)):
## transformation drops documents
test_corpus <- tm_map(test_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(test_corpus, removeNumbers): transformation
## drops documents
test_corpus <- tm_map(test_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(test_corpus, removePunctuation): transformation
## drops documents
test_corpus <- tm_map(test_corpus, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(test_corpus, removeWords, stopwords("en")):
## transformation drops documents
test_corpus <- tm_map(test_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(test_corpus, stemDocument): transformation drops
## documents
test_dtm <- DocumentTermMatrix(test_corpus, control = list(dictionary = Terms(dtm)))
predicted_labels <- predict(nb_model, as.matrix(test_dtm))
conf_matrix <- table(predicted_labels, test_labels)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(conf_matrix)
## test_labels
## predicted_labels ham spam
## ham 0 0
## spam 492 108
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.18"
To conclude, while the developed machine learning model shows initial efforts in email classification, its performance falls short of expectations with an accuracy of only 18%. This underscores the need for further refinement and enhancement.