Introduction

In this project, I aim to develop a machine learning model to classify emails as either spam or legitimate (ham). I utilize a dataset containing examples of both types of emails, where I preprocess by cleaning and converting into a suitable format for analysis. Leveraging the Naive Bayes classifier, I train the model on a portion of the dataset and evaluate its performance on unseen data. By computing a confusion matrix and accuracy metrics, I assess the effectiveness of the model in accurately distinguishing between spam and ham emails.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
library(SnowballC)

Download the dataset

download.file("https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2", "easy_ham.tar.bz2")
download.file("https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2", "spam.tar.bz2")

Extract the dataset

system("tar -xjf easy_ham.tar.bz2")
## [1] 0
system("tar -xjf spam.tar.bz2")
## [1] 0

Data Cleaning

clean_text <- function(text) {
  # Convert text to lowercase
  text <- tolower(text)
  # Remove special characters, numbers, and punctuation
  text <- gsub("[^a-z\\s]", "", text)
  # Remove extra whitespaces
  text <- gsub("\\s+", " ", text)
  return(text)
}

Read and clean ham email

ham_emails <- lapply(list.files("easy_ham", full.names = TRUE), function(file) {
  text <- readLines(file, encoding = "latin1", warn = FALSE)
  clean_text(text)
})

Read and clean spam email

# Read and clean spam emails
spam_emails <- lapply(list.files("spam", full.names = TRUE), function(file) {
  text <- readLines(file, encoding = "latin1", warn = FALSE)
  clean_text(text)
})

Combine ham and spam emials into a single dataset

emails <- c(ham_emails, spam_emails)
labels <- c(rep("ham", length(ham_emails)), rep("spam", length(spam_emails)))

Split data into Training and Testing sets

set.seed(123) # for reproducibility
sample_indices <- sample(length(emails), size = round(0.8 * length(emails)))
train_emails <- emails[sample_indices]
test_emails <- emails[-sample_indices]
train_labels <- labels[sample_indices]
test_labels <- labels[-sample_indices]

Create a corpus

train_corpus <- Corpus(VectorSource(train_emails))

Preprocessing: Remove numbers, punctuation, stopwords, and perform stemming

train_corpus <- tm_map(train_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(train_corpus, content_transformer(tolower)):
## transformation drops documents
train_corpus <- tm_map(train_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(train_corpus, removeNumbers): transformation
## drops documents
train_corpus <- tm_map(train_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(train_corpus, removePunctuation): transformation
## drops documents
train_corpus <- tm_map(train_corpus, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(train_corpus, removeWords, stopwords("en")):
## transformation drops documents
train_corpus <- tm_map(train_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(train_corpus, stemDocument): transformation
## drops documents

Create a document-term matrix

dtm <- DocumentTermMatrix(train_corpus)

Train a Naive Bayes classifiers

library(e1071)
## Warning: package 'e1071' was built under R version 4.3.2
nb_model <- naiveBayes(as.matrix(dtm), train_labels)

Test the trained model

# Preprocess test data
test_corpus <- Corpus(VectorSource(test_emails))
test_corpus <- tm_map(test_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(test_corpus, content_transformer(tolower)):
## transformation drops documents
test_corpus <- tm_map(test_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(test_corpus, removeNumbers): transformation
## drops documents
test_corpus <- tm_map(test_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(test_corpus, removePunctuation): transformation
## drops documents
test_corpus <- tm_map(test_corpus, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(test_corpus, removeWords, stopwords("en")):
## transformation drops documents
test_corpus <- tm_map(test_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(test_corpus, stemDocument): transformation drops
## documents

Create document-term matrix for test data

test_dtm <- DocumentTermMatrix(test_corpus, control = list(dictionary = Terms(dtm)))

Predict using the trained model

predicted_labels <- predict(nb_model, as.matrix(test_dtm))

Evaluate model performance

conf_matrix <- table(predicted_labels, test_labels)
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(conf_matrix)
##                 test_labels
## predicted_labels ham spam
##             ham    0    0
##             spam 492  108
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.18"

Conclusion

To conclude, while the developed machine learning model shows initial efforts in email classification, its performance falls short of expectations with an accuracy of only 18%. This underscores the need for further refinement and enhancement.