Introduction

This project aims to classify emails as either spam or ham using a Naive Bayes classifier. The dataset consists of labeled emails, and the goal is to build a model with high accuracy while validating its performance using additional metrics.

Data Preparation

Load Required Libraries

# Load necessary libraries
if (!require(tm)) install.packages("tm")
if (!require(e1071)) install.packages("e1071")
if (!require(caret)) install.packages("caret")
if (!require(wordcloud)) install.packages("wordcloud")
if (!require(text2vec)) install.packages("text2vec")
if (!require(ROSE)) install.packages("ROSE")

library(tm)
library(e1071)
library(caret)
library(wordcloud)
library(text2vec)
library(ROSE)

Load and Inspect Data

# Define directories for spam and ham emails
spam_dir <- "/Users/aribarazzaq/Desktop/Project 4 Data 607/spam_2"
ham_dir <- "/Users/aribarazzaq/Desktop/Project 4 Data 607/easy_ham"

# Load spam and ham emails
spam_emails <- VCorpus(DirSource(spam_dir, encoding = "UTF-8"), readerControl = list(reader = readPlain))
ham_emails <- VCorpus(DirSource(ham_dir, encoding = "UTF-8"), readerControl = list(reader = readPlain))

# Check the structure of the corpus
length(spam_emails)
## [1] 1397
length(ham_emails)
## [1] 2551

Data Cleaning

# Function to clean invalid UTF-8 characters
fix_encoding <- function(corpus) {
  tm_map(corpus, content_transformer(function(x) {
    iconv(x, from = "UTF-8", to = "UTF-8", sub = "")  # Replace invalid characters
  }))
}

# Apply encoding fix to both corpora
spam_emails <- fix_encoding(spam_emails)
ham_emails <- fix_encoding(ham_emails)

# Updated cleaning function
clean_corpus <- function(corpus) {
  corpus <- tm_map(corpus, content_transformer(function(x) {
    gsub("[^[:print:]]", "", x)  # Remove non-printable characters
  }))
  corpus <- tm_map(corpus, content_transformer(tolower))            # Convert to lowercase
  corpus <- tm_map(corpus, removePunctuation)                      # Remove punctuation
  corpus <- tm_map(corpus, removeNumbers)                          # Remove numbers
  corpus <- tm_map(corpus, removeWords, stopwords("en"))           # Remove stopwords
  corpus <- tm_map(corpus, stripWhitespace)                        # Remove extra whitespace
  return(corpus)
}

# Apply cleaning to both corpora
spam_emails <- clean_corpus(spam_emails)
ham_emails <- clean_corpus(ham_emails)

#Inspect a sample email (short snippet)
cat("Sample spam email content:\n", substr(content(spam_emails[[1]]), 1, 200), "...\n")
## Sample spam email content:
##   ilugadminlinuxie tue aug  returnpath ilugadminlinuxie deliveredto yyyylocalhostnetnoteinccom received localhost localhost   phoboslabsnetnoteinccom postfix esmtp id efdd  jmlocalhost tue aug edt received phobos   localhost imap fetchmail  jmlocalhost singledrop tue aug ist received lughtuathaorg rootlughtuathaorg   dogmaslashnullorg esmtp id glqwv   jmilugjmasonorg fri aug  received lugh rootlocalhost lughtuathaorg  esmtp id waa fri aug  received bettyjagessarcom wznycnydslcncnet  lughtuathaorg esmtp id waa   iluglinuxie fri aug  xauthenticationwarning lughtuathaorg host wznycnydslcncnet  claimed bettyjagessarcom received bettyjagessarcom  smtpd eval id aafcf fri aug  messageid  date fri aug   iluglinuxie  start now startnowhotmailcom mimeversion  contenttype textplain charsetusascii formatflowed subject ilug stop mlm insanity sender ilugadminlinuxie errorsto ilugadminlinuxie xmailmanversion  precedence bulk listid irish linux users group iluglinuxie xbeenthere iluglinuxie  greetings   receiving letter expressed interest  receiving information online business opportunities  erroneous please accept sincere apology onetime  mailing removal necessary   youve burned betrayed backstabbed multilevel marketing  mlm please read letter important one   ever landed inbox  multilevel marketing huge mistake people  mlm failed deliver promises past years pursuit   mlm dream cost hundreds thousands people friends   fortunes sacred honor fact mlm fatally  flawed meaning work people   companies earn big money mlm going  tell real story finally someone courage  cut hype lies tell truth mlm  heres good news   alternative mlm works works big havent yet  abandoned dreams need see earning kind income  youve dreamed easier think   permission id like send brief letter will tell   mlm doesnt work people will introduce  something new refreshing youll wonder havent heard      promise will unwanted follow sales pitch one  will call email address will used send  information period   receive free lifechanging information simply click reply type  send info subject box hit send ill get information  within hours just look words mlm wall shame inbox  cordially  siddhi  ps someone recently sent letter  eyeopening financially beneficial information ever received  honestly believe will feel way youve read   free     email never sent unsolicited spam receiving   email explicitly signed list  online signup form use ffa links page emaildom  systems explicit terms use state use   agree receive emailings may also member altra  computer systems list one many numerous free marketing services   agreed signed list also  receiving emailing due email message considered unsolicitated  spam        irish linux users group iluglinuxie httpwwwlinuxiemailmanlistinfoilug unsubscription information list maintainer listmasterlinuxie   ...

Feature Engineering

Create Document-Term Matrix (DTM)

# Combine corpora
combined_corpus <- c(spam_emails, ham_emails)

# Create Document-Term Matrix
dtm <- DocumentTermMatrix(combined_corpus)

# Remove sparse terms to reduce noise
dtm <- removeSparseTerms(dtm, 0.99)

# Convert DTM to a data frame
dtm_data <- as.data.frame(as.matrix(dtm))

# Add labels: first half spam, second half ham
dtm_data$label <- factor(c(rep("spam", length(spam_emails)), rep("ham", length(ham_emails))))

# Display the structure of the data frame
cat("Structure of the DTM data frame:\n")
## Structure of the DTM data frame:
str(dtm_data, list.len = 5)
## 'data.frame':    3948 obs. of  2272 variables:
##  $ ability                                                            : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ able                                                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ absolutely                                                         : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ abuse                                                              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ accept                                                             : num  1 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]

Model Training and Evaluation

Split Data into Training and Test Sets

# Split data
set.seed(123)
train_indices <- createDataPartition(dtm_data$label, p = 0.8, list = FALSE)
train_data <- dtm_data[train_indices, ]
test_data <- dtm_data[-train_indices, ]

Train Naive Bayes Classifier

# Train the Naive Bayes model
nb_model <- naiveBayes(label ~ ., data = train_data)

# Make predictions
predictions <- predict(nb_model, newdata = test_data)

# Evaluate performance
confusion_matrix <- confusionMatrix(predictions, test_data$label)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix$table)
##           Reference
## Prediction ham spam
##       ham  493    1
##       spam  17  278
cat("\nMetrics:\n")
## 
## Metrics:
print(confusion_matrix$overall)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   9.771863e-01   9.507347e-01   9.641835e-01   9.864244e-01   6.463878e-01 
## AccuracyPValue  McnemarPValue 
##  1.093432e-118   4.069520e-04

Cross-Validation

# Define train control for cross-validation
train_control <- trainControl(method = "cv", number = 5)

# Train Naive Bayes model with cross-validation
nb_tuned <- train(label ~ ., data = train_data, method = "naive_bayes", trControl = train_control)

# Make predictions and evaluate
nb_predictions <- predict(nb_tuned, newdata = test_data)
tuned_confusion_matrix <- confusionMatrix(nb_predictions, test_data$label)
cat("Cross-Validated Confusion Matrix:\n")
## Cross-Validated Confusion Matrix:
print(tuned_confusion_matrix$table)
##           Reference
## Prediction ham spam
##       ham  493    1
##       spam  17  278
cat("\nCross-Validated Metrics:\n")
## 
## Cross-Validated Metrics:
print(tuned_confusion_matrix$overall)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   9.771863e-01   9.507347e-01   9.641835e-01   9.864244e-01   6.463878e-01 
## AccuracyPValue  McnemarPValue 
##  1.093432e-118   4.069520e-04

Results and Discussion

Visualize Important Features

# Exclude label column
dtm_features <- dtm_data[, -ncol(dtm_data)]

# Calculate term frequencies
spam_terms <- colSums(as.matrix(dtm_features[dtm_data$label == "spam", ]))
ham_terms <- colSums(as.matrix(dtm_features[dtm_data$label == "ham", ]))

# Generate word cloud for spam
wordcloud(names(spam_terms), spam_terms, max.words = 100, scale = c(3, 0.5), colors = "red", main = "Spam Words")

# Generate word cloud for ham
wordcloud(names(ham_terms), ham_terms, max.words = 100, scale = c(3, 0.5), colors = "blue", main = "Ham Words")

Analyze Model Performance

# Additional metrics
cat("Additional Metrics:\n")
## Additional Metrics:
print(confusion_matrix$byClass)  # Precision, Recall, F1-score
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9666667            0.9964158            0.9979757 
##       Neg Pred Value            Precision               Recall 
##            0.9423729            0.9979757            0.9666667 
##                   F1           Prevalence       Detection Rate 
##            0.9820717            0.6463878            0.6248416 
## Detection Prevalence    Balanced Accuracy 
##            0.6261090            0.9815412
  • Accuracy: 97.72%
  • Precision (Spam): 99.8%
  • Recall (Spam): 96.67%
  • F1-Score: 0.98

Discussion

The model achieved a high accuracy due to effective preprocessing and feature extraction. However, cross-validation revealed potential overfitting, suggesting further validation with external datasets is needed.

Conclusion

This project demonstrated the ability to classify emails as spam or ham using a Naive Bayes classifier. Future improvements include testing with TF-IDF, n-grams, and external datasets to ensure robustness.