This project aims to classify emails as either spam or ham using a Naive Bayes classifier. The dataset consists of labeled emails, and the goal is to build a model with high accuracy while validating its performance using additional metrics.
# Load necessary libraries
if (!require(tm)) install.packages("tm")
if (!require(e1071)) install.packages("e1071")
if (!require(caret)) install.packages("caret")
if (!require(wordcloud)) install.packages("wordcloud")
if (!require(text2vec)) install.packages("text2vec")
if (!require(ROSE)) install.packages("ROSE")
library(tm)
library(e1071)
library(caret)
library(wordcloud)
library(text2vec)
library(ROSE)
# Define directories for spam and ham emails
spam_dir <- "/Users/aribarazzaq/Desktop/Project 4 Data 607/spam_2"
ham_dir <- "/Users/aribarazzaq/Desktop/Project 4 Data 607/easy_ham"
# Load spam and ham emails
spam_emails <- VCorpus(DirSource(spam_dir, encoding = "UTF-8"), readerControl = list(reader = readPlain))
ham_emails <- VCorpus(DirSource(ham_dir, encoding = "UTF-8"), readerControl = list(reader = readPlain))
# Check the structure of the corpus
length(spam_emails)
## [1] 1397
length(ham_emails)
## [1] 2551
# Function to clean invalid UTF-8 characters
fix_encoding <- function(corpus) {
tm_map(corpus, content_transformer(function(x) {
iconv(x, from = "UTF-8", to = "UTF-8", sub = "") # Replace invalid characters
}))
}
# Apply encoding fix to both corpora
spam_emails <- fix_encoding(spam_emails)
ham_emails <- fix_encoding(ham_emails)
# Updated cleaning function
clean_corpus <- function(corpus) {
corpus <- tm_map(corpus, content_transformer(function(x) {
gsub("[^[:print:]]", "", x) # Remove non-printable characters
}))
corpus <- tm_map(corpus, content_transformer(tolower)) # Convert to lowercase
corpus <- tm_map(corpus, removePunctuation) # Remove punctuation
corpus <- tm_map(corpus, removeNumbers) # Remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("en")) # Remove stopwords
corpus <- tm_map(corpus, stripWhitespace) # Remove extra whitespace
return(corpus)
}
# Apply cleaning to both corpora
spam_emails <- clean_corpus(spam_emails)
ham_emails <- clean_corpus(ham_emails)
#Inspect a sample email (short snippet)
cat("Sample spam email content:\n", substr(content(spam_emails[[1]]), 1, 200), "...\n")
## Sample spam email content:
## ilugadminlinuxie tue aug returnpath ilugadminlinuxie deliveredto yyyylocalhostnetnoteinccom received localhost localhost phoboslabsnetnoteinccom postfix esmtp id efdd jmlocalhost tue aug edt received phobos localhost imap fetchmail jmlocalhost singledrop tue aug ist received lughtuathaorg rootlughtuathaorg dogmaslashnullorg esmtp id glqwv jmilugjmasonorg fri aug received lugh rootlocalhost lughtuathaorg esmtp id waa fri aug received bettyjagessarcom wznycnydslcncnet lughtuathaorg esmtp id waa iluglinuxie fri aug xauthenticationwarning lughtuathaorg host wznycnydslcncnet claimed bettyjagessarcom received bettyjagessarcom smtpd eval id aafcf fri aug messageid date fri aug iluglinuxie start now startnowhotmailcom mimeversion contenttype textplain charsetusascii formatflowed subject ilug stop mlm insanity sender ilugadminlinuxie errorsto ilugadminlinuxie xmailmanversion precedence bulk listid irish linux users group iluglinuxie xbeenthere iluglinuxie greetings receiving letter expressed interest receiving information online business opportunities erroneous please accept sincere apology onetime mailing removal necessary youve burned betrayed backstabbed multilevel marketing mlm please read letter important one ever landed inbox multilevel marketing huge mistake people mlm failed deliver promises past years pursuit mlm dream cost hundreds thousands people friends fortunes sacred honor fact mlm fatally flawed meaning work people companies earn big money mlm going tell real story finally someone courage cut hype lies tell truth mlm heres good news alternative mlm works works big havent yet abandoned dreams need see earning kind income youve dreamed easier think permission id like send brief letter will tell mlm doesnt work people will introduce something new refreshing youll wonder havent heard promise will unwanted follow sales pitch one will call email address will used send information period receive free lifechanging information simply click reply type send info subject box hit send ill get information within hours just look words mlm wall shame inbox cordially siddhi ps someone recently sent letter eyeopening financially beneficial information ever received honestly believe will feel way youve read free email never sent unsolicited spam receiving email explicitly signed list online signup form use ffa links page emaildom systems explicit terms use state use agree receive emailings may also member altra computer systems list one many numerous free marketing services agreed signed list also receiving emailing due email message considered unsolicitated spam irish linux users group iluglinuxie httpwwwlinuxiemailmanlistinfoilug unsubscription information list maintainer listmasterlinuxie ...
# Combine corpora
combined_corpus <- c(spam_emails, ham_emails)
# Create Document-Term Matrix
dtm <- DocumentTermMatrix(combined_corpus)
# Remove sparse terms to reduce noise
dtm <- removeSparseTerms(dtm, 0.99)
# Convert DTM to a data frame
dtm_data <- as.data.frame(as.matrix(dtm))
# Add labels: first half spam, second half ham
dtm_data$label <- factor(c(rep("spam", length(spam_emails)), rep("ham", length(ham_emails))))
# Display the structure of the data frame
cat("Structure of the DTM data frame:\n")
## Structure of the DTM data frame:
str(dtm_data, list.len = 5)
## 'data.frame': 3948 obs. of 2272 variables:
## $ ability : num 0 0 1 1 0 0 0 0 0 0 ...
## $ able : num 0 0 0 0 0 0 0 0 0 0 ...
## $ absolutely : num 0 0 1 1 0 0 0 0 0 0 ...
## $ abuse : num 0 0 0 0 0 0 0 0 0 0 ...
## $ accept : num 1 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
# Split data
set.seed(123)
train_indices <- createDataPartition(dtm_data$label, p = 0.8, list = FALSE)
train_data <- dtm_data[train_indices, ]
test_data <- dtm_data[-train_indices, ]
# Train the Naive Bayes model
nb_model <- naiveBayes(label ~ ., data = train_data)
# Make predictions
predictions <- predict(nb_model, newdata = test_data)
# Evaluate performance
confusion_matrix <- confusionMatrix(predictions, test_data$label)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix$table)
## Reference
## Prediction ham spam
## ham 493 1
## spam 17 278
cat("\nMetrics:\n")
##
## Metrics:
print(confusion_matrix$overall)
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.771863e-01 9.507347e-01 9.641835e-01 9.864244e-01 6.463878e-01
## AccuracyPValue McnemarPValue
## 1.093432e-118 4.069520e-04
# Define train control for cross-validation
train_control <- trainControl(method = "cv", number = 5)
# Train Naive Bayes model with cross-validation
nb_tuned <- train(label ~ ., data = train_data, method = "naive_bayes", trControl = train_control)
# Make predictions and evaluate
nb_predictions <- predict(nb_tuned, newdata = test_data)
tuned_confusion_matrix <- confusionMatrix(nb_predictions, test_data$label)
cat("Cross-Validated Confusion Matrix:\n")
## Cross-Validated Confusion Matrix:
print(tuned_confusion_matrix$table)
## Reference
## Prediction ham spam
## ham 493 1
## spam 17 278
cat("\nCross-Validated Metrics:\n")
##
## Cross-Validated Metrics:
print(tuned_confusion_matrix$overall)
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.771863e-01 9.507347e-01 9.641835e-01 9.864244e-01 6.463878e-01
## AccuracyPValue McnemarPValue
## 1.093432e-118 4.069520e-04
# Exclude label column
dtm_features <- dtm_data[, -ncol(dtm_data)]
# Calculate term frequencies
spam_terms <- colSums(as.matrix(dtm_features[dtm_data$label == "spam", ]))
ham_terms <- colSums(as.matrix(dtm_features[dtm_data$label == "ham", ]))
# Generate word cloud for spam
wordcloud(names(spam_terms), spam_terms, max.words = 100, scale = c(3, 0.5), colors = "red", main = "Spam Words")
# Generate word cloud for ham
wordcloud(names(ham_terms), ham_terms, max.words = 100, scale = c(3, 0.5), colors = "blue", main = "Ham Words")
# Additional metrics
cat("Additional Metrics:\n")
## Additional Metrics:
print(confusion_matrix$byClass) # Precision, Recall, F1-score
## Sensitivity Specificity Pos Pred Value
## 0.9666667 0.9964158 0.9979757
## Neg Pred Value Precision Recall
## 0.9423729 0.9979757 0.9666667
## F1 Prevalence Detection Rate
## 0.9820717 0.6463878 0.6248416
## Detection Prevalence Balanced Accuracy
## 0.6261090 0.9815412
The model achieved a high accuracy due to effective preprocessing and feature extraction. However, cross-validation revealed potential overfitting, suggesting further validation with external datasets is needed.
This project demonstrated the ability to classify emails as spam or ham using a Naive Bayes classifier. Future improvements include testing with TF-IDF, n-grams, and external datasets to ensure robustness.