Required packages and libraries:
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages(c("tm", "caret", "e1071", "stringr"))
## Installing packages into 'C:/Users/zahid/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tm' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'tm'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\zahid\AppData\Local\R\win-library\4.4\00LOCK\tm\libs\x64\tm.dll to
## C:\Users\zahid\AppData\Local\R\win-library\4.4\tm\libs\x64\tm.dll: Permission
## denied
## Warning: restored 'tm'
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\zahid\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\zahid\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
## package 'e1071' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'e1071'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\zahid\AppData\Local\R\win-library\4.4\00LOCK\e1071\libs\x64\e1071.dll
## to C:\Users\zahid\AppData\Local\R\win-library\4.4\e1071\libs\x64\e1071.dll:
## Permission denied
## Warning: restored 'e1071'
## package 'stringr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\zahid\AppData\Local\Temp\RtmpAD561h\downloaded_packages
library(tm)
## Loading required package: NLP
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: lattice
library(e1071)
library(stringr)
Loading and Labeling the Emails:
# Set paths to spam and ham folders
spam_dir <- "C:/Users/zahid/OneDrive/Desktop/Data Science/Project 607/Project 4/spam"
ham_dir <- "C:/Users/zahid/OneDrive/Desktop/Data Science/Project 607/Project 4/easy_ham"
# Read files
read_emails <- function(dir, label) {
files <- list.files(dir, full.names = TRUE)
texts <- sapply(files, readLines, warn = FALSE)
data.frame(text = sapply(texts, paste, collapse = " "), label = label, stringsAsFactors = FALSE)
}
spam_data <- read_emails(spam_dir, "spam")
ham_data <- read_emails(ham_dir, "ham")
# Combine datasets
emails <- rbind(spam_data, ham_data)
Text Preprocessing:
# Convert to UTF-8 to avoid invalid multibyte characters
emails$text <- iconv(emails$text, from = "", to = "UTF-8", sub = "byte")
# Create a corpus
corpus <- Corpus(VectorSource(emails$text))
# Clean text
corpus_clean <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus_clean <- tm_map(corpus_clean, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus_clean, removePunctuation): transformation
## drops documents
corpus_clean <- tm_map(corpus_clean, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus_clean, removeNumbers): transformation
## drops documents
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(corpus_clean, removeWords, stopwords("en")):
## transformation drops documents
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_clean, stripWhitespace): transformation
## drops documents
# Create document-term matrix
dtm <- DocumentTermMatrix(corpus_clean)
# Keep frequent terms
dtm <- removeSparseTerms(dtm, 0.99)
# Convert to data frame
email_dtm <- as.data.frame(as.matrix(dtm))
email_dtm$label <- as.factor(emails$label)
Split into Training and Test Sets:
set.seed(123)
train_index <- createDataPartition(email_dtm$label, p = 0.8, list = FALSE)
train_data <- email_dtm[train_index, ]
test_data <- email_dtm[-train_index, ]
Train a Model (Naive Bayes Example)
model <- naiveBayes(label ~ ., data = train_data)
predictions <- predict(model, newdata = test_data)
confusionMatrix(predictions, test_data$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ham spam
## ham 37 0
## spam 463 100
##
## Accuracy : 0.2283
## 95% CI : (0.1953, 0.264)
## No Information Rate : 0.8333
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0259
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.07400
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.17762
## Prevalence : 0.83333
## Detection Rate : 0.06167
## Detection Prevalence : 0.06167
## Balanced Accuracy : 0.53700
##
## 'Positive' Class : ham
##
The key matrics said, Only 22.8% of predictions were correct which
is very poor performance. Only 7.4% of actual ham emails were correctly
identified. 100% of actual spam was correctly identified. A very poor
agreement between predicted and true classes.Average of sensitivity and
specificity which is still not good & it fails to detect ham
properly.This model is biased toward predicting spam, probably because
the training data is imbalanced (many more spam than ham).
Comment: It correctly identified 37 ham emails & incorrectly predicted 463 ham emails as spam. It correctly identified 100 spam emails & missed 0 ham emails.