library(tidyverse)
library(tidytext)
library(tidymodels)
library(textrecipes)
library(discrim)
library(themis)
library(tm)
library(wordcloud)
library(wordcloud2)
library(RColorBrewer)MA_Data607_Project4
Approach
First I need to take the ham and spam files and read them into a data frame. I will combine the spam and ham data sets then begin to preprocess the text. I plan to use a Naive Bayes model because from my readings and research I see it is a classic model used in spam classification. Since I plan to use this model I need to separate the words in the files and count the number of times words appear in ham and spam files.
Challenges
One major challenge will be fine tuning the preprocessing steps — deciding which tokens to keep, how aggressively to filter rare words, and how to handle class imbalance between spam and ham. Preprocessing noise such as punctuation, HTML artifacts, and email headers is also an important step that requires careful handling.
Packages
Reading the Data
# Function to load any labeled folder of files
load_label <- function(folder_path, label) {
list.files(folder_path, full.names = TRUE) |>
tibble(filepath = _) |>
mutate(
filename = basename(filepath),
label = label,
text = map_chr(filepath, \(f)
read_file(f, locale = locale(encoding = "latin1")))
)
}
# Load both and combine
spam_ham_df <- bind_rows(
load_label("C:/Users/typem/Documents/GitHub/spam", "spam"),
load_label("C:/Users/typem/Documents/GitHub/easy_ham", "ham")
) |>
mutate(
label = factor(label, levels = c("ham", "spam")),
label_encoded = if_else(label == "spam", 1L, 0L)
) |>
select(filename, label, label_encoded, text)
count(spam_ham_df, label)# A tibble: 2 × 2
label n
<fct> <int>
1 ham 2551
2 spam 501
# Check for empty files
spam_ham_df |> filter(nchar(text) == 0)# A tibble: 0 × 4
# ℹ 4 variables: filename <chr>, label <fct>, label_encoded <int>, text <chr>
spam_ham_df <- spam_ham_df |>
filter(!str_detect(text, "^mv "))
nrow(spam_ham_df)[1] 3051
count(spam_ham_df, label)# A tibble: 2 × 2
label n
<fct> <int>
1 ham 2551
2 spam 500
Creating a Corpus
The corpus is built from the cleaned dataframe using the tm package. It is used here for exploratory visualization via a Document Term Matrix.
spam_ham_df <- spam_ham_df |>
mutate(
text = str_to_lower(text),
text = str_remove_all(text, "<.*?>"),
text = str_remove_all(text, "https?://\\S+"),
text = str_remove_all(text, "\\S+@\\S+\\.\\S+"),
text = str_remove_all(text, "[^\\w\\s]"),
text = str_remove_all(text, "\\d+"),
text = str_squish(text)
)
corpus <- VCorpus(VectorSource(spam_ham_df$text))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)dtm <- DocumentTermMatrix(corpus)
dtm <- removeSparseTerms(dtm, 0.99)
dim(dtm)[1] 3051 1856
Exploratory Data Analysis
# Wordcloud for spam emails
spam_words <- spam_ham_df |>
filter(label == "spam") |>
unnest_tokens(word, text) |>
anti_join(stop_words, by = "word") |>
filter(!str_detect(word, "\\d"), nchar(word) > 2) |>
count(word, sort = TRUE)
wordcloud(
words = spam_words$word,
freq = spam_words$n,
max.words = 50,
colors = brewer.pal(8, "Accent"),
scale = c(3, 0.5)
)# Wordcloud for ham emails
ham_words <- spam_ham_df |>
filter(label == "ham") |>
unnest_tokens(word, text) |>
anti_join(stop_words, by = "word") |>
filter(!str_detect(word, "\\d"), nchar(word) > 2) |>
count(word, sort = TRUE)
wordcloud(
words = ham_words$word,
freq = ham_words$n,
max.words = 50,
colors = brewer.pal(8, "Paired"),
scale = c(3, 0.5)
)Train/Test Split
set.seed(1234)
split <- initial_split(spam_ham_df, prop = 0.8, strata = label)
train_df <- training(split)
test_df <- testing(split)
count(train_df, label)# A tibble: 2 × 2
label n
<fct> <int>
1 ham 2040
2 spam 400
count(test_df, label)# A tibble: 2 × 2
label n
<fct> <int>
1 ham 511
2 spam 100
nb_recipe <- recipe(label ~ text, data = train_df) |>
step_tokenize(text) |>
step_stopwords(text) |>
step_stem(text) |>
step_tokenfilter(text, max_tokens = 2000, min_times = 5) |>
step_tf(text) |>
step_upsample(label, over_ratio = 1)Model
nb_model <- naive_Bayes() |>
set_mode("classification") |>
set_engine("naivebayes")
nb_workflow <- workflow() |>
add_recipe(nb_recipe) |>
add_model(nb_model)
nb_fit <- nb_workflow |> fit(data = train_df)Evaluation
nb_preds <- augment(nb_fit, new_data = test_df)
class_metrics <- metric_set(f_meas, precision, recall, roc_auc)
# Evaluate
nb_preds |>
class_metrics(
truth = label,
estimate = .pred_class,
.pred_spam,
event_level = "second"
)# A tibble: 4 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 f_meas binary 0.131
2 precision binary 1
3 recall binary 0.07
4 roc_auc binary 0.984
# Confusion matrix heat map
nb_preds |>
conf_mat(truth = label, estimate = .pred_class) |>
autoplot(type = "heatmap")Conclusion
This project built a Naive Bayes spam classifier using a tidymodels pipeline. After loading and cleaning the raw email text files, the data was preprocessed by removing HTML tags, URLs, punctuation, numbers, and stop words, followed by stemming and tokenization. Class imbalance between ham and spam was addressed using up sampling in the recipe. Even after up sampling there is a clear in balance in the results likely due to the sample size difference between ham and spam.
The model was evaluated using F1 score, precision, recall, and ROC-AUC rather than accuracy, since the dataset is imbalanced and accuracy alone would be misleading. The confusion matrix show us that false positives (ham flagged as spam which was zero) and false negatives (spam that slipped through which unfortunately was 93), which are the most important errors in a real spam filter. If I had more time to fix this I would probably try finding a bigger sample of spam data.