Project 4

Below, I load, clean, and tokenize spam and ham corpuses. I use word clouds, LDA, and histograms to understand typical document length and language. I then use the Quanteda package to create a prediction model using the Naive Bayes classifier.

Reading the files from my local drive:

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readtext)
library(tidytext)
library(textdata)
library(wordcloud)

Loading required package: RColorBrewer

library(tidyr)
library(reshape2)


Attaching package: 'reshape2'

The following object is masked from 'package:tidyr':

    smiths

#load the spam files from my local drive
spam_df <- readtext("C:/Users/Sam Barbaro/Desktop/607 - data warehousing/spamham/20021010_spam/spam")

spam_df <- spam_df |>
  mutate(label = "spam")

spam <- spam_df |>
  select(text, label) |>
  mutate(email_id = row_number())

#loading the ham files

ham_df <- readtext("C:/Users/Sam Barbaro/Desktop/607 - data warehousing/spamham/20030228_easy_ham/easy_ham")

ham_df <- ham_df |>
  mutate(label = "ham")

ham <- ham_df |>
  select(text, label) |>
  mutate(email_id = row_number())

Unnesting the text

tidy_spam <- spam %>%
  group_by(email_id) %>%
  mutate(
    linenumber = row_number(),
    email_id = cur_group_id() 
  ) %>%
  ungroup() %>%
    unnest_tokens(word, text)

tidy_ham <- ham %>%
  group_by(email_id) %>%
  mutate(
    linenumber = row_number(),
    email_id = cur_group_id() 
  ) %>%
  ungroup() %>%
    unnest_tokens(word, text)

Word cloud

What are some of the most common words in spam and ham emails?

tidy_spam %>%
  anti_join(stop_words, by = join_by(word)) %>%
  count(word) %>%
  with(wordcloud::wordcloud(word, n, max.words = 100))

tidy_ham %>%
  anti_join(stop_words, by = join_by(word)) %>%
  count(word) %>%
  with(wordcloud::wordcloud(word, n, max.words = 100))

This does suggest that I need to get rid of a few more common words beyond stop words (like “from” and “zzz”).

Join dataframes

library(topicmodels)

Warning: package 'topicmodels' was built under R version 4.5.3

library(tm)

Warning: package 'tm' was built under R version 4.5.3

Loading required package: NLP

Warning: package 'NLP' was built under R version 4.5.2


Attaching package: 'NLP'

The following object is masked from 'package:ggplot2':

    annotate

#combine data frames and remove words that are just numbers
#which occupied the first ~5,000 rows the first time I did this 
#also removing stop words
combined_tidy <- rbind(tidy_ham, tidy_spam) %>%
  anti_join(stop_words, by = join_by(word)) %>%
  filter(!str_detect(word, "[0-9]"))

    
tidy_counts <- combined_tidy %>%
  count(email_id, word)

dtm <- tidy_counts %>%
  cast_dtm(email_id, word, n)

# Run LDA to create a two-topic LDA model (span and ham)
spam_lda <- LDA(dtm, k = 2, control = list(seed = 1234))

Word-topic probabilities

spam_topics <- tidy(spam_lda, matrix = "beta")
spam_topics

# A tibble: 79,578 × 3
   topic term                                                beta
   <int> <chr>                                              <dbl>
 1     1 _______________________________________________ 0.000498
 2     2 _______________________________________________ 0.00124 
 3     1 admin                                           0.00379 
 4     2 admin                                           0.0105  
 5     1 ago                                             0.000181
 6     2 ago                                             0.000224
 7     1 archive                                         0.000765
 8     2 archive                                         0.00264 
 9     1 ascii                                           0.000845
10     2 ascii                                           0.00191 
# ℹ 79,568 more rows

Graphing the top terms by topic

#2 is ham and 1 is spam

spam_top_terms <- spam_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>% 
  ungroup() %>%
  arrange(topic, -beta)

spam_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

This doesn’t give us a ton more than the word cloud did.

Analyzing email length

set.seed(100)

#taking a sample because this takes a long time
unique_ids <- unique(tidy_spam$email_id)

sampled_ids <- sample(unique_ids, 0.4 * length(unique_ids))

#plotting histograms

tidy_spam |> filter(email_id %in% sampled_ids) |>
  count(email_id, name = "total_words") |>
  ggplot(aes(x = total_words)) +
  geom_histogram() + 
  labs(Title = "Spam Email Length Frequency")

Ignoring unknown labels:
• Title : "Spam Email Length Frequency"
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.

unique_ids <- unique(tidy_ham$email_id)

sampled_ids <- sample(unique_ids, 0.4 * length(unique_ids))

tidy_ham |> filter(email_id %in% sampled_ids) |>
  count(email_id, name = "total_words") |>
  ggplot(aes(x = total_words)) +
  geom_histogram() +
  labs(Title = "Ham Email Length Frequency")

Ignoring unknown labels:
• Title : "Ham Email Length Frequency"
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#mean word counts
spam_counts <- tidy_spam |>
  count(email_id, name = "total_words") 

spam_counts |> summarise(mean(total_words))

# A tibble: 1 × 1
  `mean(total_words)`
                <dbl>
1                862.

ham_counts <- tidy_ham |>
  count(email_id, name = "total_words") 

ham_counts |> summarise(mean(total_words))

# A tibble: 1 × 1
  `mean(total_words)`
                <dbl>
1                494.

On average, Spam emails tend to be longer.

Let’s build a model.

Quanteda Naive Bayes classifier

This Quanteda tutorial uses movie review data in a format similar to the spam/ham corpus. I’ve modified it below to only use a portion of the data.

library(caret)

Warning: package 'caret' was built under R version 4.5.2

Loading required package: lattice


Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

library(quanteda)

Warning: package 'quanteda' was built under R version 4.5.3

Package version: 4.4
Unicode version: 15.1
ICU version: 74.1

Parallel computing: 16 of 16 threads used.

See https://quanteda.io for tutorials and examples.


Attaching package: 'quanteda'

The following object is masked from 'package:tm':

    stopwords

The following objects are masked from 'package:NLP':

    meta, meta<-

The following object is masked from 'package:readtext':

    texts

library(quanteda.textmodels)

Warning: package 'quanteda.textmodels' was built under R version 4.5.3

#data prep
#combined spam_df and ham_df (these have the full text in a single cell)
#create a new email id so everything has its own id once tokenized

spam_ham <- rbind(spam_df, ham_df)

spam_ham <- spam_ham |>
  select(text, label) |>
  mutate(email_id = row_number())

#format as corpus (part of the quanteda package)
spam_corpus <- corpus(spam_ham, text_field = "text")

#this data set is huge, so we'll use about 10% to test 
#generate 300 numbers without replacement (10% of the data)
set.seed(300)
id_train <- sample(1:3000, 300, replace = FALSE)
head(id_train, 10)

 [1] 2638  874  789  553 1875 1705  272  461 2828 1383

#separate set of 100 numbers for test set, so we don't test on training data
remaining_ids <- setdiff(1:3000, id_train)


id_test <- sample(remaining_ids, 100, replace = FALSE)


spam_corpus$email_id <- 1:ndoc(spam_corpus)

# tokenize texts
toks_spam <- tokens(spam_corpus, remove_punct = TRUE, remove_number = TRUE) %>% 
               tokens_remove(pattern = stopwords("en")) %>% 
               tokens_wordstem()

dfmt_spam <- dfm(toks_spam)

# get training set
spam_training <- dfm_subset(dfmt_spam, email_id %in% id_train)

# get test set 
spam_test <- dfm_subset(dfmt_spam, email_id %in% id_test)

Train the model:

tmod_nb <- textmodel_nb(spam_training, spam_training$label)

summary(tmod_nb)


Call:
textmodel_nb.dfm(x = spam_training, y = spam_training$label)

Class Priors:
(showing first 2 elements)
 ham spam 
 0.5  0.5 

Estimated Feature Scores:
            mv bfc8d64d12b325ff385cca8d07b84288
ham  6.117e-06                        6.117e-06
spam 8.566e-06                        8.566e-06
     00010.7f5fb525755c45eb78efc18d7c9ea5aa c60d1c697136b07c947fa180ba3e0441
ham                               6.117e-06                        6.117e-06
spam                              8.566e-06                        8.566e-06
     00101.2dfd7ee79ae439b8d9c38e783a137efa
ham                               6.117e-06
spam                              8.566e-06
     00102.2e3969075728dde7a328e05d19b35976
ham                               6.117e-06
spam                              8.566e-06
     00103.8c39bfed2079f865e9dfb75f4416a468
ham                               6.117e-06
spam                              8.566e-06
     00104.886f4a22362f4d3528c3e675878f17f7
ham                               6.117e-06
spam                              8.566e-06
     00105.9790e1c57fcbf7885b7cd1719fb4681b fa6df8609cebb6f0f37aec3f70aa5b9a
ham                               6.117e-06                        6.117e-06
spam                              8.566e-06                        8.566e-06
     f1d4194b57840ea6587b9a73ed88e075 00108.4506c2ef846b80b9a7beb90315b22701
ham                         6.117e-06                              6.117e-06
spam                        8.566e-06                              8.566e-06
     00109.601a9cd8272f22236b27e95dbe2fa22d
ham                               6.117e-06
spam                              8.566e-06
     00011.2a1247254a535bac29c476b86c708901
ham                               6.117e-06
spam                              8.566e-06
     00110.20934dc65c9a88fc9c6afda9952ce2c5 a163d41592b3a52747d7521341a961af
ham                               6.117e-06                        6.117e-06
spam                              8.566e-06                        8.566e-06
     ec411d26d1f4decc16af7ef73e69a227 ff113297f0ed07536d288c7b2193a8ec
ham                         6.117e-06                        6.117e-06
spam                        8.566e-06                        8.566e-06
     c104ada3a249e1e1846c0cd156a303e9 d7c257361675ee5d45baa552205fb472
ham                         6.117e-06                        6.117e-06
spam                        8.566e-06                        8.566e-06
     00116.8e13644b995f98dbab198b71e26f67ec
ham                               6.117e-06
spam                              8.566e-06
     00117.33011fddf61efe5f453a14468ff7e629
ham                               6.117e-06
spam                              8.566e-06
     00118.4be8b50c2a818c62b62e70c4b5456113
ham                               6.117e-06
spam                              8.566e-06
     00119.07aedc59172c0c25ef617188ada9b80f
ham                               6.117e-06
spam                              8.566e-06
     00012.7bc8e619ad0264979edce15083e70a02
ham                               6.117e-06
spam                              8.566e-06
     00120.4312b48b82c3d018d2d4ccf5b8e9c167
ham                               6.117e-06
spam                              8.566e-06
     00121.772c3ccd1b6c1a2e0e2ec0356082c77b
ham                               6.117e-06
spam                              8.566e-06
     00122.21b041c1ad2be417102d7f5d3f0b7045
ham                               6.117e-06
spam                              8.566e-06
     00123.68e87f8b736959b1ab5c4b5f2ce7484a
ham                               6.117e-06
spam                              8.566e-06
     00124.37afd066a74d18b7f14bea0b1fb43d4d
ham                               6.117e-06
spam                              8.566e-06

#test the model
dfmat_matched <- dfm_match(spam_test, features = featnames(spam_training))

#inspect how well the classification worked.
#Also, the point at which I stopped changing the labels from the sample code

actual_class <- dfmat_matched$label
predicted_class <- predict(tmod_nb, newdata = dfmat_matched)

#the mini confusion matrix
tab_class <- table(actual_class, predicted_class)
tab_class

            predicted_class
actual_class ham spam
        ham   79    2
        spam   8   11

Confusion matrix (positive class is ham):

confusionMatrix(tab_class)

Confusion Matrix and Statistics

            predicted_class
actual_class ham spam
        ham   79    2
        spam   8   11
                                         
               Accuracy : 0.9            
                 95% CI : (0.8238, 0.951)
    No Information Rate : 0.87           
    P-Value [Acc > NIR] : 0.2337         
                                         
                  Kappa : 0.6305         
                                         
 Mcnemar's Test P-Value : 0.1138         
                                         
            Sensitivity : 0.9080         
            Specificity : 0.8462         
         Pos Pred Value : 0.9753         
         Neg Pred Value : 0.5789         
             Prevalence : 0.8700         
         Detection Rate : 0.7900         
   Detection Prevalence : 0.8100         
      Balanced Accuracy : 0.8771         
                                         
       'Positive' Class : ham

confusionMatrix(tab_class, mode = "everything")

Confusion Matrix and Statistics

            predicted_class
actual_class ham spam
        ham   79    2
        spam   8   11
                                         
               Accuracy : 0.9            
                 95% CI : (0.8238, 0.951)
    No Information Rate : 0.87           
    P-Value [Acc > NIR] : 0.2337         
                                         
                  Kappa : 0.6305         
                                         
 Mcnemar's Test P-Value : 0.1138         
                                         
            Sensitivity : 0.9080         
            Specificity : 0.8462         
         Pos Pred Value : 0.9753         
         Neg Pred Value : 0.5789         
              Precision : 0.9753         
                 Recall : 0.9080         
                     F1 : 0.9405         
             Prevalence : 0.8700         
         Detection Rate : 0.7900         
   Detection Prevalence : 0.8100         
      Balanced Accuracy : 0.8771         
                                         
       'Positive' Class : ham

90% accuracy, 79 true positives, 8 false negatives, 2 false positives, and 11 true negatives. As a spam filter, 8 non-spam emails are going into the spam folder and 2 spam messages sneak through. No one wants 9 legitimate emails out of 87 to go in their spam filter. I wonder if the results woud be better if the sample size were larger.

In retrospect, I could have taken other steps to prep the data. I didn’t filter out numbers, short words (though these are filtered by stop words), or nonsense strings, which appear frequently in older email messages.

Sources

Silge, J., & Robinson, D. (2017). Text mining with R: A tidy approach. O’Reilly Media.

Watanabe, K & Müller, S. Quanteda Tutorials. https://tutorials.quanteda.io/machine-learning/nb/.

Google Gemini. (2026). Gemini 3 Flash [Large language model].https://gemini.google.com. Accessed May 1, 2026.