Introduction

Working with test document data from ‘Spam Assassin’, this project will classify documents whether it is spam or non-spam email. A training and validation set will be created and pushed into a decision tree and logistic regression model.

Required Libraries

library(tidyverse)
library(tidytext)
library(purrr)
library(rpart)
library(rpart.plot)

Data Prep Non-Spam Emails

Import Non-Spam Emails

Importing Multiple Document files was assisted with the following code:

‘Multiple Text Files’, ‘Unpacking Lists’

# Create list of text files
txt_files_ls = list.files(path="ham", pattern="*") 

# Read the files in
read_files_list <- lapply(paste0('ham/',txt_files_ls), readLines)

# Transform from lists to Data Frame
non_spam_df <- map_df(read_files_list, ~as.data.frame(.x), .id="email") |> 
  rename(text = ".x")

knitr::kable(head(non_spam_df))
email text
1 From Thu Aug 22 12:36:23 2002
1 Return-Path:
1 Delivered-To:
1 Received: from localhost (localhost [127.0.0.1])
1 by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36
1 for ; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)

Tokenize

Tokenize words from spam email and select words that provoke either positive or negative sentiment. This removes filler words such as ‘the’ or punctuations.

tokenize_non_spam <- 
  non_spam_df |> 
  unnest_tokens(word, text) |> 
  mutate(word = str_split(word, "[:punct:]")) |> 
  unnest(word) |> 
  inner_join(get_sentiments("bing"))

knitr::kable(head(tokenize_non_spam))
email word sentiment
1 errors negative
1 error negative
1 like positive
1 fail negative
1 error negative
1 mar negative

Frequency

Determine frequency usage per non-spam email. Due to local storage and processing constraints for the models, I will be selecting the top 50 words found.

non_spam_freq <- 
  tokenize_non_spam |> 
  group_by(email, word) |> 
  summarise(count = n(), .groups = 'drop') |>  # determine freq/word
  group_by(email) |> 
  mutate(pct_freq = (count / sum(count))) |> 
  mutate(is_blank_node = ifelse(word == "" | str_detect(word, '^\\s*$'), 1, 0)) |> 
  filter(is_blank_node != 1) |> 
  group_by(word) |> 
  mutate(total_count = n()) |> 
  ungroup() |> 
  mutate(rank = dense_rank(desc(total_count))) |> 
  filter(rank <= 50) |> 
  select(!c(count,is_blank_node, total_count, rank))

knitr::kable(head(non_spam_freq))
email word pct_freq
1 error 0.2500
1 errors 0.1250
1 issue 0.1250
1 like 0.1250
1 works 0.1250
10 available 0.0125

Pivot

Pivot Words to Wide Format to Evaluate in Models

non_spam_freq <- 
  non_spam_freq |> 
  pivot_wider(names_from = word, values_from = pct_freq, values_fill = 0) |> 
  mutate(email = 'non_spam')

Data Prep Import Non-Spam Emails

Import Spam Emails

# Create list of text files
txt_files_ls = list.files(path="spam", pattern="*") 

# Read the files in
read_files_list <- lapply(paste0('spam/',txt_files_ls), readLines)

# Transform from lists to Data Frame
spam_df <- map_df(read_files_list, ~as.data.frame(.x), .id="email") |> 
  rename(text = ".x")

Tokenize

Tokenize words from spam email and select words that provoke either positive or negative sentiment. This removes filler words such as ‘the’ or punctuations.

tokenize_spam <- 
  spam_df |> 
  unnest_tokens(word, text) |> 
  mutate(word = str_split(word, "[:punct:]")) |> 
  unnest(word) |> 
  inner_join(get_sentiments("bing"))

Frequency

Determine frequency usage per spam email. Due to local storage and processing constraints for the models, I will be selecting the top 50 words found.

spam_freq <- 
  tokenize_spam |> 
  group_by(email, word) |> 
  summarise(count = n(), .groups = 'drop') |>  # determine freq/word
  group_by(email) |> 
  mutate(pct_freq = (count / sum(count))) |> 
  mutate(is_blank_node = ifelse(word == "" | str_detect(word, '^\\s*$'), 1, 0)) |> 
  filter(is_blank_node != 1) |> 
  group_by(word) |> 
  mutate(total_count = n()) |> 
  ungroup() |> 
  mutate(rank = dense_rank(desc(total_count))) |> 
  filter(rank <= 50) |> 
  select(!c(count,is_blank_node, total_count, rank))

Pivot

Pivot Words to Wide Format to Evaluate in Models

spam_freq <- 
  spam_freq |> 
  pivot_wider(names_from = word, values_from = pct_freq, values_fill = 0) |> 
  mutate(email = 'spam')

Export Data to CSV

spam_ham_df <- 
  non_spam_freq |> 
  bind_rows(spam_freq) |> 
  mutate(across(everything(), ~replace_na(.x, 0))) |> 
  mutate(email = as.factor(email))

write_csv(spam_ham_df, 'spam_ham.csv')

knitr::kable(head(spam_ham_df |> select(1:6)))
email error errors issue like works
non_spam 0.2500000 0.1250000 0.125 0.1250000 0.125
non_spam 0.0000000 0.0000000 0.000 0.0250000 0.000
non_spam 0.0909091 0.0909091 0.000 0.0909091 0.000
non_spam 0.0000000 0.0526316 0.000 0.0526316 0.000
non_spam 0.0000000 0.1250000 0.000 0.2500000 0.000
non_spam 0.0000000 0.1666667 0.000 0.0000000 0.000

Generate Training and Validation Sets

set.seed(123)

trainIndex  <- sample(1:nrow(spam_ham_df), size = round(0.7*nrow(spam_ham_df)), replace = F)

trainingSet <- spam_ham_df[trainIndex,]
validationSet  <- spam_ham_df[-trainIndex,]

Set Up Decision Tree

MyTree <- rpart(email ~ ., data=trainingSet, method="class", 
              control=rpart.control(minsplit=10, cp=0.005))
prunedTree <- prune(MyTree, cp=MyTree$cptable[which.min(MyTree$cptable[,"xerror"]),"CP"])

Generating Confusion Matrices for the training and validation sets

predTraining <- predict(MyTree, trainingSet, type="class") 
predValidation <- predict(MyTree, validationSet, type="class")

# Confusion Matrix for the training set
Predicted <- table(Predicted=predTraining)
Observed <- table(Observed=trainingSet[, 1])

t1 <- rbind(Predicted, Observed)
knitr::kable(t1, caption = 'Confusion Matrix for the training set')
Confusion Matrix for the training set
non_spam spam
Predicted 1636 901
Observed 1595 942
# Confusion Matrix for the validation set
Predicted <- table(Predicted=predValidation)
Observed <- table(Observed=validationSet[, 1])

t2 <- rbind(Predicted, Observed)
knitr::kable(t2, caption = 'Confusion Matrix for the validation set')
Confusion Matrix for the validation set
non_spam spam
Predicted 712 375
Observed 689 398
# Classification Rates
predRateTraining <- mean(predTraining == trainingSet$email)
predRateValidation <- mean(predValidation == validationSet$email)

Plot Decision Tree

prp(prunedTree, main=paste("Decision Tree\n(Correct classification rate ",
                           round(predRateTraining,4)*100,
                           "% for the training set\n ",
                           round(predRateValidation,4)*100,
                           "% for the validation set)"), 
    type=4, extra=6, faclen=0, under=TRUE)


Logistic Regression Model

lm_mod <-  glm(email ~ ., family = 'binomial', data = spam_ham_df)
summary(lm_mod)
## 
## Call:
## glm(formula = email ~ ., family = "binomial", data = spam_ham_df)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.235   0.000   0.000   0.000   3.482  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.858e+00  2.282e-01  -8.141 3.91e-16 ***
## error        3.678e+00  1.510e+00   2.435 0.014907 *  
## errors      -2.808e+00  9.056e-01  -3.101 0.001930 ** 
## issue       -9.129e+02  2.335e+05  -0.004 0.996880    
## like        -4.334e-01  8.766e-01  -0.494 0.621062    
## works       -7.794e-01  2.896e+00  -0.269 0.787853    
## available    8.336e+00  1.936e+00   4.306 1.66e-05 ***
## bad          9.547e-01  2.012e+00   0.474 0.635169    
## free         5.991e+00  8.305e-01   7.213 5.46e-13 ***
## good         8.329e-01  9.577e-01   0.870 0.384479    
## hard         1.094e+00  3.296e+00   0.332 0.739915    
## unknown      1.199e+00  6.756e-01   1.774 0.076003 .  
## worth       -1.856e+03  3.944e+05  -0.005 0.996245    
## clean       -1.098e+03  1.097e+05  -0.010 0.992015    
## problem     -9.359e+00  4.418e+00  -2.118 0.034152 *  
## problems     5.905e-01  1.470e+00   0.402 0.687894    
## enough       1.024e+00  2.235e+00   0.458 0.646805    
## tired       -4.762e+02  3.663e+05  -0.001 0.998963    
## false       -4.336e+02  1.793e+05  -0.002 0.998070    
## great        2.244e+00  1.956e+00   1.147 0.251262    
## pretty      -5.968e+02  2.307e+05  -0.003 0.997936    
## forged      -2.124e+00  1.400e+00  -1.518 0.129089    
## top          4.244e+00  1.077e+00   3.942 8.09e-05 ***
## interesting -6.514e+02  4.979e+05  -0.001 0.998956    
## work         1.177e+00  9.851e-01   1.194 0.232318    
## happy       -2.836e+00  4.426e+00  -0.641 0.521566    
## won          3.021e+00  1.822e+00   1.658 0.097331 .  
## best         4.274e+00  9.958e-01   4.292 1.77e-05 ***
## issues      -3.647e+02  2.519e+05  -0.001 0.998845    
## nice        -6.578e+02  1.834e+05  -0.004 0.997138    
## support     -2.087e+00  3.165e+00  -0.659 0.509652    
## well        -1.210e+01  4.944e+00  -2.448 0.014368 *  
## worked      -3.164e+02  2.280e+05  -0.001 0.998893    
## unconfirmed -4.676e+02  2.675e+05  -0.002 0.998606    
## important    3.311e+00  1.573e+00   2.104 0.035336 *  
## bs          -4.711e+02  4.182e+05  -0.001 0.999101    
## easy         8.336e+00  2.455e+00   3.396 0.000685 ***
## respect     -1.284e+03  2.052e+06  -0.001 0.999501    
## right        2.300e+00  1.441e+00   1.596 0.110388    
## nonsense    -3.646e+02  4.770e+04  -0.008 0.993902    
## slack       -2.768e+02  9.102e+04  -0.003 0.997574    
## bug         -4.188e+02  1.302e+05  -0.003 0.997434    
## dense       -1.020e+03  2.386e+05  -0.004 0.996589    
## lost        -1.990e+03  3.584e+05  -0.006 0.995569    
## better       1.029e+00  1.558e+00   0.660 0.508964    
## love         7.083e+00  1.839e+00   3.852 0.000117 ***
## wrong       -5.395e+02  1.970e+05  -0.003 0.997816    
## clear       -1.023e+03  2.076e+06   0.000 0.999607    
## fast        -8.879e-01  3.028e+00  -0.293 0.769376    
## fine        -9.336e+02  3.428e+05  -0.003 0.997827    
## fall        -6.783e+02  2.554e+05  -0.003 0.997881    
## useful      -7.107e+02  5.008e+05  -0.001 0.998868    
## warning      1.863e+00  1.202e+00   1.550 0.121211    
## welcome      2.188e+00  2.636e+00   0.830 0.406594    
## heaven       5.935e+00  2.282e+00   2.601 0.009295 ** 
## easier       6.758e+02  3.848e+05   0.002 0.998599    
## sincerely    4.455e+02  2.232e+05   0.002 0.998408    
## taint        1.097e+03  1.048e+05   0.010 0.991652    
## lose         2.919e+02  1.802e+05   0.002 0.998707    
## secure       1.683e+02  4.395e+05   0.000 0.999694    
## trust        5.463e+02  3.484e+05   0.002 0.998749    
## risk         4.881e+02  2.084e+05   0.002 0.998132    
## success      2.770e+02  2.782e+05   0.001 0.999206    
## waste        1.125e+02  1.523e+05   0.001 0.999411    
## powerful     7.428e+02  3.276e+05   0.002 0.998191    
## unlimited    4.906e+02  2.499e+05   0.002 0.998433    
## safe         3.618e+02  3.172e+05   0.001 0.999090    
## fastest     -2.361e+02  5.296e+05   0.000 0.999644    
## jabber       6.287e+02  6.781e+05   0.001 0.999260    
## valuable     4.039e+02  3.695e+05   0.001 0.999128    
## bonus        1.525e+01  8.283e+04   0.000 0.999853    
## toll         7.690e+02  3.574e+05   0.002 0.998283    
## debt         1.458e+02  4.471e+05   0.000 0.999740    
## improvement  8.022e+01  5.431e+05   0.000 0.999882    
## strong       5.742e+02  1.203e+05   0.005 0.996192    
## loss         2.436e+02  2.920e+05   0.001 0.999335    
## proven       6.287e+02  3.279e+05   0.002 0.998470    
## thank        3.539e+02  1.631e+05   0.002 0.998269    
## guarantee    1.900e+02  1.618e+05   0.001 0.999063    
## abuse        2.535e+02  1.645e+05   0.002 0.998770    
## successful   4.292e+01  6.292e+05   0.000 0.999946    
## advantage    1.355e+02  1.651e+05   0.001 0.999345    
## solid        5.208e+02  4.749e+05   0.001 0.999125    
## limited      8.571e+02  1.286e+06   0.001 0.999468    
## benefits     4.602e+02  4.026e+05   0.001 0.999088    
## effective    1.644e+03  1.503e+06   0.001 0.999127    
## hot          4.868e+02  2.311e+05   0.002 0.998320    
## super        2.576e+02  2.140e+05   0.001 0.999040    
## excite       1.118e+03  1.750e+05   0.006 0.994901    
## protect      2.751e+02  1.871e+05   0.001 0.998826    
## ready        3.144e+02  2.419e+05   0.001 0.998963    
## savings      1.640e+03  2.762e+05   0.006 0.995263    
## win          2.053e+02  1.812e+05   0.001 0.999096    
## amazing      4.391e+02  3.629e+05   0.001 0.999035    
## perfect      3.746e+02  2.493e+05   0.002 0.998801    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4775.17  on 3623  degrees of freedom
## Residual deviance:  762.81  on 3529  degrees of freedom
## AIC: 952.81
## 
## Number of Fisher Scoring iterations: 24