Working with test document data from ‘Spam Assassin’, this project will classify documents whether it is spam or non-spam email. A training and validation set will be created and pushed into a decision tree and logistic regression model.
library(tidyverse)
library(tidytext)
library(purrr)
library(rpart)
library(rpart.plot)
Import Non-Spam Emails
Importing Multiple Document files was assisted with the following code:
‘Multiple Text Files’, ‘Unpacking Lists’
# Create list of text files
txt_files_ls = list.files(path="ham", pattern="*")
# Read the files in
read_files_list <- lapply(paste0('ham/',txt_files_ls), readLines)
# Transform from lists to Data Frame
non_spam_df <- map_df(read_files_list, ~as.data.frame(.x), .id="email") |>
rename(text = ".x")
knitr::kable(head(non_spam_df))
text | |
---|---|
1 | From exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002 |
1 | Return-Path: exmh-workers-admin@example.com |
1 | Delivered-To: zzzz@localhost.netnoteinc.com |
1 | Received: from localhost (localhost [127.0.0.1]) |
1 | by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36 |
1 | for zzzz@localhost; Thu, 22 Aug 2002 07:36:16 -0400 (EDT) |
Tokenize
Tokenize words from spam email and select words that provoke either positive or negative sentiment. This removes filler words such as ‘the’ or punctuations.
tokenize_non_spam <-
non_spam_df |>
unnest_tokens(word, text) |>
mutate(word = str_split(word, "[:punct:]")) |>
unnest(word) |>
inner_join(get_sentiments("bing"))
knitr::kable(head(tokenize_non_spam))
word | sentiment | |
---|---|---|
1 | errors | negative |
1 | error | negative |
1 | like | positive |
1 | fail | negative |
1 | error | negative |
1 | mar | negative |
Frequency
Determine frequency usage per non-spam email. Due to local storage and processing constraints for the models, I will be selecting the top 50 words found.
non_spam_freq <-
tokenize_non_spam |>
group_by(email, word) |>
summarise(count = n(), .groups = 'drop') |> # determine freq/word
group_by(email) |>
mutate(pct_freq = (count / sum(count))) |>
mutate(is_blank_node = ifelse(word == "" | str_detect(word, '^\\s*$'), 1, 0)) |>
filter(is_blank_node != 1) |>
group_by(word) |>
mutate(total_count = n()) |>
ungroup() |>
mutate(rank = dense_rank(desc(total_count))) |>
filter(rank <= 50) |>
select(!c(count,is_blank_node, total_count, rank))
knitr::kable(head(non_spam_freq))
word | pct_freq | |
---|---|---|
1 | error | 0.2500 |
1 | errors | 0.1250 |
1 | issue | 0.1250 |
1 | like | 0.1250 |
1 | works | 0.1250 |
10 | available | 0.0125 |
Pivot
Pivot Words to Wide Format to Evaluate in Models
non_spam_freq <-
non_spam_freq |>
pivot_wider(names_from = word, values_from = pct_freq, values_fill = 0) |>
mutate(email = 'non_spam')
Import Spam Emails
# Create list of text files
txt_files_ls = list.files(path="spam", pattern="*")
# Read the files in
read_files_list <- lapply(paste0('spam/',txt_files_ls), readLines)
# Transform from lists to Data Frame
spam_df <- map_df(read_files_list, ~as.data.frame(.x), .id="email") |>
rename(text = ".x")
Tokenize
Tokenize words from spam email and select words that provoke either positive or negative sentiment. This removes filler words such as ‘the’ or punctuations.
tokenize_spam <-
spam_df |>
unnest_tokens(word, text) |>
mutate(word = str_split(word, "[:punct:]")) |>
unnest(word) |>
inner_join(get_sentiments("bing"))
Frequency
Determine frequency usage per spam email. Due to local storage and processing constraints for the models, I will be selecting the top 50 words found.
spam_freq <-
tokenize_spam |>
group_by(email, word) |>
summarise(count = n(), .groups = 'drop') |> # determine freq/word
group_by(email) |>
mutate(pct_freq = (count / sum(count))) |>
mutate(is_blank_node = ifelse(word == "" | str_detect(word, '^\\s*$'), 1, 0)) |>
filter(is_blank_node != 1) |>
group_by(word) |>
mutate(total_count = n()) |>
ungroup() |>
mutate(rank = dense_rank(desc(total_count))) |>
filter(rank <= 50) |>
select(!c(count,is_blank_node, total_count, rank))
Pivot
Pivot Words to Wide Format to Evaluate in Models
spam_freq <-
spam_freq |>
pivot_wider(names_from = word, values_from = pct_freq, values_fill = 0) |>
mutate(email = 'spam')
Export Data to CSV
spam_ham_df <-
non_spam_freq |>
bind_rows(spam_freq) |>
mutate(across(everything(), ~replace_na(.x, 0))) |>
mutate(email = as.factor(email))
write_csv(spam_ham_df, 'spam_ham.csv')
knitr::kable(head(spam_ham_df |> select(1:6)))
error | errors | issue | like | works | |
---|---|---|---|---|---|
non_spam | 0.2500000 | 0.1250000 | 0.125 | 0.1250000 | 0.125 |
non_spam | 0.0000000 | 0.0000000 | 0.000 | 0.0250000 | 0.000 |
non_spam | 0.0909091 | 0.0909091 | 0.000 | 0.0909091 | 0.000 |
non_spam | 0.0000000 | 0.0526316 | 0.000 | 0.0526316 | 0.000 |
non_spam | 0.0000000 | 0.1250000 | 0.000 | 0.2500000 | 0.000 |
non_spam | 0.0000000 | 0.1666667 | 0.000 | 0.0000000 | 0.000 |
set.seed(123)
trainIndex <- sample(1:nrow(spam_ham_df), size = round(0.7*nrow(spam_ham_df)), replace = F)
trainingSet <- spam_ham_df[trainIndex,]
validationSet <- spam_ham_df[-trainIndex,]
MyTree <- rpart(email ~ ., data=trainingSet, method="class",
control=rpart.control(minsplit=10, cp=0.005))
prunedTree <- prune(MyTree, cp=MyTree$cptable[which.min(MyTree$cptable[,"xerror"]),"CP"])
Generating Confusion Matrices for the training and validation sets
predTraining <- predict(MyTree, trainingSet, type="class")
predValidation <- predict(MyTree, validationSet, type="class")
# Confusion Matrix for the training set
Predicted <- table(Predicted=predTraining)
Observed <- table(Observed=trainingSet[, 1])
t1 <- rbind(Predicted, Observed)
knitr::kable(t1, caption = 'Confusion Matrix for the training set')
non_spam | spam | |
---|---|---|
Predicted | 1636 | 901 |
Observed | 1595 | 942 |
# Confusion Matrix for the validation set
Predicted <- table(Predicted=predValidation)
Observed <- table(Observed=validationSet[, 1])
t2 <- rbind(Predicted, Observed)
knitr::kable(t2, caption = 'Confusion Matrix for the validation set')
non_spam | spam | |
---|---|---|
Predicted | 712 | 375 |
Observed | 689 | 398 |
# Classification Rates
predRateTraining <- mean(predTraining == trainingSet$email)
predRateValidation <- mean(predValidation == validationSet$email)
Plot Decision Tree
prp(prunedTree, main=paste("Decision Tree\n(Correct classification rate ",
round(predRateTraining,4)*100,
"% for the training set\n ",
round(predRateValidation,4)*100,
"% for the validation set)"),
type=4, extra=6, faclen=0, under=TRUE)
lm_mod <- glm(email ~ ., family = 'binomial', data = spam_ham_df)
summary(lm_mod)
##
## Call:
## glm(formula = email ~ ., family = "binomial", data = spam_ham_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.235 0.000 0.000 0.000 3.482
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.858e+00 2.282e-01 -8.141 3.91e-16 ***
## error 3.678e+00 1.510e+00 2.435 0.014907 *
## errors -2.808e+00 9.056e-01 -3.101 0.001930 **
## issue -9.129e+02 2.335e+05 -0.004 0.996880
## like -4.334e-01 8.766e-01 -0.494 0.621062
## works -7.794e-01 2.896e+00 -0.269 0.787853
## available 8.336e+00 1.936e+00 4.306 1.66e-05 ***
## bad 9.547e-01 2.012e+00 0.474 0.635169
## free 5.991e+00 8.305e-01 7.213 5.46e-13 ***
## good 8.329e-01 9.577e-01 0.870 0.384479
## hard 1.094e+00 3.296e+00 0.332 0.739915
## unknown 1.199e+00 6.756e-01 1.774 0.076003 .
## worth -1.856e+03 3.944e+05 -0.005 0.996245
## clean -1.098e+03 1.097e+05 -0.010 0.992015
## problem -9.359e+00 4.418e+00 -2.118 0.034152 *
## problems 5.905e-01 1.470e+00 0.402 0.687894
## enough 1.024e+00 2.235e+00 0.458 0.646805
## tired -4.762e+02 3.663e+05 -0.001 0.998963
## false -4.336e+02 1.793e+05 -0.002 0.998070
## great 2.244e+00 1.956e+00 1.147 0.251262
## pretty -5.968e+02 2.307e+05 -0.003 0.997936
## forged -2.124e+00 1.400e+00 -1.518 0.129089
## top 4.244e+00 1.077e+00 3.942 8.09e-05 ***
## interesting -6.514e+02 4.979e+05 -0.001 0.998956
## work 1.177e+00 9.851e-01 1.194 0.232318
## happy -2.836e+00 4.426e+00 -0.641 0.521566
## won 3.021e+00 1.822e+00 1.658 0.097331 .
## best 4.274e+00 9.958e-01 4.292 1.77e-05 ***
## issues -3.647e+02 2.519e+05 -0.001 0.998845
## nice -6.578e+02 1.834e+05 -0.004 0.997138
## support -2.087e+00 3.165e+00 -0.659 0.509652
## well -1.210e+01 4.944e+00 -2.448 0.014368 *
## worked -3.164e+02 2.280e+05 -0.001 0.998893
## unconfirmed -4.676e+02 2.675e+05 -0.002 0.998606
## important 3.311e+00 1.573e+00 2.104 0.035336 *
## bs -4.711e+02 4.182e+05 -0.001 0.999101
## easy 8.336e+00 2.455e+00 3.396 0.000685 ***
## respect -1.284e+03 2.052e+06 -0.001 0.999501
## right 2.300e+00 1.441e+00 1.596 0.110388
## nonsense -3.646e+02 4.770e+04 -0.008 0.993902
## slack -2.768e+02 9.102e+04 -0.003 0.997574
## bug -4.188e+02 1.302e+05 -0.003 0.997434
## dense -1.020e+03 2.386e+05 -0.004 0.996589
## lost -1.990e+03 3.584e+05 -0.006 0.995569
## better 1.029e+00 1.558e+00 0.660 0.508964
## love 7.083e+00 1.839e+00 3.852 0.000117 ***
## wrong -5.395e+02 1.970e+05 -0.003 0.997816
## clear -1.023e+03 2.076e+06 0.000 0.999607
## fast -8.879e-01 3.028e+00 -0.293 0.769376
## fine -9.336e+02 3.428e+05 -0.003 0.997827
## fall -6.783e+02 2.554e+05 -0.003 0.997881
## useful -7.107e+02 5.008e+05 -0.001 0.998868
## warning 1.863e+00 1.202e+00 1.550 0.121211
## welcome 2.188e+00 2.636e+00 0.830 0.406594
## heaven 5.935e+00 2.282e+00 2.601 0.009295 **
## easier 6.758e+02 3.848e+05 0.002 0.998599
## sincerely 4.455e+02 2.232e+05 0.002 0.998408
## taint 1.097e+03 1.048e+05 0.010 0.991652
## lose 2.919e+02 1.802e+05 0.002 0.998707
## secure 1.683e+02 4.395e+05 0.000 0.999694
## trust 5.463e+02 3.484e+05 0.002 0.998749
## risk 4.881e+02 2.084e+05 0.002 0.998132
## success 2.770e+02 2.782e+05 0.001 0.999206
## waste 1.125e+02 1.523e+05 0.001 0.999411
## powerful 7.428e+02 3.276e+05 0.002 0.998191
## unlimited 4.906e+02 2.499e+05 0.002 0.998433
## safe 3.618e+02 3.172e+05 0.001 0.999090
## fastest -2.361e+02 5.296e+05 0.000 0.999644
## jabber 6.287e+02 6.781e+05 0.001 0.999260
## valuable 4.039e+02 3.695e+05 0.001 0.999128
## bonus 1.525e+01 8.283e+04 0.000 0.999853
## toll 7.690e+02 3.574e+05 0.002 0.998283
## debt 1.458e+02 4.471e+05 0.000 0.999740
## improvement 8.022e+01 5.431e+05 0.000 0.999882
## strong 5.742e+02 1.203e+05 0.005 0.996192
## loss 2.436e+02 2.920e+05 0.001 0.999335
## proven 6.287e+02 3.279e+05 0.002 0.998470
## thank 3.539e+02 1.631e+05 0.002 0.998269
## guarantee 1.900e+02 1.618e+05 0.001 0.999063
## abuse 2.535e+02 1.645e+05 0.002 0.998770
## successful 4.292e+01 6.292e+05 0.000 0.999946
## advantage 1.355e+02 1.651e+05 0.001 0.999345
## solid 5.208e+02 4.749e+05 0.001 0.999125
## limited 8.571e+02 1.286e+06 0.001 0.999468
## benefits 4.602e+02 4.026e+05 0.001 0.999088
## effective 1.644e+03 1.503e+06 0.001 0.999127
## hot 4.868e+02 2.311e+05 0.002 0.998320
## super 2.576e+02 2.140e+05 0.001 0.999040
## excite 1.118e+03 1.750e+05 0.006 0.994901
## protect 2.751e+02 1.871e+05 0.001 0.998826
## ready 3.144e+02 2.419e+05 0.001 0.998963
## savings 1.640e+03 2.762e+05 0.006 0.995263
## win 2.053e+02 1.812e+05 0.001 0.999096
## amazing 4.391e+02 3.629e+05 0.001 0.999035
## perfect 3.746e+02 2.493e+05 0.002 0.998801
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4775.17 on 3623 degrees of freedom
## Residual deviance: 762.81 on 3529 degrees of freedom
## AIC: 952.81
##
## Number of Fisher Scoring iterations: 24