library(tidyverse)
library(caret)
library(quanteda)
library(future)
library(future.apply)
library(quanteda.textmodels)
<- function(file_path, file_count = 1) {
load_files_to_df <- list.files(path = file_path, full.names = TRUE, pattern = "^\\d+.*")
file_names <- head(file_names, file_count)
file_names <- set_names(file_names, nm = basename(file_names)) %>%
text_data map_df(~tibble(name = .y, data = read_file(.x)), .y = names(.))
return(text_data)
}
<- function(df, cache_filename) {
preprocess_and_create_dfm message("### preprocessing ...")
message("#### setting up parallel text processing ...")
plan(multisession, workers = 4)
message("#### cleaning ...")
<- future.apply::future_sapply(df$data, FUN = function(text) {
clean_data %>%
text str_replace_all("\\n", " ") %>%
str_squish()
USE.NAMES = FALSE)
},
message("#### tokenizing and creating dfm/document feature matrix ... ")
<- tokens(clean_data, what = "word",
tokens remove_punct = TRUE, remove_symbols = TRUE)
<- tokens_remove(tokens, stopwords("en"))
tokens
<- dfm(tokens)
dfm
message("#### removing sparse terms ...")
<- dfm_trim(dfm, sparsity = .95)
dfm return(dfm)
}
message("## Loading email files")
# Make sure these are here!
<- load_files_to_df("nogit_easy_ham/", file_count = 100)
mail <- load_files_to_df("nogit_spam/", file_count = 100)
spam <- bind_rows(
all_mail %>% mutate(label = 1),
mail %>% mutate(label = 0)
spam %>% sample_frac(size = 1)
)
message("## Split into test and training")
<- createDataPartition(all_mail$label, p = 0.8, list = FALSE)
split_index <- all_mail[split_index, ]
train_set <- all_mail[-split_index, ]
test_set
message("## Building training dataset dtm")
<- preprocess_and_create_dfm(train_set,
train_set_dfm cache_filename = "train_set_dfm.rds")
message("## Building test dataset dtm")
<- preprocess_and_create_dfm(test_set,
test_set_dfm cache_filename = "test_set_dfm.rds")
<- dfm_match(test_set_dfm, featnames(train_set_dfm))
test_set_dfm
message("## Model and predict")
<- textmodel_nb(train_set_dfm, y = factor(train_set$label))
model <- predict(model, newdata = test_set_dfm)
predictions <- factor(test_set$label) true_labels
A spam classifier
Assignment
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).
To run locally
If you’d like to run this code yourself, you’ll have to download the spam and ham email zips nd uncompressed separately on your local computer. Those files need to be moved into the following directories.
- Real emails: ./nogit_easy_ham
- Spam emails: ./nogit_spam
The code
Confusion matrix results
This is pretty effective and fairly consistent algorithm and program. I’ve ran it at least two dozen times, the lowest is low 92%’s, and the highest are low 94%’s.
With respect to turning dials and making tweaks, the biggest tweak is finding that about 100 messages is all it needed to make about the same level of consistent predictions.
Please remember when reading the results to look at accuracy, confidence interval, and the reference/prediction block that shows errors and accurate predictions. this is how to read that chart.
Prediction Reference | 0 | 1 |
---|---|---|
0 | True Negative | False Positive (Type 1 Error) |
1 | False Negative (Type 2 Error) | True Positive |
# print the confusion matrix
<- confusionMatrix(predictions, true_labels)
conf_mat print(conf_mat)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1726 21
1 274 1979
Accuracy : 0.9262
95% CI : (0.9177, 0.9342)
No Information Rate : 0.5
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8525
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.8630
Specificity : 0.9895
Pos Pred Value : 0.9880
Neg Pred Value : 0.8784
Prevalence : 0.5000
Detection Rate : 0.4315
Detection Prevalence : 0.4368
Balanced Accuracy : 0.9263
'Positive' Class : 0