Objective

Use word frequency features to build a model that predicts whether an email is spam.

Load Dataset

email_data <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-08-15/spam.csv')
## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Initial Exploration & Cleaning

skim(email_data)
Data summary
Name email_data
Number of rows 4601
Number of columns 7
_______________________
Column type frequency:
character 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
yesno 0 1 1 1 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
crl.tot 0 1 283.29 606.35 1 35 95 266.00 15841.00 ▇▁▁▁▁
dollar 0 1 0.08 0.25 0 0 0 0.05 6.00 ▇▁▁▁▁
bang 0 1 0.27 0.82 0 0 0 0.32 32.48 ▇▁▁▁▁
money 0 1 0.09 0.44 0 0 0 0.00 12.50 ▇▁▁▁▁
n000 0 1 0.10 0.35 0 0 0 0.00 5.45 ▇▁▁▁▁
make 0 1 0.10 0.31 0 0 0 0.00 4.54 ▇▁▁▁▁
table(email_data$yesno)
## 
##    n    y 
## 2788 1813
email_prepped <- email_data %>%
  mutate(yesno = factor(yesno, levels = c("y", "n")))

Visualizations

email_prepped %>%
  ggplot(aes(x = yesno)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Spam vs Non-Spam Emails")

email_prepped %>%
  ggplot(aes(x = yesno, y = crl.tot)) +
  geom_boxplot() +
  labs(title = "CRL Total by Spam Label")

Correlation Analysis via Funnel

email_binarized <- email_prepped %>%
  binarize()

cor_results <- email_binarized %>%
  correlate(yesno__y) %>%
  arrange(desc(correlation))

cor_results %>%
  correlationfunnel::plot_correlation_funnel()

Train-Test Split

set.seed(42)
email_split <- initial_split(email_prepped, strata = yesno)
train_set <- training(email_split)
test_set <- testing(email_split)

folds <- vfold_cv(train_set, v = 5, strata = yesno)

Recipe

email_recipe <- recipe(yesno ~ ., data = train_set) %>%
  step_YeoJohnson(all_numeric_predictors())

XGBoost Specification

model_spec <- boost_tree(trees = tune()) %>%
  set_mode("classification") %>%
  set_engine("xgboost")

model_workflow <- workflow() %>%
  add_recipe(email_recipe) %>%
  add_model(model_spec)

Tune Model

registerDoParallel()

set.seed(2024)
tuned_model <- tune_grid(
  model_workflow,
  resamples = folds,
  grid = 5,
  control = control_grid(save_pred = TRUE)
)

Evaluate Performance

collect_metrics(tuned_model)
## # A tibble: 15 × 7
##    trees .metric     .estimator   mean     n std_err .config             
##    <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>               
##  1    19 accuracy    binary     0.879      5 0.00680 Preprocessor1_Model1
##  2    19 brier_class binary     0.0947     5 0.00404 Preprocessor1_Model1
##  3    19 roc_auc     binary     0.917      5 0.00575 Preprocessor1_Model1
##  4   467 accuracy    binary     0.865      5 0.00717 Preprocessor1_Model2
##  5   467 brier_class binary     0.111      5 0.00572 Preprocessor1_Model2
##  6   467 roc_auc     binary     0.903      5 0.00654 Preprocessor1_Model2
##  7  1120 accuracy    binary     0.860      5 0.00750 Preprocessor1_Model3
##  8  1120 brier_class binary     0.118      5 0.00567 Preprocessor1_Model3
##  9  1120 roc_auc     binary     0.896      5 0.00659 Preprocessor1_Model3
## 10  1554 accuracy    binary     0.859      5 0.00692 Preprocessor1_Model4
## 11  1554 brier_class binary     0.121      5 0.00584 Preprocessor1_Model4
## 12  1554 roc_auc     binary     0.894      5 0.00669 Preprocessor1_Model4
## 13  1631 accuracy    binary     0.859      5 0.00737 Preprocessor1_Model5
## 14  1631 brier_class binary     0.121      5 0.00583 Preprocessor1_Model5
## 15  1631 roc_auc     binary     0.893      5 0.00672 Preprocessor1_Model5
collect_predictions(tuned_model) %>%
  group_by(id) %>%
  roc_curve(yesno, .pred_y) %>%
  autoplot()

Final Fit

final_model <- model_workflow %>%
  finalize_workflow(select_best(tuned_model, metric = "accuracy")) %>%
  last_fit(email_split)
## Warning: package 'xgboost' was built under R version 4.4.2
collect_metrics(final_model)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary        0.883  Preprocessor1_Model1
## 2 roc_auc     binary        0.927  Preprocessor1_Model1
## 3 brier_class binary        0.0923 Preprocessor1_Model1
collect_predictions(final_model) %>%
  conf_mat(yesno, .pred_class) %>%
  autoplot()

Variable Importance

final_model %>%
  extract_fit_engine() %>%
  vip()