Objective

Use word frequency features to build a model that predicts whether an email is spam.

Load Dataset

email_data <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-08-15/spam.csv')

## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Initial Exploration & Cleaning

skim(email_data)

Data summary
Name	email_data
Number of rows	4601
Number of columns	7
_______________________
Column type frequency:
character	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
yesno	0	1	1	1	0	2	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
crl.tot	1	283.29	606.35	1	35	95	266.00	15841.00	▇▁▁▁▁
dollar	1	0.08	0.25	0	0	0	0.05	6.00	▇▁▁▁▁
bang	1	0.27	0.82	0	0	0	0.32	32.48	▇▁▁▁▁
money	1	0.09	0.44	0	0	0	0.00	12.50	▇▁▁▁▁
n000	1	0.10	0.35	0	0	0	0.00	5.45	▇▁▁▁▁
make	1	0.10	0.31	0	0	0	0.00	4.54	▇▁▁▁▁

table(email_data$yesno)

## 
##    n    y 
## 2788 1813

email_prepped <- email_data %>%
  mutate(yesno = factor(yesno, levels = c("y", "n")))

Visualizations

email_prepped %>%
  ggplot(aes(x = yesno)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Spam vs Non-Spam Emails")

email_prepped %>%
  ggplot(aes(x = yesno, y = crl.tot)) +
  geom_boxplot() +
  labs(title = "CRL Total by Spam Label")

Correlation Analysis via Funnel

email_binarized <- email_prepped %>%
  binarize()

cor_results <- email_binarized %>%
  correlate(yesno__y) %>%
  arrange(desc(correlation))

cor_results %>%
  correlationfunnel::plot_correlation_funnel()

Train-Test Split

set.seed(42)
email_split <- initial_split(email_prepped, strata = yesno)
train_set <- training(email_split)
test_set <- testing(email_split)

folds <- vfold_cv(train_set, v = 5, strata = yesno)

Recipe

email_recipe <- recipe(yesno ~ ., data = train_set) %>%
  step_YeoJohnson(all_numeric_predictors())

XGBoost Specification

model_spec <- boost_tree(trees = tune()) %>%
  set_mode("classification") %>%
  set_engine("xgboost")

model_workflow <- workflow() %>%
  add_recipe(email_recipe) %>%
  add_model(model_spec)

Tune Model

registerDoParallel()

set.seed(2024)
tuned_model <- tune_grid(
  model_workflow,
  resamples = folds,
  grid = 5,
  control = control_grid(save_pred = TRUE)
)

Evaluate Performance

collect_metrics(tuned_model)

## # A tibble: 15 × 7
##    trees .metric     .estimator   mean     n std_err .config             
##    <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>               
##  1    19 accuracy    binary     0.879      5 0.00680 Preprocessor1_Model1
##  2    19 brier_class binary     0.0947     5 0.00404 Preprocessor1_Model1
##  3    19 roc_auc     binary     0.917      5 0.00575 Preprocessor1_Model1
##  4   467 accuracy    binary     0.865      5 0.00717 Preprocessor1_Model2
##  5   467 brier_class binary     0.111      5 0.00572 Preprocessor1_Model2
##  6   467 roc_auc     binary     0.903      5 0.00654 Preprocessor1_Model2
##  7  1120 accuracy    binary     0.860      5 0.00750 Preprocessor1_Model3
##  8  1120 brier_class binary     0.118      5 0.00567 Preprocessor1_Model3
##  9  1120 roc_auc     binary     0.896      5 0.00659 Preprocessor1_Model3
## 10  1554 accuracy    binary     0.859      5 0.00692 Preprocessor1_Model4
## 11  1554 brier_class binary     0.121      5 0.00584 Preprocessor1_Model4
## 12  1554 roc_auc     binary     0.894      5 0.00669 Preprocessor1_Model4
## 13  1631 accuracy    binary     0.859      5 0.00737 Preprocessor1_Model5
## 14  1631 brier_class binary     0.121      5 0.00583 Preprocessor1_Model5
## 15  1631 roc_auc     binary     0.893      5 0.00672 Preprocessor1_Model5

collect_predictions(tuned_model) %>%
  group_by(id) %>%
  roc_curve(yesno, .pred_y) %>%
  autoplot()

Final Fit

final_model <- model_workflow %>%
  finalize_workflow(select_best(tuned_model, metric = "accuracy")) %>%
  last_fit(email_split)

## Warning: package 'xgboost' was built under R version 4.4.2

collect_metrics(final_model)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary        0.883  Preprocessor1_Model1
## 2 roc_auc     binary        0.927  Preprocessor1_Model1
## 3 brier_class binary        0.0923 Preprocessor1_Model1

collect_predictions(final_model) %>%
  conf_mat(yesno, .pred_class) %>%
  autoplot()

Variable Importance

final_model %>%
  extract_fit_engine() %>%
  vip()

Apply 7

Savannah Ricker

2025-04-09

Objective

Load Dataset

Initial Exploration & Cleaning

Visualizations

Correlation Analysis via Funnel

Train-Test Split

Recipe

XGBoost Specification

Tune Model

Evaluate Performance

Final Fit

Variable Importance