Goal: Predict spam emails based on word frequency features

Import Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.4.2

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.4.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.2
## ✔ recipes      1.1.0

## Warning: package 'dials' was built under R version 4.4.2

## Warning: package 'infer' was built under R version 4.4.2

## Warning: package 'modeldata' was built under R version 4.4.2

## Warning: package 'parsnip' was built under R version 4.4.2

## Warning: package 'tune' was built under R version 4.4.2

## Warning: package 'workflows' was built under R version 4.4.2

## Warning: package 'workflowsets' was built under R version 4.4.2

## Warning: package 'yardstick' was built under R version 4.4.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(themis)

## Warning: package 'themis' was built under R version 4.4.3

spam <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-08-15/spam.csv')

## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

skimr::skim(spam)

Data summary
Name	spam
Number of rows	4601
Number of columns	7
_______________________
Column type frequency:
character	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
yesno	0	1	1	1	0	2	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
crl.tot	1	283.29	606.35	1	35	95	266.00	15841.00	▇▁▁▁▁
dollar	1	0.08	0.25	0	0	0	0.05	6.00	▇▁▁▁▁
bang	1	0.27	0.82	0	0	0	0.32	32.48	▇▁▁▁▁
money	1	0.09	0.44	0	0	0	0.00	12.50	▇▁▁▁▁
n000	1	0.10	0.35	0	0	0	0.00	5.45	▇▁▁▁▁
make	1	0.10	0.31	0	0	0	0.00	4.54	▇▁▁▁▁

Issues with Data:

No missing values
The target variable is yesno (spam or not spam)
Numeric variables representing word frequencies
No apparent ID variables to drop
Need to convert the target variable into a binary format

spam_clean <- spam %>%
  mutate(yesno = factor(yesno, levels = c("y", "n")))

spam_clean

## # A tibble: 4,601 × 7
##    crl.tot dollar  bang money  n000  make yesno
##      <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
##  1     278  0     0.778  0     0     0    y    
##  2    1028  0.18  0.372  0.43  0.43  0.21 y    
##  3    2259  0.184 0.276  0.06  1.16  0.06 y    
##  4     191  0     0.137  0     0     0    y    
##  5     191  0     0.135  0     0     0    y    
##  6      54  0     0      0     0     0    y    
##  7     112  0.054 0.164  0     0     0    y    
##  8      49  0     0      0     0     0    y    
##  9    1257  0.203 0.181  0.15  0     0.15 y    
## 10     749  0.081 0.244  0     0.19  0.06 y    
## # ℹ 4,591 more rows

Explore Data

spam_clean %>% count(yesno)

## # A tibble: 2 × 2
##   yesno     n
##   <fct> <int>
## 1 y      1813
## 2 n      2788

spam_clean %>%
    ggplot(aes(yesno)) +
    geom_bar()

Spam vs. CRL Total

spam_clean %>%
    ggplot(aes(yesno, crl.tot)) +
    geom_boxplot()

Correlation Analysis

# Step 1: Binarize
spam_binarized <- spam_clean %>%
    binarize()

spam_binarized %>% glimpse()

## Rows: 4,601
## Columns: 17
## $ `crl.tot__-Inf_35`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0…
## $ crl.tot__35_95       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ crl.tot__95_266      <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1…
## $ crl.tot__266_Inf     <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `dollar__-Inf_0.052` <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1…
## $ dollar__0.052_Inf    <dbl> 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0…
## $ `bang__-Inf_0.315`   <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ bang__0.315_Inf      <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1…
## $ money__0             <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1…
## $ `money__-OTHER`      <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ n000__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1…
## $ `n000__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ make__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ make__0.1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `make__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ yesno__y             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ yesno__n             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

# Step 2: Correlation
spam_correlation <- spam_binarized %>%
    correlate(yesno__y)

# Step 3: Sort and Analyze Correlations
spam_correlation_sorted <- spam_correlation %>%
    arrange(desc(correlation))

spam_correlation_sorted

## # A tibble: 17 × 3
##    feature bin        correlation
##    <fct>   <chr>            <dbl>
##  1 yesno   y               1     
##  2 dollar  0.052_Inf       0.566 
##  3 bang    0.315_Inf       0.490 
##  4 money   -OTHER          0.475 
##  5 n000    -OTHER          0.419 
##  6 crl.tot 266_Inf         0.299 
##  7 make    -OTHER          0.223 
##  8 crl.tot 95_266          0.145 
##  9 make    0.1             0.0803
## 10 crl.tot 35_95          -0.0818
## 11 make    0              -0.239 
## 12 crl.tot -Inf_35        -0.360 
## 13 n000    0              -0.419 
## 14 money   0              -0.475 
## 15 bang    -Inf_0.315     -0.490 
## 16 dollar  -Inf_0.052     -0.566 
## 17 yesno   n              -1

# Step 4: Plot
spam_correlation_sorted %>%
    correlationfunnel::plot_correlation_funnel()

Split Data

set.seed(1234)
spam_split <- initial_split(spam_clean, strata = yesno)
spam_train <- training(spam_split)
spam_test <- testing(spam_split)

spam_cv <- vfold_cv(spam_train, strata = yesno)
spam_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [3104/346]> Fold01
##  2 <split [3105/345]> Fold02
##  3 <split [3105/345]> Fold03
##  4 <split [3105/345]> Fold04
##  5 <split [3105/345]> Fold05
##  6 <split [3105/345]> Fold06
##  7 <split [3105/345]> Fold07
##  8 <split [3105/345]> Fold08
##  9 <split [3105/345]> Fold09
## 10 <split [3106/344]> Fold10

Preprocess Data

spam_recipe <- recipe(yesno ~ ., data = spam_train) %>%
    step_YeoJohnson(all_numeric_predictors())

spam_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 3,450
## Columns: 7
## $ crl.tot <dbl> 3.260223, 4.132217, 7.347801, 3.541443, 4.935058, 3.330126, 2.…
## $ dollar  <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000,…
## $ bang    <dbl> 0.10652716, 0.15135901, 0.00000000, 0.00000000, 0.00000000, 0.…
## $ money   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
## $ n000    <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
## $ make    <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
## $ yesno   <fct> n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,…

Specify Model

spam_xgboost_spec <- 
  boost_tree(trees = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

spam_xgboost_workflow <- 
  workflow() %>% 
  add_recipe(spam_recipe) %>% 
  add_model(spam_xgboost_spec)

Tune Hyperparameters

doParallel::registerDoParallel()

set.seed(43931)
spam_xgboost_tune <-
  tune_grid(spam_xgboost_workflow,
            resamples = spam_cv,
            grid = 5,
            control = control_grid(save_pred = TRUE))

Model Evaluation

Identify optimal values for hyperparameters

collect_metrics(spam_xgboost_tune)

## # A tibble: 15 × 7
##    trees .metric     .estimator  mean     n std_err .config             
##    <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>               
##  1   356 accuracy    binary     0.868    10 0.00429 Preprocessor1_Model1
##  2   356 brier_class binary     0.106    10 0.00317 Preprocessor1_Model1
##  3   356 roc_auc     binary     0.909    10 0.00334 Preprocessor1_Model1
##  4   425 accuracy    binary     0.868    10 0.00352 Preprocessor1_Model2
##  5   425 brier_class binary     0.107    10 0.00312 Preprocessor1_Model2
##  6   425 roc_auc     binary     0.907    10 0.00341 Preprocessor1_Model2
##  7   909 accuracy    binary     0.865    10 0.00430 Preprocessor1_Model3
##  8   909 brier_class binary     0.113    10 0.00311 Preprocessor1_Model3
##  9   909 roc_auc     binary     0.901    10 0.00320 Preprocessor1_Model3
## 10  1505 accuracy    binary     0.863    10 0.00436 Preprocessor1_Model4
## 11  1505 brier_class binary     0.117    10 0.00321 Preprocessor1_Model4
## 12  1505 roc_auc     binary     0.899    10 0.00342 Preprocessor1_Model4
## 13  1733 accuracy    binary     0.863    10 0.00456 Preprocessor1_Model5
## 14  1733 brier_class binary     0.118    10 0.00322 Preprocessor1_Model5
## 15  1733 roc_auc     binary     0.898    10 0.00341 Preprocessor1_Model5

collect_predictions(spam_xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(yesno, .pred_y) %>%
    autoplot()

Fit the model for the last time

spam_xgboost_last <- spam_xgboost_workflow %>%
    finalize_workflow(select_best(spam_xgboost_tune, metric = "accuracy")) %>%
    last_fit(spam_split)

## Warning: package 'xgboost' was built under R version 4.4.2

collect_metrics(spam_xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.867 Preprocessor1_Model1
## 2 roc_auc     binary         0.913 Preprocessor1_Model1
## 3 brier_class binary         0.102 Preprocessor1_Model1

collect_predictions(spam_xgboost_last) %>%
    yardstick::conf_mat(yesno, .pred_class) %>%
    autoplot()

Variable Importance

library(vip)

## Warning: package 'vip' was built under R version 4.4.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

spam_xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Apply 7

Adam Cilley

2025-04-01

Goal: Predict spam emails based on word frequency features

Import Data

Clean Data

Issues with Data:

Explore Data

Spam vs. CRL Total

Correlation Analysis

Split Data

Preprocess Data

Specify Model

Tune Hyperparameters

Model Evaluation

Identify optimal values for hyperparameters

Fit the model for the last time

Variable Importance