Goal is to predict the spam emails

Import data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-08-15/spam.csv')

## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	4601
Number of columns	7
_______________________
Column type frequency:
character	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
yesno	0	1	1	1	0	2	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
crl.tot	1	283.29	606.35	1	35	95	266.00	15841.00	▇▁▁▁▁
dollar	1	0.08	0.25	0	0	0	0.05	6.00	▇▁▁▁▁
bang	1	0.27	0.82	0	0	0	0.32	32.48	▇▁▁▁▁
money	1	0.09	0.44	0	0	0	0.00	12.50	▇▁▁▁▁
n000	1	0.10	0.35	0	0	0	0.00	5.45	▇▁▁▁▁
make	1	0.10	0.31	0	0	0	0.00	4.54	▇▁▁▁▁

clean data

data_clean <- data %>% 
    
    # Address factors imported as numeric 
    # mutate(across(where(is.character), as.factor)) %>%
    mutate(yesno = factor(yesno, levels = c("y", "n"))) %>%
    
    # Recode yesno
     mutate(yesno = if_else(yesno == "y", "Spam", yesno))

data_clean

## # A tibble: 4,601 × 7
##    crl.tot dollar  bang money  n000  make yesno
##      <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
##  1     278  0     0.778  0     0     0    Spam 
##  2    1028  0.18  0.372  0.43  0.43  0.21 Spam 
##  3    2259  0.184 0.276  0.06  1.16  0.06 Spam 
##  4     191  0     0.137  0     0     0    Spam 
##  5     191  0     0.135  0     0     0    Spam 
##  6      54  0     0      0     0     0    Spam 
##  7     112  0.054 0.164  0     0     0    Spam 
##  8      49  0     0      0     0     0    Spam 
##  9    1257  0.203 0.181  0.15  0     0.15 Spam 
## 10     749  0.081 0.244  0     0.19  0.06 Spam 
## # ℹ 4,591 more rows

skimr::skim(data)

Data summary
Name	data
Number of rows	4601
Number of columns	7
_______________________
Column type frequency:
character	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
yesno	0	1	1	1	0	2	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
crl.tot	1	283.29	606.35	1	35	95	266.00	15841.00	▇▁▁▁▁
dollar	1	0.08	0.25	0	0	0	0.05	6.00	▇▁▁▁▁
bang	1	0.27	0.82	0	0	0	0.32	32.48	▇▁▁▁▁
money	1	0.09	0.44	0	0	0	0.00	12.50	▇▁▁▁▁
n000	1	0.10	0.35	0	0	0	0.00	5.45	▇▁▁▁▁
make	1	0.10	0.31	0	0	0	0.00	4.54	▇▁▁▁▁

Explore data

data %>% count(yesno)

## # A tibble: 2 × 2
##   yesno     n
##   <chr> <int>
## 1 n      2788
## 2 y      1813

data %>% 
    ggplot(aes(yesno)) +
    geom_bar()

yesno vs. dollar

data %>%
    ggplot(aes(yesno, dollar)) + 
    geom_boxplot()

correlation plot

# step 1: binarize
data_binarized <- data %>% 
    select(-money) %>% 
    binarize()

data_binarized %>% glimpse()

## Rows: 4,601
## Columns: 15
## $ `crl.tot__-Inf_35`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0…
## $ crl.tot__35_95       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ crl.tot__95_266      <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1…
## $ crl.tot__266_Inf     <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `dollar__-Inf_0.052` <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1…
## $ dollar__0.052_Inf    <dbl> 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0…
## $ `bang__-Inf_0.315`   <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ bang__0.315_Inf      <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1…
## $ n000__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1…
## $ `n000__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ make__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ make__0.1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `make__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ yesno__n             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ yesno__y             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…

# step 2: correlation
data_correlation <- data_binarized %>% 
    correlate(yesno__y)

data_correlation

## # A tibble: 15 × 3
##    feature bin        correlation
##    <fct>   <chr>            <dbl>
##  1 yesno   n              -1     
##  2 yesno   y               1     
##  3 dollar  -Inf_0.052     -0.566 
##  4 dollar  0.052_Inf       0.566 
##  5 bang    -Inf_0.315     -0.490 
##  6 bang    0.315_Inf       0.490 
##  7 n000    0              -0.419 
##  8 n000    -OTHER          0.419 
##  9 crl.tot -Inf_35        -0.360 
## 10 crl.tot 266_Inf         0.299 
## 11 make    0              -0.239 
## 12 make    -OTHER          0.223 
## 13 crl.tot 95_266          0.145 
## 14 crl.tot 35_95          -0.0818
## 15 make    0.1             0.0803

# step 3: plot 
data_correlation %>% 
    correlationfunnel::plot_correlation_funnel()

Model building

Split data

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.7     ✔ rsample      1.2.1
## ✔ dials        1.4.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.0     ✔ yardstick    1.3.2
## ✔ recipes      1.1.0

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

set.seed(1234)
data_clean <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean, strata = yesno)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train, strata = yesno)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [66/8]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [66/8]> Fold04
##  5 <split [67/7]> Fold05
##  6 <split [67/7]> Fold06
##  7 <split [67/7]> Fold07
##  8 <split [67/7]> Fold08
##  9 <split [67/7]> Fold09
## 10 <split [67/7]> Fold10

Preprocess data

library(themis)

xgboost_rec <- recipes::recipe(yesno ~ ., data = data_train) %>% 
    step_YeoJohnson(all_numeric_predictors()) %>% 
    step_normalize(all_numeric_predictors())
    # step_pca(all_numeric_predictors(), num_comp = 3)

xgboost_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 74
## Columns: 7
## $ crl.tot <dbl> -0.3738610, -1.3525904, -1.4794323, -1.1150125, -1.1150125, -0…
## $ dollar  <dbl> -0.2138871, -0.2138871, -0.2138871, -0.2138871, -0.2138871, -0…
## $ bang    <dbl> 0.2560991, -0.8397250, -0.8397250, -0.8397250, -0.8397250, -0.…
## $ money   <dbl> -0.24509002, -0.24509002, -0.24509002, -0.24509002, -0.2450900…
## $ n000    <dbl> -0.2691836, -0.2691836, -0.2691836, -0.2691836, -0.2691836, -0…
## $ make    <dbl> -0.3425374, -0.3425374, -0.3425374, -0.3425374, -0.3425374, -0…
## $ yesno   <fct> n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,…

Specify model

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_rec) %>% 
  add_model(xgboost_spec)

Tune hyperparameters

tree_grid <- grid_regular(trees(),
                          tree_depth(),
                          levels = 10)

doParallel::registerDoParallel()

set.seed(65677)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv,
            grid = 5,
            control = control_grid(save_pred = TRUE))

Model evaluation

Identify optimal values for hyperparameters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator  mean     n std_err .config          
##    <int>      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>            
##  1  1743          2 accuracy    binary     0.746    10  0.0438 Preprocessor1_Mo…
##  2  1743          2 brier_class binary     0.200    10  0.0396 Preprocessor1_Mo…
##  3  1743          2 roc_auc     binary     0.797    10  0.0688 Preprocessor1_Mo…
##  4   126          5 accuracy    binary     0.745    10  0.0404 Preprocessor1_Mo…
##  5   126          5 brier_class binary     0.176    10  0.0377 Preprocessor1_Mo…
##  6   126          5 roc_auc     binary     0.815    10  0.0634 Preprocessor1_Mo…
##  7  1009          7 accuracy    binary     0.773    10  0.0424 Preprocessor1_Mo…
##  8  1009          7 brier_class binary     0.189    10  0.0407 Preprocessor1_Mo…
##  9  1009          7 roc_auc     binary     0.797    10  0.0716 Preprocessor1_Mo…
## 10  1359         10 accuracy    binary     0.773    10  0.0424 Preprocessor1_Mo…
## 11  1359         10 brier_class binary     0.191    10  0.0407 Preprocessor1_Mo…
## 12  1359         10 roc_auc     binary     0.79     10  0.0772 Preprocessor1_Mo…
## 13   749         13 accuracy    binary     0.773    10  0.0424 Preprocessor1_Mo…
## 14   749         13 brier_class binary     0.186    10  0.0408 Preprocessor1_Mo…
## 15   749         13 roc_auc     binary     0.803    10  0.0662 Preprocessor1_Mo…

collect_predictions(xgboost_tune) %>% 
    group_by(id) %>% 
    roc_curve(yesno, .pred_Spam) %>% 
    autoplot()

Fit the model for the last time

xgboost_last <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>% 
    last_fit(data_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.808 Preprocessor1_Model1
## 2 roc_auc     binary         0.861 Preprocessor1_Model1
## 3 brier_class binary         0.172 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(yesno, .pred_class) %>% 
    autoplot()

Variable importance

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>% 
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had accuracy of 0.808 and AUC of 0.861

Feature transformation: normalized numeric data. It resulted in the same accuracy of 0.808 and AUC of 0.861
Feature selection: PCA didn’t make an improvement but went down tremendously with an accuracy of 0.692 and an AUC of 0.776
Adding more hyperparameters resulted in the same accuracy as before of 0.808 and AUC of 0.861

Apply 5

Alden Dimick

2025-03-25