Goal is to predict Spam Emails from key words

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.3.3

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

library(tidyquant)

## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.3.3

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0

## Warning: package 'dials' was built under R version 4.3.3

## Warning: package 'infer' was built under R version 4.3.3

## Warning: package 'modeldata' was built under R version 4.3.3

## Warning: package 'parsnip' was built under R version 4.3.3

## Warning: package 'recipes' was built under R version 4.3.3

## Warning: package 'rsample' was built under R version 4.3.3

## Warning: package 'tune' was built under R version 4.3.3

## Warning: package 'workflows' was built under R version 4.3.3

## Warning: package 'workflowsets' was built under R version 4.3.3

## Warning: package 'yardstick' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

spam <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv')

## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean data

skimr::skim(spam)

Data summary
Name	spam
Number of rows	4601
Number of columns	7
_______________________
Column type frequency:
character	1
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
yesno	0	1	1	1	0	2	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
crl.tot	1	283.29	606.35	1	35	95	266.00	15841.00	▇▁▁▁▁
dollar	1	0.08	0.25	0	0	0	0.05	6.00	▇▁▁▁▁
bang	1	0.27	0.82	0	0	0	0.32	32.48	▇▁▁▁▁
money	1	0.09	0.44	0	0	0	0.00	12.50	▇▁▁▁▁
n000	1	0.10	0.35	0	0	0	0.00	5.45	▇▁▁▁▁
make	1	0.10	0.31	0	0	0	0.00	4.54	▇▁▁▁▁

factors_vec <- spam %>% select(yesno) %>% names()

data_clean <- spam %>%
    # address factors imported as numeric
    mutate(across(all_of(factors_vec), as.factor)) %>%
    
    mutate(yesno = if_else(yesno == "y","Y","Left"))
    
# Please correct me if I'm wrong with how to mutate to re-code

Explore Data

data_clean %>% count(yesno)

## # A tibble: 2 × 2
##   yesno     n
##   <chr> <int>
## 1 Left   2788
## 2 Y      1813

data_clean %>% 
    ggplot(aes(yesno)) +
    geom_bar()

Correlation plot

# step one: Binarize
data_binarized <- data_clean %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 4,601
## Columns: 17
## $ `crl.tot__-Inf_35`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0…
## $ crl.tot__35_95       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ crl.tot__95_266      <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1…
## $ crl.tot__266_Inf     <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `dollar__-Inf_0.052` <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1…
## $ dollar__0.052_Inf    <dbl> 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0…
## $ `bang__-Inf_0.315`   <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ bang__0.315_Inf      <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1…
## $ money__0             <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1…
## $ `money__-OTHER`      <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ n000__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1…
## $ `n000__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ make__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ make__0.1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `make__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ yesno__Left          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ yesno__Y             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…

# Step two: Correlation
data_correlation <- data_binarized %>%
    correlate(yesno__Y)

data_correlation

## # A tibble: 17 × 3
##    feature bin        correlation
##    <fct>   <chr>            <dbl>
##  1 yesno   Left           -1     
##  2 yesno   Y               1     
##  3 dollar  -Inf_0.052     -0.566 
##  4 dollar  0.052_Inf       0.566 
##  5 bang    -Inf_0.315     -0.490 
##  6 bang    0.315_Inf       0.490 
##  7 money   -OTHER          0.475 
##  8 money   0              -0.475 
##  9 n000    0              -0.419 
## 10 n000    -OTHER          0.419 
## 11 crl.tot -Inf_35        -0.360 
## 12 crl.tot 266_Inf         0.299 
## 13 make    0              -0.239 
## 14 make    -OTHER          0.223 
## 15 crl.tot 95_266          0.145 
## 16 crl.tot 35_95          -0.0818
## 17 make    0.1             0.0803

# Step three: plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

Model buidling

Split data

set.seed(1234) 

data_split <- initial_split(data_clean, strata = yesno)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <-rsample::vfold_cv(data_train, strata = yesno)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [3104/346]> Fold01
##  2 <split [3105/345]> Fold02
##  3 <split [3105/345]> Fold03
##  4 <split [3105/345]> Fold04
##  5 <split [3105/345]> Fold05
##  6 <split [3105/345]> Fold06
##  7 <split [3105/345]> Fold07
##  8 <split [3105/345]> Fold08
##  9 <split [3105/345]> Fold09
## 10 <split [3106/344]> Fold10

Preproccess data

library(themis)

## Warning: package 'themis' was built under R version 4.3.3

xgboost_recipe <- recipes::recipe(yesno ~ ., data = data_train) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_other(threshold = 0.025) %>%
    step_smote(yesno) %>%
    step_normalize(all_numeric_predictors())


xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 4,182
## Columns: 7
## $ crl.tot <dbl> -0.4656847, -0.4017893, 2.5488659, -0.4509396, -0.2625302, -0.…
## $ dollar  <dbl> -0.3905966, -0.3905966, -0.3905966, -0.3905966, -0.3905966, -0…
## $ bang    <dbl> -0.17950530, -0.04142900, -0.36157051, -0.36157051, -0.3615705…
## $ money   <dbl> -0.2495720, -0.2495720, -0.2495720, -0.2495720, -0.2495720, -0…
## $ n000    <dbl> -0.3352998, -0.3352998, -0.3352998, -0.3352998, -0.3352998, -0…
## $ make    <dbl> -0.3769207, -0.3769207, -0.3769207, -0.3769207, -0.3769207, -0…
## $ yesno   <fct> Left, Left, Left, Left, Left, Left, Left, Left, Left, Left, Le…

Specify Model

usemodels::use_xgboost(yesno ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = yesno ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(76274)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune(10), learn_rate = tune(5)) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec)

tune hyperparemeters

library(doParallel)

## Warning: package 'doParallel' was built under R version 4.3.3

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.3.3

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

## Warning: package 'iterators' was built under R version 4.3.3

## Loading required package: parallel

doParallel::registerDoParallel()

set.seed(16613)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv , grid = 7, control = control_grid(save_pred = TRUE))

collect_metrics(xgboost_tune)

## # A tibble: 21 × 9
##    trees `10.000000` `5.000000` .metric  .estimator   mean     n std_err .config
##    <int>       <int>      <dbl> <chr>    <chr>       <dbl> <int>   <dbl> <chr>  
##  1   833           3    0.0887  accuracy binary     0.874     10 0.00436 Prepro…
##  2   833           3    0.0887  brier_c… binary     0.0985    10 0.00347 Prepro…
##  3   833           3    0.0887  roc_auc  binary     0.921     10 0.00322 Prepro…
##  4  1985           5    0.00154 accuracy binary     0.873     10 0.00247 Prepro…
##  5  1985           5    0.00154 brier_c… binary     0.0978    10 0.00322 Prepro…
##  6  1985           5    0.00154 roc_auc  binary     0.922     10 0.00409 Prepro…
##  7   511           6    0.0438  accuracy binary     0.875     10 0.00508 Prepro…
##  8   511           6    0.0438  brier_c… binary     0.0983    10 0.00369 Prepro…
##  9   511           6    0.0438  roc_auc  binary     0.922     10 0.00378 Prepro…
## 10  1459           9    0.00353 accuracy binary     0.874     10 0.00369 Prepro…
## # ℹ 11 more rows

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(yesno, .pred_Left) %>%
    autoplot()

xgboost_last <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)

## Warning: package 'xgboost' was built under R version 4.3.3

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary        0.878  Preprocessor1_Model1
## 2 roc_auc     binary        0.916  Preprocessor1_Model1
## 3 brier_class binary        0.0980 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(yesno, .pred_class) %>%
    autoplot()

library(vip)

## Warning: package 'vip' was built under R version 4.3.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had an accuracy of 0.855 and AUC of 0.906

Normalizing the data saw no improvement to accuracy and AUC

Increasing learn rate aswell as increasing grid to 7 accuracy grew to 0.876 and AUC 0.916

Changing the step_other threshold does not show improvments

Spam prediction

Kenton Quaglieri

2024-10-08