Goal is to predict Spam Emails from key words

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.3.3
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0
## Warning: package 'dials' was built under R version 4.3.3
## Warning: package 'infer' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## Warning: package 'rsample' was built under R version 4.3.3
## Warning: package 'tune' was built under R version 4.3.3
## Warning: package 'workflows' was built under R version 4.3.3
## Warning: package 'workflowsets' was built under R version 4.3.3
## Warning: package 'yardstick' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
spam <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv')
## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean data

skimr::skim(spam)
Data summary
Name spam
Number of rows 4601
Number of columns 7
_______________________
Column type frequency:
character 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
yesno 0 1 1 1 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
crl.tot 0 1 283.29 606.35 1 35 95 266.00 15841.00 ▇▁▁▁▁
dollar 0 1 0.08 0.25 0 0 0 0.05 6.00 ▇▁▁▁▁
bang 0 1 0.27 0.82 0 0 0 0.32 32.48 ▇▁▁▁▁
money 0 1 0.09 0.44 0 0 0 0.00 12.50 ▇▁▁▁▁
n000 0 1 0.10 0.35 0 0 0 0.00 5.45 ▇▁▁▁▁
make 0 1 0.10 0.31 0 0 0 0.00 4.54 ▇▁▁▁▁
factors_vec <- spam %>% select(yesno) %>% names()

data_clean <- spam %>%
    # address factors imported as numeric
    mutate(across(all_of(factors_vec), as.factor)) %>%
    
    mutate(yesno = if_else(yesno == "y","Y","Left"))
    
# Please correct me if I'm wrong with how to mutate to re-code

Explore Data

data_clean %>% count(yesno)
## # A tibble: 2 × 2
##   yesno     n
##   <chr> <int>
## 1 Left   2788
## 2 Y      1813
data_clean %>% 
    ggplot(aes(yesno)) +
    geom_bar()

Correlation plot

# step one: Binarize
data_binarized <- data_clean %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 4,601
## Columns: 17
## $ `crl.tot__-Inf_35`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0…
## $ crl.tot__35_95       <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ crl.tot__95_266      <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1…
## $ crl.tot__266_Inf     <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `dollar__-Inf_0.052` <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1…
## $ dollar__0.052_Inf    <dbl> 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0…
## $ `bang__-Inf_0.315`   <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0…
## $ bang__0.315_Inf      <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1…
## $ money__0             <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1…
## $ `money__-OTHER`      <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ n000__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1…
## $ `n000__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ make__0              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ make__0.1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `make__-OTHER`       <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ yesno__Left          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ yesno__Y             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
# Step two: Correlation
data_correlation <- data_binarized %>%
    correlate(yesno__Y)

data_correlation
## # A tibble: 17 × 3
##    feature bin        correlation
##    <fct>   <chr>            <dbl>
##  1 yesno   Left           -1     
##  2 yesno   Y               1     
##  3 dollar  -Inf_0.052     -0.566 
##  4 dollar  0.052_Inf       0.566 
##  5 bang    -Inf_0.315     -0.490 
##  6 bang    0.315_Inf       0.490 
##  7 money   -OTHER          0.475 
##  8 money   0              -0.475 
##  9 n000    0              -0.419 
## 10 n000    -OTHER          0.419 
## 11 crl.tot -Inf_35        -0.360 
## 12 crl.tot 266_Inf         0.299 
## 13 make    0              -0.239 
## 14 make    -OTHER          0.223 
## 15 crl.tot 95_266          0.145 
## 16 crl.tot 35_95          -0.0818
## 17 make    0.1             0.0803
# Step three: plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()

Model buidling

Split data

set.seed(1234) 

data_split <- initial_split(data_clean, strata = yesno)
data_train <- training(data_split)
data_test <- testing(data_split)
data_cv <-rsample::vfold_cv(data_train, strata = yesno)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [3104/346]> Fold01
##  2 <split [3105/345]> Fold02
##  3 <split [3105/345]> Fold03
##  4 <split [3105/345]> Fold04
##  5 <split [3105/345]> Fold05
##  6 <split [3105/345]> Fold06
##  7 <split [3105/345]> Fold07
##  8 <split [3105/345]> Fold08
##  9 <split [3105/345]> Fold09
## 10 <split [3106/344]> Fold10

Preproccess data

library(themis)
## Warning: package 'themis' was built under R version 4.3.3
xgboost_recipe <- recipes::recipe(yesno ~ ., data = data_train) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_other(threshold = 0.025) %>%
    step_smote(yesno) %>%
    step_normalize(all_numeric_predictors())


xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 4,182
## Columns: 7
## $ crl.tot <dbl> -0.4656847, -0.4017893, 2.5488659, -0.4509396, -0.2625302, -0.…
## $ dollar  <dbl> -0.3905966, -0.3905966, -0.3905966, -0.3905966, -0.3905966, -0…
## $ bang    <dbl> -0.17950530, -0.04142900, -0.36157051, -0.36157051, -0.3615705…
## $ money   <dbl> -0.2495720, -0.2495720, -0.2495720, -0.2495720, -0.2495720, -0…
## $ n000    <dbl> -0.3352998, -0.3352998, -0.3352998, -0.3352998, -0.3352998, -0…
## $ make    <dbl> -0.3769207, -0.3769207, -0.3769207, -0.3769207, -0.3769207, -0…
## $ yesno   <fct> Left, Left, Left, Left, Left, Left, Left, Left, Left, Left, Le…

Specify Model

usemodels::use_xgboost(yesno ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = yesno ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(76274)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_spec <- 
  boost_tree(trees = tune(), tree_depth = tune(10), learn_rate = tune(5)) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

tune hyperparemeters

library(doParallel)
## Warning: package 'doParallel' was built under R version 4.3.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.3.3
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.3.3
## Loading required package: parallel
doParallel::registerDoParallel()

set.seed(16613)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv , grid = 7, control = control_grid(save_pred = TRUE))
collect_metrics(xgboost_tune)
## # A tibble: 21 × 9
##    trees `10.000000` `5.000000` .metric  .estimator   mean     n std_err .config
##    <int>       <int>      <dbl> <chr>    <chr>       <dbl> <int>   <dbl> <chr>  
##  1   833           3    0.0887  accuracy binary     0.874     10 0.00436 Prepro…
##  2   833           3    0.0887  brier_c… binary     0.0985    10 0.00347 Prepro…
##  3   833           3    0.0887  roc_auc  binary     0.921     10 0.00322 Prepro…
##  4  1985           5    0.00154 accuracy binary     0.873     10 0.00247 Prepro…
##  5  1985           5    0.00154 brier_c… binary     0.0978    10 0.00322 Prepro…
##  6  1985           5    0.00154 roc_auc  binary     0.922     10 0.00409 Prepro…
##  7   511           6    0.0438  accuracy binary     0.875     10 0.00508 Prepro…
##  8   511           6    0.0438  brier_c… binary     0.0983    10 0.00369 Prepro…
##  9   511           6    0.0438  roc_auc  binary     0.922     10 0.00378 Prepro…
## 10  1459           9    0.00353 accuracy binary     0.874     10 0.00369 Prepro…
## # ℹ 11 more rows
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(yesno, .pred_Left) %>%
    autoplot()

xgboost_last <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)
## Warning: package 'xgboost' was built under R version 4.3.3
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary        0.878  Preprocessor1_Model1
## 2 roc_auc     binary        0.916  Preprocessor1_Model1
## 3 brier_class binary        0.0980 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(yesno, .pred_class) %>%
    autoplot()

library(vip)
## Warning: package 'vip' was built under R version 4.3.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Conclusion

The previous model had an accuracy of 0.855 and AUC of 0.906

Normalizing the data saw no improvement to accuracy and AUC

Increasing learn rate aswell as increasing grid to 7 accuracy grew to 0.876 and AUC 0.916

Changing the step_other threshold does not show improvments