library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
spam <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv')
## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spam %>% skimr::skim()
Name | Piped data |
Number of rows | 4601 |
Number of columns | 7 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
yesno | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
crl.tot | 0 | 1 | 283.29 | 606.35 | 1 | 35 | 95 | 266.00 | 15841.00 | ▇▁▁▁▁ |
dollar | 0 | 1 | 0.08 | 0.25 | 0 | 0 | 0 | 0.05 | 6.00 | ▇▁▁▁▁ |
bang | 0 | 1 | 0.27 | 0.82 | 0 | 0 | 0 | 0.32 | 32.48 | ▇▁▁▁▁ |
money | 0 | 1 | 0.09 | 0.44 | 0 | 0 | 0 | 0.00 | 12.50 | ▇▁▁▁▁ |
n000 | 0 | 1 | 0.10 | 0.35 | 0 | 0 | 0 | 0.00 | 5.45 | ▇▁▁▁▁ |
make | 0 | 1 | 0.10 | 0.31 | 0 | 0 | 0 | 0.00 | 4.54 | ▇▁▁▁▁ |
colSums(is.na(spam))
## crl.tot dollar bang money n000 make yesno
## 0 0 0 0 0 0 0
## No missing data
spam <- spam %>%
filter(bang < 2500, crl.tot < 2500)
spam <- spam[!duplicated(spam), ]
spam %>%
mutate_if(is.character, as.factor)
## # A tibble: 2,982 × 7
## crl.tot dollar bang money n000 make yesno
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 278 0 0.778 0 0 0 y
## 2 1028 0.18 0.372 0.43 0.43 0.21 y
## 3 2259 0.184 0.276 0.06 1.16 0.06 y
## 4 191 0 0.137 0 0 0 y
## 5 191 0 0.135 0 0 0 y
## 6 54 0 0 0 0 0 y
## 7 112 0.054 0.164 0 0 0 y
## 8 49 0 0 0 0 0 y
## 9 1257 0.203 0.181 0.15 0 0.15 y
## 10 749 0.081 0.244 0 0.19 0.06 y
## # ℹ 2,972 more rows
spam %>%
filter(money > 0, bang > 0) %>%
ggplot(aes(money, bang, color = yesno)) +
geom_point(alpha = 0.2, size = 1.1)
spam %>%
filter(dollar < 2) %>%
ggplot(aes(dollar, y = ..density.., fill = yesno)) +
geom_histogram(position = "identity", alpha = 0.5)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
spam_split <- initial_split(spam, strata = yesno)
spam_train <- training(spam_split)
museum_test <- testing(spam_split)
set.seed(234)
spam_folds <- vfold_cv(spam_train, strata = yesno)
spam_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [2012/224]> Fold01
## 2 <split [2012/224]> Fold02
## 3 <split [2012/224]> Fold03
## 4 <split [2012/224]> Fold04
## 5 <split [2012/224]> Fold05
## 6 <split [2012/224]> Fold06
## 7 <split [2013/223]> Fold07
## 8 <split [2013/223]> Fold08
## 9 <split [2013/223]> Fold09
## 10 <split [2013/223]> Fold10
#library(embed)
#spam_rec <-
# recipe(yesno ~ ., data = spam_train) %>%
## How does bang affect "Accreditation "yesno" (affect encoding)
#step_lencode_glm(bang, outcome = vars(yesno)) %>%
#step_dummy(all_nominal_predictors())
#spam_rec
library(themis)
spam_rec <-
recipe(formula = yesno ~ ., data = spam_train) %>%
step_unknown(all_nominal_predictors()) %>%
step_other(all_nominal_predictors(), threshold = 0.03) %>%
step_impute_linear(dollar)
spam_spec <-
rand_forest(trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
spam_workflow <-
workflow() %>%
add_recipe(spam_rec) %>%
add_model(spam_spec)
doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
fit_resamples(spam_workflow,
resamples = spam_folds,
control = control_resamples(save_pred = TRUE)
)
#prep(spam_rec) %>%
# tidy(number = 1) %>%
#filter(level == "..new")
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(spam_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <-
tune_race_anova(
xgb_wf,
resamples = spam_folds,
grid = 15,
control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will maximize the roc_auc metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 4 eliminated; 11 candidates remain.
##
## ℹ Fold07: 1 eliminated; 10 candidates remain.
##
## ℹ Fold03: 3 eliminated; 7 candidates remain.
##
## ℹ Fold05: 1 eliminated; 6 candidates remain.
##
## ℹ Fold09: 3 eliminated; 3 candidates remain.
##
## ℹ Fold04: 0 eliminated; 3 candidates remain.
##
## ℹ Fold06: 0 eliminated; 3 candidates remain.
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .order .metrics .notes
## <list> <chr> <int> <list> <list>
## 1 <split [2012/224]> Fold01 2 <tibble [30 × 8]> <tibble [0 × 3]>
## 2 <split [2012/224]> Fold02 3 <tibble [30 × 8]> <tibble [0 × 3]>
## 3 <split [2013/223]> Fold10 1 <tibble [30 × 8]> <tibble [0 × 3]>
## 4 <split [2013/223]> Fold07 4 <tibble [22 × 8]> <tibble [0 × 3]>
## 5 <split [2012/224]> Fold03 5 <tibble [20 × 8]> <tibble [0 × 3]>
## 6 <split [2012/224]> Fold05 6 <tibble [14 × 8]> <tibble [0 × 3]>
## 7 <split [2013/223]> Fold09 7 <tibble [12 × 8]> <tibble [0 × 3]>
## 8 <split [2012/224]> Fold04 8 <tibble [6 × 8]> <tibble [0 × 3]>
## 9 <split [2012/224]> Fold06 9 <tibble [6 × 8]> <tibble [0 × 3]>
## 10 <split [2013/223]> Fold08 10 <tibble [6 × 8]> <tibble [0 × 3]>
plot_race(xgb_rs)
collect_metrics(xgb_rs)
## # A tibble: 6 × 10
## mtry trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 869 29 0.0263 accuracy binary 0.825 10 0.00535 Preproce…
## 2 1 869 29 0.0263 roc_auc binary 0.894 10 0.00539 Preproce…
## 3 3 594 24 0.0139 accuracy binary 0.820 10 0.00639 Preproce…
## 4 3 594 24 0.0139 roc_auc binary 0.896 10 0.00522 Preproce…
## 5 4 935 8 0.00826 accuracy binary 0.827 10 0.00758 Preproce…
## 6 4 935 8 0.00826 roc_auc binary 0.899 10 0.00494 Preproce…
xgb_last <-
xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
last_fit(spam_split)
collect_metrics(xgb_last)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.834 Preprocessor1_Model1
## 2 roc_auc binary 0.914 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
conf_mat(yesno, .pred_class)
## Truth
## Prediction n y
## n 310 85
## y 39 312
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgb_last %>%
extract_fit_engine() %>%
vip()
The dataset consisting of 2982 entries and 7 variables. The primary variable, yesno, indicates if an email is spam. Other numeric columns represent attributes like word frequencies (dollar, bang, money). There are no missing values, and duplicates and outliers have been addressed. The dataset is primed for determining the spam nature of emails based on these attributes.
The primary variable, yesno, distinguishes emails as spam or not. It’s categorical, with ‘y’ indicating spam. Numeric attributes such as dollar, bang, and money reflect specific word or symbol frequencies in emails. These variables exhibit a right-skewed distribution, with most data concentrated at lower values, indicating diverse email content.
The original dataset had 2982 emails with potential redundancies and outliers. For modeling, duplicates were removed, and data points like extreme values in the bang and crl.tot columns were filtered out. These transformations optimized data quality and model performance.
During data cleaning, outliers were removed, duplicates were identified and deleted. Further, for modeling, specific feature engineering techniques were applied to optimize the dataset’s structure.
The machine learning models used in the analysis are Random Forest and XGBoost.
The metrics used for model evaluation in the analysis are “accuracy” and “roc_auc” (Area Under the Receiver Operating Characteristic Curve).
The analysis utilized Random Forest and XGBoost models to classify emails as spam or not. Both models were evaluated using accuracy and roc_auc metrics. The XGBoost model demonstrated a promising accuracy of approximately 83.4% and a roc_auc of 91.4%, indicating its strong capability in distinguishing spam from non-spam emails.