Import data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
spam <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv')
## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Clean Data
spam %>% skimr::skim()
Data summary
| Name |
Piped data |
| Number of rows |
4601 |
| Number of columns |
7 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| numeric |
6 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: numeric
| crl.tot |
0 |
1 |
283.29 |
606.35 |
1 |
35 |
95 |
266.00 |
15841.00 |
▇▁▁▁▁ |
| dollar |
0 |
1 |
0.08 |
0.25 |
0 |
0 |
0 |
0.05 |
6.00 |
▇▁▁▁▁ |
| bang |
0 |
1 |
0.27 |
0.82 |
0 |
0 |
0 |
0.32 |
32.48 |
▇▁▁▁▁ |
| money |
0 |
1 |
0.09 |
0.44 |
0 |
0 |
0 |
0.00 |
12.50 |
▇▁▁▁▁ |
| n000 |
0 |
1 |
0.10 |
0.35 |
0 |
0 |
0 |
0.00 |
5.45 |
▇▁▁▁▁ |
| make |
0 |
1 |
0.10 |
0.31 |
0 |
0 |
0 |
0.00 |
4.54 |
▇▁▁▁▁ |
colSums(is.na(spam))
## crl.tot dollar bang money n000 make yesno
## 0 0 0 0 0 0 0
cleaned_spam <- spam[!duplicated(spam), ]
cleaned_spam <- cleaned_spam %>%
filter(bang < 2250, crl.tot < 2250)
Explore data
cleaned_spam %>%
filter(money > 0, bang > 0) %>%
ggplot(aes(money, bang, color = yesno)) +
geom_point(alpha = 0.5, size = 1.8)

cleaned_spam %>%
filter(dollar < 2) %>%
ggplot(aes(dollar, y = ..density.., fill = yesno)) +
geom_histogram(position = "identity", alpha = 0.8)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cleaned_spam %>%
filter(bang < 2250) %>%
ggplot(aes(x = bang, color = yesno)) +
geom_histogram(binwidth = 0.5) +
ggtitle("Distribution of 'bang' (Values < 2250)")

cleaned_spam %>%
filter(crl.tot < 2250) %>%
ggplot(aes(x = crl.tot, color = yesno)) +
geom_histogram(binwidth = 0.5) +
ggtitle("Distribution of 'crl.tot' (Values < 2500)")

ggplot(cleaned_spam, aes(x = yesno, y = bang, color = yesno)) +
geom_boxplot() +
ggtitle("'bang' by 'yesno'")

ggplot(cleaned_spam, aes(x = yesno, y = crl.tot, color = yesno)) +
geom_boxplot() +
ggtitle("'crl.tot' by 'yesno'")

Building a model
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
spam_split <- initial_split(cleaned_spam, strata = yesno)
spam_train <- training(spam_split)
spam_test <- testing(spam_split)
set.seed(234)
spam_folds <- vfold_cv(spam_train, strata = yesno)
spam_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [2000/223]> Fold01
## 2 <split [2000/223]> Fold02
## 3 <split [2000/223]> Fold03
## 4 <split [2000/223]> Fold04
## 5 <split [2001/222]> Fold05
## 6 <split [2001/222]> Fold06
## 7 <split [2001/222]> Fold07
## 8 <split [2001/222]> Fold08
## 9 <split [2001/222]> Fold09
## 10 <split [2002/221]> Fold10
usemodels::use_ranger(yesno ~ ., data = spam_train)
## ranger_recipe <-
## recipe(formula = yesno ~ ., data = spam_train)
##
## ranger_spec <-
## rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
## set_mode("classification") %>%
## set_engine("ranger")
##
## ranger_workflow <-
## workflow() %>%
## add_recipe(ranger_recipe) %>%
## add_model(ranger_spec)
##
## set.seed(36881)
## ranger_tune <-
## tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(themis)
ranger_recipe <-
recipe(formula = yesno ~ ., data = spam_train) %>%
step_unknown(all_nominal_predictors()) %>%
step_other(all_nominal_predictors(), threshold = 0.03) %>%
step_impute_linear(dollar)
ranger_spec <-
rand_forest(trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
fit_resamples(ranger_workflow,
resamples = spam_folds,
control = control_resamples(save_pred = TRUE)
)
Explore the results
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.826 10 0.00975 Preprocessor1_Model1
## 2 roc_auc binary 0.901 10 0.00963 Preprocessor1_Model1
collect_predictions(ranger_rs) %>%
group_by(id) %>%
roc_curve(yesno, .pred_n) %>%
autoplot()

conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
autoplot()

final_fitted <- last_fit(ranger_workflow, spam_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.841 Preprocessor1_Model1
## 2 roc_auc binary 0.911 Preprocessor1_Model1
library(tidymodels)
library(ggplot2)
collect_predictions(final_fitted) %>%
conf_mat(yesno, .pred_class) %>%
autoplot(type = "heatmap") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal()
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(tidymodels)
library(ggplot2)
ranger_spec <- rand_forest() %>%
set_mode("classification") %>%
set_engine("ranger", importance = "permutation")
imp_data <- ranger_recipe %>%
prep() %>%
bake(new_data = NULL)
ranger_fit <- ranger_spec %>%
fit(yesno ~ ., data = imp_data)
spam_plot <- vip(ranger_fit, geom = "point") +
geom_point(color = "blue") +
theme_minimal()
print(spam_plot)

- Question and Data:
- What is the research question? Clearly state the research question
you aim to address using the new dataset. Can I predict whether an email
is spam depending on how many special characters appear in the
email?
- Describe the data briefly: Provide an overview of the new dataset,
highlighting its key characteristics and dimensions. The spam data set
contains 2,966 entries with 8 columns. Each column contains details on
how often special characters appear in an email, and whether it can be
determined as spam based off the frequency of the symbol occurrence. The
target variable is yesno which indicates if an email is spam or not.
Yesno is character data. Some of the other columns are dollar, bang,
money, and crl.tot.
- What are the characteristics of the key variables used in the
analysis? Describe the primary variables of interest in the dataset and
their characteristics. The primary variables of interest are crl.tot
which is numerical data. Another primary variable of interest is dollar,
as well as bang. These three variables track the amount of times a
special character or phrase appear in an email. The data set was cleaned
for duplicates as well as outliers. They are all numerical data.
- Data Exploration and Transformation:
- Describe the differences between the original data and the data
transformed for modeling. Why? Explain any preprocessing or
transformations performed on the new dataset compared to the original
data. Discuss why these changes were necessary or beneficial. The
original spam data set was transformed in a few different ways. First, I
used “colSums(is.na(spam))” to check for any NA values in the data set.
Next, I used “cleaned_spam <- spam[!duplicated(spam), ]” to remove
duplicat rows. This way every data entry was unique. After that, I used
“cleaned_spam <- cleaned_spam %>% filter(bang < 2250, crl.tot
< 2250)” to remove outliers in the bang and crl.tot varaibles. The
outliers can skew the model training which can affect the prediction
accuracy. These transformations were important to enhance the model’s
accuracy. By removing outliers and duplicates I helped to prevent any
skewed data from creating bias in my data set.
- Data Preparation and Modeling:
- What are the names of data preparation steps mentioned in the video?
List and describe any data preparation steps or techniques mentioned in
the CA video that you applied to the new data set. The mutate and
case_when function, mutate_if, step_impute_linear, and step_downsample
were all used in the video. I did not use any of these preparation steps
as I did not feel that they applied to my data set, I got errors when
trying to use them, so I used different data preperation steps to
prepare my data set, which I mentioned in the question above.
- What is the name of the machine learning model(s) used in the
analysis? Specify the machine learning model(s) you employed for your
analysis and briefly explain their relevance to the research question.
The machine learning model used in the analysis is Random Forest. Random
forest is used with regression tasks, which is what the research
question was requesting for this data set. The ranger function was used
to predict if an email is spam based of the amount of times special
characters appear in the email. Random forests can handle linear and
nonliner relationships in data. They also can find complex patterns in a
large data set, which is helpful in this code by finding nonlinear
relationships between the special characters in connection with if it is
spam or not.
- Model Evaluation:
- What metrics are used in the model evaluation? Detail the evaluation
metrics you used to assess the performance of your machine learning
model(s) on the new dataset. Discuss the significance of these metrics
in the context of your research question. The evaluation metrics used to
assess the performance of the machine learning model on the spam data
set is roc_curve. The metrics used in the model evaluation was ROC
curve. ROC was used to distinguish between yes or no. The ROC curve
helps understand the model’s discriminatory power. I also used the
confusion matrix to calculate the performance metrics, specifically in
terms of its ability to correctly predict positive and negative cases.
This was insightful on understanding the accuracy of the models
predictions on whether it is a spam email or not based on special
characters and key word appearances in an email.
- Conclusion:
- What are the major findings? Summarize the key findings and insights
obtained from your analysis of the new dataset. Relate these findings
back to the research question and any similarities or differences
compared to the CA assignment. The major findings are that the model has
a rather strong accuracy of prediciting spam email. The Random Forest
model had an accuracy of about 83.8% on the validation set, indicating
that it correctly identified emails as spam or not spam in the majority
of tests. The ROC_AUC value was approximately 0.913, which shows a high
ability in the model to predict either spam and non-spam emails. By
using the VIP, I can see that certain key words and special characters
within emails are significant predictors for determining if an email is
spam. By using scatterplots and histograms I was able to see
relationships between the frequency of terms like ‘money’ and ‘bang’, as
well as the distribution of terms like ‘dollar’ and ‘bang’ in relation
to spam classification. In summary, the Random Forest model is able to
classify emails using the features in the data set, with specific terms
and characters in the emails being especially likely to mean it is
spam.