Import data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
spam <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv')
## Rows: 4601 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): yesno
## dbl (6): crl.tot, dollar, bang, money, n000, make
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

spam %>% skimr::skim()
Data summary
Name Piped data
Number of rows 4601
Number of columns 7
_______________________
Column type frequency:
character 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
yesno 0 1 1 1 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
crl.tot 0 1 283.29 606.35 1 35 95 266.00 15841.00 ▇▁▁▁▁
dollar 0 1 0.08 0.25 0 0 0 0.05 6.00 ▇▁▁▁▁
bang 0 1 0.27 0.82 0 0 0 0.32 32.48 ▇▁▁▁▁
money 0 1 0.09 0.44 0 0 0 0.00 12.50 ▇▁▁▁▁
n000 0 1 0.10 0.35 0 0 0 0.00 5.45 ▇▁▁▁▁
make 0 1 0.10 0.31 0 0 0 0.00 4.54 ▇▁▁▁▁
colSums(is.na(spam))
## crl.tot  dollar    bang   money    n000    make   yesno 
##       0       0       0       0       0       0       0
cleaned_spam <- spam[!duplicated(spam), ]

cleaned_spam <- cleaned_spam %>%
  filter(bang < 2250, crl.tot < 2250)

Explore data

cleaned_spam %>%
    filter(money > 0, bang > 0) %>%
    ggplot(aes(money, bang, color = yesno)) +
    geom_point(alpha = 0.5, size = 1.8)

cleaned_spam %>%
  filter(dollar < 2) %>%
  ggplot(aes(dollar, y = ..density.., fill = yesno)) +
  geom_histogram(position = "identity", alpha = 0.8)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cleaned_spam %>% 
  filter(bang < 2250) %>%
  ggplot(aes(x = bang, color = yesno)) + 
  geom_histogram(binwidth = 0.5) + 
  ggtitle("Distribution of 'bang' (Values < 2250)") 

cleaned_spam %>% 
  filter(crl.tot < 2250) %>%
  ggplot(aes(x = crl.tot, color = yesno)) + 
  geom_histogram(binwidth = 0.5) + 
  ggtitle("Distribution of 'crl.tot' (Values < 2500)")

ggplot(cleaned_spam, aes(x = yesno, y = bang, color = yesno)) + 
  geom_boxplot() + 
  ggtitle("'bang' by 'yesno'")

ggplot(cleaned_spam, aes(x = yesno, y = crl.tot, color = yesno)) + 
  geom_boxplot() + 
  ggtitle("'crl.tot' by 'yesno'")

Building a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
spam_split <- initial_split(cleaned_spam, strata = yesno)
spam_train <- training(spam_split)
spam_test <- testing(spam_split)

set.seed(234)
spam_folds <- vfold_cv(spam_train, strata = yesno)
spam_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [2000/223]> Fold01
##  2 <split [2000/223]> Fold02
##  3 <split [2000/223]> Fold03
##  4 <split [2000/223]> Fold04
##  5 <split [2001/222]> Fold05
##  6 <split [2001/222]> Fold06
##  7 <split [2001/222]> Fold07
##  8 <split [2001/222]> Fold08
##  9 <split [2001/222]> Fold09
## 10 <split [2002/221]> Fold10
usemodels::use_ranger(yesno ~ ., data = spam_train)
## ranger_recipe <- 
##   recipe(formula = yesno ~ ., data = spam_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("classification") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(36881)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(themis)
ranger_recipe <-
  recipe(formula = yesno ~ ., data = spam_train) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_other(all_nominal_predictors(), threshold = 0.03) %>%
  step_impute_linear(dollar)

ranger_spec <-
  rand_forest(trees = 1000) %>%
  set_mode("classification") %>%
  set_engine("ranger")

ranger_workflow <-
  workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(ranger_spec)

doParallel::registerDoParallel()
set.seed(74403)
ranger_rs <-
  fit_resamples(ranger_workflow,
    resamples = spam_folds,
    control = control_resamples(save_pred = TRUE)
  )

Explore the results

collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.826    10 0.00975 Preprocessor1_Model1
## 2 roc_auc  binary     0.901    10 0.00963 Preprocessor1_Model1
collect_predictions(ranger_rs) %>%
  group_by(id) %>%
  roc_curve(yesno, .pred_n) %>%
  autoplot()

conf_mat_resampled(ranger_rs, tidy = FALSE) %>%
  autoplot()

final_fitted <- last_fit(ranger_workflow, spam_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.841 Preprocessor1_Model1
## 2 roc_auc  binary         0.911 Preprocessor1_Model1
library(tidymodels)
library(ggplot2)

collect_predictions(final_fitted) %>%
  conf_mat(yesno, .pred_class) %>%
  autoplot(type = "heatmap") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_minimal()
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
library(tidymodels)
library(ggplot2)

ranger_spec <- rand_forest() %>%
  set_mode("classification") %>%
  set_engine("ranger", importance = "permutation")

imp_data <- ranger_recipe %>%
  prep() %>%
  bake(new_data = NULL)

ranger_fit <- ranger_spec %>%
  fit(yesno ~ ., data = imp_data)

spam_plot <- vip(ranger_fit, geom = "point") + 
  geom_point(color = "blue") +
  theme_minimal()

print(spam_plot)

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. Can I predict whether an email is spam depending on how many special characters appear in the email?
    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The spam data set contains 2,966 entries with 8 columns. Each column contains details on how often special characters appear in an email, and whether it can be determined as spam based off the frequency of the symbol occurrence. The target variable is yesno which indicates if an email is spam or not. Yesno is character data. Some of the other columns are dollar, bang, money, and crl.tot.
    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary variables of interest are crl.tot which is numerical data. Another primary variable of interest is dollar, as well as bang. These three variables track the amount of times a special character or phrase appear in an email. The data set was cleaned for duplicates as well as outliers. They are all numerical data.
  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original spam data set was transformed in a few different ways. First, I used “colSums(is.na(spam))” to check for any NA values in the data set. Next, I used “cleaned_spam <- spam[!duplicated(spam), ]” to remove duplicat rows. This way every data entry was unique. After that, I used “cleaned_spam <- cleaned_spam %>% filter(bang < 2250, crl.tot < 2250)” to remove outliers in the bang and crl.tot varaibles. The outliers can skew the model training which can affect the prediction accuracy. These transformations were important to enhance the model’s accuracy. By removing outliers and duplicates I helped to prevent any skewed data from creating bias in my data set.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new data set. The mutate and case_when function, mutate_if, step_impute_linear, and step_downsample were all used in the video. I did not use any of these preparation steps as I did not feel that they applied to my data set, I got errors when trying to use them, so I used different data preperation steps to prepare my data set, which I mentioned in the question above.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning model used in the analysis is Random Forest. Random forest is used with regression tasks, which is what the research question was requesting for this data set. The ranger function was used to predict if an email is spam based of the amount of times special characters appear in the email. Random forests can handle linear and nonliner relationships in data. They also can find complex patterns in a large data set, which is helpful in this code by finding nonlinear relationships between the special characters in connection with if it is spam or not.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The evaluation metrics used to assess the performance of the machine learning model on the spam data set is roc_curve. The metrics used in the model evaluation was ROC curve. ROC was used to distinguish between yes or no. The ROC curve helps understand the model’s discriminatory power. I also used the confusion matrix to calculate the performance metrics, specifically in terms of its ability to correctly predict positive and negative cases. This was insightful on understanding the accuracy of the models predictions on whether it is a spam email or not based on special characters and key word appearances in an email.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. Relate these findings back to the research question and any similarities or differences compared to the CA assignment. The major findings are that the model has a rather strong accuracy of prediciting spam email. The Random Forest model had an accuracy of about 83.8% on the validation set, indicating that it correctly identified emails as spam or not spam in the majority of tests. The ROC_AUC value was approximately 0.913, which shows a high ability in the model to predict either spam and non-spam emails. By using the VIP, I can see that certain key words and special characters within emails are significant predictors for determining if an email is spam. By using scatterplots and histograms I was able to see relationships between the frequency of terms like ‘money’ and ‘bang’, as well as the distribution of terms like ‘dollar’ and ‘bang’ in relation to spam classification. In summary, the Random Forest model is able to classify emails using the features in the data set, with specific terms and characters in the emails being especially likely to mean it is spam.