Explore the data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): observed, location_details, county, state, season, title, classif...
## dbl  (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
  count(classification)
## # A tibble: 3 × 2
##   classification     n
##   <chr>          <int>
## 1 Class A         2481
## 2 Class B         2510
## 3 Class C           30
bigfoot <- 
  bigfoot_raw %>%
  filter(classification != "Class C", !is.na(observed)) %>%
  mutate(classification = case_when(
    classification == "Class A" ~ "sighting",
    classification == "Class B" ~ " posible"
  ))
library(tidytext)
library(tidylo)

bigfoot %>%
  unnest_tokens(word, observed) %>%
  count(classification, word) %>%
  filter(n > 100) %>%
  bind_log_odds(classification, word, n) %>%
  arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
##    classification word           n log_odds_weighted
##    <chr>          <chr>      <int>             <dbl>
##  1 " posible"     howl         455              14.7
##  2 "sighting"     fur          362              13.3
##  3 " posible"     heard       5397              12.7
##  4 " posible"     screams      327              12.5
##  5 "sighting"     ape          300              12.1
##  6 " posible"     knocks       301              12.0
##  7 "sighting"     hands        285              11.8
##  8 "sighting"     headlights   283              11.7
##  9 " posible"     listened     266              11.2
## 10 "sighting"     witness      249              11.0
## # ℹ 1,737 more rows

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
bigfoot_split <- 
  bigfoot %>%
  select(observed, classification) %>%
  initial_split(strata = classification)

bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)

set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, v = 5, strata = classification)
bigfoot_folds
## #  5-fold cross-validation using stratification 
## # A tibble: 5 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [2971/743]> Fold1
## 2 <split [2971/743]> Fold2
## 3 <split [2971/743]> Fold3
## 4 <split [2971/743]> Fold4
## 5 <split [2972/742]> Fold5
library(textrecipes)

bigfoot_rec <-
  recipe(classification ~ observed, data = bigfoot_train) %>%
  step_tokenize(observed) %>%
  step_tokenfilter(observed, max_tokens = 2e3) %>%
  step_tfidf(observed)

bigfoot_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 1
## 
## ── Operations
## • Tokenization for: observed
## • Text filtering for: observed
## • Term frequency-inverse document frequency with: observed
glmnet_spec <- 
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

bigfoot_wf <- workflow(bigfoot_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)

bigfoot_res <- 
  tune_grid(
    bigfoot_wf, 
    bigfoot_folds, 
    grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
    )

autoplot(bigfoot_res)

show_best(bigfoot_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.0158  roc_auc binary     0.855     5 0.00497 Preprocessor1_Model05
## 2 0.00794 roc_auc binary     0.849     5 0.00561 Preprocessor1_Model04
## 3 0.0316  roc_auc binary     0.847     5 0.00585 Preprocessor1_Model06
## 4 0.0631  roc_auc binary     0.826     5 0.00611 Preprocessor1_Model07
## 5 0.00398 roc_auc binary     0.824     5 0.00701 Preprocessor1_Model03
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config             .best .loss
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               <dbl> <dbl>
## 1  0.0316 roc_auc binary     0.847     5 0.00585 Preprocessor1_Mode… 0.855  1.00
bigfoot_final <-
  bigfoot_wf %>%
  finalize_workflow(
    select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(bigfoot_split)

bigfoot_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [3714/1239]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(bigfoot_final)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.764 Preprocessor1_Model1
## 2 roc_auc  binary         0.836 Preprocessor1_Model1
collect_predictions(bigfoot_final) %>%
  conf_mat(classification, .pred_class)
##           Truth
## Prediction  posible sighting
##    posible      491      158
##   sighting      134      456
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
bigfoot_final %>%
  extract_fit_engine() %>%
  vi()
## # A tibble: 2,000 × 3
##    Variable                  Importance Sign 
##    <chr>                          <dbl> <chr>
##  1 tfidf_observed_muscular         860. POS  
##  2 tfidf_observed_hunched          810. POS  
##  3 tfidf_observed_nose             804. POS  
##  4 tfidf_observed_shaggy           779. POS  
##  5 tfidf_observed_guessing         768. POS  
##  6 tfidf_observed_whooping         716. NEG  
##  7 tfidf_observed_especially       682. NEG  
##  8 tfidf_observed_putting          673. POS  
##  9 tfidf_observed_literally        664. NEG  
## 10 tfidf_observed_admit            621. POS  
## # ℹ 1,990 more rows
  1. The research question we are trying to answer is wether we can use the description from a bigfoot sighting to predicts the classification of that sighting. This means if we can lookk at the description and then predict if it was classified as a sighting or a possible sighting.

The original data was made up of 5021 observations and 28 variable with its key characteristics being the “observed” variable which was used to predict the classification.

In the analysis we only used the observed variable and the text in it. This was because a lot of the other variables had a lot of missing data which would have made for a worse outcome if they were taken away or imputed.

  1. The bigfoot_raw dataset contained 28 columns of Bigfoot encounter details. In the transformation, less credible Class C reports were removed, and Class A and B were relabeled as “sighting” and “possible”. Encounter descriptions were broken down into individual words, with their frequencies computed. Using term frequency-inverse document frequency (TF-IDF), the most informative words were highlighted. This streamlined data was designed for clarity and optimized for predictive modeling.

  2. The data underwent several preparation steps: First, it was filtered to exclude Class C reports and relabeled Class A and B encounters. Then, encounter descriptions were tokenized, breaking them down into individual words. Finally, term frequency-inverse document frequency (TF-IDF) was applied to identify and highlight the most informative words for predictive modeling.

The machine learning model used in the analysis is the logistic regression model, specifically implemented with the glmnet engine.

  1. The model evaluation employed two metrics: accuracy, which measures the proportion of correct predictions, and roc_auc, which assesses the model’s ability to distinguish between classes based on the area under the receiver operating characteristic curve.

  2. The analysis of the Bigfoot encounter data revealed a preference in the terms used for different types of encounters. Words like “howl”, “heard”, and “screams” were associated with “possible” encounters, while terms such as “fur”, “ape”, and “hands” were linked to confirmed “sightings”. After building a logistic regression model, it achieved an accuracy of 76.4% and an ROC_AUC of 0.836 on the test data. Important terms influencing predictions included “muscular”, “hunched”, and “nose” for sightings, while words like “whooping” and “especially” had negative associations.