Explore data

library(tidyverse)
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')

bigfoot_raw %>%
  count(classification)
## # A tibble: 3 × 2
##   classification     n
##   <chr>          <int>
## 1 Class A         2481
## 2 Class B         2510
## 3 Class C           30
bigfoot <-
  bigfoot_raw %>%
  filter(classification != "Class C", !is.na(observed)) %>%
  mutate(classification = case_when(
    classification == "Class A" ~ "sighting",
    classification == "Class B" ~ "possible"
  ))
library(tidytext)
library(tidylo)

bigfoot %>%
  unnest_tokens(word, observed) %>%
  count(classification, word) %>%
  filter(n > 100) %>%
  bind_log_odds(classification, word, n) %>%
  arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
##    classification word           n log_odds_weighted
##    <chr>          <chr>      <int>             <dbl>
##  1 possible       howl         455              14.7
##  2 sighting       fur          362              13.3
##  3 possible       heard       5397              12.7
##  4 possible       screams      327              12.5
##  5 sighting       ape          300              12.1
##  6 possible       knocks       301              12.0
##  7 sighting       hands        285              11.8
##  8 sighting       headlights   283              11.7
##  9 possible       listened     266              11.2
## 10 sighting       witness      249              11.0
## # ℹ 1,737 more rows

Build a model

library(tidymodels)

set.seed(123)
bigfoot_split <-
  bigfoot %>%
  select(observed, classification) %>%
  initial_split(strata = classification)

bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)

set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, v = 5, strata = classification)
bigfoot_folds
## #  5-fold cross-validation using stratification 
## # A tibble: 5 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [2971/743]> Fold1
## 2 <split [2971/743]> Fold2
## 3 <split [2971/743]> Fold3
## 4 <split [2971/743]> Fold4
## 5 <split [2972/742]> Fold5
library(textrecipes)

bigfoot_rec <-
  recipe(classification ~ observed, data = bigfoot_train) %>%
  step_tokenize(observed) %>%
  step_tokenfilter(observed, max_tokens = 2e3) %>%
  step_tfidf(observed)

bigfoot_rec
glmnet_spec <-
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

bigfoot_wf <- workflow(bigfoot_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)

bigfoot_res <-
  tune_grid(
    bigfoot_wf,
    bigfoot_folds,
    grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
  )

autoplot(bigfoot_res)

show_best(bigfoot_res)
## # A tibble: 5 × 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.0158  roc_auc binary     0.855     5 0.00497 Preprocessor1_Model05
## 2 0.00794 roc_auc binary     0.849     5 0.00561 Preprocessor1_Model04
## 3 0.0316  roc_auc binary     0.847     5 0.00585 Preprocessor1_Model06
## 4 0.0631  roc_auc binary     0.826     5 0.00611 Preprocessor1_Model07
## 5 0.00398 roc_auc binary     0.824     5 0.00701 Preprocessor1_Model03
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config             .best .loss
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               <dbl> <dbl>
## 1  0.0316 roc_auc binary     0.847     5 0.00585 Preprocessor1_Mode… 0.855  1.00
bigfoot_final <-
  bigfoot_wf %>%
  finalize_workflow(
    select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(bigfoot_split)

bigfoot_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [3714/1239]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(bigfoot_final)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.764 Preprocessor1_Model1
## 2 roc_auc  binary         0.836 Preprocessor1_Model1
collect_predictions(bigfoot_final) %>%
  conf_mat(classification, .pred_class)
##           Truth
## Prediction possible sighting
##   possible      491      158
##   sighting      134      456
library(vip)

bigfoot_final %>%
  extract_fit_engine() %>%
  vi()
## # A tibble: 2,000 × 3
##    Variable                  Importance Sign 
##    <chr>                          <dbl> <chr>
##  1 tfidf_observed_muscular         860. POS  
##  2 tfidf_observed_hunched          810. POS  
##  3 tfidf_observed_nose             804. POS  
##  4 tfidf_observed_shaggy           779. POS  
##  5 tfidf_observed_guessing         768. POS  
##  6 tfidf_observed_whooping         716. NEG  
##  7 tfidf_observed_especially       682. NEG  
##  8 tfidf_observed_putting          673. POS  
##  9 tfidf_observed_literally        664. NEG  
## 10 tfidf_observed_admit            621. POS  
## # ℹ 1,990 more rows

Deploy the model

library(vetiver)

v <-
  bigfoot_final %>%
  extract_workflow() %>%
  vetiver_model("bigfoot")

v
## 
## ── bigfoot ─ <bundled_workflow> model for deployment 
## A glmnet classification modeling workflow using 1 feature
augment(v, slice_sample(bigfoot_test, n = 10))
## # A tibble: 10 × 5
##    observed             classification .pred_class .pred_possible .pred_sighting
##    <chr>                <chr>          <fct>                <dbl>          <dbl>
##  1 "My husband, myself… possible       possible            0.776           0.224
##  2 "We live about a mi… sighting       sighting            0.211           0.789
##  3 "Hello my name  is … sighting       sighting            0.198           0.802
##  4 "On November 7th an… possible       possible            0.592           0.408
##  5 "I am a tree trimme… possible       possible            0.518           0.482
##  6 "My wife and I were… sighting       sighting            0.0785          0.922
##  7 "Bigfoot sighting. … sighting       sighting            0.407           0.593
##  8 "Preface:  The foll… possible       possible            0.551           0.449
##  9 "Found a fresh foot… possible       possible            0.631           0.369
## 10 "A neighbor's chick… possible       sighting            0.382           0.618