Explore data
library(tidyverse)
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
bigfoot_raw %>%
count(classification)
## # A tibble: 3 × 2
## classification n
## <chr> <int>
## 1 Class A 2481
## 2 Class B 2510
## 3 Class C 30
bigfoot <-
bigfoot_raw %>%
filter(classification != "Class C", !is.na(observed)) %>%
mutate(classification = case_when(
classification == "Class A" ~ "sighting",
classification == "Class B" ~ "possible"
))
library(tidytext)
library(tidylo)
bigfoot %>%
unnest_tokens(word, observed) %>%
count(classification, word) %>%
filter(n > 100) %>%
bind_log_odds(classification, word, n) %>%
arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
## classification word n log_odds_weighted
## <chr> <chr> <int> <dbl>
## 1 possible howl 455 14.7
## 2 sighting fur 362 13.3
## 3 possible heard 5397 12.7
## 4 possible screams 327 12.5
## 5 sighting ape 300 12.1
## 6 possible knocks 301 12.0
## 7 sighting hands 285 11.8
## 8 sighting headlights 283 11.7
## 9 possible listened 266 11.2
## 10 sighting witness 249 11.0
## # ℹ 1,737 more rows
Build a model
library(tidymodels)
set.seed(123)
bigfoot_split <-
bigfoot %>%
select(observed, classification) %>%
initial_split(strata = classification)
bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)
set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, v = 5, strata = classification)
bigfoot_folds
## # 5-fold cross-validation using stratification
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [2971/743]> Fold1
## 2 <split [2971/743]> Fold2
## 3 <split [2971/743]> Fold3
## 4 <split [2971/743]> Fold4
## 5 <split [2972/742]> Fold5
library(textrecipes)
bigfoot_rec <-
recipe(classification ~ observed, data = bigfoot_train) %>%
step_tokenize(observed) %>%
step_tokenfilter(observed, max_tokens = 2e3) %>%
step_tfidf(observed)
bigfoot_rec
glmnet_spec <-
logistic_reg(mixture = 1, penalty = tune()) %>%
set_engine("glmnet")
bigfoot_wf <- workflow(bigfoot_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
bigfoot_res <-
tune_grid(
bigfoot_wf,
bigfoot_folds,
grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
)
autoplot(bigfoot_res)

show_best(bigfoot_res)
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0158 roc_auc binary 0.855 5 0.00497 Preprocessor1_Model05
## 2 0.00794 roc_auc binary 0.849 5 0.00561 Preprocessor1_Model04
## 3 0.0316 roc_auc binary 0.847 5 0.00585 Preprocessor1_Model06
## 4 0.0631 roc_auc binary 0.826 5 0.00611 Preprocessor1_Model07
## 5 0.00398 roc_auc binary 0.824 5 0.00701 Preprocessor1_Model03
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .loss
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.0316 roc_auc binary 0.847 5 0.00585 Preprocessor1_Mode… 0.855 1.00
bigfoot_final <-
bigfoot_wf %>%
finalize_workflow(
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
) %>%
last_fit(bigfoot_split)
bigfoot_final
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [3714/1239]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(bigfoot_final)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.764 Preprocessor1_Model1
## 2 roc_auc binary 0.836 Preprocessor1_Model1
collect_predictions(bigfoot_final) %>%
conf_mat(classification, .pred_class)
## Truth
## Prediction possible sighting
## possible 491 158
## sighting 134 456
library(vip)
bigfoot_final %>%
extract_fit_engine() %>%
vi()
## # A tibble: 2,000 × 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 tfidf_observed_muscular 860. POS
## 2 tfidf_observed_hunched 810. POS
## 3 tfidf_observed_nose 804. POS
## 4 tfidf_observed_shaggy 779. POS
## 5 tfidf_observed_guessing 768. POS
## 6 tfidf_observed_whooping 716. NEG
## 7 tfidf_observed_especially 682. NEG
## 8 tfidf_observed_putting 673. POS
## 9 tfidf_observed_literally 664. NEG
## 10 tfidf_observed_admit 621. POS
## # ℹ 1,990 more rows
Deploy the model
library(vetiver)
v <-
bigfoot_final %>%
extract_workflow() %>%
vetiver_model("bigfoot")
v
##
## ── bigfoot ─ <bundled_workflow> model for deployment
## A glmnet classification modeling workflow using 1 feature
augment(v, slice_sample(bigfoot_test, n = 10))
## # A tibble: 10 × 5
## observed classification .pred_class .pred_possible .pred_sighting
## <chr> <chr> <fct> <dbl> <dbl>
## 1 "My husband, myself… possible possible 0.776 0.224
## 2 "We live about a mi… sighting sighting 0.211 0.789
## 3 "Hello my name is … sighting sighting 0.198 0.802
## 4 "On November 7th an… possible possible 0.592 0.408
## 5 "I am a tree trimme… possible possible 0.518 0.482
## 6 "My wife and I were… sighting sighting 0.0785 0.922
## 7 "Bigfoot sighting. … sighting sighting 0.407 0.593
## 8 "Preface: The foll… possible possible 0.551 0.449
## 9 "Found a fresh foot… possible possible 0.631 0.369
## 10 "A neighbor's chick… possible sighting 0.382 0.618