library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): observed, location_details, county, state, season, title, classif...
## dbl  (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
  count(classification)
## # A tibble: 3 × 2
##   classification     n
##   <chr>          <int>
## 1 Class A         2481
## 2 Class B         2510
## 3 Class C           30
bigfoot <-
  bigfoot_raw %>%
  filter(classification != "Class C", !is.na(observed)) %>%
  mutate(
    classification = case_when(
      classification == "Class A" ~ "sighting",
      classification == "Class B" ~ "possible"
    )
  )

bigfoot
## # A tibble: 4,953 × 28
##    observed        location_details county state season title latitude longitude
##    <chr>           <chr>            <chr>  <chr> <chr>  <chr>    <dbl>     <dbl>
##  1 "I was canoein…  <NA>            Winst… Alab… Summer <NA>      NA        NA  
##  2 "Ed L. was sal… "East side of P… Valde… Alas… Fall   <NA>      NA        NA  
##  3 "While attendi… "Great swamp ar… Washi… Rhod… Fall   Repo…     41.4     -71.5
##  4 "Hello, My nam… "I would rather… York … Penn… Summer <NA>      NA        NA  
##  5 "It was May 19… "Logging roads … Yamhi… Oreg… Spring <NA>      NA        NA  
##  6 "My two childr… "The creature c… Washi… Okla… Fall   Repo…     35.3     -99.2
##  7 "I was staying… "Vincent, Ohio … Washi… Ohio  Summer Repo…     39.4     -81.7
##  8 "Well last yea… "Both sightings… Westc… New … Fall   Repo…     41.3     -73.7
##  9 "I grew up in … "The Western fa… Washo… Neva… Fall   Repo…     39.6    -120. 
## 10 "heh i kinda f… "the road is of… Warre… New … Fall   <NA>      NA        NA  
## # ℹ 4,943 more rows
## # ℹ 20 more variables: date <date>, number <dbl>, classification <chr>,
## #   geohash <chr>, temperature_high <dbl>, temperature_mid <dbl>,
## #   temperature_low <dbl>, dew_point <dbl>, humidity <dbl>, cloud_cover <dbl>,
## #   moon_phase <dbl>, precip_intensity <dbl>, precip_probability <dbl>,
## #   precip_type <chr>, pressure <dbl>, summary <chr>, uv_index <dbl>,
## #   visibility <dbl>, wind_bearing <dbl>, wind_speed <dbl>
library(tidytext)
library(tidylo)

bigfoot %>%
  unnest_tokens(word, observed) %>%
  count(classification, word) %>%
  filter(n > 100) %>%
  bind_log_odds(classification, word, n) %>%
  arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
##    classification word           n log_odds_weighted
##    <chr>          <chr>      <int>             <dbl>
##  1 possible       howl         455              14.7
##  2 sighting       fur          362              13.3
##  3 possible       heard       5397              12.7
##  4 possible       screams      327              12.5
##  5 sighting       ape          300              12.1
##  6 possible       knocks       301              12.0
##  7 sighting       hands        285              11.8
##  8 sighting       headlights   283              11.7
##  9 possible       listened     266              11.2
## 10 sighting       witness      249              11.0
## # ℹ 1,737 more rows

Build a Model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
bigfoot_split <-
  bigfoot %>%
  select(observed, classification) %>%
  initial_split(strata = classification)

bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)

set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, strata = classification)
bigfoot_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [3342/372]> Fold01
##  2 <split [3342/372]> Fold02
##  3 <split [3342/372]> Fold03
##  4 <split [3342/372]> Fold04
##  5 <split [3343/371]> Fold05
##  6 <split [3343/371]> Fold06
##  7 <split [3343/371]> Fold07
##  8 <split [3343/371]> Fold08
##  9 <split [3343/371]> Fold09
## 10 <split [3343/371]> Fold10
library(textrecipes)

bigfoot_rec <-
  recipe(classification ~ observed, data = bigfoot_train) %>%
  step_tokenize(observed) %>%
  step_tokenfilter(observed, max_tokens = 2e3) %>%
  step_tfidf(observed)

bigfoot_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 1
## 
## ── Operations
## • Tokenization for: observed
## • Text filtering for: observed
## • Term frequency-inverse document frequency with: observed
glmnet_spec <- 
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

bigfoot_wf <- workflow(bigfoot_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
bigfoot_res <- 
  tune_grid(
    bigfoot_wf, 
    bigfoot_folds, 
    grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
  )

autoplot(bigfoot_res)

show_best(bigfoot_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.0158  roc_auc binary     0.857    10 0.00463 Preprocessor1_Model05
## 2 0.00794 roc_auc binary     0.853    10 0.00545 Preprocessor1_Model04
## 3 0.0316  roc_auc binary     0.847    10 0.00487 Preprocessor1_Model06
## 4 0.00398 roc_auc binary     0.837    10 0.00675 Preprocessor1_Model03
## 5 0.0631  roc_auc binary     0.827    10 0.00506 Preprocessor1_Model07
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config             .best .loss
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               <dbl> <dbl>
## 1  0.0316 roc_auc binary     0.847    10 0.00487 Preprocessor1_Mode… 0.857  1.13
bigfoot_final <-
  bigfoot_wf %>%
  finalize_workflow(
    select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(bigfoot_split)

bigfoot_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [3714/1239]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(bigfoot_final)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.764 Preprocessor1_Model1
## 2 roc_auc  binary         0.836 Preprocessor1_Model1
collect_predictions(bigfoot_final) %>%
  conf_mat(classification, .pred_class)
##           Truth
## Prediction possible sighting
##   possible      491      158
##   sighting      134      456
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
bigfoot_final %>%
  extract_fit_engine() %>%
  vi() 
## # A tibble: 2,000 × 3
##    Variable                  Importance Sign 
##    <chr>                          <dbl> <chr>
##  1 tfidf_observed_muscular         860. POS  
##  2 tfidf_observed_hunched          810. POS  
##  3 tfidf_observed_nose             804. POS  
##  4 tfidf_observed_shaggy           779. POS  
##  5 tfidf_observed_guessing         768. POS  
##  6 tfidf_observed_whooping         716. NEG  
##  7 tfidf_observed_especially       682. NEG  
##  8 tfidf_observed_putting          673. POS  
##  9 tfidf_observed_literally        664. NEG  
## 10 tfidf_observed_admit            621. POS  
## # ℹ 1,990 more rows