library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): observed, location_details, county, state, season, title, classif...
## dbl  (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
  count(classification)
## # A tibble: 3 × 2
##   classification     n
##   <chr>          <int>
## 1 Class A         2481
## 2 Class B         2510
## 3 Class C           30
bigfoot <-
  bigfoot_raw %>%
  filter(classification != "Class C", !is.na(observed)) %>%
  mutate(
    classification = case_when(
      classification == "Class A" ~ "sighting",
      classification == "Class B" ~ "possible"
    ))

bigfoot
## # A tibble: 4,953 × 28
##    observed        location_details county state season title latitude longitude
##    <chr>           <chr>            <chr>  <chr> <chr>  <chr>    <dbl>     <dbl>
##  1 "I was canoein…  <NA>            Winst… Alab… Summer <NA>      NA        NA  
##  2 "Ed L. was sal… "East side of P… Valde… Alas… Fall   <NA>      NA        NA  
##  3 "While attendi… "Great swamp ar… Washi… Rhod… Fall   Repo…     41.4     -71.5
##  4 "Hello, My nam… "I would rather… York … Penn… Summer <NA>      NA        NA  
##  5 "It was May 19… "Logging roads … Yamhi… Oreg… Spring <NA>      NA        NA  
##  6 "My two childr… "The creature c… Washi… Okla… Fall   Repo…     35.3     -99.2
##  7 "I was staying… "Vincent, Ohio … Washi… Ohio  Summer Repo…     39.4     -81.7
##  8 "Well last yea… "Both sightings… Westc… New … Fall   Repo…     41.3     -73.7
##  9 "I grew up in … "The Western fa… Washo… Neva… Fall   Repo…     39.6    -120. 
## 10 "heh i kinda f… "the road is of… Warre… New … Fall   <NA>      NA        NA  
## # ℹ 4,943 more rows
## # ℹ 20 more variables: date <date>, number <dbl>, classification <chr>,
## #   geohash <chr>, temperature_high <dbl>, temperature_mid <dbl>,
## #   temperature_low <dbl>, dew_point <dbl>, humidity <dbl>, cloud_cover <dbl>,
## #   moon_phase <dbl>, precip_intensity <dbl>, precip_probability <dbl>,
## #   precip_type <chr>, pressure <dbl>, summary <chr>, uv_index <dbl>,
## #   visibility <dbl>, wind_bearing <dbl>, wind_speed <dbl>
top_sightings <- bigfoot %>% count(classification) %>% slice_max(n, n = 6) %>% pull(classification)

bigfoot %>%
  filter(classification != "Class C", !is.na(observed)) %>%
  count(classification, season) %>%
  ggplot(aes(season, n, fill = season)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(classification), scales = "free_y") +
  labs(x = NULL, y = "Sightings in Seasons")

bigfoot %>%
  count(state, season)
## # A tibble: 220 × 3
##    state   season      n
##    <chr>   <chr>   <int>
##  1 Alabama Fall       21
##  2 Alabama Spring     14
##  3 Alabama Summer     31
##  4 Alabama Unknown     1
##  5 Alabama Winter     24
##  6 Alaska  Fall        8
##  7 Alaska  Spring      4
##  8 Alaska  Summer      5
##  9 Alaska  Winter      3
## 10 Arizona Fall       20
## # ℹ 210 more rows
top_state <- bigfoot %>% count(state) %>% slice_max(n, n = 4) %>% pull(state)

bigfoot %>%
  filter(state %in% top_state) %>%
  count(state, season) %>%
  ggplot(aes(season, n, fill = season)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(state), scales = "free_y") +
  labs(x = NULL, y = "States with Sightings")

library(tidytext)
library(tidylo)

bigfoot %>%
    unnest_tokens(word, observed) %>%
    count(classification, word) %>%
    filter(n > 100) %>%
    bind_log_odds(classification, word, n) %>%
    arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
##    classification word           n log_odds_weighted
##    <chr>          <chr>      <int>             <dbl>
##  1 possible       howl         455              14.7
##  2 sighting       fur          362              13.3
##  3 possible       heard       5397              12.7
##  4 possible       screams      327              12.5
##  5 sighting       ape          300              12.1
##  6 possible       knocks       301              12.0
##  7 sighting       hands        285              11.8
##  8 sighting       headlights   283              11.7
##  9 possible       listened     266              11.2
## 10 sighting       witness      249              11.0
## # ℹ 1,737 more rows
bigfoot_parsed <- bigfoot %>%
  select(classification, observed, season, state, county, temperature_mid, 
         temperature_high, temperature_low, date, moon_phase) %>%
  mutate(date = as.character(date)) %>%
  mutate(date = parse_number(date)) %>%
  na.omit() %>%
  mutate(across(where(is.character), as.factor)) %>%
  mutate(season = as.character(season))

Feature engineering

bigfoot_parsed <- sample_n(bigfoot_parsed, 100)

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.4     ✔ rsample      1.1.1
## ✔ dials        1.1.0     ✔ tune         1.0.1
## ✔ infer        1.0.4     ✔ workflows    1.1.2
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.1.0     ✔ yardstick    1.1.0
## ✔ recipes      1.0.3
## Warning: package 'broom' was built under R version 4.2.3
## Warning: package 'parsnip' was built under R version 4.2.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
bigfoot_split <-
  bigfoot_parsed %>%
  select(observed, classification) %>%
  initial_split(strata = classification)

bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)

set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, strata = classification)
bigfoot_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [66/8]> Fold01
##  2 <split [66/8]> Fold02
##  3 <split [66/8]> Fold03
##  4 <split [66/8]> Fold04
##  5 <split [66/8]> Fold05
##  6 <split [66/8]> Fold06
##  7 <split [66/8]> Fold07
##  8 <split [68/6]> Fold08
##  9 <split [68/6]> Fold09
## 10 <split [68/6]> Fold10
library(embed)
library(textrecipes)

bigfoot_rec <- 
  recipe(classification ~ ., data = bigfoot_train) %>%
  step_tokenize(observed) %>%
    step_stopwords(observed) %>%
    step_tokenfilter(observed, max_tokens = 200) %>%
    step_tfidf(observed) %>%
  step_dummy(all_nominal_predictors())

bigfoot_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 74
## Columns: 201
## $ classification            <fct> possible, possible, possible, possible, poss…
## $ tfidf_observed_1          <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_10         <dbl> 0.000000000, 0.006522484, 0.037533928, 0.000…
## $ tfidf_observed_15         <dbl> 0.000000000, 0.009333842, 0.017904006, 0.000…
## $ tfidf_observed_2          <dbl> 0.08039507, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_20         <dbl> 0.00000000, 0.00000000, 0.01858869, 0.000000…
## $ tfidf_observed_3          <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_30         <dbl> 0.000000000, 0.000000000, 0.024492248, 0.000…
## $ tfidf_observed_4          <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_5          <dbl> 0.00000000, 0.00000000, 0.03237429, 0.000000…
## $ tfidf_observed_6          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_7          <dbl> 0.05087389, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_across     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_almost     <dbl> 0.00000000, 0.01866768, 0.00000000, 0.000000…
## $ tfidf_observed_along      <dbl> 0.000000000, 0.000000000, 0.000000000, 0.025…
## $ tfidf_observed_also       <dbl> 0.000000000, 0.014668839, 0.000000000, 0.022…
## $ tfidf_observed_animal     <dbl> 0.000000000, 0.000000000, 0.029662124, 0.000…
## $ tfidf_observed_animals    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_another    <dbl> 0.000000000, 0.055656508, 0.030502657, 0.000…
## $ tfidf_observed_anything   <dbl> 0.00000000, 0.01546367, 0.02966212, 0.011908…
## $ tfidf_observed_appeared   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.028750…
## $ tfidf_observed_area       <dbl> 0.091185101, 0.020167321, 0.019342294, 0.023…
## $ tfidf_observed_arms       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_around     <dbl> 0.060790067, 0.005041830, 0.009671147, 0.023…
## $ tfidf_observed_asked      <dbl> 0.000000000, 0.009690787, 0.000000000, 0.000…
## $ tfidf_observed_away       <dbl> 0.03693624, 0.01838059, 0.00000000, 0.018872…
## $ tfidf_observed_back       <dbl> 0.027018370, 0.013445160, 0.060177278, 0.027…
## $ tfidf_observed_bear       <dbl> 0.000000000, 0.000000000, 0.017281443, 0.027…
## $ tfidf_observed_bed        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_behind     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_believe    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_big        <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_bigfoot    <dbl> 0.00000000, 0.05268779, 0.00000000, 0.000000…
## $ tfidf_observed_black      <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_body       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_brother    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_observed_brown      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_call       <dbl> 0.00000000, 0.00000000, 0.02019651, 0.000000…
## $ tfidf_observed_came       <dbl> 0.034220091, 0.017028955, 0.010888211, 0.000…
## $ tfidf_observed_camp       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.016987…
## $ tfidf_observed_can        <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_car        <dbl> 0.00000000, 0.00000000, 0.01443782, 0.000000…
## $ tfidf_observed_clear      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.015534…
## $ tfidf_observed_come       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.013418…
## $ tfidf_observed_coming     <dbl> 0.000000000, 0.007153339, 0.000000000, 0.000…
## $ tfidf_observed_continued  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.028750…
## $ tfidf_observed_county     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.014925…
## $ tfidf_observed_couple     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_coyote     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_creature   <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_creek      <dbl> 0.12161324, 0.01008641, 0.03869512, 0.000000…
## $ tfidf_observed_dark       <dbl> 0.00000000, 0.02728373, 0.00000000, 0.000000…
## $ tfidf_observed_day        <dbl> 0.00000000, 0.00000000, 0.01224612, 0.000000…
## $ tfidf_observed_decided    <dbl> 0.00000000, 0.00000000, 0.02966212, 0.011908…
## $ tfidf_observed_deep       <dbl> 0.000000000, 0.000000000, 0.017281443, 0.000…
## $ tfidf_observed_deer       <dbl> 0.04793275, 0.07950930, 0.03050266, 0.012245…
## $ tfidf_observed_direction  <dbl> 0.00000000, 0.01938157, 0.01858869, 0.000000…
## $ tfidf_observed_distance   <dbl> 0.000000000, 0.000000000, 0.000000000, 0.013…
## $ tfidf_observed_dog        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.017872…
## $ tfidf_observed_dogs       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_observed_door       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_even       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.012607…
## $ tfidf_observed_ever       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_face       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_observed_fast       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_feet       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_felt       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.015…
## $ tfidf_observed_figure     <dbl> 0.05431311, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_first      <dbl> 0.000000000, 0.030107308, 0.014437823, 0.000…
## $ tfidf_observed_foot       <dbl> 0.00000000, 0.00000000, 0.01790401, 0.028750…
## $ tfidf_observed_forest     <dbl> 0.000000000, 0.009333842, 0.035808012, 0.000…
## $ tfidf_observed_found      <dbl> 0.000000000, 0.007731833, 0.000000000, 0.000…
## $ tfidf_observed_friend     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.011592…
## $ tfidf_observed_front      <dbl> 0.000000000, 0.007153339, 0.000000000, 0.000…
## $ tfidf_observed_full       <dbl> 0.000000000, 0.010086406, 0.019347561, 0.000…
## $ tfidf_observed_get        <dbl> 0.00000000, 0.01304497, 0.00000000, 0.000000…
## $ tfidf_observed_girlfriend <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_go         <dbl> 0.000000000, 0.007153339, 0.000000000, 0.011…
## $ tfidf_observed_going      <dbl> 0.000000000, 0.000000000, 0.017904006, 0.014…
## $ tfidf_observed_got        <dbl> 0.031913184, 0.015880968, 0.010154195, 0.024…
## $ tfidf_observed_ground     <dbl> 0.09322382, 0.00000000, 0.00000000, 0.023816…
## $ tfidf_observed_hair       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_half       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_happened   <dbl> 0.00000000, 0.01687759, 0.01618715, 0.000000…
## $ tfidf_observed_head       <dbl> 0.00000000, 0.00000000, 0.02887565, 0.000000…
## $ tfidf_observed_hear       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_heard      <dbl> 0.087078906, 0.067407052, 0.055413849, 0.014…
## $ tfidf_observed_heavy      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.015534…
## $ tfidf_observed_hill       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.028750…
## $ tfidf_observed_home       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_house      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_howl       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_human      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_hunting    <dbl> 0.000000000, 0.008438797, 0.000000000, 0.025…
## $ tfidf_observed_incident   <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_jumped     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_just       <dbl> 0.029467430, 0.014663887, 0.009376001, 0.015…
## $ tfidf_observed_kind       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_knocks     <dbl> 0.14801527, 0.03682844, 0.00000000, 0.000000…
## $ tfidf_observed_know       <dbl> 0.00000000, 0.01396498, 0.00000000, 0.010754…
## $ tfidf_observed_lake       <dbl> 0.00000000, 0.07365689, 0.00000000, 0.000000…
## $ tfidf_observed_large      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.028889…
## $ tfidf_observed_later      <dbl> 0.000000000, 0.008438797, 0.000000000, 0.000…
## $ tfidf_observed_left       <dbl> 0.078642516, 0.006522484, 0.012511309, 0.010…
## $ tfidf_observed_legs       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_light      <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_like       <dbl> 0.000000000, 0.000000000, 0.015889003, 0.031…
## $ tfidf_observed_line       <dbl> 0.000000000, 0.036037133, 0.000000000, 0.000…
## $ tfidf_observed_little     <dbl> 0.000000000, 0.000000000, 0.000000000, 0.012…
## $ tfidf_observed_long       <dbl> 0.000000000, 0.006252529, 0.000000000, 0.000…
## $ tfidf_observed_look       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.012…
## $ tfidf_observed_looked     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_looking    <dbl> 0.04793275, 0.00000000, 0.00000000, 0.012245…
## $ tfidf_observed_loud       <dbl> 0.000000000, 0.006820933, 0.013083789, 0.000…
## $ tfidf_observed_made       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.010…
## $ tfidf_observed_make       <dbl> 0.000000000, 0.016877594, 0.000000000, 0.000…
## $ tfidf_observed_man        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_observed_many       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.015534…
## $ tfidf_observed_maybe      <dbl> 0.000000000, 0.008185881, 0.000000000, 0.012…
## $ tfidf_observed_mile       <dbl> 0.000000000, 0.009690787, 0.000000000, 0.000…
## $ tfidf_observed_miles      <dbl> 0.000000000, 0.009333842, 0.000000000, 0.000…
## $ tfidf_observed_minutes    <dbl> 0.00000000, 0.00000000, 0.06280803, 0.000000…
## $ tfidf_observed_mountain   <dbl> 0.000000000, 0.009333842, 0.000000000, 0.071…
## $ tfidf_observed_much       <dbl> 0.000000000, 0.017424450, 0.000000000, 0.000…
## $ tfidf_observed_near       <dbl> 0.00000000, 0.03180372, 0.01525133, 0.012245…
## $ tfidf_observed_never      <dbl> 0.00000000, 0.00000000, 0.01279012, 0.000000…
## $ tfidf_observed_next       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_night      <dbl> 0.00000000, 0.01250506, 0.00000000, 0.000000…
## $ tfidf_observed_noise      <dbl> 0.04793275, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_north      <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_nothing    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_now        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_occurred   <dbl> 0.000000000, 0.011604450, 0.044518891, 0.000…
## $ tfidf_observed_old        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0402543, …
## $ tfidf_observed_one        <dbl> 0.000000000, 0.016151656, 0.020654542, 0.008…
## $ tfidf_observed_outside    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_pm         <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_pond       <dbl> 0.00000000, 0.15485573, 0.00000000, 0.000000…
## $ tfidf_observed_prints     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.033974…
## $ tfidf_observed_property   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.016216…
## $ tfidf_observed_ran        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_right      <dbl> 0.038487819, 0.000000000, 0.012246124, 0.029…
## $ tfidf_observed_road       <dbl> 0.00000000, 0.03973423, 0.00000000, 0.034969…
## $ tfidf_observed_rock       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.015…
## $ tfidf_observed_running    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_said       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_saw        <dbl> 0.000000000, 0.004963597, 0.019042164, 0.000…
## $ tfidf_observed_say        <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_scared     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_seconds    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_see        <dbl> 0.031388923, 0.005206693, 0.029962153, 0.000…
## $ tfidf_observed_seemed     <dbl> 0.000000000, 0.000000000, 0.000000000, 0.028…
## $ tfidf_observed_seen       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.009…
## $ tfidf_observed_several    <dbl> 0.000000000, 0.029072360, 0.000000000, 0.000…
## $ tfidf_observed_side       <dbl> 0.036936237, 0.018380592, 0.011752439, 0.009…
## $ tfidf_observed_sitting    <dbl> 0.058421600, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_small      <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_snow       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_someone    <dbl> 0.000000000, 0.009009283, 0.000000000, 0.000…
## $ tfidf_observed_something  <dbl> 0.000000000, 0.005574982, 0.021387660, 0.034…
## $ tfidf_observed_sound      <dbl> 0.000000000, 0.013964983, 0.026787377, 0.000…
## $ tfidf_observed_sounded    <dbl> 0.000000000, 0.000000000, 0.028875646, 0.023…
## $ tfidf_observed_sounds     <dbl> 0.00000000, 0.01687759, 0.00000000, 0.000000…
## $ tfidf_observed_south      <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_spot       <dbl> 0.13298730, 0.00000000, 0.04231414, 0.000000…
## $ tfidf_observed_standing   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_started    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_still      <dbl> 0.000000000, 0.015053654, 0.000000000, 0.011…
## $ tfidf_observed_stood      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_stopped    <dbl> 0.000000000, 0.016371763, 0.000000000, 0.000…
## $ tfidf_observed_strange    <dbl> 0.000000000, 0.000000000, 0.000000000, 0.015…
## $ tfidf_observed_sure       <dbl> 0.000000000, 0.016877594, 0.016187147, 0.000…
## $ tfidf_observed_swamp      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_tall       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_tell       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_tent       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_thing      <dbl> 0.00000000, 0.00000000, 0.01199349, 0.019259…
## $ tfidf_observed_think      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_thought    <dbl> 0.000000000, 0.021910345, 0.021014013, 0.008…
## $ tfidf_observed_three      <dbl> 0.000000000, 0.009009283, 0.000000000, 0.013…
## $ tfidf_observed_time       <dbl> 0.00000000, 0.02356765, 0.01130176, 0.000000…
## $ tfidf_observed_told       <dbl> 0.000000000, 0.007153339, 0.000000000, 0.033…
## $ tfidf_observed_took       <dbl> 0.000000000, 0.000000000, 0.000000000, 0.021…
## $ tfidf_observed_towards    <dbl> 0.000000000, 0.000000000, 0.037177382, 0.000…
## $ tfidf_observed_tracks     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.016987…
## $ tfidf_observed_trail      <dbl> 0.00000000, 0.00000000, 0.12097010, 0.069378…
## $ tfidf_observed_tree       <dbl> 0.00000000, 0.04516096, 0.00000000, 0.000000…
## $ tfidf_observed_trees      <dbl> 0.000000000, 0.008185881, 0.000000000, 0.000…
## $ tfidf_observed_truck      <dbl> 0.00000000, 0.04411901, 0.00000000, 0.000000…
## $ tfidf_observed_turned     <dbl> 0.00000000, 0.00000000, 0.01251131, 0.000000…
## $ tfidf_observed_two        <dbl> 0.00000000, 0.03344989, 0.02138766, 0.017172…
## $ tfidf_observed_us         <dbl> 0.00000000, 0.01095517, 0.00000000, 0.025308…
## $ tfidf_observed_walk       <dbl> 0.00000000, 0.00000000, 0.04039302, 0.032432…
## $ tfidf_observed_walked     <dbl> 0.000000000, 0.000000000, 0.043313469, 0.023…
## $ tfidf_observed_walking    <dbl> 0.000000000, 0.006820933, 0.013083789, 0.010…
## $ tfidf_observed_way        <dbl> 0.000000000, 0.017675737, 0.000000000, 0.045…
## $ tfidf_observed_well       <dbl> 0.000000000, 0.009690787, 0.018588691, 0.029…
## $ tfidf_observed_went       <dbl> 0.000000000, 0.021535542, 0.000000000, 0.000…
## $ tfidf_observed_woods      <dbl> 0.00000000, 0.02728373, 0.02616758, 0.000000…
## $ tfidf_observed_yards      <dbl> 0.00000000, 0.02146002, 0.00000000, 0.022034…
## $ tfidf_observed_years      <dbl> 0.04421607, 0.01466884, 0.00000000, 0.022592…

Build a model

xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(bigfoot_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_grid(
  xgb_wf,
  resamples = bigfoot_folds,
  grid = 5,
  control = control_grid(verbose = TRUE, save_pred = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits         id     .metrics          .notes           .predictions     
##    <list>         <chr>  <list>            <list>           <list>           
##  1 <split [66/8]> Fold01 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [40 × 9]>
##  2 <split [66/8]> Fold02 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [40 × 9]>
##  3 <split [66/8]> Fold03 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [40 × 9]>
##  4 <split [66/8]> Fold04 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [40 × 9]>
##  5 <split [66/8]> Fold05 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [40 × 9]>
##  6 <split [66/8]> Fold06 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [40 × 9]>
##  7 <split [66/8]> Fold07 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [40 × 9]>
##  8 <split [68/6]> Fold08 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [30 × 9]>
##  9 <split [68/6]> Fold09 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [30 × 9]>
## 10 <split [68/6]> Fold10 <tibble [10 × 7]> <tibble [0 × 3]> <tibble [30 × 9]>

Evaluate and finalize mode:

collect_metrics(xgb_rs)
## # A tibble: 10 × 9
##     mtry trees min_n .metric  .estimator  mean     n std_err .config            
##    <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>              
##  1    41   606    28 accuracy binary     0.5      10  0      Preprocessor1_Mode…
##  2    41   606    28 roc_auc  binary     0.5      10  0      Preprocessor1_Mode…
##  3    46  1196    24 accuracy binary     0.5      10  0      Preprocessor1_Mode…
##  4    46  1196    24 roc_auc  binary     0.5      10  0      Preprocessor1_Mode…
##  5    81  1331    37 accuracy binary     0.5      10  0      Preprocessor1_Mode…
##  6    81  1331    37 roc_auc  binary     0.5      10  0      Preprocessor1_Mode…
##  7   131  1641     7 accuracy binary     0.812    10  0.0404 Preprocessor1_Mode…
##  8   131  1641     7 roc_auc  binary     0.915    10  0.0335 Preprocessor1_Mode…
##  9   185   194    13 accuracy binary     0.5      10  0      Preprocessor1_Mode…
## 10   185   194    13 roc_auc  binary     0.5      10  0      Preprocessor1_Mode…
# plot_race(xgb_rs)
xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(bigfoot_split)
## Warning: package 'stopwords' was built under R version 4.2.3
xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits          id               .metrics .notes   .predictions .workflow 
##   <list>          <chr>            <list>   <list>   <list>       <list>    
## 1 <split [74/26]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(xgb_last)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.692 Preprocessor1_Model1
## 2 roc_auc  binary         0.731 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
    conf_mat(classification, .pred_class)
##           Truth
## Prediction possible sighting
##   possible       10        5
##   sighting        3        8
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgb_last %>%
  extract_fit_engine() %>%
  vip()