Feature engineering
bigfoot_parsed <- sample_n(bigfoot_parsed, 100)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.4 ✔ rsample 1.1.1
## ✔ dials 1.1.0 ✔ tune 1.0.1
## ✔ infer 1.0.4 ✔ workflows 1.1.2
## ✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
## ✔ parsnip 1.1.0 ✔ yardstick 1.1.0
## ✔ recipes 1.0.3
## Warning: package 'broom' was built under R version 4.2.3
## Warning: package 'parsnip' was built under R version 4.2.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
bigfoot_split <-
bigfoot_parsed %>%
select(observed, classification) %>%
initial_split(strata = classification)
bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)
set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, strata = classification)
bigfoot_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [66/8]> Fold01
## 2 <split [66/8]> Fold02
## 3 <split [66/8]> Fold03
## 4 <split [66/8]> Fold04
## 5 <split [66/8]> Fold05
## 6 <split [66/8]> Fold06
## 7 <split [66/8]> Fold07
## 8 <split [68/6]> Fold08
## 9 <split [68/6]> Fold09
## 10 <split [68/6]> Fold10
library(embed)
library(textrecipes)
bigfoot_rec <-
recipe(classification ~ ., data = bigfoot_train) %>%
step_tokenize(observed) %>%
step_stopwords(observed) %>%
step_tokenfilter(observed, max_tokens = 200) %>%
step_tfidf(observed) %>%
step_dummy(all_nominal_predictors())
bigfoot_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 74
## Columns: 201
## $ classification <fct> possible, possible, possible, possible, poss…
## $ tfidf_observed_1 <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_10 <dbl> 0.000000000, 0.006522484, 0.037533928, 0.000…
## $ tfidf_observed_15 <dbl> 0.000000000, 0.009333842, 0.017904006, 0.000…
## $ tfidf_observed_2 <dbl> 0.08039507, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_20 <dbl> 0.00000000, 0.00000000, 0.01858869, 0.000000…
## $ tfidf_observed_3 <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_30 <dbl> 0.000000000, 0.000000000, 0.024492248, 0.000…
## $ tfidf_observed_4 <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_5 <dbl> 0.00000000, 0.00000000, 0.03237429, 0.000000…
## $ tfidf_observed_6 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_7 <dbl> 0.05087389, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_across <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_almost <dbl> 0.00000000, 0.01866768, 0.00000000, 0.000000…
## $ tfidf_observed_along <dbl> 0.000000000, 0.000000000, 0.000000000, 0.025…
## $ tfidf_observed_also <dbl> 0.000000000, 0.014668839, 0.000000000, 0.022…
## $ tfidf_observed_animal <dbl> 0.000000000, 0.000000000, 0.029662124, 0.000…
## $ tfidf_observed_animals <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_another <dbl> 0.000000000, 0.055656508, 0.030502657, 0.000…
## $ tfidf_observed_anything <dbl> 0.00000000, 0.01546367, 0.02966212, 0.011908…
## $ tfidf_observed_appeared <dbl> 0.00000000, 0.00000000, 0.00000000, 0.028750…
## $ tfidf_observed_area <dbl> 0.091185101, 0.020167321, 0.019342294, 0.023…
## $ tfidf_observed_arms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_around <dbl> 0.060790067, 0.005041830, 0.009671147, 0.023…
## $ tfidf_observed_asked <dbl> 0.000000000, 0.009690787, 0.000000000, 0.000…
## $ tfidf_observed_away <dbl> 0.03693624, 0.01838059, 0.00000000, 0.018872…
## $ tfidf_observed_back <dbl> 0.027018370, 0.013445160, 0.060177278, 0.027…
## $ tfidf_observed_bear <dbl> 0.000000000, 0.000000000, 0.017281443, 0.027…
## $ tfidf_observed_bed <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_behind <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_believe <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_big <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_bigfoot <dbl> 0.00000000, 0.05268779, 0.00000000, 0.000000…
## $ tfidf_observed_black <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_body <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_brother <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_observed_brown <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_call <dbl> 0.00000000, 0.00000000, 0.02019651, 0.000000…
## $ tfidf_observed_came <dbl> 0.034220091, 0.017028955, 0.010888211, 0.000…
## $ tfidf_observed_camp <dbl> 0.00000000, 0.00000000, 0.00000000, 0.016987…
## $ tfidf_observed_can <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_car <dbl> 0.00000000, 0.00000000, 0.01443782, 0.000000…
## $ tfidf_observed_clear <dbl> 0.00000000, 0.00000000, 0.00000000, 0.015534…
## $ tfidf_observed_come <dbl> 0.00000000, 0.00000000, 0.00000000, 0.013418…
## $ tfidf_observed_coming <dbl> 0.000000000, 0.007153339, 0.000000000, 0.000…
## $ tfidf_observed_continued <dbl> 0.00000000, 0.00000000, 0.00000000, 0.028750…
## $ tfidf_observed_county <dbl> 0.00000000, 0.00000000, 0.00000000, 0.014925…
## $ tfidf_observed_couple <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_coyote <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_creature <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_creek <dbl> 0.12161324, 0.01008641, 0.03869512, 0.000000…
## $ tfidf_observed_dark <dbl> 0.00000000, 0.02728373, 0.00000000, 0.000000…
## $ tfidf_observed_day <dbl> 0.00000000, 0.00000000, 0.01224612, 0.000000…
## $ tfidf_observed_decided <dbl> 0.00000000, 0.00000000, 0.02966212, 0.011908…
## $ tfidf_observed_deep <dbl> 0.000000000, 0.000000000, 0.017281443, 0.000…
## $ tfidf_observed_deer <dbl> 0.04793275, 0.07950930, 0.03050266, 0.012245…
## $ tfidf_observed_direction <dbl> 0.00000000, 0.01938157, 0.01858869, 0.000000…
## $ tfidf_observed_distance <dbl> 0.000000000, 0.000000000, 0.000000000, 0.013…
## $ tfidf_observed_dog <dbl> 0.00000000, 0.00000000, 0.00000000, 0.017872…
## $ tfidf_observed_dogs <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_observed_door <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_even <dbl> 0.00000000, 0.00000000, 0.00000000, 0.012607…
## $ tfidf_observed_ever <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_face <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_observed_fast <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_feet <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_felt <dbl> 0.000000000, 0.000000000, 0.000000000, 0.015…
## $ tfidf_observed_figure <dbl> 0.05431311, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_first <dbl> 0.000000000, 0.030107308, 0.014437823, 0.000…
## $ tfidf_observed_foot <dbl> 0.00000000, 0.00000000, 0.01790401, 0.028750…
## $ tfidf_observed_forest <dbl> 0.000000000, 0.009333842, 0.035808012, 0.000…
## $ tfidf_observed_found <dbl> 0.000000000, 0.007731833, 0.000000000, 0.000…
## $ tfidf_observed_friend <dbl> 0.00000000, 0.00000000, 0.00000000, 0.011592…
## $ tfidf_observed_front <dbl> 0.000000000, 0.007153339, 0.000000000, 0.000…
## $ tfidf_observed_full <dbl> 0.000000000, 0.010086406, 0.019347561, 0.000…
## $ tfidf_observed_get <dbl> 0.00000000, 0.01304497, 0.00000000, 0.000000…
## $ tfidf_observed_girlfriend <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_go <dbl> 0.000000000, 0.007153339, 0.000000000, 0.011…
## $ tfidf_observed_going <dbl> 0.000000000, 0.000000000, 0.017904006, 0.014…
## $ tfidf_observed_got <dbl> 0.031913184, 0.015880968, 0.010154195, 0.024…
## $ tfidf_observed_ground <dbl> 0.09322382, 0.00000000, 0.00000000, 0.023816…
## $ tfidf_observed_hair <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_half <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_happened <dbl> 0.00000000, 0.01687759, 0.01618715, 0.000000…
## $ tfidf_observed_head <dbl> 0.00000000, 0.00000000, 0.02887565, 0.000000…
## $ tfidf_observed_hear <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_heard <dbl> 0.087078906, 0.067407052, 0.055413849, 0.014…
## $ tfidf_observed_heavy <dbl> 0.00000000, 0.00000000, 0.00000000, 0.015534…
## $ tfidf_observed_hill <dbl> 0.00000000, 0.00000000, 0.00000000, 0.028750…
## $ tfidf_observed_home <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_house <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_howl <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_human <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_hunting <dbl> 0.000000000, 0.008438797, 0.000000000, 0.025…
## $ tfidf_observed_incident <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_jumped <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_just <dbl> 0.029467430, 0.014663887, 0.009376001, 0.015…
## $ tfidf_observed_kind <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_knocks <dbl> 0.14801527, 0.03682844, 0.00000000, 0.000000…
## $ tfidf_observed_know <dbl> 0.00000000, 0.01396498, 0.00000000, 0.010754…
## $ tfidf_observed_lake <dbl> 0.00000000, 0.07365689, 0.00000000, 0.000000…
## $ tfidf_observed_large <dbl> 0.00000000, 0.00000000, 0.00000000, 0.028889…
## $ tfidf_observed_later <dbl> 0.000000000, 0.008438797, 0.000000000, 0.000…
## $ tfidf_observed_left <dbl> 0.078642516, 0.006522484, 0.012511309, 0.010…
## $ tfidf_observed_legs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_light <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_like <dbl> 0.000000000, 0.000000000, 0.015889003, 0.031…
## $ tfidf_observed_line <dbl> 0.000000000, 0.036037133, 0.000000000, 0.000…
## $ tfidf_observed_little <dbl> 0.000000000, 0.000000000, 0.000000000, 0.012…
## $ tfidf_observed_long <dbl> 0.000000000, 0.006252529, 0.000000000, 0.000…
## $ tfidf_observed_look <dbl> 0.000000000, 0.000000000, 0.000000000, 0.012…
## $ tfidf_observed_looked <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_looking <dbl> 0.04793275, 0.00000000, 0.00000000, 0.012245…
## $ tfidf_observed_loud <dbl> 0.000000000, 0.006820933, 0.013083789, 0.000…
## $ tfidf_observed_made <dbl> 0.000000000, 0.000000000, 0.000000000, 0.010…
## $ tfidf_observed_make <dbl> 0.000000000, 0.016877594, 0.000000000, 0.000…
## $ tfidf_observed_man <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_observed_many <dbl> 0.00000000, 0.00000000, 0.00000000, 0.015534…
## $ tfidf_observed_maybe <dbl> 0.000000000, 0.008185881, 0.000000000, 0.012…
## $ tfidf_observed_mile <dbl> 0.000000000, 0.009690787, 0.000000000, 0.000…
## $ tfidf_observed_miles <dbl> 0.000000000, 0.009333842, 0.000000000, 0.000…
## $ tfidf_observed_minutes <dbl> 0.00000000, 0.00000000, 0.06280803, 0.000000…
## $ tfidf_observed_mountain <dbl> 0.000000000, 0.009333842, 0.000000000, 0.071…
## $ tfidf_observed_much <dbl> 0.000000000, 0.017424450, 0.000000000, 0.000…
## $ tfidf_observed_near <dbl> 0.00000000, 0.03180372, 0.01525133, 0.012245…
## $ tfidf_observed_never <dbl> 0.00000000, 0.00000000, 0.01279012, 0.000000…
## $ tfidf_observed_next <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_night <dbl> 0.00000000, 0.01250506, 0.00000000, 0.000000…
## $ tfidf_observed_noise <dbl> 0.04793275, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_north <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_nothing <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_now <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_occurred <dbl> 0.000000000, 0.011604450, 0.044518891, 0.000…
## $ tfidf_observed_old <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0402543, …
## $ tfidf_observed_one <dbl> 0.000000000, 0.016151656, 0.020654542, 0.008…
## $ tfidf_observed_outside <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_pm <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_pond <dbl> 0.00000000, 0.15485573, 0.00000000, 0.000000…
## $ tfidf_observed_prints <dbl> 0.00000000, 0.00000000, 0.00000000, 0.033974…
## $ tfidf_observed_property <dbl> 0.00000000, 0.00000000, 0.00000000, 0.016216…
## $ tfidf_observed_ran <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_right <dbl> 0.038487819, 0.000000000, 0.012246124, 0.029…
## $ tfidf_observed_road <dbl> 0.00000000, 0.03973423, 0.00000000, 0.034969…
## $ tfidf_observed_rock <dbl> 0.000000000, 0.000000000, 0.000000000, 0.015…
## $ tfidf_observed_running <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_said <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_saw <dbl> 0.000000000, 0.004963597, 0.019042164, 0.000…
## $ tfidf_observed_say <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_scared <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_seconds <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_see <dbl> 0.031388923, 0.005206693, 0.029962153, 0.000…
## $ tfidf_observed_seemed <dbl> 0.000000000, 0.000000000, 0.000000000, 0.028…
## $ tfidf_observed_seen <dbl> 0.000000000, 0.000000000, 0.000000000, 0.009…
## $ tfidf_observed_several <dbl> 0.000000000, 0.029072360, 0.000000000, 0.000…
## $ tfidf_observed_side <dbl> 0.036936237, 0.018380592, 0.011752439, 0.009…
## $ tfidf_observed_sitting <dbl> 0.058421600, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_small <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_snow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_observed_someone <dbl> 0.000000000, 0.009009283, 0.000000000, 0.000…
## $ tfidf_observed_something <dbl> 0.000000000, 0.005574982, 0.021387660, 0.034…
## $ tfidf_observed_sound <dbl> 0.000000000, 0.013964983, 0.026787377, 0.000…
## $ tfidf_observed_sounded <dbl> 0.000000000, 0.000000000, 0.028875646, 0.023…
## $ tfidf_observed_sounds <dbl> 0.00000000, 0.01687759, 0.00000000, 0.000000…
## $ tfidf_observed_south <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_spot <dbl> 0.13298730, 0.00000000, 0.04231414, 0.000000…
## $ tfidf_observed_standing <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_started <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_still <dbl> 0.000000000, 0.015053654, 0.000000000, 0.011…
## $ tfidf_observed_stood <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_stopped <dbl> 0.000000000, 0.016371763, 0.000000000, 0.000…
## $ tfidf_observed_strange <dbl> 0.000000000, 0.000000000, 0.000000000, 0.015…
## $ tfidf_observed_sure <dbl> 0.000000000, 0.016877594, 0.016187147, 0.000…
## $ tfidf_observed_swamp <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_tall <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_tell <dbl> 0.000000000, 0.000000000, 0.000000000, 0.000…
## $ tfidf_observed_tent <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_thing <dbl> 0.00000000, 0.00000000, 0.01199349, 0.019259…
## $ tfidf_observed_think <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_observed_thought <dbl> 0.000000000, 0.021910345, 0.021014013, 0.008…
## $ tfidf_observed_three <dbl> 0.000000000, 0.009009283, 0.000000000, 0.013…
## $ tfidf_observed_time <dbl> 0.00000000, 0.02356765, 0.01130176, 0.000000…
## $ tfidf_observed_told <dbl> 0.000000000, 0.007153339, 0.000000000, 0.033…
## $ tfidf_observed_took <dbl> 0.000000000, 0.000000000, 0.000000000, 0.021…
## $ tfidf_observed_towards <dbl> 0.000000000, 0.000000000, 0.037177382, 0.000…
## $ tfidf_observed_tracks <dbl> 0.00000000, 0.00000000, 0.00000000, 0.016987…
## $ tfidf_observed_trail <dbl> 0.00000000, 0.00000000, 0.12097010, 0.069378…
## $ tfidf_observed_tree <dbl> 0.00000000, 0.04516096, 0.00000000, 0.000000…
## $ tfidf_observed_trees <dbl> 0.000000000, 0.008185881, 0.000000000, 0.000…
## $ tfidf_observed_truck <dbl> 0.00000000, 0.04411901, 0.00000000, 0.000000…
## $ tfidf_observed_turned <dbl> 0.00000000, 0.00000000, 0.01251131, 0.000000…
## $ tfidf_observed_two <dbl> 0.00000000, 0.03344989, 0.02138766, 0.017172…
## $ tfidf_observed_us <dbl> 0.00000000, 0.01095517, 0.00000000, 0.025308…
## $ tfidf_observed_walk <dbl> 0.00000000, 0.00000000, 0.04039302, 0.032432…
## $ tfidf_observed_walked <dbl> 0.000000000, 0.000000000, 0.043313469, 0.023…
## $ tfidf_observed_walking <dbl> 0.000000000, 0.006820933, 0.013083789, 0.010…
## $ tfidf_observed_way <dbl> 0.000000000, 0.017675737, 0.000000000, 0.045…
## $ tfidf_observed_well <dbl> 0.000000000, 0.009690787, 0.018588691, 0.029…
## $ tfidf_observed_went <dbl> 0.000000000, 0.021535542, 0.000000000, 0.000…
## $ tfidf_observed_woods <dbl> 0.00000000, 0.02728373, 0.02616758, 0.000000…
## $ tfidf_observed_yards <dbl> 0.00000000, 0.02146002, 0.00000000, 0.022034…
## $ tfidf_observed_years <dbl> 0.04421607, 0.01466884, 0.00000000, 0.022592…