library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): observed, location_details, county, state, season, title, classif...
## dbl  (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
  count(classification)
## # A tibble: 3 × 2
##   classification     n
##   <chr>          <int>
## 1 Class A         2481
## 2 Class B         2510
## 3 Class C           30
bigfoot <-
  bigfoot_raw %>%
  filter(classification != "Class C", !is.na(observed)) %>%
  mutate(
    classification = case_when(
      classification == "Class A" ~ "sighting",
      classification == "Class B" ~ "possible"
    ))

bigfoot
## # A tibble: 4,953 × 28
##    observed        location_details county state season title latitude longitude
##    <chr>           <chr>            <chr>  <chr> <chr>  <chr>    <dbl>     <dbl>
##  1 "I was canoein…  <NA>            Winst… Alab… Summer <NA>      NA        NA  
##  2 "Ed L. was sal… "East side of P… Valde… Alas… Fall   <NA>      NA        NA  
##  3 "While attendi… "Great swamp ar… Washi… Rhod… Fall   Repo…     41.4     -71.5
##  4 "Hello, My nam… "I would rather… York … Penn… Summer <NA>      NA        NA  
##  5 "It was May 19… "Logging roads … Yamhi… Oreg… Spring <NA>      NA        NA  
##  6 "My two childr… "The creature c… Washi… Okla… Fall   Repo…     35.3     -99.2
##  7 "I was staying… "Vincent, Ohio … Washi… Ohio  Summer Repo…     39.4     -81.7
##  8 "Well last yea… "Both sightings… Westc… New … Fall   Repo…     41.3     -73.7
##  9 "I grew up in … "The Western fa… Washo… Neva… Fall   Repo…     39.6    -120. 
## 10 "heh i kinda f… "the road is of… Warre… New … Fall   <NA>      NA        NA  
## # ℹ 4,943 more rows
## # ℹ 20 more variables: date <date>, number <dbl>, classification <chr>,
## #   geohash <chr>, temperature_high <dbl>, temperature_mid <dbl>,
## #   temperature_low <dbl>, dew_point <dbl>, humidity <dbl>, cloud_cover <dbl>,
## #   moon_phase <dbl>, precip_intensity <dbl>, precip_probability <dbl>,
## #   precip_type <chr>, pressure <dbl>, summary <chr>, uv_index <dbl>,
## #   visibility <dbl>, wind_bearing <dbl>, wind_speed <dbl>
top_sightings <- bigfoot %>% count(classification) %>% slice_max(n, n = 6) %>% pull(classification)

bigfoot %>%
  filter(classification != "Class C", !is.na(observed)) %>%
  count(classification, season) %>%
  ggplot(aes(season, n, fill = season)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(classification), scales = "free_y") +
  labs(x = NULL, y = "Sightings in Seasons")

bigfoot %>%
  count(state, season)
## # A tibble: 220 × 3
##    state   season      n
##    <chr>   <chr>   <int>
##  1 Alabama Fall       21
##  2 Alabama Spring     14
##  3 Alabama Summer     31
##  4 Alabama Unknown     1
##  5 Alabama Winter     24
##  6 Alaska  Fall        8
##  7 Alaska  Spring      4
##  8 Alaska  Summer      5
##  9 Alaska  Winter      3
## 10 Arizona Fall       20
## # ℹ 210 more rows
top_state <- bigfoot %>% count(state) %>% slice_max(n, n = 4) %>% pull(state)

bigfoot %>%
  filter(state %in% top_state) %>%
  count(state, season) %>%
  ggplot(aes(season, n, fill = season)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(vars(state), scales = "free_y") +
  labs(x = NULL, y = "States with Sightings")

library(tidytext)
library(tidylo)

bigfoot %>%
    unnest_tokens(word, observed) %>%
    count(classification, word) %>%
    filter(n > 100) %>%
    bind_log_odds(classification, word, n) %>%
    arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
##    classification word           n log_odds_weighted
##    <chr>          <chr>      <int>             <dbl>
##  1 possible       howl         455              14.7
##  2 sighting       fur          362              13.3
##  3 possible       heard       5397              12.7
##  4 possible       screams      327              12.5
##  5 sighting       ape          300              12.1
##  6 possible       knocks       301              12.0
##  7 sighting       hands        285              11.8
##  8 sighting       headlights   283              11.7
##  9 possible       listened     266              11.2
## 10 sighting       witness      249              11.0
## # ℹ 1,737 more rows
bigfoot_parsed <- bigfoot %>%
  select(classification, observed, season, state, county, temperature_mid, 
         temperature_high, temperature_low, date, moon_phase) %>%
  mutate(date = as.character(date)) %>%
  mutate(date = parse_number(date)) %>%
  na.omit() %>%
  mutate(across(where(is.character), as.factor)) %>%
  mutate(season = as.character(season))

Feature Engineering

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
bigfoot_split <-
  bigfoot_parsed %>%
  select(observed, classification) %>%
  initial_split(strata = classification)

bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)

set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, strata = classification)
bigfoot_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [2127/237]> Fold01
##  2 <split [2127/237]> Fold02
##  3 <split [2127/237]> Fold03
##  4 <split [2127/237]> Fold04
##  5 <split [2127/237]> Fold05
##  6 <split [2127/237]> Fold06
##  7 <split [2127/237]> Fold07
##  8 <split [2129/235]> Fold08
##  9 <split [2129/235]> Fold09
## 10 <split [2129/235]> Fold10
library(embed)
library(textrecipes)

bigfoot_rec <- 
  recipe(classification ~ ., data = bigfoot_train) %>%
  step_tokenize(observed) %>%
    step_stopwords(observed) %>%
    step_tokenfilter(observed, max_tokens = 200) %>%
    step_tfidf(observed) %>%
  step_dummy(all_nominal_predictors())

bigfoot_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 2,364
## Columns: 201
## $ classification           <fct> possible, possible, possible, possible, possi…
## $ tfidf_observed_1         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_10        <dbl> 0.01611678, 0.00000000, 0.06148253, 0.0415007…
## $ tfidf_observed_15        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_2         <dbl> 0.00000000, 0.01717567, 0.00000000, 0.0373570…
## $ tfidf_observed_20        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_3         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_30        <dbl> 0.00000000, 0.00000000, 0.06247454, 0.0421703…
## $ tfidf_observed_4         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_5         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_6         <dbl> 0.00000000, 0.02139852, 0.00000000, 0.0000000…
## $ tfidf_observed_7         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_8         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_across    <dbl> 0.00000000, 0.00000000, 0.05877095, 0.0000000…
## $ tfidf_observed_almost    <dbl> 0.00000000, 0.02135401, 0.00000000, 0.0000000…
## $ tfidf_observed_along     <dbl> 0.03732727, 0.02209603, 0.00000000, 0.0000000…
## $ tfidf_observed_also      <dbl> 0.01639157, 0.00000000, 0.00000000, 0.0422083…
## $ tfidf_observed_animal    <dbl> 0.01613096, 0.05729273, 0.00000000, 0.0000000…
## $ tfidf_observed_another   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_anything  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_area      <dbl> 0.032948453, 0.000000000, 0.041897415, 0.0282…
## $ tfidf_observed_arms      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_around    <dbl> 0.010467666, 0.037178262, 0.000000000, 0.0000…
## $ tfidf_observed_asked     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_away      <dbl> 0.01269058, 0.00000000, 0.04841223, 0.0000000…
## $ tfidf_observed_back      <dbl> 0.037641063, 0.022281779, 0.000000000, 0.0484…
## $ tfidf_observed_bear      <dbl> 0.00000000, 0.01930192, 0.06219507, 0.0000000…
## $ tfidf_observed_behind    <dbl> 0.03106853, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_believe   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_big       <dbl> 0.00000000, 0.01878458, 0.00000000, 0.0000000…
## $ tfidf_observed_bigfoot   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_black     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_brown     <dbl> 0.03680474, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_brush     <dbl> 0.04347373, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_came      <dbl> 0.01374316, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_camp      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_can       <dbl> 0.01844195, 0.02183357, 0.00000000, 0.0000000…
## $ tfidf_observed_car       <dbl> 0.00000000, 0.00000000, 0.14193016, 0.0479014…
## $ tfidf_observed_close     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_closer    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_come      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_coming    <dbl> 0.00000000, 0.00000000, 0.06519937, 0.0000000…
## $ tfidf_observed_county    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_covered   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_creature  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_creek     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_dark      <dbl> 0.00000000, 0.01640980, 0.00000000, 0.0356913…
## $ tfidf_observed_day       <dbl> 0.02830939, 0.03351572, 0.00000000, 0.0000000…
## $ tfidf_observed_decided   <dbl> 0.03481885, 0.00000000, 0.00000000, 0.0448292…
## $ tfidf_observed_deer      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_direction <dbl> 0.00000000, 0.02357171, 0.00000000, 0.0000000…
## $ tfidf_observed_distance  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_dog       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_dogs      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_driving   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_even      <dbl> 0.00000000, 0.02063148, 0.00000000, 0.0000000…
## $ tfidf_observed_ever      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_eyes      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_face      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_fast      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_feet      <dbl> 0.03591763, 0.02834878, 0.00000000, 0.0308293…
## $ tfidf_observed_field     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_figure    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_first     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_foot      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_forest    <dbl> 0.02094772, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_found     <dbl> 0.00000000, 0.02043047, 0.00000000, 0.0000000…
## $ tfidf_observed_friend    <dbl> 0.00000000, 0.02057066, 0.00000000, 0.0000000…
## $ tfidf_observed_friends   <dbl> 0.00000000, 0.02493273, 0.00000000, 0.0000000…
## $ tfidf_observed_front     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_get       <dbl> 0.01337659, 0.01583666, 0.00000000, 0.0688894…
## $ tfidf_observed_go        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_going     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_gone      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_good      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_got       <dbl> 0.000000000, 0.014191172, 0.000000000, 0.0308…
## $ tfidf_observed_ground    <dbl> 0.00000000, 0.02236678, 0.00000000, 0.0000000…
## $ tfidf_observed_hair      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_happened  <dbl> 0.01767149, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_head      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_hear      <dbl> 0.01634742, 0.00000000, 0.00000000, 0.0420946…
## $ tfidf_observed_heard     <dbl> 0.000000000, 0.000000000, 0.000000000, 0.0000…
## $ tfidf_observed_high      <dbl> 0.01976604, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_hill      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_home      <dbl> 0.00000000, 0.00000000, 0.06164519, 0.0416105…
## $ tfidf_observed_house     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_human     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_hunting   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_husband   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_just      <dbl> 0.03090724, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_knew      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_know      <dbl> 0.01411328, 0.01670883, 0.05383955, 0.0000000…
## $ tfidf_observed_lake      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_large     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_last      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_later     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_left      <dbl> 0.01365651, 0.04850417, 0.00000000, 0.0000000…
## $ tfidf_observed_legs      <dbl> 0.00000000, 0.00000000, 0.07272292, 0.0000000…
## $ tfidf_observed_light     <dbl> 0.03732727, 0.02209603, 0.00000000, 0.0000000…
## $ tfidf_observed_like      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0471493…
## $ tfidf_observed_line      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_little    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_long      <dbl> 0.01376259, 0.03258729, 0.00000000, 0.0000000…
## $ tfidf_observed_look      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_looked    <dbl> 0.01151843, 0.01363677, 0.04394069, 0.0000000…
## $ tfidf_observed_looking   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_loud      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0448292…
## $ tfidf_observed_made      <dbl> 0.01504788, 0.05344590, 0.00000000, 0.0000000…
## $ tfidf_observed_make      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_man       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_many      <dbl> 0.02059473, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_maybe     <dbl> 0.01868413, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_mile      <dbl> 0.00000000, 0.04468363, 0.00000000, 0.0485934…
## $ tfidf_observed_miles     <dbl> 0.01889233, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_minutes   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_morning   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_moved     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_moving    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_much      <dbl> 0.01725681, 0.02043047, 0.00000000, 0.0000000…
## $ tfidf_observed_near      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0436754…
## $ tfidf_observed_never     <dbl> 0.01298908, 0.01537788, 0.00000000, 0.0000000…
## $ tfidf_observed_next      <dbl> 0.00000000, 0.01804625, 0.00000000, 0.0000000…
## $ tfidf_observed_night     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_noise     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_north     <dbl> 0.00000000, 0.02395283, 0.07718133, 0.0000000…
## $ tfidf_observed_nothing   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_noticed   <dbl> 0.01760064, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_now       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_old       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0915625…
## $ tfidf_observed_one       <dbl> 0.02202421, 0.02607464, 0.00000000, 0.0000000…
## $ tfidf_observed_outside   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_people    <dbl> 0.01960142, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_point     <dbl> 0.02091999, 0.02476734, 0.00000000, 0.0000000…
## $ tfidf_observed_ran       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_really    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_remember  <dbl> 0.00000000, 0.02599292, 0.00000000, 0.0000000…
## $ tfidf_observed_right     <dbl> 0.05413426, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_river     <dbl> 0.04269379, 0.02527276, 0.00000000, 0.0549682…
## $ tfidf_observed_road      <dbl> 0.000000000, 0.137924268, 0.133326793, 0.0599…
## $ tfidf_observed_run       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_running   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_said      <dbl> 0.04300183, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_saw       <dbl> 0.050348778, 0.000000000, 0.000000000, 0.0000…
## $ tfidf_observed_say       <dbl> 0.01887124, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_scared    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_seconds   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_see       <dbl> 0.042511833, 0.025165051, 0.040543693, 0.0000…
## $ tfidf_observed_seemed    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0494250…
## $ tfidf_observed_seen      <dbl> 0.05451165, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_several   <dbl> 0.00000000, 0.02214463, 0.00000000, 0.0000000…
## $ tfidf_observed_side      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0638579…
## $ tfidf_observed_since     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_small     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_smell     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_someone   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0517743…
## $ tfidf_observed_something <dbl> 0.00000000, 0.00000000, 0.04207650, 0.0284016…
## $ tfidf_observed_son       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_sound     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_sounded   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0464933…
## $ tfidf_observed_sounds    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_south     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_spot      <dbl> 0.02114473, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_standing  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_started   <dbl> 0.04454104, 0.00000000, 0.00000000, 0.0764621…
## $ tfidf_observed_still     <dbl> 0.01544419, 0.01828451, 0.00000000, 0.0000000…
## $ tfidf_observed_stood     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_stopped   <dbl> 0.00000000, 0.00000000, 0.05911250, 0.0000000…
## $ tfidf_observed_sure      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_tall      <dbl> 0.01299761, 0.00000000, 0.04958346, 0.0000000…
## $ tfidf_observed_tell      <dbl> 0.00000000, 0.02216903, 0.00000000, 0.0000000…
## $ tfidf_observed_tent      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_thing     <dbl> 0.00000000, 0.00000000, 0.05633009, 0.0000000…
## $ tfidf_observed_think     <dbl> 0.01709110, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_thought   <dbl> 0.000000000, 0.015149946, 0.000000000, 0.0000…
## $ tfidf_observed_three     <dbl> 0.01964808, 0.02326152, 0.00000000, 0.0000000…
## $ tfidf_observed_time      <dbl> 0.010982818, 0.000000000, 0.000000000, 0.0000…
## $ tfidf_observed_times     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_told      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_took      <dbl> 0.01534278, 0.03632888, 0.00000000, 0.0000000…
## $ tfidf_observed_top       <dbl> 0.04228947, 0.07510026, 0.00000000, 0.0000000…
## $ tfidf_observed_towards   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_tracks    <dbl> 0.00000000, 0.07231156, 0.00000000, 0.0000000…
## $ tfidf_observed_trail     <dbl> 0.01962472, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_tree      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_trees     <dbl> 0.03275364, 0.01938865, 0.00000000, 0.0000000…
## $ tfidf_observed_truck     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_turned    <dbl> 0.00000000, 0.01718871, 0.05538585, 0.0000000…
## $ tfidf_observed_two       <dbl> 0.02549479, 0.06036698, 0.04862896, 0.0000000…
## $ tfidf_observed_us        <dbl> 0.06460685, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_walk      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0976232…
## $ tfidf_observed_walked    <dbl> 0.00000000, 0.01817935, 0.00000000, 0.0000000…
## $ tfidf_observed_walking   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0362357…
## $ tfidf_observed_water     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_way       <dbl> 0.02723682, 0.01612294, 0.00000000, 0.0000000…
## $ tfidf_observed_well      <dbl> 0.00000000, 0.02109177, 0.00000000, 0.0000000…
## $ tfidf_observed_went      <dbl> 0.00000000, 0.01382404, 0.00000000, 0.0300672…
## $ tfidf_observed_wife      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_window    <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_woods     <dbl> 0.01239959, 0.01467998, 0.09460431, 0.0638579…
## $ tfidf_observed_yards     <dbl> 0.01548264, 0.01833002, 0.05906341, 0.0000000…
## $ tfidf_observed_years     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…

Build a Model

xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(bigfoot_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_grid(
  xgb_wf,
  resamples = bigfoot_folds,
  grid = 5,
  control = control_grid(verbose = TRUE, save_pred = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .metrics          .notes           .predictions
##    <list>             <chr>  <list>            <list>           <list>      
##  1 <split [2127/237]> Fold01 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  2 <split [2127/237]> Fold02 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  3 <split [2127/237]> Fold03 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  4 <split [2127/237]> Fold04 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  5 <split [2127/237]> Fold05 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  6 <split [2127/237]> Fold06 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  7 <split [2127/237]> Fold07 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  8 <split [2129/235]> Fold08 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
##  9 <split [2129/235]> Fold09 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>    
## 10 <split [2129/235]> Fold10 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>

Evaluate and Finalize Model

collect_metrics(xgb_rs)
## # A tibble: 10 × 9
##     mtry trees min_n .metric  .estimator  mean     n std_err .config            
##    <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>              
##  1    41   606    28 accuracy binary     0.781    10 0.00516 Preprocessor1_Mode…
##  2    41   606    28 roc_auc  binary     0.866    10 0.00698 Preprocessor1_Mode…
##  3    46  1196    24 accuracy binary     0.781    10 0.00723 Preprocessor1_Mode…
##  4    46  1196    24 roc_auc  binary     0.866    10 0.00787 Preprocessor1_Mode…
##  5    81  1331    37 accuracy binary     0.787    10 0.00466 Preprocessor1_Mode…
##  6    81  1331    37 roc_auc  binary     0.866    10 0.00723 Preprocessor1_Mode…
##  7   131  1641     7 accuracy binary     0.784    10 0.00711 Preprocessor1_Mode…
##  8   131  1641     7 roc_auc  binary     0.863    10 0.00801 Preprocessor1_Mode…
##  9   185   194    13 accuracy binary     0.772    10 0.00789 Preprocessor1_Mode…
## 10   185   194    13 roc_auc  binary     0.856    10 0.00624 Preprocessor1_Mode…
xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(bigfoot_split)
xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits             id               .metrics .notes   .predictions .workflow 
##   <list>             <chr>            <list>   <list>   <list>       <list>    
## 1 <split [2364/789]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(xgb_last)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.795 Preprocessor1_Model1
## 2 roc_auc  binary         0.878 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
    conf_mat(classification, .pred_class)
##           Truth
## Prediction possible sighting
##   possible      318       70
##   sighting       92      309
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgb_last %>%
  extract_fit_engine() %>%
  vip()

Questions

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. “Can we predict bigfoot sightings (possible or sighting) based on the words used in the description of the possible sighting?

    • Describe the data briefly: Provide an overview of the new data set, highlighting its key characteristics and dimensions. The original data set is 5,021 observations of 28 variables. The data set contains data about different variables about conditions during the possible big foot sighting. Some variables are county, state, season, title, moon_phase, date, temperature_mid, temperature_low, temperature_high, and more.

    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the data set and their characteristics. The characteristics of the primary variables are character data, numerical data, and a few factor data that we converted. Temperature_mid, temperature_low, moon_phase and temperature_high are numerical data. Classification and observed were broken down into factor data, but were orignally character data. Title, county, state, season are character data. Date is date data.

  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original data had the classification variable as either Class A, Class B, or Class C. I used the mutate function so that Class A was sighting and Class B was possible. I used the filter function to filter out Class C entirely. I also unnested and tokenized the observed variable to the top 100 words so that the model could use this variable to predict the sightings.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. I used filter(classification != “Class C”, !is.na(observed)) %>% mutate(classification = case_when( classification == “Class A” ~ “sighting”, classification == “Class B” ~ “possible”)) to filter out Class C in the classification variable. I also used the mutate function to change the name of Class A and Class B to sighting and possible. I used unnest_tokens(word, observed) %>% count(classification, word) %>% filter(n > 100) %>% bind_log_odds(classification, word, n) %>% arrange(-log_odds_weighted) to break the observed variable into the top 100 words instead of full sentences so that the model could use these to predict the sightings.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning model we used was XGBoost. XGBoost was used to predict the classification of the bigfoot sighting, either sighting or possible, based on the variable observed as well as classification, observed, season, state, county, temperature_mid, temperature_high, temperature_low, date, moon_phase. XGBoost is great at handling classification tasks. The model had a good overall performing so I would say XGBoost was a good choice for this model.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. I used “collect_metrics(xgb_rs)”, “collect_metrics(xgb_last)”, and “(collect_predictions(xgb_last) %>% conf_mat(Classification, .pred_class)” for the metrics used in this evaluation. Confusion matrix describes the performance of a classification algorithm on a set of test data. These metrics provide a rate of predictions which is necessary for validating the reliability of the models predictive capabilities. Confusion matrix shows where the model may go wrong which is helpful in understanding the limitations of the model as well as how it could be improved. These are important in the context of my research question as they may help me to determine which variable will give me the best prediction. I may want to change the variable I am using to predict if the confusion matrix proves that it may not be the best option.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. Relate these findings back to the research question and any similarities or differences compared to the CA assignment. The result of the collect_metrics(xgb_last) was accuracy = 79.5% and roc_auc = 87.8%. These are not too far off from what most of the accuracy’s are for the code along assignments, so I would say the model performs rather well. In the code along the XGBoost model, after tuning, achieves an accuracy of approximately 81.6% and an ROC_AUC of approximately 88.9% on the test data. My model is very similar to this, so I will stick with these variables for the machine learning model.