library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): observed, location_details, county, state, season, title, classif...
## dbl (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
count(classification)
## # A tibble: 3 × 2
## classification n
## <chr> <int>
## 1 Class A 2481
## 2 Class B 2510
## 3 Class C 30
bigfoot <-
bigfoot_raw %>%
filter(classification != "Class C", !is.na(observed)) %>%
mutate(
classification = case_when(
classification == "Class A" ~ "sighting",
classification == "Class B" ~ "possible"
))
bigfoot
## # A tibble: 4,953 × 28
## observed location_details county state season title latitude longitude
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 "I was canoein… <NA> Winst… Alab… Summer <NA> NA NA
## 2 "Ed L. was sal… "East side of P… Valde… Alas… Fall <NA> NA NA
## 3 "While attendi… "Great swamp ar… Washi… Rhod… Fall Repo… 41.4 -71.5
## 4 "Hello, My nam… "I would rather… York … Penn… Summer <NA> NA NA
## 5 "It was May 19… "Logging roads … Yamhi… Oreg… Spring <NA> NA NA
## 6 "My two childr… "The creature c… Washi… Okla… Fall Repo… 35.3 -99.2
## 7 "I was staying… "Vincent, Ohio … Washi… Ohio Summer Repo… 39.4 -81.7
## 8 "Well last yea… "Both sightings… Westc… New … Fall Repo… 41.3 -73.7
## 9 "I grew up in … "The Western fa… Washo… Neva… Fall Repo… 39.6 -120.
## 10 "heh i kinda f… "the road is of… Warre… New … Fall <NA> NA NA
## # ℹ 4,943 more rows
## # ℹ 20 more variables: date <date>, number <dbl>, classification <chr>,
## # geohash <chr>, temperature_high <dbl>, temperature_mid <dbl>,
## # temperature_low <dbl>, dew_point <dbl>, humidity <dbl>, cloud_cover <dbl>,
## # moon_phase <dbl>, precip_intensity <dbl>, precip_probability <dbl>,
## # precip_type <chr>, pressure <dbl>, summary <chr>, uv_index <dbl>,
## # visibility <dbl>, wind_bearing <dbl>, wind_speed <dbl>
top_sightings <- bigfoot %>% count(classification) %>% slice_max(n, n = 6) %>% pull(classification)
bigfoot %>%
filter(classification != "Class C", !is.na(observed)) %>%
count(classification, season) %>%
ggplot(aes(season, n, fill = season)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(classification), scales = "free_y") +
labs(x = NULL, y = "Sightings in Seasons")
bigfoot %>%
count(state, season)
## # A tibble: 220 × 3
## state season n
## <chr> <chr> <int>
## 1 Alabama Fall 21
## 2 Alabama Spring 14
## 3 Alabama Summer 31
## 4 Alabama Unknown 1
## 5 Alabama Winter 24
## 6 Alaska Fall 8
## 7 Alaska Spring 4
## 8 Alaska Summer 5
## 9 Alaska Winter 3
## 10 Arizona Fall 20
## # ℹ 210 more rows
top_state <- bigfoot %>% count(state) %>% slice_max(n, n = 4) %>% pull(state)
bigfoot %>%
filter(state %in% top_state) %>%
count(state, season) %>%
ggplot(aes(season, n, fill = season)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(state), scales = "free_y") +
labs(x = NULL, y = "States with Sightings")
library(tidytext)
library(tidylo)
bigfoot %>%
unnest_tokens(word, observed) %>%
count(classification, word) %>%
filter(n > 100) %>%
bind_log_odds(classification, word, n) %>%
arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
## classification word n log_odds_weighted
## <chr> <chr> <int> <dbl>
## 1 possible howl 455 14.7
## 2 sighting fur 362 13.3
## 3 possible heard 5397 12.7
## 4 possible screams 327 12.5
## 5 sighting ape 300 12.1
## 6 possible knocks 301 12.0
## 7 sighting hands 285 11.8
## 8 sighting headlights 283 11.7
## 9 possible listened 266 11.2
## 10 sighting witness 249 11.0
## # ℹ 1,737 more rows
bigfoot_parsed <- bigfoot %>%
select(classification, observed, season, state, county, temperature_mid,
temperature_high, temperature_low, date, moon_phase) %>%
mutate(date = as.character(date)) %>%
mutate(date = parse_number(date)) %>%
na.omit() %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(season = as.character(season))
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
bigfoot_split <-
bigfoot_parsed %>%
select(observed, classification) %>%
initial_split(strata = classification)
bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)
set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, strata = classification)
bigfoot_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [2127/237]> Fold01
## 2 <split [2127/237]> Fold02
## 3 <split [2127/237]> Fold03
## 4 <split [2127/237]> Fold04
## 5 <split [2127/237]> Fold05
## 6 <split [2127/237]> Fold06
## 7 <split [2127/237]> Fold07
## 8 <split [2129/235]> Fold08
## 9 <split [2129/235]> Fold09
## 10 <split [2129/235]> Fold10
library(embed)
library(textrecipes)
bigfoot_rec <-
recipe(classification ~ ., data = bigfoot_train) %>%
step_tokenize(observed) %>%
step_stopwords(observed) %>%
step_tokenfilter(observed, max_tokens = 200) %>%
step_tfidf(observed) %>%
step_dummy(all_nominal_predictors())
bigfoot_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 2,364
## Columns: 201
## $ classification <fct> possible, possible, possible, possible, possi…
## $ tfidf_observed_1 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_10 <dbl> 0.01611678, 0.00000000, 0.06148253, 0.0415007…
## $ tfidf_observed_15 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_2 <dbl> 0.00000000, 0.01717567, 0.00000000, 0.0373570…
## $ tfidf_observed_20 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_3 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_30 <dbl> 0.00000000, 0.00000000, 0.06247454, 0.0421703…
## $ tfidf_observed_4 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_5 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_6 <dbl> 0.00000000, 0.02139852, 0.00000000, 0.0000000…
## $ tfidf_observed_7 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_8 <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_across <dbl> 0.00000000, 0.00000000, 0.05877095, 0.0000000…
## $ tfidf_observed_almost <dbl> 0.00000000, 0.02135401, 0.00000000, 0.0000000…
## $ tfidf_observed_along <dbl> 0.03732727, 0.02209603, 0.00000000, 0.0000000…
## $ tfidf_observed_also <dbl> 0.01639157, 0.00000000, 0.00000000, 0.0422083…
## $ tfidf_observed_animal <dbl> 0.01613096, 0.05729273, 0.00000000, 0.0000000…
## $ tfidf_observed_another <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_anything <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_area <dbl> 0.032948453, 0.000000000, 0.041897415, 0.0282…
## $ tfidf_observed_arms <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_around <dbl> 0.010467666, 0.037178262, 0.000000000, 0.0000…
## $ tfidf_observed_asked <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_away <dbl> 0.01269058, 0.00000000, 0.04841223, 0.0000000…
## $ tfidf_observed_back <dbl> 0.037641063, 0.022281779, 0.000000000, 0.0484…
## $ tfidf_observed_bear <dbl> 0.00000000, 0.01930192, 0.06219507, 0.0000000…
## $ tfidf_observed_behind <dbl> 0.03106853, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_believe <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_big <dbl> 0.00000000, 0.01878458, 0.00000000, 0.0000000…
## $ tfidf_observed_bigfoot <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_black <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_brown <dbl> 0.03680474, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_brush <dbl> 0.04347373, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_came <dbl> 0.01374316, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_camp <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_can <dbl> 0.01844195, 0.02183357, 0.00000000, 0.0000000…
## $ tfidf_observed_car <dbl> 0.00000000, 0.00000000, 0.14193016, 0.0479014…
## $ tfidf_observed_close <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_closer <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_come <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_coming <dbl> 0.00000000, 0.00000000, 0.06519937, 0.0000000…
## $ tfidf_observed_county <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_covered <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_creature <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_creek <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_dark <dbl> 0.00000000, 0.01640980, 0.00000000, 0.0356913…
## $ tfidf_observed_day <dbl> 0.02830939, 0.03351572, 0.00000000, 0.0000000…
## $ tfidf_observed_decided <dbl> 0.03481885, 0.00000000, 0.00000000, 0.0448292…
## $ tfidf_observed_deer <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_direction <dbl> 0.00000000, 0.02357171, 0.00000000, 0.0000000…
## $ tfidf_observed_distance <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_dog <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_dogs <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_driving <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_even <dbl> 0.00000000, 0.02063148, 0.00000000, 0.0000000…
## $ tfidf_observed_ever <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_eyes <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_face <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_fast <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_feet <dbl> 0.03591763, 0.02834878, 0.00000000, 0.0308293…
## $ tfidf_observed_field <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_figure <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_first <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_foot <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_forest <dbl> 0.02094772, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_found <dbl> 0.00000000, 0.02043047, 0.00000000, 0.0000000…
## $ tfidf_observed_friend <dbl> 0.00000000, 0.02057066, 0.00000000, 0.0000000…
## $ tfidf_observed_friends <dbl> 0.00000000, 0.02493273, 0.00000000, 0.0000000…
## $ tfidf_observed_front <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_get <dbl> 0.01337659, 0.01583666, 0.00000000, 0.0688894…
## $ tfidf_observed_go <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_going <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_gone <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_good <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_got <dbl> 0.000000000, 0.014191172, 0.000000000, 0.0308…
## $ tfidf_observed_ground <dbl> 0.00000000, 0.02236678, 0.00000000, 0.0000000…
## $ tfidf_observed_hair <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_happened <dbl> 0.01767149, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_head <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_hear <dbl> 0.01634742, 0.00000000, 0.00000000, 0.0420946…
## $ tfidf_observed_heard <dbl> 0.000000000, 0.000000000, 0.000000000, 0.0000…
## $ tfidf_observed_high <dbl> 0.01976604, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_hill <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_home <dbl> 0.00000000, 0.00000000, 0.06164519, 0.0416105…
## $ tfidf_observed_house <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_human <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_hunting <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_husband <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_just <dbl> 0.03090724, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_knew <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_know <dbl> 0.01411328, 0.01670883, 0.05383955, 0.0000000…
## $ tfidf_observed_lake <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_large <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_last <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_later <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_left <dbl> 0.01365651, 0.04850417, 0.00000000, 0.0000000…
## $ tfidf_observed_legs <dbl> 0.00000000, 0.00000000, 0.07272292, 0.0000000…
## $ tfidf_observed_light <dbl> 0.03732727, 0.02209603, 0.00000000, 0.0000000…
## $ tfidf_observed_like <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0471493…
## $ tfidf_observed_line <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_little <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_long <dbl> 0.01376259, 0.03258729, 0.00000000, 0.0000000…
## $ tfidf_observed_look <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_looked <dbl> 0.01151843, 0.01363677, 0.04394069, 0.0000000…
## $ tfidf_observed_looking <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_loud <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0448292…
## $ tfidf_observed_made <dbl> 0.01504788, 0.05344590, 0.00000000, 0.0000000…
## $ tfidf_observed_make <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_man <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_many <dbl> 0.02059473, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_maybe <dbl> 0.01868413, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_mile <dbl> 0.00000000, 0.04468363, 0.00000000, 0.0485934…
## $ tfidf_observed_miles <dbl> 0.01889233, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_minutes <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_morning <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_moved <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_moving <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_much <dbl> 0.01725681, 0.02043047, 0.00000000, 0.0000000…
## $ tfidf_observed_near <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0436754…
## $ tfidf_observed_never <dbl> 0.01298908, 0.01537788, 0.00000000, 0.0000000…
## $ tfidf_observed_next <dbl> 0.00000000, 0.01804625, 0.00000000, 0.0000000…
## $ tfidf_observed_night <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_noise <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_north <dbl> 0.00000000, 0.02395283, 0.07718133, 0.0000000…
## $ tfidf_observed_nothing <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_noticed <dbl> 0.01760064, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_now <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_old <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0915625…
## $ tfidf_observed_one <dbl> 0.02202421, 0.02607464, 0.00000000, 0.0000000…
## $ tfidf_observed_outside <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_people <dbl> 0.01960142, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_point <dbl> 0.02091999, 0.02476734, 0.00000000, 0.0000000…
## $ tfidf_observed_ran <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_really <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_remember <dbl> 0.00000000, 0.02599292, 0.00000000, 0.0000000…
## $ tfidf_observed_right <dbl> 0.05413426, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_river <dbl> 0.04269379, 0.02527276, 0.00000000, 0.0549682…
## $ tfidf_observed_road <dbl> 0.000000000, 0.137924268, 0.133326793, 0.0599…
## $ tfidf_observed_run <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_running <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_said <dbl> 0.04300183, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_saw <dbl> 0.050348778, 0.000000000, 0.000000000, 0.0000…
## $ tfidf_observed_say <dbl> 0.01887124, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_scared <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_seconds <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_see <dbl> 0.042511833, 0.025165051, 0.040543693, 0.0000…
## $ tfidf_observed_seemed <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0494250…
## $ tfidf_observed_seen <dbl> 0.05451165, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_several <dbl> 0.00000000, 0.02214463, 0.00000000, 0.0000000…
## $ tfidf_observed_side <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0638579…
## $ tfidf_observed_since <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_small <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_smell <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_someone <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0517743…
## $ tfidf_observed_something <dbl> 0.00000000, 0.00000000, 0.04207650, 0.0284016…
## $ tfidf_observed_son <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_sound <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_sounded <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0464933…
## $ tfidf_observed_sounds <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_south <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_spot <dbl> 0.02114473, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_standing <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_started <dbl> 0.04454104, 0.00000000, 0.00000000, 0.0764621…
## $ tfidf_observed_still <dbl> 0.01544419, 0.01828451, 0.00000000, 0.0000000…
## $ tfidf_observed_stood <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_stopped <dbl> 0.00000000, 0.00000000, 0.05911250, 0.0000000…
## $ tfidf_observed_sure <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_tall <dbl> 0.01299761, 0.00000000, 0.04958346, 0.0000000…
## $ tfidf_observed_tell <dbl> 0.00000000, 0.02216903, 0.00000000, 0.0000000…
## $ tfidf_observed_tent <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_thing <dbl> 0.00000000, 0.00000000, 0.05633009, 0.0000000…
## $ tfidf_observed_think <dbl> 0.01709110, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_thought <dbl> 0.000000000, 0.015149946, 0.000000000, 0.0000…
## $ tfidf_observed_three <dbl> 0.01964808, 0.02326152, 0.00000000, 0.0000000…
## $ tfidf_observed_time <dbl> 0.010982818, 0.000000000, 0.000000000, 0.0000…
## $ tfidf_observed_times <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_told <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_took <dbl> 0.01534278, 0.03632888, 0.00000000, 0.0000000…
## $ tfidf_observed_top <dbl> 0.04228947, 0.07510026, 0.00000000, 0.0000000…
## $ tfidf_observed_towards <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_observed_tracks <dbl> 0.00000000, 0.07231156, 0.00000000, 0.0000000…
## $ tfidf_observed_trail <dbl> 0.01962472, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_tree <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_trees <dbl> 0.03275364, 0.01938865, 0.00000000, 0.0000000…
## $ tfidf_observed_truck <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_turned <dbl> 0.00000000, 0.01718871, 0.05538585, 0.0000000…
## $ tfidf_observed_two <dbl> 0.02549479, 0.06036698, 0.04862896, 0.0000000…
## $ tfidf_observed_us <dbl> 0.06460685, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_walk <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0976232…
## $ tfidf_observed_walked <dbl> 0.00000000, 0.01817935, 0.00000000, 0.0000000…
## $ tfidf_observed_walking <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0362357…
## $ tfidf_observed_water <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_way <dbl> 0.02723682, 0.01612294, 0.00000000, 0.0000000…
## $ tfidf_observed_well <dbl> 0.00000000, 0.02109177, 0.00000000, 0.0000000…
## $ tfidf_observed_went <dbl> 0.00000000, 0.01382404, 0.00000000, 0.0300672…
## $ tfidf_observed_wife <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_observed_window <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
## $ tfidf_observed_woods <dbl> 0.01239959, 0.01467998, 0.09460431, 0.0638579…
## $ tfidf_observed_yards <dbl> 0.01548264, 0.01833002, 0.05906341, 0.0000000…
## $ tfidf_observed_years <dbl> 0.00000000, 0.00000000, 0.00000000, 0.0000000…
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(bigfoot_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_grid(
xgb_wf,
resamples = bigfoot_folds,
grid = 5,
control = control_grid(verbose = TRUE, save_pred = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [2127/237]> Fold01 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 2 <split [2127/237]> Fold02 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 3 <split [2127/237]> Fold03 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 4 <split [2127/237]> Fold04 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 5 <split [2127/237]> Fold05 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 6 <split [2127/237]> Fold06 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 7 <split [2127/237]> Fold07 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 8 <split [2129/235]> Fold08 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 9 <split [2129/235]> Fold09 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
## 10 <split [2129/235]> Fold10 <tibble [10 × 7]> <tibble [0 × 3]> <tibble>
collect_metrics(xgb_rs)
## # A tibble: 10 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 41 606 28 accuracy binary 0.781 10 0.00516 Preprocessor1_Mode…
## 2 41 606 28 roc_auc binary 0.866 10 0.00698 Preprocessor1_Mode…
## 3 46 1196 24 accuracy binary 0.781 10 0.00723 Preprocessor1_Mode…
## 4 46 1196 24 roc_auc binary 0.866 10 0.00787 Preprocessor1_Mode…
## 5 81 1331 37 accuracy binary 0.787 10 0.00466 Preprocessor1_Mode…
## 6 81 1331 37 roc_auc binary 0.866 10 0.00723 Preprocessor1_Mode…
## 7 131 1641 7 accuracy binary 0.784 10 0.00711 Preprocessor1_Mode…
## 8 131 1641 7 roc_auc binary 0.863 10 0.00801 Preprocessor1_Mode…
## 9 185 194 13 accuracy binary 0.772 10 0.00789 Preprocessor1_Mode…
## 10 185 194 13 roc_auc binary 0.856 10 0.00624 Preprocessor1_Mode…
xgb_last <- xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
last_fit(bigfoot_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [2364/789]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(xgb_last)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.795 Preprocessor1_Model1
## 2 roc_auc binary 0.878 Preprocessor1_Model1
collect_predictions(xgb_last) %>%
conf_mat(classification, .pred_class)
## Truth
## Prediction possible sighting
## possible 318 70
## sighting 92 309
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgb_last %>%
extract_fit_engine() %>%
vip()
What is the research question? Clearly state the research question you aim to address using the new dataset. “Can we predict bigfoot sightings (possible or sighting) based on the words used in the description of the possible sighting?
Describe the data briefly: Provide an overview of the new data set, highlighting its key characteristics and dimensions. The original data set is 5,021 observations of 28 variables. The data set contains data about different variables about conditions during the possible big foot sighting. Some variables are county, state, season, title, moon_phase, date, temperature_mid, temperature_low, temperature_high, and more.
What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the data set and their characteristics. The characteristics of the primary variables are character data, numerical data, and a few factor data that we converted. Temperature_mid, temperature_low, moon_phase and temperature_high are numerical data. Classification and observed were broken down into factor data, but were orignally character data. Title, county, state, season are character data. Date is date data.