library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): observed, location_details, county, state, season, title, classif...
## dbl (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
count(classification)
## # A tibble: 3 × 2
## classification n
## <chr> <int>
## 1 Class A 2481
## 2 Class B 2510
## 3 Class C 30
bigfoot <-
bigfoot_raw %>%
filter(classification != "Class C", !is.na(observed)) %>%
mutate(
classification = case_when(
classification == "Class A" ~ "sighting",
classification == "Class B" ~ "possible"
)
)
bigfoot
## # A tibble: 4,953 × 28
## observed location_details county state season title latitude longitude
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 "I was canoein… <NA> Winst… Alab… Summer <NA> NA NA
## 2 "Ed L. was sal… "East side of P… Valde… Alas… Fall <NA> NA NA
## 3 "While attendi… "Great swamp ar… Washi… Rhod… Fall Repo… 41.4 -71.5
## 4 "Hello, My nam… "I would rather… York … Penn… Summer <NA> NA NA
## 5 "It was May 19… "Logging roads … Yamhi… Oreg… Spring <NA> NA NA
## 6 "My two childr… "The creature c… Washi… Okla… Fall Repo… 35.3 -99.2
## 7 "I was staying… "Vincent, Ohio … Washi… Ohio Summer Repo… 39.4 -81.7
## 8 "Well last yea… "Both sightings… Westc… New … Fall Repo… 41.3 -73.7
## 9 "I grew up in … "The Western fa… Washo… Neva… Fall Repo… 39.6 -120.
## 10 "heh i kinda f… "the road is of… Warre… New … Fall <NA> NA NA
## # ℹ 4,943 more rows
## # ℹ 20 more variables: date <date>, number <dbl>, classification <chr>,
## # geohash <chr>, temperature_high <dbl>, temperature_mid <dbl>,
## # temperature_low <dbl>, dew_point <dbl>, humidity <dbl>, cloud_cover <dbl>,
## # moon_phase <dbl>, precip_intensity <dbl>, precip_probability <dbl>,
## # precip_type <chr>, pressure <dbl>, summary <chr>, uv_index <dbl>,
## # visibility <dbl>, wind_bearing <dbl>, wind_speed <dbl>
library(tidytext)
library(tidylo)
bigfoot %>%
unnest_tokens(word, observed) %>%
count(classification, word) %>%
filter(n > 100) %>%
bind_log_odds(classification, word, n) %>%
arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
## classification word n log_odds_weighted
## <chr> <chr> <int> <dbl>
## 1 possible howl 455 14.7
## 2 sighting fur 362 13.3
## 3 possible heard 5397 12.7
## 4 possible screams 327 12.5
## 5 sighting ape 300 12.1
## 6 possible knocks 301 12.0
## 7 sighting hands 285 11.8
## 8 sighting headlights 283 11.7
## 9 possible listened 266 11.2
## 10 sighting witness 249 11.0
## # ℹ 1,737 more rows
Build a Model
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
bigfoot_split <-
bigfoot %>%
select(observed, classification) %>%
initial_split(strata = classification)
bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)
set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, strata = classification)
bigfoot_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [3342/372]> Fold01
## 2 <split [3342/372]> Fold02
## 3 <split [3342/372]> Fold03
## 4 <split [3342/372]> Fold04
## 5 <split [3343/371]> Fold05
## 6 <split [3343/371]> Fold06
## 7 <split [3343/371]> Fold07
## 8 <split [3343/371]> Fold08
## 9 <split [3343/371]> Fold09
## 10 <split [3343/371]> Fold10
library(textrecipes)
bigfoot_rec <-
recipe(classification ~ observed, data = bigfoot_train) %>%
step_tokenize(observed) %>%
step_tokenfilter(observed, max_tokens = 2e3) %>%
step_tfidf(observed)
bigfoot_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 1
##
## ── Operations
## • Tokenization for: observed
## • Text filtering for: observed
## • Term frequency-inverse document frequency with: observed
glmnet_spec <-
logistic_reg(mixture = 1, penalty = tune()) %>%
set_engine("glmnet")
bigfoot_wf <- workflow(bigfoot_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
bigfoot_res <-
tune_grid(
bigfoot_wf,
bigfoot_folds,
grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
)
autoplot(bigfoot_res)

show_best(bigfoot_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0158 roc_auc binary 0.857 10 0.00463 Preprocessor1_Model05
## 2 0.00794 roc_auc binary 0.853 10 0.00545 Preprocessor1_Model04
## 3 0.0316 roc_auc binary 0.847 10 0.00487 Preprocessor1_Model06
## 4 0.00398 roc_auc binary 0.837 10 0.00675 Preprocessor1_Model03
## 5 0.0631 roc_auc binary 0.827 10 0.00506 Preprocessor1_Model07
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .loss
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.0316 roc_auc binary 0.847 10 0.00487 Preprocessor1_Mode… 0.857 1.13
bigfoot_final <-
bigfoot_wf %>%
finalize_workflow(
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
) %>%
last_fit(bigfoot_split)
bigfoot_final
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [3714/1239]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(bigfoot_final)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.764 Preprocessor1_Model1
## 2 roc_auc binary 0.836 Preprocessor1_Model1
collect_predictions(bigfoot_final) %>%
conf_mat(classification, .pred_class)
## Truth
## Prediction possible sighting
## possible 491 158
## sighting 134 456
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
bigfoot_final %>%
extract_fit_engine() %>%
vi()
## # A tibble: 2,000 × 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 tfidf_observed_muscular 860. POS
## 2 tfidf_observed_hunched 810. POS
## 3 tfidf_observed_nose 804. POS
## 4 tfidf_observed_shaggy 779. POS
## 5 tfidf_observed_guessing 768. POS
## 6 tfidf_observed_whooping 716. NEG
## 7 tfidf_observed_especially 682. NEG
## 8 tfidf_observed_putting 673. POS
## 9 tfidf_observed_literally 664. NEG
## 10 tfidf_observed_admit 621. POS
## # ℹ 1,990 more rows