library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): observed, location_details, county, state, season, title, classif...
## dbl (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
count(classification)
## # A tibble: 3 × 2
## classification n
## <chr> <int>
## 1 Class A 2481
## 2 Class B 2510
## 3 Class C 30
bigfoot <-
bigfoot_raw %>%
filter(classification != "Class C", !is.na(observed)) %>%
mutate(classification = case_when(
classification == "Class A" ~ "sighting",
classification == "Class B" ~ " posible"
))
library(tidytext)
library(tidylo)
bigfoot %>%
unnest_tokens(word, observed) %>%
count(classification, word) %>%
filter(n > 100) %>%
bind_log_odds(classification, word, n) %>%
arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
## classification word n log_odds_weighted
## <chr> <chr> <int> <dbl>
## 1 " posible" howl 455 14.7
## 2 "sighting" fur 362 13.3
## 3 " posible" heard 5397 12.7
## 4 " posible" screams 327 12.5
## 5 "sighting" ape 300 12.1
## 6 " posible" knocks 301 12.0
## 7 "sighting" hands 285 11.8
## 8 "sighting" headlights 283 11.7
## 9 " posible" listened 266 11.2
## 10 "sighting" witness 249 11.0
## # ℹ 1,737 more rows
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
bigfoot_split <-
bigfoot %>%
select(observed, classification) %>%
initial_split(strata = classification)
bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)
set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, v = 5, strata = classification)
bigfoot_folds
## # 5-fold cross-validation using stratification
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [2971/743]> Fold1
## 2 <split [2971/743]> Fold2
## 3 <split [2971/743]> Fold3
## 4 <split [2971/743]> Fold4
## 5 <split [2972/742]> Fold5
library(textrecipes)
bigfoot_rec <-
recipe(classification ~ observed, data = bigfoot_train) %>%
step_tokenize(observed) %>%
step_tokenfilter(observed, max_tokens = 2e3) %>%
step_tfidf(observed)
bigfoot_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 1
##
## ── Operations
## • Tokenization for: observed
## • Text filtering for: observed
## • Term frequency-inverse document frequency with: observed
glmnet_spec <-
logistic_reg(mixture = 1, penalty = tune()) %>%
set_engine("glmnet")
bigfoot_wf <- workflow(bigfoot_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
bigfoot_res <-
tune_grid(
bigfoot_wf,
bigfoot_folds,
grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
)
autoplot(bigfoot_res)
show_best(bigfoot_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0158 roc_auc binary 0.855 5 0.00497 Preprocessor1_Model05
## 2 0.00794 roc_auc binary 0.849 5 0.00561 Preprocessor1_Model04
## 3 0.0316 roc_auc binary 0.847 5 0.00585 Preprocessor1_Model06
## 4 0.0631 roc_auc binary 0.826 5 0.00611 Preprocessor1_Model07
## 5 0.00398 roc_auc binary 0.824 5 0.00701 Preprocessor1_Model03
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .loss
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.0316 roc_auc binary 0.847 5 0.00585 Preprocessor1_Mode… 0.855 1.00
bigfoot_final <-
bigfoot_wf %>%
finalize_workflow(
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
) %>%
last_fit(bigfoot_split)
bigfoot_final
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [3714/1239]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(bigfoot_final)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.764 Preprocessor1_Model1
## 2 roc_auc binary 0.836 Preprocessor1_Model1
collect_predictions(bigfoot_final) %>%
conf_mat(classification, .pred_class)
## Truth
## Prediction posible sighting
## posible 491 158
## sighting 134 456
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
bigfoot_final %>%
extract_fit_engine() %>%
vi()
## # A tibble: 2,000 × 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 tfidf_observed_muscular 860. POS
## 2 tfidf_observed_hunched 810. POS
## 3 tfidf_observed_nose 804. POS
## 4 tfidf_observed_shaggy 779. POS
## 5 tfidf_observed_guessing 768. POS
## 6 tfidf_observed_whooping 716. NEG
## 7 tfidf_observed_especially 682. NEG
## 8 tfidf_observed_putting 673. POS
## 9 tfidf_observed_literally 664. NEG
## 10 tfidf_observed_admit 621. POS
## # ℹ 1,990 more rows
The original data was made up of 5021 observations and 28 variable with its key characteristics being the “observed” variable which was used to predict the classification.
In the analysis we only used the observed variable and the text in it. This was because a lot of the other variables had a lot of missing data which would have made for a worse outcome if they were taken away or imputed.
The bigfoot_raw dataset contained 28 columns of Bigfoot encounter details. In the transformation, less credible Class C reports were removed, and Class A and B were relabeled as “sighting” and “possible”. Encounter descriptions were broken down into individual words, with their frequencies computed. Using term frequency-inverse document frequency (TF-IDF), the most informative words were highlighted. This streamlined data was designed for clarity and optimized for predictive modeling.
The data underwent several preparation steps: First, it was filtered to exclude Class C reports and relabeled Class A and B encounters. Then, encounter descriptions were tokenized, breaking them down into individual words. Finally, term frequency-inverse document frequency (TF-IDF) was applied to identify and highlight the most informative words for predictive modeling.
The machine learning model used in the analysis is the logistic regression model, specifically implemented with the glmnet engine.
The model evaluation employed two metrics: accuracy, which measures the proportion of correct predictions, and roc_auc, which assesses the model’s ability to distinguish between classes based on the area under the receiver operating characteristic curve.
The analysis of the Bigfoot encounter data revealed a preference in the terms used for different types of encounters. Words like “howl”, “heard”, and “screams” were associated with “possible” encounters, while terms such as “fur”, “ape”, and “hands” were linked to confirmed “sightings”. After building a logistic regression model, it achieved an accuracy of 76.4% and an ROC_AUC of 0.836 on the test data. Important terms influencing predictions included “muscular”, “hunched”, and “nose” for sightings, while words like “whooping” and “especially” had negative associations.