library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): observed, location_details, county, state, season, title, classif...
## dbl (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
count(classification)
## # A tibble: 3 × 2
## classification n
## <chr> <int>
## 1 Class A 2481
## 2 Class B 2510
## 3 Class C 30
bigfoot <-
bigfoot_raw %>%
filter(classification != "Class C", !is.na(observed)) %>%
mutate(
classification = case_when(
classification == "Class A" ~ "sighting",
classification == "Class B" ~ "possible"
))
bigfoot
## # A tibble: 4,953 × 28
## observed location_details county state season title latitude longitude
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 "I was canoein… <NA> Winst… Alab… Summer <NA> NA NA
## 2 "Ed L. was sal… "East side of P… Valde… Alas… Fall <NA> NA NA
## 3 "While attendi… "Great swamp ar… Washi… Rhod… Fall Repo… 41.4 -71.5
## 4 "Hello, My nam… "I would rather… York … Penn… Summer <NA> NA NA
## 5 "It was May 19… "Logging roads … Yamhi… Oreg… Spring <NA> NA NA
## 6 "My two childr… "The creature c… Washi… Okla… Fall Repo… 35.3 -99.2
## 7 "I was staying… "Vincent, Ohio … Washi… Ohio Summer Repo… 39.4 -81.7
## 8 "Well last yea… "Both sightings… Westc… New … Fall Repo… 41.3 -73.7
## 9 "I grew up in … "The Western fa… Washo… Neva… Fall Repo… 39.6 -120.
## 10 "heh i kinda f… "the road is of… Warre… New … Fall <NA> NA NA
## # ℹ 4,943 more rows
## # ℹ 20 more variables: date <date>, number <dbl>, classification <chr>,
## # geohash <chr>, temperature_high <dbl>, temperature_mid <dbl>,
## # temperature_low <dbl>, dew_point <dbl>, humidity <dbl>, cloud_cover <dbl>,
## # moon_phase <dbl>, precip_intensity <dbl>, precip_probability <dbl>,
## # precip_type <chr>, pressure <dbl>, summary <chr>, uv_index <dbl>,
## # visibility <dbl>, wind_bearing <dbl>, wind_speed <dbl>
library(tidytext)
library(tidylo)
bigfoot %>%
unnest_tokens(word, observed) %>%
count(classification, word) %>%
filter(n > 100) %>%
bind_log_odds(classification, word, n) %>%
arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
## classification word n log_odds_weighted
## <chr> <chr> <int> <dbl>
## 1 possible howl 455 14.7
## 2 sighting fur 362 13.3
## 3 possible heard 5397 12.7
## 4 possible screams 327 12.5
## 5 sighting ape 300 12.1
## 6 possible knocks 301 12.0
## 7 sighting hands 285 11.8
## 8 sighting headlights 283 11.7
## 9 possible listened 266 11.2
## 10 sighting witness 249 11.0
## # ℹ 1,737 more rows
Build a model
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
bigfoot_split <-
bigfoot %>%
select(observed, classification) %>%
initial_split(strata = classification)
bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)
set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, strata = classification)
bigfoot_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [3342/372]> Fold01
## 2 <split [3342/372]> Fold02
## 3 <split [3342/372]> Fold03
## 4 <split [3342/372]> Fold04
## 5 <split [3343/371]> Fold05
## 6 <split [3343/371]> Fold06
## 7 <split [3343/371]> Fold07
## 8 <split [3343/371]> Fold08
## 9 <split [3343/371]> Fold09
## 10 <split [3343/371]> Fold10
library(textrecipes)
bigfoot_rec <-
recipe(classification ~ observed, data = bigfoot_train) %>%
step_tokenize(observed) %>%
step_tokenfilter(observed, max_tokens = 2e3) %>%
step_tfidf(observed)
bigfoot_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 1
##
## ── Operations
## • Tokenization for: observed
## • Text filtering for: observed
## • Term frequency-inverse document frequency with: observed
glmnet_spec <-
logistic_reg(mixture = 1, penalty = tune()) %>%
set_engine("glmnet")
bigfoot_wf <- workflow(bigfoot_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
bigfoot_res <-
tune_grid(
bigfoot_wf,
bigfoot_folds,
grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
)
autoplot(bigfoot_res)

show_best(bigfoot_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0158 roc_auc binary 0.857 10 0.00463 Preprocessor1_Model05
## 2 0.00794 roc_auc binary 0.853 10 0.00545 Preprocessor1_Model04
## 3 0.0316 roc_auc binary 0.847 10 0.00487 Preprocessor1_Model06
## 4 0.00398 roc_auc binary 0.837 10 0.00675 Preprocessor1_Model03
## 5 0.0631 roc_auc binary 0.827 10 0.00506 Preprocessor1_Model07
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .loss
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.0316 roc_auc binary 0.847 10 0.00487 Preprocessor1_Mode… 0.857 1.13
bigfoot_final <-
bigfoot_wf %>%
finalize_workflow(
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
) %>%
last_fit(bigfoot_split)
bigfoot_final
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [3714/1239]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(bigfoot_final)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.764 Preprocessor1_Model1
## 2 roc_auc binary 0.836 Preprocessor1_Model1
collect_predictions(bigfoot_final) %>%
conf_mat(classification, .pred_class)
## Truth
## Prediction possible sighting
## possible 491 158
## sighting 134 456
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
bigfoot_final %>%
extract_fit_engine() %>%
vi()
## # A tibble: 2,000 × 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 tfidf_observed_muscular 860. POS
## 2 tfidf_observed_hunched 810. POS
## 3 tfidf_observed_nose 804. POS
## 4 tfidf_observed_shaggy 779. POS
## 5 tfidf_observed_guessing 768. POS
## 6 tfidf_observed_whooping 716. NEG
## 7 tfidf_observed_especially 682. NEG
## 8 tfidf_observed_putting 673. POS
## 9 tfidf_observed_literally 664. NEG
## 10 tfidf_observed_admit 621. POS
## # ℹ 1,990 more rows
Questions
- Question and Data:
- What is the research question? Clearly state the research question
you aim to address using the new dataset. Can we predict if there was a
sighting or a possible sighting of reported Bigfoot encounters based on
the textual descriptions of the encounters?
- Describe the data briefly: Provide an overview of the new dataset,
highlighting its key characteristics and dimensions. The data set
contains information about Bigfoot encounters, including textual
descriptions, geographical details, weather-related data, and other
important information about the sighting. A few key characteristics are
the classification column, which describes if the sighting was for sure
or if it was just possible. Another key characteristic is the title
column, which contains the description of the sighting. The new data set
has 4,953 observations of 28 variables after cleaning.
- What are the characteristics of the key variables used in the
analysis? Describe the primary variables of interest in the dataset and
their characteristics. The primary variables of interest are
classification and title. The classification data is character data and
classifies what type of sighting it was. The title data is also
character data and provides a description of the sighting from the
sighter.
- Data Exploration and Transformation:
- Describe the differences between the original data and the data
transformed for modeling. Why? Explain any preprocessing or
transformations performed on the new dataset compared to the original
data. Discuss why these changes were necessary or beneficial. The
original data set had 5,021 observations of 28 variables. A few of the
variables were observed, location_details, county, state, season, date,
classification and more. These varaiables are details that tell us about
the possible Bigfoot sighting. Things like county and state are
important to know, as well as the date and season of the possible
sighting. The original data set had class a, class b, and class c as the
data in the classification column. We filtered this data and removed
Class C as well as omitted NA values. We also changed the classification
column data from class a and b to either sighting or possible. We also
used unnest_tokens on the title column to create a word column that was
used to help predict the model. We also filtered it by 100.
- Data Preparation and Modeling:
- What are the names of data preparation steps mentioned in the video?
List and describe any data preparation steps or techniques mentioned in
the CA video that you applied to the new dataset. The preperation steps
are as follows: First we used “filter(classification !=”Class C”,
!is.na(observed)) %>% mutate( classification = case_when(
classification == “Class A” ~ “sighting”, classification == “Class B” ~
“possible”” to filter out the Class C observation type in the
classification column. We also filtered out NA values. We also changed
the classification types from Class A and B to sighting or possible. The
next step was: “unnest_tokens(word, observed) %>%
count(classification, word) %>% filter(n > 100) %>%
bind_log_odds(classification, word, n) %>%
arrange(-log_odds_weighted)” We tokenized the title column by the top
100 words so that we could use them to help predict the sightings.
- What is the name of the machine learning model(s) used in the
analysis? Specify the machine learning model(s) you employed for your
analysis and briefly explain their relevance to the research question.
The machine learning models used were the logistic regression model,
specifically glmnet. Logistic regression is well-suited for
classification problems when the goal is to assign observations to one
of two classes (possible or sighting). Logistic regression is also easy
to interpret the impact of the predictor variables on the probability of
belonging to one of two classes. Logistic regression is also good with
handling text data.
- Model Evaluation:
- What metrics are used in the model evaluation? Detail the evaluation
metrics you used to assess the performance of your machine learning
model(s) on the new dataset. Discuss the significance of these metrics
in the context of your research question. The metrics used was the
receiver operating characteristic area under the curve (roc_auc). We
used this to assess the performance of the machine learning model. This
metric is good at classification, and we want to classify whether the
Bigfoot encounter was either a sighting or possible. Roc_auc measures
the models ability to differentiate between two classes and a higher
roc_auc indicates a better distinction between the two classes. This is
important as it shows us if the model is able to differentiate between
possible or sightings.
- Conclusion:
- What are the major findings? Summarize the key findings and insights
obtained from your analysis of the new dataset. Our model was able to
pick up on the patterns in the language that people used in their
description of the sighting. The model could differentiate between how
people spoke about their experience when it was classified as possible
vs how they spoke when it was classified as sighting. Using the
different ways of speaking, we tokenized the descriptions in the title
column so that the model could see the top 100 words. The model then
used those words to classify if it was possible or sighting.