library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bigfoot_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): observed, location_details, county, state, season, title, classif...
## dbl  (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigfoot_raw %>%
  count(classification)
## # A tibble: 3 × 2
##   classification     n
##   <chr>          <int>
## 1 Class A         2481
## 2 Class B         2510
## 3 Class C           30
bigfoot <-
  bigfoot_raw %>%
  filter(classification != "Class C", !is.na(observed)) %>%
  mutate(
    classification = case_when(
      classification == "Class A" ~ "sighting",
      classification == "Class B" ~ "possible"
    ))

bigfoot
## # A tibble: 4,953 × 28
##    observed        location_details county state season title latitude longitude
##    <chr>           <chr>            <chr>  <chr> <chr>  <chr>    <dbl>     <dbl>
##  1 "I was canoein…  <NA>            Winst… Alab… Summer <NA>      NA        NA  
##  2 "Ed L. was sal… "East side of P… Valde… Alas… Fall   <NA>      NA        NA  
##  3 "While attendi… "Great swamp ar… Washi… Rhod… Fall   Repo…     41.4     -71.5
##  4 "Hello, My nam… "I would rather… York … Penn… Summer <NA>      NA        NA  
##  5 "It was May 19… "Logging roads … Yamhi… Oreg… Spring <NA>      NA        NA  
##  6 "My two childr… "The creature c… Washi… Okla… Fall   Repo…     35.3     -99.2
##  7 "I was staying… "Vincent, Ohio … Washi… Ohio  Summer Repo…     39.4     -81.7
##  8 "Well last yea… "Both sightings… Westc… New … Fall   Repo…     41.3     -73.7
##  9 "I grew up in … "The Western fa… Washo… Neva… Fall   Repo…     39.6    -120. 
## 10 "heh i kinda f… "the road is of… Warre… New … Fall   <NA>      NA        NA  
## # ℹ 4,943 more rows
## # ℹ 20 more variables: date <date>, number <dbl>, classification <chr>,
## #   geohash <chr>, temperature_high <dbl>, temperature_mid <dbl>,
## #   temperature_low <dbl>, dew_point <dbl>, humidity <dbl>, cloud_cover <dbl>,
## #   moon_phase <dbl>, precip_intensity <dbl>, precip_probability <dbl>,
## #   precip_type <chr>, pressure <dbl>, summary <chr>, uv_index <dbl>,
## #   visibility <dbl>, wind_bearing <dbl>, wind_speed <dbl>
library(tidytext)
library(tidylo)

bigfoot %>%
    unnest_tokens(word, observed) %>%
    count(classification, word) %>%
    filter(n > 100) %>%
    bind_log_odds(classification, word, n) %>%
    arrange(-log_odds_weighted)
## # A tibble: 1,747 × 4
##    classification word           n log_odds_weighted
##    <chr>          <chr>      <int>             <dbl>
##  1 possible       howl         455              14.7
##  2 sighting       fur          362              13.3
##  3 possible       heard       5397              12.7
##  4 possible       screams      327              12.5
##  5 sighting       ape          300              12.1
##  6 possible       knocks       301              12.0
##  7 sighting       hands        285              11.8
##  8 sighting       headlights   283              11.7
##  9 possible       listened     266              11.2
## 10 sighting       witness      249              11.0
## # ℹ 1,737 more rows

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
bigfoot_split <-
  bigfoot %>%
  select(observed, classification) %>%
  initial_split(strata = classification)

bigfoot_train <- training(bigfoot_split)
bigfoot_test <- testing(bigfoot_split)

set.seed(234)
bigfoot_folds <- vfold_cv(bigfoot_train, strata = classification)
bigfoot_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [3342/372]> Fold01
##  2 <split [3342/372]> Fold02
##  3 <split [3342/372]> Fold03
##  4 <split [3342/372]> Fold04
##  5 <split [3343/371]> Fold05
##  6 <split [3343/371]> Fold06
##  7 <split [3343/371]> Fold07
##  8 <split [3343/371]> Fold08
##  9 <split [3343/371]> Fold09
## 10 <split [3343/371]> Fold10
library(textrecipes)

bigfoot_rec <-
  recipe(classification ~ observed, data = bigfoot_train) %>%
  step_tokenize(observed) %>%
  step_tokenfilter(observed, max_tokens = 2e3) %>%
  step_tfidf(observed)

bigfoot_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 1
## 
## ── Operations
## • Tokenization for: observed
## • Text filtering for: observed
## • Term frequency-inverse document frequency with: observed
glmnet_spec <- 
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

bigfoot_wf <- workflow(bigfoot_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
bigfoot_res <- 
  tune_grid(
    bigfoot_wf, 
    bigfoot_folds, 
    grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
  )

autoplot(bigfoot_res)

show_best(bigfoot_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.0158  roc_auc binary     0.857    10 0.00463 Preprocessor1_Model05
## 2 0.00794 roc_auc binary     0.853    10 0.00545 Preprocessor1_Model04
## 3 0.0316  roc_auc binary     0.847    10 0.00487 Preprocessor1_Model06
## 4 0.00398 roc_auc binary     0.837    10 0.00675 Preprocessor1_Model03
## 5 0.0631  roc_auc binary     0.827    10 0.00506 Preprocessor1_Model07
select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config             .best .loss
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               <dbl> <dbl>
## 1  0.0316 roc_auc binary     0.847    10 0.00487 Preprocessor1_Mode… 0.857  1.13
bigfoot_final <-
  bigfoot_wf %>%
  finalize_workflow(
    select_by_pct_loss(bigfoot_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(bigfoot_split)

bigfoot_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [3714/1239]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(bigfoot_final)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.764 Preprocessor1_Model1
## 2 roc_auc  binary         0.836 Preprocessor1_Model1
collect_predictions(bigfoot_final) %>%
  conf_mat(classification, .pred_class)
##           Truth
## Prediction possible sighting
##   possible      491      158
##   sighting      134      456
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
bigfoot_final %>%
  extract_fit_engine() %>%
  vi() 
## # A tibble: 2,000 × 3
##    Variable                  Importance Sign 
##    <chr>                          <dbl> <chr>
##  1 tfidf_observed_muscular         860. POS  
##  2 tfidf_observed_hunched          810. POS  
##  3 tfidf_observed_nose             804. POS  
##  4 tfidf_observed_shaggy           779. POS  
##  5 tfidf_observed_guessing         768. POS  
##  6 tfidf_observed_whooping         716. NEG  
##  7 tfidf_observed_especially       682. NEG  
##  8 tfidf_observed_putting          673. POS  
##  9 tfidf_observed_literally        664. NEG  
## 10 tfidf_observed_admit            621. POS  
## # ℹ 1,990 more rows

Questions

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. Can we predict if there was a sighting or a possible sighting of reported Bigfoot encounters based on the textual descriptions of the encounters?
    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set contains information about Bigfoot encounters, including textual descriptions, geographical details, weather-related data, and other important information about the sighting. A few key characteristics are the classification column, which describes if the sighting was for sure or if it was just possible. Another key characteristic is the title column, which contains the description of the sighting. The new data set has 4,953 observations of 28 variables after cleaning.
    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary variables of interest are classification and title. The classification data is character data and classifies what type of sighting it was. The title data is also character data and provides a description of the sighting from the sighter.
  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original data set had 5,021 observations of 28 variables. A few of the variables were observed, location_details, county, state, season, date, classification and more. These varaiables are details that tell us about the possible Bigfoot sighting. Things like county and state are important to know, as well as the date and season of the possible sighting. The original data set had class a, class b, and class c as the data in the classification column. We filtered this data and removed Class C as well as omitted NA values. We also changed the classification column data from class a and b to either sighting or possible. We also used unnest_tokens on the title column to create a word column that was used to help predict the model. We also filtered it by 100.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. The preperation steps are as follows: First we used “filter(classification !=”Class C”, !is.na(observed)) %>% mutate( classification = case_when( classification == “Class A” ~ “sighting”, classification == “Class B” ~ “possible”” to filter out the Class C observation type in the classification column. We also filtered out NA values. We also changed the classification types from Class A and B to sighting or possible. The next step was: “unnest_tokens(word, observed) %>% count(classification, word) %>% filter(n > 100) %>% bind_log_odds(classification, word, n) %>% arrange(-log_odds_weighted)” We tokenized the title column by the top 100 words so that we could use them to help predict the sightings.
    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models used were the logistic regression model, specifically glmnet. Logistic regression is well-suited for classification problems when the goal is to assign observations to one of two classes (possible or sighting). Logistic regression is also easy to interpret the impact of the predictor variables on the probability of belonging to one of two classes. Logistic regression is also good with handling text data.
  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The metrics used was the receiver operating characteristic area under the curve (roc_auc). We used this to assess the performance of the machine learning model. This metric is good at classification, and we want to classify whether the Bigfoot encounter was either a sighting or possible. Roc_auc measures the models ability to differentiate between two classes and a higher roc_auc indicates a better distinction between the two classes. This is important as it shows us if the model is able to differentiate between possible or sightings.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. Our model was able to pick up on the patterns in the language that people used in their description of the sighting. The model could differentiate between how people spoke about their experience when it was classified as possible vs how they spoke when it was classified as sighting. Using the different ways of speaking, we tokenized the descriptions in the title column so that the model could see the top 100 words. The model then used those words to classify if it was possible or sighting.