library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members_raw %>% count(died)
## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106
library(tidytext)
library(tidylo)

members_raw %>%
  unnest_tokens(word, season) %>%
  count(died, word) %>%
  filter(n > 100) %>%
  bind_log_odds(died, word, n) %>%
  arrange(-log_odds_weighted)
## # A tibble: 6 × 4
##   died  word       n log_odds_weighted
##   <lgl> <chr>  <int>             <dbl>
## 1 FALSE winter  2054             18.5 
## 2 FALSE summer   729             10.9 
## 3 TRUE  spring   555              8.69
## 4 TRUE  autumn   493              7.75
## 5 FALSE autumn 35402             -4.80
## 6 FALSE spring 37227             -5.37
member <- members_raw %>% 
    mutate(across(where(is.logical), factor))
 library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
member_split <-
  member %>%
  select(season, died) %>%
  initial_split(strata = died)

member_train <- training(member_split)
member_test <- testing(member_split)

set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [51650/5739]> Fold01
##  2 <split [51650/5739]> Fold02
##  3 <split [51650/5739]> Fold03
##  4 <split [51650/5739]> Fold04
##  5 <split [51650/5739]> Fold05
##  6 <split [51650/5739]> Fold06
##  7 <split [51650/5739]> Fold07
##  8 <split [51650/5739]> Fold08
##  9 <split [51650/5739]> Fold09
## 10 <split [51651/5738]> Fold10
library(textrecipes)

member_rec <-
  recipe(died ~ season, data = member_train) %>%
  step_tokenize(season) %>%
  step_tokenfilter(season, max_tokens = 5) %>%
  step_tfidf(season)

member_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 1
## 
## ── Operations
## • Tokenization for: season
## • Text filtering for: season
## • Term frequency-inverse document frequency with: season
glmnet_spec <- 
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

member_wf <- workflow(member_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
member_res <- 
  tune_grid(
    member_wf, 
    member_folds, 
    grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
  )

autoplot(member_res)

show_best(member_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.001   roc_auc binary     0.503    10 0.00283 Preprocessor1_Model01
## 2 0.00200 roc_auc binary     0.5      10 0       Preprocessor1_Model02
## 3 0.00398 roc_auc binary     0.5      10 0       Preprocessor1_Model03
## 4 0.00794 roc_auc binary     0.5      10 0       Preprocessor1_Model04
## 5 0.0158  roc_auc binary     0.5      10 0       Preprocessor1_Model05
select_by_pct_loss(member_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config             .best .loss
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               <dbl> <dbl>
## 1       1 roc_auc binary       0.5    10       0 Preprocessor1_Mode… 0.503 0.579
member_final <-
  member_wf %>%
  finalize_workflow(
    select_by_pct_loss(member_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(member_split)

member_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits                id             .metrics .notes   .predictions .workflow 
##   <list>                <chr>          <list>   <list>   <list>       <list>    
## 1 <split [57389/19130]> train/test sp… <tibble> <tibble> <tibble>     <workflow>
collect_metrics(member_final)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.5   Preprocessor1_Model1
collect_predictions(member_final) %>%
  conf_mat(died, .pred_class)
##           Truth
## Prediction FALSE  TRUE
##      FALSE 18859   271
##      TRUE      0     0
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
member_final %>%
  extract_fit_engine() %>%
  vi() 
## # A tibble: 5 × 3
##   Variable             Importance Sign 
##   <chr>                     <dbl> <chr>
## 1 tfidf_season_winter      0.107  POS  
## 2 tfidf_season_autumn      0.0261 NEG  
## 3 tfidf_season_spring      0      NEG  
## 4 tfidf_season_summer      0      NEG  
## 5 tfidf_season_unknown     0      NEG

1. What is the research question?

#2. Data Exploration and Transformation: - The newly transformed data has logical data changed to factor.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: step_tokenize(creates a specification of a recipe step that will convert a character to a token variable), step_tokenfilter(creates a specification of a recipe step that will convert a token variable to be filtered by frequency), and step_tf(converts token variable into multiple variables).

#4. Model Evaluation: - Looking at the confusion matrix we can see that the model did a good job of predicting the outcome.

5. Conclusion: