library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

members_raw %>% count(died)

## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106

library(tidytext)
library(tidylo)

members_raw %>%
  unnest_tokens(word, season) %>%
  count(died, word) %>%
  filter(n > 100) %>%
  bind_log_odds(died, word, n) %>%
  arrange(-log_odds_weighted)

## # A tibble: 6 × 4
##   died  word       n log_odds_weighted
##   <lgl> <chr>  <int>             <dbl>
## 1 FALSE winter  2054             18.5 
## 2 FALSE summer   729             10.9 
## 3 TRUE  spring   555              8.69
## 4 TRUE  autumn   493              7.75
## 5 FALSE autumn 35402             -4.80
## 6 FALSE spring 37227             -5.37

member <- members_raw %>% 
    mutate(across(where(is.logical), factor))

 library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

set.seed(123)
member_split <-
  member %>%
  select(season, died) %>%
  initial_split(strata = died)

member_train <- training(member_split)
member_test <- testing(member_split)

set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [51650/5739]> Fold01
##  2 <split [51650/5739]> Fold02
##  3 <split [51650/5739]> Fold03
##  4 <split [51650/5739]> Fold04
##  5 <split [51650/5739]> Fold05
##  6 <split [51650/5739]> Fold06
##  7 <split [51650/5739]> Fold07
##  8 <split [51650/5739]> Fold08
##  9 <split [51650/5739]> Fold09
## 10 <split [51651/5738]> Fold10

library(textrecipes)

member_rec <-
  recipe(died ~ season, data = member_train) %>%
  step_tokenize(season) %>%
  step_tokenfilter(season, max_tokens = 5) %>%
  step_tfidf(season)

member_rec

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:   1
## predictor: 1

##

## ── Operations

## • Tokenization for: season

## • Text filtering for: season

## • Term frequency-inverse document frequency with: season

glmnet_spec <- 
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

member_wf <- workflow(member_rec, glmnet_spec)

doParallel::registerDoParallel()
set.seed(123)
member_res <- 
  tune_grid(
    member_wf, 
    member_folds, 
    grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
  )

autoplot(member_res)

show_best(member_res)

## Warning: No value of `metric` was given; metric 'roc_auc' will be used.

## # A tibble: 5 × 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.001   roc_auc binary     0.503    10 0.00283 Preprocessor1_Model01
## 2 0.00200 roc_auc binary     0.5      10 0       Preprocessor1_Model02
## 3 0.00398 roc_auc binary     0.5      10 0       Preprocessor1_Model03
## 4 0.00794 roc_auc binary     0.5      10 0       Preprocessor1_Model04
## 5 0.0158  roc_auc binary     0.5      10 0       Preprocessor1_Model05

select_by_pct_loss(member_res, desc(penalty), metric = "roc_auc")

## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config             .best .loss
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               <dbl> <dbl>
## 1       1 roc_auc binary       0.5    10       0 Preprocessor1_Mode… 0.503 0.579

member_final <-
  member_wf %>%
  finalize_workflow(
    select_by_pct_loss(member_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(member_split)

member_final

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits                id             .metrics .notes   .predictions .workflow 
##   <list>                <chr>          <list>   <list>   <list>       <list>    
## 1 <split [57389/19130]> train/test sp… <tibble> <tibble> <tibble>     <workflow>

collect_metrics(member_final)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.5   Preprocessor1_Model1

collect_predictions(member_final) %>%
  conf_mat(died, .pred_class)

##           Truth
## Prediction FALSE  TRUE
##      FALSE 18859   271
##      TRUE      0     0

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

member_final %>%
  extract_fit_engine() %>%
  vi()

## # A tibble: 5 × 3
##   Variable             Importance Sign 
##   <chr>                     <dbl> <chr>
## 1 tfidf_season_winter      0.107  POS  
## 2 tfidf_season_autumn      0.0261 NEG  
## 3 tfidf_season_spring      0      NEG  
## 4 tfidf_season_summer      0      NEG  
## 5 tfidf_season_unknown     0      NEG

1. What is the research question?

The question we are trying to answer is can we determine if someone died based on keywords. # Describe the data briefly
The data consists of different mountains, seasons and years they were climbed, ages and citizenship of all involved, etc. # What are the characteristics of the key variables used in the analysis?
The primary variables we are using in this analysis consist of did they die and what season it was.

#2. Data Exploration and Transformation: - The newly transformed data has logical data changed to factor.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: step_tokenize(creates a specification of a recipe step that will convert a character to a token variable), step_tokenfilter(creates a specification of a recipe step that will convert a token variable to be filtered by frequency), and step_tf(converts token variable into multiple variables).

#4. Model Evaluation: - Looking at the confusion matrix we can see that the model did a good job of predicting the outcome.

5. Conclusion:

The key findings of this model are that winter had the highest importance in the data.

Apply 7

Colin Tracy

2023-10-25

1. What is the research question?

3. Data Preparation and Modeling:

5. Conclusion: