library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members %>%
  count(died)
## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106
members <-
  members %>%
  mutate(died = case_when(
    died == "FALSE" ~ "did not die",
    died == "TRUE" ~ " died"))
library(tidytext)
library(tidylo)

members %>%
  unnest_tokens(word, expedition_role) %>%
  count(died, word) %>%
  bind_log_odds(died, word, n) %>%
  arrange(-log_odds_weighted)
## # A tibble: 326 × 4
##    died          word          n log_odds_weighted
##    <chr>         <chr>     <int>             <dbl>
##  1 "did not die" guide       161              5.18
##  2 "did not die" support     126              4.58
##  3 " died"       porter       29              3.86
##  4 "did not die" rope         62              3.21
##  5 "did not die" scientist    60              3.16
##  6 "did not die" torch        49              2.86
##  7 " died"       kitchen      15              2.66
##  8 "did not die" abc          42              2.65
##  9 "did not die" ldr          40              2.58
## 10 "did not die" radio        38              2.52
## # ℹ 316 more rows

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
members_split <- 
  members %>%
  select(expedition_role, died) %>%
  initial_split(strata = died)

members_train <- training(members_split)
members_test <- testing(members_split)

set.seed(234)
members_folds <- vfold_cv(members_train, v = 5, strata = died)
members_folds
## #  5-fold cross-validation using stratification 
## # A tibble: 5 × 2
##   splits                id   
##   <list>                <chr>
## 1 <split [45911/11478]> Fold1
## 2 <split [45911/11478]> Fold2
## 3 <split [45911/11478]> Fold3
## 4 <split [45911/11478]> Fold4
## 5 <split [45912/11477]> Fold5
library(textrecipes)

members_rec <-
  recipe(died ~ expedition_role, data = members_train) %>%
  step_tokenize(expedition_role) %>%
  step_tokenfilter(expedition_role) %>%
  step_tfidf(expedition_role)

members_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 1
## 
## ── Operations
## • Tokenization for: expedition_role
## • Text filtering for: expedition_role
## • Term frequency-inverse document frequency with: expedition_role
glmnet_spec <- 
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

members_wf <- workflow(members_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)

members_res <- 
  tune_grid(
    members_wf, 
    members_folds, 
    grid = 10)

autoplot(members_res)

show_best(members_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
##    penalty .metric .estimator  mean     n std_err .config              
##      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 6.57e-10 roc_auc binary     0.557     5 0.00933 Preprocessor1_Model01
## 2 5.55e- 9 roc_auc binary     0.557     5 0.00933 Preprocessor1_Model02
## 3 1.05e- 8 roc_auc binary     0.557     5 0.00933 Preprocessor1_Model03
## 4 3.46e- 7 roc_auc binary     0.557     5 0.00933 Preprocessor1_Model04
## 5 4.09e- 6 roc_auc binary     0.557     5 0.00933 Preprocessor1_Model05
select_by_pct_loss(members_res, desc(penalty), metric = "roc_auc")
## Warning in 1:best_index: numerical expression has 4 elements: only the first
## used
## # A tibble: 1 × 9
##     penalty .metric .estimator  mean     n std_err .config         .best   .loss
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           <dbl>   <dbl>
## 1 0.0000127 roc_auc binary     0.557     5 0.00932 Preprocessor1_… 0.557 0.00562
members_final <-
  members_wf %>%
  finalize_workflow(
    select_by_pct_loss(members_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(members_split)
## Warning in 1:best_index: numerical expression has 4 elements: only the first
## used
members_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits                id             .metrics .notes   .predictions .workflow 
##   <list>                <chr>          <list>   <list>   <list>       <list>    
## 1 <split [57389/19130]> train/test sp… <tibble> <tibble> <tibble>     <workflow>
collect_metrics(members_final)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.557 Preprocessor1_Model1
collect_predictions(members_final) %>%
  conf_mat(died, .pred_class)
##              Truth
## Prediction     died did not die
##    died          12           1
##   did not die   259       18858
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
members_final %>%
  extract_fit_engine() %>%
  vi()
## # A tibble: 99 × 3
##    Variable                      Importance Sign 
##    <chr>                              <dbl> <chr>
##  1 tfidf_expedition_role_ne            2.87 POS  
##  2 tfidf_expedition_role_kitchen       2.32 NEG  
##  3 tfidf_expedition_role_guide         1.83 POS  
##  4 tfidf_expedition_role_dep           1.81 POS  
##  5 tfidf_expedition_role_ldr           1.79 POS  
##  6 tfidf_expedition_role_mgr           1.68 POS  
##  7 tfidf_expedition_role_support       1.67 POS  
##  8 tfidf_expedition_role_group         1.62 POS  
##  9 tfidf_expedition_role_only          1.60 POS  
## 10 tfidf_expedition_role_ridge         1.56 NEG  
## # ℹ 89 more rows
  1. Is there a relationship between the different types of variables from a climbing dataset and their likelihood of death during expeditions?

The original data was made up of 76519 observations and 21 variable with its key characteristics used for my prediction being the ““observed”expedition_role” variable which was used to predict if a person died or not.

The dataset focuses on two key variables: ‘expedition_role’ and ‘died’. ‘expedition_role’ describes roles held by expedition members, such as “guide” or “scientist”, and is processed using term frequency-inverse document frequency (TF-IDF). The ‘died’ variable is categorical, indicating if an expedition member died, with categories “died” and “did not die”. It acts as the predictive model’s target, exploring the link between roles and fatality outcomes.

  1. The original data underwent significant transformations for modeling purposes. The ‘died’ variable, originally boolean, was recategorized into “died” and “did not die” for clarity. The ‘expedition_role’ variable was tokenized, filtered, and subjected to term frequency-inverse document frequency (TF-IDF) processing. These transformations made the data more suitable for machine learning, aiding in discerning patterns related to expedition roles and fatality outcomes. Converting raw data to this structured form improves model performance and interpretability.

  2. The data preparation involved several key steps: recategorizing the ‘died’ variable, tokenizing the ‘expedition_role’ variable, filtering tokens, and applying term frequency-inverse document frequency (TF-IDF) processing on ‘expedition_role’. These steps transformed the data into a more structured format suitable for machine learning modeling.

The machine learning model used in the analysis is the logistic regression model, specifically implemented with the glmnet engine.

  1. The model evaluation employed two metrics: accuracy, which measures the proportion of correct predictions, and roc_auc, which assesses the model’s ability to distinguish between classes based on the area under the receiver operating characteristic curve.

  2. The major findings from the analysis revealed a correlation between expedition roles and mortality rates. Notably, the ‘guide’ role exhibited a high likelihood of survival, while the ‘porter’ role had an increased risk of death. The logistic regression model demonstrated a 55.7% ROC_AUC, suggesting limited predictive power.