library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members %>%
count(died)
## # A tibble: 2 × 2
## died n
## <lgl> <int>
## 1 FALSE 75413
## 2 TRUE 1106
members <-
members %>%
mutate(died = case_when(
died == "FALSE" ~ "did not die",
died == "TRUE" ~ " died"))
library(tidytext)
library(tidylo)
members %>%
unnest_tokens(word, expedition_role) %>%
count(died, word) %>%
bind_log_odds(died, word, n) %>%
arrange(-log_odds_weighted)
## # A tibble: 326 × 4
## died word n log_odds_weighted
## <chr> <chr> <int> <dbl>
## 1 "did not die" guide 161 5.18
## 2 "did not die" support 126 4.58
## 3 " died" porter 29 3.86
## 4 "did not die" rope 62 3.21
## 5 "did not die" scientist 60 3.16
## 6 "did not die" torch 49 2.86
## 7 " died" kitchen 15 2.66
## 8 "did not die" abc 42 2.65
## 9 "did not die" ldr 40 2.58
## 10 "did not die" radio 38 2.52
## # ℹ 316 more rows
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
members_split <-
members %>%
select(expedition_role, died) %>%
initial_split(strata = died)
members_train <- training(members_split)
members_test <- testing(members_split)
set.seed(234)
members_folds <- vfold_cv(members_train, v = 5, strata = died)
members_folds
## # 5-fold cross-validation using stratification
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [45911/11478]> Fold1
## 2 <split [45911/11478]> Fold2
## 3 <split [45911/11478]> Fold3
## 4 <split [45911/11478]> Fold4
## 5 <split [45912/11477]> Fold5
library(textrecipes)
members_rec <-
recipe(died ~ expedition_role, data = members_train) %>%
step_tokenize(expedition_role) %>%
step_tokenfilter(expedition_role) %>%
step_tfidf(expedition_role)
members_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 1
##
## ── Operations
## • Tokenization for: expedition_role
## • Text filtering for: expedition_role
## • Term frequency-inverse document frequency with: expedition_role
glmnet_spec <-
logistic_reg(mixture = 1, penalty = tune()) %>%
set_engine("glmnet")
members_wf <- workflow(members_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
members_res <-
tune_grid(
members_wf,
members_folds,
grid = 10)
autoplot(members_res)
show_best(members_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 6.57e-10 roc_auc binary 0.557 5 0.00933 Preprocessor1_Model01
## 2 5.55e- 9 roc_auc binary 0.557 5 0.00933 Preprocessor1_Model02
## 3 1.05e- 8 roc_auc binary 0.557 5 0.00933 Preprocessor1_Model03
## 4 3.46e- 7 roc_auc binary 0.557 5 0.00933 Preprocessor1_Model04
## 5 4.09e- 6 roc_auc binary 0.557 5 0.00933 Preprocessor1_Model05
select_by_pct_loss(members_res, desc(penalty), metric = "roc_auc")
## Warning in 1:best_index: numerical expression has 4 elements: only the first
## used
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .loss
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.0000127 roc_auc binary 0.557 5 0.00932 Preprocessor1_… 0.557 0.00562
members_final <-
members_wf %>%
finalize_workflow(
select_by_pct_loss(members_res, desc(penalty), metric = "roc_auc")
) %>%
last_fit(members_split)
## Warning in 1:best_index: numerical expression has 4 elements: only the first
## used
members_final
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [57389/19130]> train/test sp… <tibble> <tibble> <tibble> <workflow>
collect_metrics(members_final)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.986 Preprocessor1_Model1
## 2 roc_auc binary 0.557 Preprocessor1_Model1
collect_predictions(members_final) %>%
conf_mat(died, .pred_class)
## Truth
## Prediction died did not die
## died 12 1
## did not die 259 18858
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
members_final %>%
extract_fit_engine() %>%
vi()
## # A tibble: 99 × 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 tfidf_expedition_role_ne 2.87 POS
## 2 tfidf_expedition_role_kitchen 2.32 NEG
## 3 tfidf_expedition_role_guide 1.83 POS
## 4 tfidf_expedition_role_dep 1.81 POS
## 5 tfidf_expedition_role_ldr 1.79 POS
## 6 tfidf_expedition_role_mgr 1.68 POS
## 7 tfidf_expedition_role_support 1.67 POS
## 8 tfidf_expedition_role_group 1.62 POS
## 9 tfidf_expedition_role_only 1.60 POS
## 10 tfidf_expedition_role_ridge 1.56 NEG
## # ℹ 89 more rows
The original data was made up of 76519 observations and 21 variable with its key characteristics used for my prediction being the ““observed”expedition_role” variable which was used to predict if a person died or not.
The dataset focuses on two key variables: ‘expedition_role’ and ‘died’. ‘expedition_role’ describes roles held by expedition members, such as “guide” or “scientist”, and is processed using term frequency-inverse document frequency (TF-IDF). The ‘died’ variable is categorical, indicating if an expedition member died, with categories “died” and “did not die”. It acts as the predictive model’s target, exploring the link between roles and fatality outcomes.
The original data underwent significant transformations for modeling purposes. The ‘died’ variable, originally boolean, was recategorized into “died” and “did not die” for clarity. The ‘expedition_role’ variable was tokenized, filtered, and subjected to term frequency-inverse document frequency (TF-IDF) processing. These transformations made the data more suitable for machine learning, aiding in discerning patterns related to expedition roles and fatality outcomes. Converting raw data to this structured form improves model performance and interpretability.
The data preparation involved several key steps: recategorizing the ‘died’ variable, tokenizing the ‘expedition_role’ variable, filtering tokens, and applying term frequency-inverse document frequency (TF-IDF) processing on ‘expedition_role’. These steps transformed the data into a more structured format suitable for machine learning modeling.
The machine learning model used in the analysis is the logistic regression model, specifically implemented with the glmnet engine.
The model evaluation employed two metrics: accuracy, which measures the proportion of correct predictions, and roc_auc, which assesses the model’s ability to distinguish between classes based on the area under the receiver operating characteristic curve.
The major findings from the analysis revealed a correlation between expedition roles and mortality rates. Notably, the ‘guide’ role exhibited a high likelihood of survival, while the ‘porter’ role had an increased risk of death. The logistic regression model demonstrated a 55.7% ROC_AUC, suggesting limited predictive power.