library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members %>%
  count(died)
## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106
data <- members %>%
    select(-death_cause, -injury_type, -highpoint_metres, -death_height_metres, -injury_height_metres) %>%
    
    drop_na() %>%
    select(-peak_id, -expedition_id) %>%
    distinct(member_id, .keep_all = TRUE) %>%
    mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>%
    mutate(across(where(is.character), factor)) %>%
    mutate(across(where(is.logical), factor)) %>%
    mutate(member_id = as.character(member_id))


skimr::skim(data)    
Data summary
Name data
Number of rows 72984
Number of columns 14
_______________________
Column type frequency:
character 1
factor 11
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
member_id 0 1 12 12 0 72984 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
peak_name 0 1 FALSE 390 Eve: 20994, Cho: 8608, Ama: 8235, Man: 4510
season 0 1 FALSE 4 Spr: 36150, Aut: 34186, Win: 2011, Sum: 637
sex 0 1 FALSE 2 M: 66150, F: 6834
citizenship 0 1 FALSE 207 Nep: 14367, USA: 6318, Jap: 6188, UK: 5071
expedition_role 0 1 FALSE 483 Cli: 43315, H-A: 13033, Lea: 9884, Exp: 1411
hired 0 1 FALSE 2 FAL: 59006, TRU: 13978
success 0 1 FALSE 2 FAL: 44913, TRU: 28071
solo 0 1 FALSE 2 FAL: 72868, TRU: 116
oxygen_used 0 1 FALSE 2 FAL: 55215, TRU: 17769
died 0 1 FALSE 2 no: 72055, die: 929
injured 0 1 FALSE 2 FAL: 71333, TRU: 1651

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2001.00 14.12 1905 1992 2004 2012 2019 ▁▁▁▃▇
age 0 1 37.34 10.39 7 29 36 44 85 ▁▇▅▁▁
data %>%
    select(hired:injured) %>%
    pivot_longer(-died) %>%
    filter(value == "TRUE") %>%
    group_by(died, name) %>%
    summarise(n_true = n()) %>%
    ungroup() %>%
    pivot_wider(names_from = died, values_from = n_true) %>%
    mutate(pct_died = died/no * 100) %>%
    ggplot(aes(pct_died, fct_reorder(name, pct_died))) +
    geom_col() +
    
    labs(y = NULL, x = "Percent Died",
         caption = "Solo seem to be relevant - 'Dangerous'")
## `summarise()` has grouped output by 'died'. You can override using the
## `.groups` argument.

top10_peaks <- data %>% count(peak_name, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(peak_name)

data %>% 
    
    filter(peak_name %in% top10_peaks) %>%
    count(died, peak_name) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(pct_died, fct_reorder(peak_name, pct_died))) +
    geom_col() +
    
    labs(y = "Peak Names", x = "Percent Died")

data %>% 
    
    ggplot(aes(died, year)) +
    geom_boxplot()

data %>%
    
    mutate(year_decade = year %/% 10 * 10) %>%
    count(died, year_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(year_decade, pct_died))  +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")

data %>%
    
    count(died, season) %>%
    pivot_wider(names_from = died, values_from = n) %>%
    mutate(pct_died = (died / (died + no) * 100)) %>%
    
    ggplot(aes(pct_died, fct_reorder(season, pct_died))) +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")

plot_pct_died <- function(data, cat_var) {
    
    data %>%

    count(died, {{cat_var}}) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (died + no) * 100)) %>%
    
    ggplot(aes(pct_died, fct_reorder({{cat_var}}, pct_died))) +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")
    
}  
data %>% plot_pct_died(cat_var = sex)

data %>% 
    
    ggplot(aes(died, age)) +
    geom_boxplot()

data %>%
    
    mutate(age_decade = age %/% 10 * 10) %>%
    count(died, age_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(age_decade, pct_died)) +
    geom_col() %>%
    
    labs(y = "Percent Died", x = "Age by Decade")

data %>%
    
    mutate(age_decade = age %/% 10 * 10) %>%
    count(died, age_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%
    filter(age_decade != 80) %>%
    
    ggplot(aes(age_decade, pct_died)) +
    geom_col() %>%
    
    labs(y = "Percent Died", x = "Age by Decade")

top10_citizen <- data %>% count(citizenship, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(citizenship)

data %>%
    filter(citizenship %in% top10_citizen)  %>%
    plot_pct_died(cat_var = citizenship)

top10_role <- data %>% count(expedition_role, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(expedition_role)

data %>%
    filter(expedition_role %in% top10_role)  %>%
    plot_pct_died(cat_var = expedition_role)

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
data_split <- initial_split(data, strata = died)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- vfold_cv(data_train, strata = died)
data_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [49264/5474]> Fold01
##  2 <split [49264/5474]> Fold02
##  3 <split [49264/5474]> Fold03
##  4 <split [49264/5474]> Fold04
##  5 <split [49264/5474]> Fold05
##  6 <split [49264/5474]> Fold06
##  7 <split [49264/5474]> Fold07
##  8 <split [49264/5474]> Fold08
##  9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10
library(themis)
library(embed)  

xgboost_recipe <-
  recipe(formula = died ~ ., data = data_train) %>%
    update_role(member_id, new_role = "id") %>%
    step_other(peak_name, citizenship, expedition_role) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_zv(all_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(died)


xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 108,068
## Columns: 35
## $ member_id                  <fct> AMAD78301-02, AMAD78301-05, AMAD78301-06, A…
## $ year                       <dbl> -1.635974, -1.635974, -1.635974, -1.635974,…
## $ age                        <dbl> 0.35136828, -0.32114464, -1.18580412, 0.351…
## $ died                       <fct> no, no, no, no, no, no, no, no, no, no, no,…
## $ peak_name_Ama.Dablam       <dbl> 2.812559, 2.812559, 2.812559, 2.812559, 2.8…
## $ peak_name_Cho.Oyu          <dbl> -0.363673, -0.363673, -0.363673, -0.363673,…
## $ peak_name_Everest          <dbl> -0.6366008, -0.6366008, -0.6366008, -0.6366…
## $ peak_name_Manaslu          <dbl> -0.2574662, -0.2574662, -0.2574662, -0.2574…
## $ peak_name_other            <dbl> -0.8516586, -0.8516586, -0.8516586, -0.8516…
## $ season_Autumn              <dbl> 1.066582, 1.066582, 1.066582, 1.066582, -0.…
## $ season_Spring              <dbl> -0.9927462, -0.9927462, -0.9927462, -0.9927…
## $ season_Summer              <dbl> -0.09336126, -0.09336126, -0.09336126, -0.0…
## $ season_Winter              <dbl> -0.1671621, -0.1671621, -0.1671621, -0.1671…
## $ sex_F                      <dbl> -0.3221255, -0.3221255, -0.3221255, -0.3221…
## $ sex_M                      <dbl> 0.3221255, 0.3221255, 0.3221255, 0.3221255,…
## $ citizenship_France         <dbl> 3.9004595, 3.9004595, 3.9004595, 3.9004595,…
## $ citizenship_Japan          <dbl> -0.3062252, -0.3062252, -0.3062252, -0.3062…
## $ citizenship_Nepal          <dbl> -0.493404, -0.493404, -0.493404, -0.493404,…
## $ citizenship_UK             <dbl> -0.2720068, -0.2720068, -0.2720068, -0.2720…
## $ citizenship_USA            <dbl> -0.3077572, -0.3077572, -0.3077572, -0.3077…
## $ citizenship_other          <dbl> -1.0027716, -1.0027716, -1.0027716, -1.0027…
## $ expedition_role_Climber    <dbl> -1.2099521, 0.8264639, 0.8264639, 0.8264639…
## $ expedition_role_H.A.Worker <dbl> -0.4644574, -0.4644574, -0.4644574, -0.4644…
## $ expedition_role_Leader     <dbl> -0.396114, -0.396114, -0.396114, -0.396114,…
## $ expedition_role_other      <dbl> 3.127181, -0.319771, -0.319771, -0.319771, …
## $ hired_FALSE.               <dbl> 0.4848862, 0.4848862, 0.4848862, 0.4848862,…
## $ hired_TRUE.                <dbl> -0.4848862, -0.4848862, -0.4848862, -0.4848…
## $ success_FALSE.             <dbl> 0.7921472, 0.7921472, 0.7921472, 0.7921472,…
## $ success_TRUE.              <dbl> -0.7921472, -0.7921472, -0.7921472, -0.7921…
## $ solo_FALSE.                <dbl> 0.0398985, 0.0398985, 0.0398985, 0.0398985,…
## $ solo_TRUE.                 <dbl> -0.0398985, -0.0398985, -0.0398985, -0.0398…
## $ oxygen_used_FALSE.         <dbl> 0.5681049, 0.5681049, 0.5681049, 0.5681049,…
## $ oxygen_used_TRUE.          <dbl> -0.5681049, -0.5681049, -0.5681049, -0.5681…
## $ injured_FALSE.             <dbl> 0.1515509, 0.1515509, 0.1515509, 0.1515509,…
## $ injured_TRUE.              <dbl> -0.1515509, -0.1515509, -0.1515509, -0.1515…
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

doParallel::registerDoParallel()
set.seed(27358)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_folds,
            control = control_resamples(save_pred = TRUE))
collect_metrics(xgboost_tune)
## # A tibble: 20 × 10
##    trees min_n tree_depth learn_rate .metric  .estimator  mean     n  std_err
##    <int> <int>      <int>      <dbl> <chr>    <chr>      <dbl> <int>    <dbl>
##  1     6     2          5    0.00648 accuracy binary     0.613    10 0.0191  
##  2     6     2          5    0.00648 roc_auc  binary     0.666    10 0.0128  
##  3  1765     8          1    0.252   accuracy binary     0.805    10 0.00260 
##  4  1765     8          1    0.252   roc_auc  binary     0.675    10 0.0140  
##  5   257    10          7    0.0840  accuracy binary     0.985    10 0.000627
##  6   257    10          7    0.0840  roc_auc  binary     0.735    10 0.0126  
##  7  1867    15         14    0.00279 accuracy binary     0.963    10 0.00154 
##  8  1867    15         14    0.00279 roc_auc  binary     0.736    10 0.0138  
##  9  1057    20          3    0.00427 accuracy binary     0.692    10 0.00690 
## 10  1057    20          3    0.00427 roc_auc  binary     0.703    10 0.0125  
## 11   771    23         10    0.00102 accuracy binary     0.794    10 0.00474 
## 12   771    23         10    0.00102 roc_auc  binary     0.723    10 0.0140  
## 13  1531    26         13    0.171   accuracy binary     0.986    10 0.000557
## 14  1531    26         13    0.171   roc_auc  binary     0.730    10 0.00918 
## 15   966    29          7    0.0419  accuracy binary     0.986    10 0.000545
## 16   966    29          7    0.0419  roc_auc  binary     0.742    10 0.0127  
## 17  1233    33          9    0.0140  accuracy binary     0.985    10 0.000531
## 18  1233    33          9    0.0140  roc_auc  binary     0.738    10 0.0129  
## 19   582    39         11    0.0186  accuracy binary     0.982    10 0.000572
## 20   582    39         11    0.0186  roc_auc  binary     0.735    10 0.0128  
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(died, .pred_died) %>%
    autoplot()

final_fitted <- xgboost_workflow %>%
    
    finalize_workflow(select_best(xgboost_tune, "accuracy")) %>%
    last_fit(data_split)

collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.733 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
    conf_mat(died, .pred_class) %>%
    autoplot()

collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.733 Preprocessor1_Model1
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
final_fitted %>%
  extract_fit_engine() %>%
  vip()

  1. Is there a relationship between the different types of variables from a climbing dataset and their likelihood of death during expeditions?

The original data was made up of 76519 observations and 21 variable with its key characteristics used for my prediction being the ““observed”expedition_role” variable which was used to predict if a person died or not.

The dataset focuses on two key variables: ‘expedition_role’ and ‘died’. ‘expedition_role’ describes roles held by expedition members, such as “guide” or “scientist”, and is processed using term frequency-inverse document frequency (TF-IDF). The ‘died’ variable is categorical, indicating if an expedition member died, with categories “died” and “did not die”. It acts as the predictive model’s target, exploring the link between roles and fatality outcomes.

  1. The original data underwent significant transformations for modeling purposes. The ‘died’ variable, originally boolean, was recategorized into “died” and “did not die” for clarity. The ‘expedition_role’ variable was tokenized, filtered, and subjected to term frequency-inverse document frequency (TF-IDF) processing. These transformations made the data more suitable for machine learning, aiding in discerning patterns related to expedition roles and fatality outcomes. Converting raw data to this structured form improves model performance and interpretability.

  2. The data preparation involved several key steps: recategorizing the ‘died’ variable, tokenizing the ‘expedition_role’ variable, filtering tokens, and applying term frequency-inverse document frequency (TF-IDF) processing on ‘expedition_role’. These steps transformed the data into a more structured format suitable for machine learning modeling.

The machine learning model used in the analysis is the logistic regression model, specifically implemented with the glmnet engine.

  1. The model evaluation employed two metrics: accuracy, which measures the proportion of correct predictions, and roc_auc, which assesses the model’s ability to distinguish between classes based on the area under the receiver operating characteristic curve.

  2. The major findings from the analysis revealed a correlation between expedition roles and mortality rates. Notably, the ‘guide’ role exhibited a high likelihood of survival, while the ‘porter’ role had an increased risk of death. The logistic regression model demonstrated a 55.7% ROC_AUC, suggesting limited predictive power.