Himalayan Climbing Expeditions: The dataset is a compilation of records for all expeditions that have climbed in the Nepal Himalaya. Build a classification model to predict whether the person died (died). Use the members dataset.

library(tidyverse)
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

members %>% skimr::skim()
Data summary
Name Piped data
Number of rows 76519
Number of columns 21
_______________________
Column type frequency:
character 10
logical 6
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
expedition_id 0 1.00 9 9 0 10350 0
member_id 0 1.00 12 12 0 76518 0
peak_id 0 1.00 4 4 0 391 0
peak_name 15 1.00 4 25 0 390 0
season 0 1.00 6 7 0 5 0
sex 2 1.00 1 1 0 2 0
citizenship 10 1.00 2 23 0 212 0
expedition_role 21 1.00 4 25 0 524 0
death_cause 75413 0.01 3 27 0 12 0
injury_type 74807 0.02 3 27 0 11 0

Variable type: logical

skim_variable n_missing complete_rate mean count
hired 0 1 0.21 FAL: 60788, TRU: 15731
success 0 1 0.38 FAL: 47320, TRU: 29199
solo 0 1 0.00 FAL: 76398, TRU: 121
oxygen_used 0 1 0.24 FAL: 58286, TRU: 18233
died 0 1 0.01 FAL: 75413, TRU: 1106
injured 0 1 0.02 FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2000.36 14.78 1905 1991 2004 2012 2019 ▁▁▁▃▇
age 3497 0.95 37.33 10.40 7 29 36 44 85 ▁▇▅▁▁
highpoint_metres 21833 0.71 7470.68 1040.06 3800 6700 7400 8400 8850 ▁▁▆▃▇
death_height_metres 75451 0.01 6592.85 1308.19 400 5800 6600 7550 8830 ▁▁▂▇▆
injury_height_metres 75510 0.01 7049.91 1214.24 400 6200 7100 8000 8880 ▁▁▂▇▇

Notes about the dataset

data <- members %>%
    
    # Drop variables with too many missing values
    select(-death_cause, -injury_type, -highpoint_metres, -death_height_metres, -injury_height_metres) %>%
    
    # Drop observations with missing values
    drop_na() %>%
    
    # Remove a redundant variable
    select(-peak_id, -expedition_id) %>%
    
    # Remove one of the two members with the identical id
    distinct(member_id, .keep_all = TRUE) %>%
    
    # Recode died
    mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>%
    
    # Convert character variables to factors
    mutate(across(where(is.character), factor)) %>%
    
    # Convert logical variables to factors
    mutate(across(where(is.logical), factor)) %>%
    
    # id should be character
    mutate(member_id = as.character(member_id))


skimr::skim(data)    
Data summary
Name data
Number of rows 72984
Number of columns 14
_______________________
Column type frequency:
character 1
factor 11
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
member_id 0 1 12 12 0 72984 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
peak_name 0 1 FALSE 390 Eve: 20994, Cho: 8608, Ama: 8235, Man: 4510
season 0 1 FALSE 4 Spr: 36150, Aut: 34186, Win: 2011, Sum: 637
sex 0 1 FALSE 2 M: 66150, F: 6834
citizenship 0 1 FALSE 207 Nep: 14367, USA: 6318, Jap: 6188, UK: 5071
expedition_role 0 1 FALSE 483 Cli: 43315, H-A: 13033, Lea: 9884, Exp: 1411
hired 0 1 FALSE 2 FAL: 59006, TRU: 13978
success 0 1 FALSE 2 FAL: 44913, TRU: 28071
solo 0 1 FALSE 2 FAL: 72868, TRU: 116
oxygen_used 0 1 FALSE 2 FAL: 55215, TRU: 17769
died 0 1 FALSE 2 no: 72055, die: 929
injured 0 1 FALSE 2 FAL: 71333, TRU: 1651

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2001.00 14.12 1905 1992 2004 2012 2019 ▁▁▁▃▇
age 0 1 37.34 10.39 7 29 36 44 85 ▁▇▅▁▁

Explore data

Solo, Hired, Oxygen Used, Success, Injured

Examine binary variables together.

Solo seems to be important.

data %>%
    select(hired:injured) %>%
    
    pivot_longer(-died) %>%

    filter(value == "TRUE") %>%
    
    group_by(died, name) %>%
    summarise(n_true = n()) %>%
    ungroup() %>%
    
    pivot_wider(names_from = died, values_from = n_true) %>%
    
    mutate(pct_died = died/no * 100) %>%
    
    ggplot(aes(pct_died, fct_reorder(name, pct_died))) +
    geom_col() +
    
    labs(y = NULL, x = "Percent Died",
         caption = "Solo seem to be relevant - 'Dangerous'")

Peak Name

Are some peaks more dangerous than others?

top10_peaks <- data %>% count(peak_name, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(peak_name)

data %>% 
    
    filter(peak_name %in% top10_peaks) %>%
    count(died, peak_name) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(pct_died, fct_reorder(peak_name, pct_died))) +
    geom_col() +
    
    labs(y = "Peak Names", x = "Percent Died")

Year

Has it become less dangerous to climb as the equipment has modernized?

data %>% 
    
    ggplot(aes(died, year)) +
    geom_boxplot()

Year by Decade

Try age by decade.

  • Climbers in their 80s have a much higher chance of dying.
  • Outside of the age group of 80s, the age seems to give more experience and reduces the likelihood of dying.

Based on this, we may remove climbers in their 80s (only five climbers are 80 or older) and use age_decade as a numeric data.

data %>%
    
    mutate(year_decade = year %/% 10 * 10) %>%
    count(died, year_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(year_decade, pct_died))  +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")

Season

Is winter more dangerous than summer?

data %>%
    
    count(died, season) %>%
    pivot_wider(names_from = died, values_from = n) %>%
    mutate(pct_died = (died / (died + no) * 100)) %>%
    
    ggplot(aes(pct_died, fct_reorder(season, pct_died))) +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")

***************************

Function to plot pct died

****************************

There are many categorical variables to find correlation with died. The same code repeats. Create a function.

plot_pct_died <- function(data, cat_var) {
    
    data %>%

    count(died, {{cat_var}}) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (died + no) * 100)) %>%
    
    ggplot(aes(pct_died, fct_reorder({{cat_var}}, pct_died))) +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")
    
}  

Sex

Is winter more dangerous than summer?

data %>% plot_pct_died(cat_var = sex)

Age

Age doesn’t matter?

data %>% 
    
    ggplot(aes(died, age)) +
    geom_boxplot()

Age by Decade

Try age by decade.

  • Climbers in their 80s have a much higher chance of dying.
  • Outside of the age group of 80s, the age seems to give more experience and reduces the likelihood of dying.
data %>%
    
    mutate(age_decade = age %/% 10 * 10) %>%
    count(died, age_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(age_decade, pct_died)) +
    geom_col() %>%
    
    labs(y = "Percent Died", x = "Age by Decade")

data %>%
    
    mutate(age_decade = age %/% 10 * 10) %>%
    count(died, age_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%
    filter(age_decade != 80) %>%
    
    ggplot(aes(age_decade, pct_died)) +
    geom_col() %>%
    
    labs(y = "Percent Died", x = "Age by Decade")

Citizenship

top10_citizen <- data %>% count(citizenship, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(citizenship)

data %>%
    filter(citizenship %in% top10_citizen)  %>%
    plot_pct_died(cat_var = citizenship)

Expedition Role

top10_role <- data %>% count(expedition_role, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(expedition_role)

data %>%
    filter(expedition_role %in% top10_role)  %>%
    plot_pct_died(cat_var = expedition_role)

Build Model

# data <- data %>% group_by(died) %>% sample_n(50) %>% ungroup()

library(tidymodels)

set.seed(123)
data_split <- initial_split(data, strata = died)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- vfold_cv(data_train, strata = died)
data_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [49264/5474]> Fold01
##  2 <split [49264/5474]> Fold02
##  3 <split [49264/5474]> Fold03
##  4 <split [49264/5474]> Fold04
##  5 <split [49264/5474]> Fold05
##  6 <split [49264/5474]> Fold06
##  7 <split [49264/5474]> Fold07
##  8 <split [49264/5474]> Fold08
##  9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10
library(themis) # for step_smote
library(embed)  # for step_lencode_glm

xgboost_recipe <-
  recipe(formula = died ~ ., data = data_train) %>%
    update_role(member_id, new_role = "id") %>%
    # step_lencode_glm(c(expedition_id, peak_name, citizenship, expedition_role), outcome = vars(died)) %>%  # for factors with many values
    step_other(peak_name, citizenship, expedition_role) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_zv(all_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(died) # for unbalanced variable, died

# xgboost_recipe <- 
#   recipe(formula = died ~ ., data = data_train) %>%
#     update_role(member_id, new_role = "id") %>%
#     # step_lencode_glm(c(expedition_id, peak_name, citizenship, expedition_role), outcome = vars(died)) %>%  # for factors with many values
#     step_other(peak_name, citizenship, expedition_role) %>%
#     step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
#     step_zv(all_predictors()) %>%
#     step_normalize(all_numeric_predictors()) 

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 108,068
## Columns: 35
## $ member_id                  <fct> AMAD78301-02, AMAD78301-05, AMAD78301-06, A…
## $ year                       <dbl> -1.635974, -1.635974, -1.635974, -1.635974,…
## $ age                        <dbl> 0.35136828, -0.32114464, -1.18580412, 0.351…
## $ died                       <fct> no, no, no, no, no, no, no, no, no, no, no,…
## $ peak_name_Ama.Dablam       <dbl> 2.812559, 2.812559, 2.812559, 2.812559, 2.8…
## $ peak_name_Cho.Oyu          <dbl> -0.363673, -0.363673, -0.363673, -0.363673,…
## $ peak_name_Everest          <dbl> -0.6366008, -0.6366008, -0.6366008, -0.6366…
## $ peak_name_Manaslu          <dbl> -0.2574662, -0.2574662, -0.2574662, -0.2574…
## $ peak_name_other            <dbl> -0.8516586, -0.8516586, -0.8516586, -0.8516…
## $ season_Autumn              <dbl> 1.066582, 1.066582, 1.066582, 1.066582, -0.…
## $ season_Spring              <dbl> -0.9927462, -0.9927462, -0.9927462, -0.9927…
## $ season_Summer              <dbl> -0.09336126, -0.09336126, -0.09336126, -0.0…
## $ season_Winter              <dbl> -0.1671621, -0.1671621, -0.1671621, -0.1671…
## $ sex_F                      <dbl> -0.3221255, -0.3221255, -0.3221255, -0.3221…
## $ sex_M                      <dbl> 0.3221255, 0.3221255, 0.3221255, 0.3221255,…
## $ citizenship_France         <dbl> 3.9004595, 3.9004595, 3.9004595, 3.9004595,…
## $ citizenship_Japan          <dbl> -0.3062252, -0.3062252, -0.3062252, -0.3062…
## $ citizenship_Nepal          <dbl> -0.493404, -0.493404, -0.493404, -0.493404,…
## $ citizenship_UK             <dbl> -0.2720068, -0.2720068, -0.2720068, -0.2720…
## $ citizenship_USA            <dbl> -0.3077572, -0.3077572, -0.3077572, -0.3077…
## $ citizenship_other          <dbl> -1.0027716, -1.0027716, -1.0027716, -1.0027…
## $ expedition_role_Climber    <dbl> -1.2099521, 0.8264639, 0.8264639, 0.8264639…
## $ expedition_role_H.A.Worker <dbl> -0.4644574, -0.4644574, -0.4644574, -0.4644…
## $ expedition_role_Leader     <dbl> -0.396114, -0.396114, -0.396114, -0.396114,…
## $ expedition_role_other      <dbl> 3.127181, -0.319771, -0.319771, -0.319771, …
## $ hired_FALSE.               <dbl> 0.4848862, 0.4848862, 0.4848862, 0.4848862,…
## $ hired_TRUE.                <dbl> -0.4848862, -0.4848862, -0.4848862, -0.4848…
## $ success_FALSE.             <dbl> 0.7921472, 0.7921472, 0.7921472, 0.7921472,…
## $ success_TRUE.              <dbl> -0.7921472, -0.7921472, -0.7921472, -0.7921…
## $ solo_FALSE.                <dbl> 0.0398985, 0.0398985, 0.0398985, 0.0398985,…
## $ solo_TRUE.                 <dbl> -0.0398985, -0.0398985, -0.0398985, -0.0398…
## $ oxygen_used_FALSE.         <dbl> 0.5681049, 0.5681049, 0.5681049, 0.5681049,…
## $ oxygen_used_TRUE.          <dbl> -0.5681049, -0.5681049, -0.5681049, -0.5681…
## $ injured_FALSE.             <dbl> 0.1515509, 0.1515509, 0.1515509, 0.1515509,…
## $ injured_TRUE.              <dbl> -0.1515509, -0.1515509, -0.1515509, -0.1515…
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

doParallel::registerDoParallel()
set.seed(27358)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_folds,
            control = control_resamples(save_pred = TRUE))

Model evaluation

Show performance

collect_metrics(xgboost_tune)
## # A tibble: 20 × 10
##    trees min_n tree_depth learn_rate .metric  .estimator  mean     n  std_err
##    <int> <int>      <int>      <dbl> <chr>    <chr>      <dbl> <int>    <dbl>
##  1     6     2          5    0.00648 accuracy binary     0.613    10 0.0191  
##  2     6     2          5    0.00648 roc_auc  binary     0.666    10 0.0128  
##  3  1765     8          1    0.252   accuracy binary     0.805    10 0.00252 
##  4  1765     8          1    0.252   roc_auc  binary     0.676    10 0.0140  
##  5   257    10          7    0.0840  accuracy binary     0.985    10 0.000654
##  6   257    10          7    0.0840  roc_auc  binary     0.740    10 0.0126  
##  7  1867    15         14    0.00279 accuracy binary     0.963    10 0.00144 
##  8  1867    15         14    0.00279 roc_auc  binary     0.736    10 0.0140  
##  9  1057    20          3    0.00427 accuracy binary     0.692    10 0.00702 
## 10  1057    20          3    0.00427 roc_auc  binary     0.703    10 0.0124  
## 11   771    23         10    0.00102 accuracy binary     0.793    10 0.00490 
## 12   771    23         10    0.00102 roc_auc  binary     0.723    10 0.0139  
## 13  1531    26         13    0.171   accuracy binary     0.986    10 0.000568
## 14  1531    26         13    0.171   roc_auc  binary     0.732    10 0.00894 
## 15   966    29          7    0.0419  accuracy binary     0.986    10 0.000481
## 16   966    29          7    0.0419  roc_auc  binary     0.744    10 0.0123  
## 17  1233    33          9    0.0140  accuracy binary     0.985    10 0.000512
## 18  1233    33          9    0.0140  roc_auc  binary     0.738    10 0.0126  
## 19   582    39         11    0.0186  accuracy binary     0.983    10 0.000483
## 20   582    39         11    0.0186  roc_auc  binary     0.733    10 0.0124  
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(died, .pred_died) %>%
    autoplot()

# conf_mat_resampled(xgboost_tune, tidy = FALSE) %>%
#     autoplot()

Finalize workflow (last_fit)

final_fitted <- xgboost_workflow %>%
    
    # Fit the best model to the train data
    finalize_workflow(select_best(xgboost_tune, "accuracy")) %>%
    
    # Evaluate on the test data
    last_fit(data_split)

collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.728 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
    conf_mat(died, .pred_class) %>%
    autoplot()

Variable importance

library(vip)

final_fitted %>%
  extract_fit_engine() %>%
  vip()