Himalayan Climbing Expeditions: The dataset is a compilation of records for all expeditions that have climbed in the Nepal Himalaya. Build a classification model to predict whether the person died (died). Use the members dataset.

library(tidyverse)
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

members %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	76519
Number of columns	21
_______________________
Column type frequency:
character	10
logical	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
expedition_id	0	1.00	9	9	10350
member_id	0	1.00	12	12	76518
peak_id	0	1.00	4	4	391
peak_name	15	1.00	4	25	390
season	0	1.00	6	7	5
sex	2	1.00	1	1	2
citizenship	10	1.00	2	23	212
expedition_role	21	1.00	4	25	524
death_cause	75413	0.01	3	27	12
injury_type	74807	0.02	3	27	11

Variable type: logical

skim_variable	complete_rate	mean	count
hired	1	0.21	FAL: 60788, TRU: 15731
success	1	0.38	FAL: 47320, TRU: 29199
solo	1	0.00	FAL: 76398, TRU: 121
oxygen_used	1	0.24	FAL: 58286, TRU: 18233
died	1	0.01	FAL: 75413, TRU: 1106
injured	1	0.02	FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2000.36	14.78	1905	1991	2004	2012	2019	▁▁▁▃▇
age	3497	0.95	37.33	10.40	7	29	36	44	85	▁▇▅▁▁
highpoint_metres	21833	0.71	7470.68	1040.06	3800	6700	7400	8400	8850	▁▁▆▃▇
death_height_metres	75451	0.01	6592.85	1308.19	400	5800	6600	7550	8830	▁▁▂▇▆
injury_height_metres	75510	0.01	7049.91	1214.24	400	6200	7100	8000	8880	▁▁▂▇▇

Notes about the dataset

Some variables have too many missing values - death_casue, injury_type, highpoint_metres, death_height_metres, injury_height_metres: drop the variables
Some others have a relatively few missing values: filter the observations out. The data is still large after dropping NAs
The target variable is unbalanced (only a small portion die): themis::step_smote(died)
member_id is the id variable but with one member showing up twice: remove one of the duplicates. Not sure which is legit.
Many categorical variables with cadinality issue - expedition_id, peak_id, peak_name, citizenship, expedition_role: step_other or step_lencode_glm
redundant variables - peak_id and peak_name: keep peak_name
Character variables need to be converted to factor

data <- members %>%
    
    # Drop variables with too many missing values
    select(-death_cause, -injury_type, -highpoint_metres, -death_height_metres, -injury_height_metres) %>%
    
    # Drop observations with missing values
    drop_na() %>%
    
    # Remove a redundant variable
    select(-peak_id, -expedition_id) %>%
    
    # Remove one of the two members with the identical id
    distinct(member_id, .keep_all = TRUE) %>%
    
    # Recode died
    mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>%
    
    # Convert character variables to factors
    mutate(across(where(is.character), factor)) %>%
    
    # Convert logical variables to factors
    mutate(across(where(is.logical), factor)) %>%
    
    # id should be character
    mutate(member_id = as.character(member_id))


skimr::skim(data)

Data summary
Name	data
Number of rows	72984
Number of columns	14
_______________________
Column type frequency:
character	1
factor	11
numeric	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
member_id	0	1	12	12	0	72984	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
peak_name	1	FALSE	390	Eve: 20994, Cho: 8608, Ama: 8235, Man: 4510
season	1	FALSE	4	Spr: 36150, Aut: 34186, Win: 2011, Sum: 637
sex	1	FALSE	2	M: 66150, F: 6834
citizenship	1	FALSE	207	Nep: 14367, USA: 6318, Jap: 6188, UK: 5071
expedition_role	1	FALSE	483	Cli: 43315, H-A: 13033, Lea: 9884, Exp: 1411
hired	1	FALSE	2	FAL: 59006, TRU: 13978
success	1	FALSE	2	FAL: 44913, TRU: 28071
solo	1	FALSE	2	FAL: 72868, TRU: 116
oxygen_used	1	FALSE	2	FAL: 55215, TRU: 17769
died	1	FALSE	2	no: 72055, die: 929
injured	1	FALSE	2	FAL: 71333, TRU: 1651

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1	2001.00	14.12	1905	1992	2004	2012	2019	▁▁▁▃▇
age	0	1	37.34	10.39	7	29	36	44	85	▁▇▅▁▁

Explore data

Solo, Hired, Oxygen Used, Success, Injured

Examine binary variables together.

Solo seems to be important.

data %>%
    select(hired:injured) %>%
    
    pivot_longer(-died) %>%

    filter(value == "TRUE") %>%
    
    group_by(died, name) %>%
    summarise(n_true = n()) %>%
    ungroup() %>%
    
    pivot_wider(names_from = died, values_from = n_true) %>%
    
    mutate(pct_died = died/no * 100) %>%
    
    ggplot(aes(pct_died, fct_reorder(name, pct_died))) +
    geom_col() +
    
    labs(y = NULL, x = "Percent Died",
         caption = "Solo seem to be relevant - 'Dangerous'")

Peak Name

Are some peaks more dangerous than others?

top10_peaks <- data %>% count(peak_name, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(peak_name)

data %>% 
    
    filter(peak_name %in% top10_peaks) %>%
    count(died, peak_name) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(pct_died, fct_reorder(peak_name, pct_died))) +
    geom_col() +
    
    labs(y = "Peak Names", x = "Percent Died")

Year

Has it become less dangerous to climb as the equipment has modernized?

data %>% 
    
    ggplot(aes(died, year)) +
    geom_boxplot()

Year by Decade

Try age by decade.

Climbers in their 80s have a much higher chance of dying.
Outside of the age group of 80s, the age seems to give more experience and reduces the likelihood of dying.

Based on this, we may remove climbers in their 80s (only five climbers are 80 or older) and use age_decade as a numeric data.

data %>%
    
    mutate(year_decade = year %/% 10 * 10) %>%
    count(died, year_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(year_decade, pct_died))  +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")

Season

Is winter more dangerous than summer?

data %>%
    
    count(died, season) %>%
    pivot_wider(names_from = died, values_from = n) %>%
    mutate(pct_died = (died / (died + no) * 100)) %>%
    
    ggplot(aes(pct_died, fct_reorder(season, pct_died))) +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")

***************************

Function to plot pct died

****************************

There are many categorical variables to find correlation with died. The same code repeats. Create a function.

plot_pct_died <- function(data, cat_var) {
    
    data %>%

    count(died, {{cat_var}}) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (died + no) * 100)) %>%
    
    ggplot(aes(pct_died, fct_reorder({{cat_var}}, pct_died))) +
    geom_col(fill = "midnightblue") +
    
    labs(y = NULL, x = "Percent Died")
    
}

Sex

Is winter more dangerous than summer?

data %>% plot_pct_died(cat_var = sex)

Age

Age doesn’t matter?

data %>% 
    
    ggplot(aes(died, age)) +
    geom_boxplot()

Age by Decade

Try age by decade.

Climbers in their 80s have a much higher chance of dying.
Outside of the age group of 80s, the age seems to give more experience and reduces the likelihood of dying.

data %>%
    
    mutate(age_decade = age %/% 10 * 10) %>%
    count(died, age_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%

    ggplot(aes(age_decade, pct_died)) +
    geom_col() %>%
    
    labs(y = "Percent Died", x = "Age by Decade")

data %>%
    
    mutate(age_decade = age %/% 10 * 10) %>%
    count(died, age_decade) %>%
    pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
    mutate(pct_died = (died / (no + died)) * 100) %>%
    filter(age_decade != 80) %>%
    
    ggplot(aes(age_decade, pct_died)) +
    geom_col() %>%
    
    labs(y = "Percent Died", x = "Age by Decade")

Citizenship

top10_citizen <- data %>% count(citizenship, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(citizenship)

data %>%
    filter(citizenship %in% top10_citizen)  %>%
    plot_pct_died(cat_var = citizenship)

Expedition Role

top10_role <- data %>% count(expedition_role, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(expedition_role)

data %>%
    filter(expedition_role %in% top10_role)  %>%
    plot_pct_died(cat_var = expedition_role)

Build Model

# data <- data %>% group_by(died) %>% sample_n(50) %>% ungroup()

library(tidymodels)

set.seed(123)
data_split <- initial_split(data, strata = died)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- vfold_cv(data_train, strata = died)
data_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [49264/5474]> Fold01
##  2 <split [49264/5474]> Fold02
##  3 <split [49264/5474]> Fold03
##  4 <split [49264/5474]> Fold04
##  5 <split [49264/5474]> Fold05
##  6 <split [49264/5474]> Fold06
##  7 <split [49264/5474]> Fold07
##  8 <split [49264/5474]> Fold08
##  9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10

library(themis) # for step_smote
library(embed)  # for step_lencode_glm

xgboost_recipe <-
  recipe(formula = died ~ ., data = data_train) %>%
    update_role(member_id, new_role = "id") %>%
    # step_lencode_glm(c(expedition_id, peak_name, citizenship, expedition_role), outcome = vars(died)) %>%  # for factors with many values
    step_other(peak_name, citizenship, expedition_role) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_zv(all_predictors()) %>%
    step_normalize(all_numeric_predictors()) %>%
    step_smote(died) # for unbalanced variable, died

# xgboost_recipe <- 
#   recipe(formula = died ~ ., data = data_train) %>%
#     update_role(member_id, new_role = "id") %>%
#     # step_lencode_glm(c(expedition_id, peak_name, citizenship, expedition_role), outcome = vars(died)) %>%  # for factors with many values
#     step_other(peak_name, citizenship, expedition_role) %>%
#     step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
#     step_zv(all_predictors()) %>%
#     step_normalize(all_numeric_predictors()) 

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 108,068
## Columns: 35
## $ member_id                  <fct> AMAD78301-02, AMAD78301-05, AMAD78301-06, A…
## $ year                       <dbl> -1.635974, -1.635974, -1.635974, -1.635974,…
## $ age                        <dbl> 0.35136828, -0.32114464, -1.18580412, 0.351…
## $ died                       <fct> no, no, no, no, no, no, no, no, no, no, no,…
## $ peak_name_Ama.Dablam       <dbl> 2.812559, 2.812559, 2.812559, 2.812559, 2.8…
## $ peak_name_Cho.Oyu          <dbl> -0.363673, -0.363673, -0.363673, -0.363673,…
## $ peak_name_Everest          <dbl> -0.6366008, -0.6366008, -0.6366008, -0.6366…
## $ peak_name_Manaslu          <dbl> -0.2574662, -0.2574662, -0.2574662, -0.2574…
## $ peak_name_other            <dbl> -0.8516586, -0.8516586, -0.8516586, -0.8516…
## $ season_Autumn              <dbl> 1.066582, 1.066582, 1.066582, 1.066582, -0.…
## $ season_Spring              <dbl> -0.9927462, -0.9927462, -0.9927462, -0.9927…
## $ season_Summer              <dbl> -0.09336126, -0.09336126, -0.09336126, -0.0…
## $ season_Winter              <dbl> -0.1671621, -0.1671621, -0.1671621, -0.1671…
## $ sex_F                      <dbl> -0.3221255, -0.3221255, -0.3221255, -0.3221…
## $ sex_M                      <dbl> 0.3221255, 0.3221255, 0.3221255, 0.3221255,…
## $ citizenship_France         <dbl> 3.9004595, 3.9004595, 3.9004595, 3.9004595,…
## $ citizenship_Japan          <dbl> -0.3062252, -0.3062252, -0.3062252, -0.3062…
## $ citizenship_Nepal          <dbl> -0.493404, -0.493404, -0.493404, -0.493404,…
## $ citizenship_UK             <dbl> -0.2720068, -0.2720068, -0.2720068, -0.2720…
## $ citizenship_USA            <dbl> -0.3077572, -0.3077572, -0.3077572, -0.3077…
## $ citizenship_other          <dbl> -1.0027716, -1.0027716, -1.0027716, -1.0027…
## $ expedition_role_Climber    <dbl> -1.2099521, 0.8264639, 0.8264639, 0.8264639…
## $ expedition_role_H.A.Worker <dbl> -0.4644574, -0.4644574, -0.4644574, -0.4644…
## $ expedition_role_Leader     <dbl> -0.396114, -0.396114, -0.396114, -0.396114,…
## $ expedition_role_other      <dbl> 3.127181, -0.319771, -0.319771, -0.319771, …
## $ hired_FALSE.               <dbl> 0.4848862, 0.4848862, 0.4848862, 0.4848862,…
## $ hired_TRUE.                <dbl> -0.4848862, -0.4848862, -0.4848862, -0.4848…
## $ success_FALSE.             <dbl> 0.7921472, 0.7921472, 0.7921472, 0.7921472,…
## $ success_TRUE.              <dbl> -0.7921472, -0.7921472, -0.7921472, -0.7921…
## $ solo_FALSE.                <dbl> 0.0398985, 0.0398985, 0.0398985, 0.0398985,…
## $ solo_TRUE.                 <dbl> -0.0398985, -0.0398985, -0.0398985, -0.0398…
## $ oxygen_used_FALSE.         <dbl> 0.5681049, 0.5681049, 0.5681049, 0.5681049,…
## $ oxygen_used_TRUE.          <dbl> -0.5681049, -0.5681049, -0.5681049, -0.5681…
## $ injured_FALSE.             <dbl> 0.1515509, 0.1515509, 0.1515509, 0.1515509,…
## $ injured_TRUE.              <dbl> -0.1515509, -0.1515509, -0.1515509, -0.1515…

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

doParallel::registerDoParallel()
set.seed(27358)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_folds,
            control = control_resamples(save_pred = TRUE))

Model evaluation

Show performance

collect_metrics(xgboost_tune)

## # A tibble: 20 × 10
##    trees min_n tree_depth learn_rate .metric  .estimator  mean     n  std_err
##    <int> <int>      <int>      <dbl> <chr>    <chr>      <dbl> <int>    <dbl>
##  1     6     2          5    0.00648 accuracy binary     0.613    10 0.0191  
##  2     6     2          5    0.00648 roc_auc  binary     0.666    10 0.0128  
##  3  1765     8          1    0.252   accuracy binary     0.805    10 0.00252 
##  4  1765     8          1    0.252   roc_auc  binary     0.676    10 0.0140  
##  5   257    10          7    0.0840  accuracy binary     0.985    10 0.000654
##  6   257    10          7    0.0840  roc_auc  binary     0.740    10 0.0126  
##  7  1867    15         14    0.00279 accuracy binary     0.963    10 0.00144 
##  8  1867    15         14    0.00279 roc_auc  binary     0.736    10 0.0140  
##  9  1057    20          3    0.00427 accuracy binary     0.692    10 0.00702 
## 10  1057    20          3    0.00427 roc_auc  binary     0.703    10 0.0124  
## 11   771    23         10    0.00102 accuracy binary     0.793    10 0.00490 
## 12   771    23         10    0.00102 roc_auc  binary     0.723    10 0.0139  
## 13  1531    26         13    0.171   accuracy binary     0.986    10 0.000568
## 14  1531    26         13    0.171   roc_auc  binary     0.732    10 0.00894 
## 15   966    29          7    0.0419  accuracy binary     0.986    10 0.000481
## 16   966    29          7    0.0419  roc_auc  binary     0.744    10 0.0123  
## 17  1233    33          9    0.0140  accuracy binary     0.985    10 0.000512
## 18  1233    33          9    0.0140  roc_auc  binary     0.738    10 0.0126  
## 19   582    39         11    0.0186  accuracy binary     0.983    10 0.000483
## 20   582    39         11    0.0186  roc_auc  binary     0.733    10 0.0124  
## # ℹ 1 more variable: .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(died, .pred_died) %>%
    autoplot()

# conf_mat_resampled(xgboost_tune, tidy = FALSE) %>%
#     autoplot()

Finalize workflow (last_fit)

final_fitted <- xgboost_workflow %>%
    
    # Fit the best model to the train data
    finalize_workflow(select_best(xgboost_tune, "accuracy")) %>%
    
    # Evaluate on the test data
    last_fit(data_split)

collect_metrics(final_fitted)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.986 Preprocessor1_Model1
## 2 roc_auc  binary         0.728 Preprocessor1_Model1

collect_predictions(final_fitted) %>%
    conf_mat(died, .pred_class) %>%
    autoplot()

Variable importance

library(vip)

final_fitted %>%
  extract_fit_engine() %>%
  vip()

Apply 5