Himalayan Climbing Expeditions: The dataset is a compilation of records for all expeditions that have climbed in the Nepal Himalaya. Build a classification model to predict whether the person died (died). Use the members dataset.
library(tidyverse)
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
members %>% skimr::skim()
Name | Piped data |
Number of rows | 76519 |
Number of columns | 21 |
_______________________ | |
Column type frequency: | |
character | 10 |
logical | 6 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
expedition_id | 0 | 1.00 | 9 | 9 | 0 | 10350 | 0 |
member_id | 0 | 1.00 | 12 | 12 | 0 | 76518 | 0 |
peak_id | 0 | 1.00 | 4 | 4 | 0 | 391 | 0 |
peak_name | 15 | 1.00 | 4 | 25 | 0 | 390 | 0 |
season | 0 | 1.00 | 6 | 7 | 0 | 5 | 0 |
sex | 2 | 1.00 | 1 | 1 | 0 | 2 | 0 |
citizenship | 10 | 1.00 | 2 | 23 | 0 | 212 | 0 |
expedition_role | 21 | 1.00 | 4 | 25 | 0 | 524 | 0 |
death_cause | 75413 | 0.01 | 3 | 27 | 0 | 12 | 0 |
injury_type | 74807 | 0.02 | 3 | 27 | 0 | 11 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
hired | 0 | 1 | 0.21 | FAL: 60788, TRU: 15731 |
success | 0 | 1 | 0.38 | FAL: 47320, TRU: 29199 |
solo | 0 | 1 | 0.00 | FAL: 76398, TRU: 121 |
oxygen_used | 0 | 1 | 0.24 | FAL: 58286, TRU: 18233 |
died | 0 | 1 | 0.01 | FAL: 75413, TRU: 1106 |
injured | 0 | 1 | 0.02 | FAL: 74806, TRU: 1713 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2000.36 | 14.78 | 1905 | 1991 | 2004 | 2012 | 2019 | ▁▁▁▃▇ |
age | 3497 | 0.95 | 37.33 | 10.40 | 7 | 29 | 36 | 44 | 85 | ▁▇▅▁▁ |
highpoint_metres | 21833 | 0.71 | 7470.68 | 1040.06 | 3800 | 6700 | 7400 | 8400 | 8850 | ▁▁▆▃▇ |
death_height_metres | 75451 | 0.01 | 6592.85 | 1308.19 | 400 | 5800 | 6600 | 7550 | 8830 | ▁▁▂▇▆ |
injury_height_metres | 75510 | 0.01 | 7049.91 | 1214.24 | 400 | 6200 | 7100 | 8000 | 8880 | ▁▁▂▇▇ |
Notes about the dataset
data <- members %>%
# Drop variables with too many missing values
select(-death_cause, -injury_type, -highpoint_metres, -death_height_metres, -injury_height_metres) %>%
# Drop observations with missing values
drop_na() %>%
# Remove a redundant variable
select(-peak_id, -expedition_id) %>%
# Remove one of the two members with the identical id
distinct(member_id, .keep_all = TRUE) %>%
# Recode died
mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>%
# Convert character variables to factors
mutate(across(where(is.character), factor)) %>%
# Convert logical variables to factors
mutate(across(where(is.logical), factor)) %>%
# id should be character
mutate(member_id = as.character(member_id))
skimr::skim(data)
Name | data |
Number of rows | 72984 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 11 |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
member_id | 0 | 1 | 12 | 12 | 0 | 72984 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
peak_name | 0 | 1 | FALSE | 390 | Eve: 20994, Cho: 8608, Ama: 8235, Man: 4510 |
season | 0 | 1 | FALSE | 4 | Spr: 36150, Aut: 34186, Win: 2011, Sum: 637 |
sex | 0 | 1 | FALSE | 2 | M: 66150, F: 6834 |
citizenship | 0 | 1 | FALSE | 207 | Nep: 14367, USA: 6318, Jap: 6188, UK: 5071 |
expedition_role | 0 | 1 | FALSE | 483 | Cli: 43315, H-A: 13033, Lea: 9884, Exp: 1411 |
hired | 0 | 1 | FALSE | 2 | FAL: 59006, TRU: 13978 |
success | 0 | 1 | FALSE | 2 | FAL: 44913, TRU: 28071 |
solo | 0 | 1 | FALSE | 2 | FAL: 72868, TRU: 116 |
oxygen_used | 0 | 1 | FALSE | 2 | FAL: 55215, TRU: 17769 |
died | 0 | 1 | FALSE | 2 | no: 72055, die: 929 |
injured | 0 | 1 | FALSE | 2 | FAL: 71333, TRU: 1651 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1 | 2001.00 | 14.12 | 1905 | 1992 | 2004 | 2012 | 2019 | ▁▁▁▃▇ |
age | 0 | 1 | 37.34 | 10.39 | 7 | 29 | 36 | 44 | 85 | ▁▇▅▁▁ |
Examine binary variables together.
Solo seems to be important.
data %>%
select(hired:injured) %>%
pivot_longer(-died) %>%
filter(value == "TRUE") %>%
group_by(died, name) %>%
summarise(n_true = n()) %>%
ungroup() %>%
pivot_wider(names_from = died, values_from = n_true) %>%
mutate(pct_died = died/no * 100) %>%
ggplot(aes(pct_died, fct_reorder(name, pct_died))) +
geom_col() +
labs(y = NULL, x = "Percent Died",
caption = "Solo seem to be relevant - 'Dangerous'")
Are some peaks more dangerous than others?
top10_peaks <- data %>% count(peak_name, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(peak_name)
data %>%
filter(peak_name %in% top10_peaks) %>%
count(died, peak_name) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(pct_died, fct_reorder(peak_name, pct_died))) +
geom_col() +
labs(y = "Peak Names", x = "Percent Died")
Has it become less dangerous to climb as the equipment has modernized?
data %>%
ggplot(aes(died, year)) +
geom_boxplot()
Try age by decade.
Based on this, we may remove climbers in their 80s (only five climbers are 80 or older) and use age_decade as a numeric data.
data %>%
mutate(year_decade = year %/% 10 * 10) %>%
count(died, year_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(year_decade, pct_died)) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")
Is winter more dangerous than summer?
data %>%
count(died, season) %>%
pivot_wider(names_from = died, values_from = n) %>%
mutate(pct_died = (died / (died + no) * 100)) %>%
ggplot(aes(pct_died, fct_reorder(season, pct_died))) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")
There are many categorical variables to find correlation with died. The same code repeats. Create a function.
plot_pct_died <- function(data, cat_var) {
data %>%
count(died, {{cat_var}}) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (died + no) * 100)) %>%
ggplot(aes(pct_died, fct_reorder({{cat_var}}, pct_died))) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")
}
Is winter more dangerous than summer?
data %>% plot_pct_died(cat_var = sex)
Age doesn’t matter?
data %>%
ggplot(aes(died, age)) +
geom_boxplot()
Try age by decade.
data %>%
mutate(age_decade = age %/% 10 * 10) %>%
count(died, age_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(age_decade, pct_died)) +
geom_col() %>%
labs(y = "Percent Died", x = "Age by Decade")
data %>%
mutate(age_decade = age %/% 10 * 10) %>%
count(died, age_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
filter(age_decade != 80) %>%
ggplot(aes(age_decade, pct_died)) +
geom_col() %>%
labs(y = "Percent Died", x = "Age by Decade")
top10_citizen <- data %>% count(citizenship, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(citizenship)
data %>%
filter(citizenship %in% top10_citizen) %>%
plot_pct_died(cat_var = citizenship)
top10_role <- data %>% count(expedition_role, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(expedition_role)
data %>%
filter(expedition_role %in% top10_role) %>%
plot_pct_died(cat_var = expedition_role)
# data <- data %>% group_by(died) %>% sample_n(50) %>% ungroup()
library(tidymodels)
set.seed(123)
data_split <- initial_split(data, strata = died)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- vfold_cv(data_train, strata = died)
data_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [49264/5474]> Fold01
## 2 <split [49264/5474]> Fold02
## 3 <split [49264/5474]> Fold03
## 4 <split [49264/5474]> Fold04
## 5 <split [49264/5474]> Fold05
## 6 <split [49264/5474]> Fold06
## 7 <split [49264/5474]> Fold07
## 8 <split [49264/5474]> Fold08
## 9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10
library(themis) # for step_smote
library(embed) # for step_lencode_glm
xgboost_recipe <-
recipe(formula = died ~ ., data = data_train) %>%
update_role(member_id, new_role = "id") %>%
# step_lencode_glm(c(expedition_id, peak_name, citizenship, expedition_role), outcome = vars(died)) %>% # for factors with many values
step_other(peak_name, citizenship, expedition_role) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(died) # for unbalanced variable, died
# xgboost_recipe <-
# recipe(formula = died ~ ., data = data_train) %>%
# update_role(member_id, new_role = "id") %>%
# # step_lencode_glm(c(expedition_id, peak_name, citizenship, expedition_role), outcome = vars(died)) %>% # for factors with many values
# step_other(peak_name, citizenship, expedition_role) %>%
# step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
# step_zv(all_predictors()) %>%
# step_normalize(all_numeric_predictors())
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 108,068
## Columns: 35
## $ member_id <fct> AMAD78301-02, AMAD78301-05, AMAD78301-06, A…
## $ year <dbl> -1.635974, -1.635974, -1.635974, -1.635974,…
## $ age <dbl> 0.35136828, -0.32114464, -1.18580412, 0.351…
## $ died <fct> no, no, no, no, no, no, no, no, no, no, no,…
## $ peak_name_Ama.Dablam <dbl> 2.812559, 2.812559, 2.812559, 2.812559, 2.8…
## $ peak_name_Cho.Oyu <dbl> -0.363673, -0.363673, -0.363673, -0.363673,…
## $ peak_name_Everest <dbl> -0.6366008, -0.6366008, -0.6366008, -0.6366…
## $ peak_name_Manaslu <dbl> -0.2574662, -0.2574662, -0.2574662, -0.2574…
## $ peak_name_other <dbl> -0.8516586, -0.8516586, -0.8516586, -0.8516…
## $ season_Autumn <dbl> 1.066582, 1.066582, 1.066582, 1.066582, -0.…
## $ season_Spring <dbl> -0.9927462, -0.9927462, -0.9927462, -0.9927…
## $ season_Summer <dbl> -0.09336126, -0.09336126, -0.09336126, -0.0…
## $ season_Winter <dbl> -0.1671621, -0.1671621, -0.1671621, -0.1671…
## $ sex_F <dbl> -0.3221255, -0.3221255, -0.3221255, -0.3221…
## $ sex_M <dbl> 0.3221255, 0.3221255, 0.3221255, 0.3221255,…
## $ citizenship_France <dbl> 3.9004595, 3.9004595, 3.9004595, 3.9004595,…
## $ citizenship_Japan <dbl> -0.3062252, -0.3062252, -0.3062252, -0.3062…
## $ citizenship_Nepal <dbl> -0.493404, -0.493404, -0.493404, -0.493404,…
## $ citizenship_UK <dbl> -0.2720068, -0.2720068, -0.2720068, -0.2720…
## $ citizenship_USA <dbl> -0.3077572, -0.3077572, -0.3077572, -0.3077…
## $ citizenship_other <dbl> -1.0027716, -1.0027716, -1.0027716, -1.0027…
## $ expedition_role_Climber <dbl> -1.2099521, 0.8264639, 0.8264639, 0.8264639…
## $ expedition_role_H.A.Worker <dbl> -0.4644574, -0.4644574, -0.4644574, -0.4644…
## $ expedition_role_Leader <dbl> -0.396114, -0.396114, -0.396114, -0.396114,…
## $ expedition_role_other <dbl> 3.127181, -0.319771, -0.319771, -0.319771, …
## $ hired_FALSE. <dbl> 0.4848862, 0.4848862, 0.4848862, 0.4848862,…
## $ hired_TRUE. <dbl> -0.4848862, -0.4848862, -0.4848862, -0.4848…
## $ success_FALSE. <dbl> 0.7921472, 0.7921472, 0.7921472, 0.7921472,…
## $ success_TRUE. <dbl> -0.7921472, -0.7921472, -0.7921472, -0.7921…
## $ solo_FALSE. <dbl> 0.0398985, 0.0398985, 0.0398985, 0.0398985,…
## $ solo_TRUE. <dbl> -0.0398985, -0.0398985, -0.0398985, -0.0398…
## $ oxygen_used_FALSE. <dbl> 0.5681049, 0.5681049, 0.5681049, 0.5681049,…
## $ oxygen_used_TRUE. <dbl> -0.5681049, -0.5681049, -0.5681049, -0.5681…
## $ injured_FALSE. <dbl> 0.1515509, 0.1515509, 0.1515509, 0.1515509,…
## $ injured_TRUE. <dbl> -0.1515509, -0.1515509, -0.1515509, -0.1515…
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(27358)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_folds,
control = control_resamples(save_pred = TRUE))
collect_metrics(xgboost_tune)
## # A tibble: 20 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 6 2 5 0.00648 accuracy binary 0.613 10 0.0191
## 2 6 2 5 0.00648 roc_auc binary 0.666 10 0.0128
## 3 1765 8 1 0.252 accuracy binary 0.805 10 0.00252
## 4 1765 8 1 0.252 roc_auc binary 0.676 10 0.0140
## 5 257 10 7 0.0840 accuracy binary 0.985 10 0.000654
## 6 257 10 7 0.0840 roc_auc binary 0.740 10 0.0126
## 7 1867 15 14 0.00279 accuracy binary 0.963 10 0.00144
## 8 1867 15 14 0.00279 roc_auc binary 0.736 10 0.0140
## 9 1057 20 3 0.00427 accuracy binary 0.692 10 0.00702
## 10 1057 20 3 0.00427 roc_auc binary 0.703 10 0.0124
## 11 771 23 10 0.00102 accuracy binary 0.793 10 0.00490
## 12 771 23 10 0.00102 roc_auc binary 0.723 10 0.0139
## 13 1531 26 13 0.171 accuracy binary 0.986 10 0.000568
## 14 1531 26 13 0.171 roc_auc binary 0.732 10 0.00894
## 15 966 29 7 0.0419 accuracy binary 0.986 10 0.000481
## 16 966 29 7 0.0419 roc_auc binary 0.744 10 0.0123
## 17 1233 33 9 0.0140 accuracy binary 0.985 10 0.000512
## 18 1233 33 9 0.0140 roc_auc binary 0.738 10 0.0126
## 19 582 39 11 0.0186 accuracy binary 0.983 10 0.000483
## 20 582 39 11 0.0186 roc_auc binary 0.733 10 0.0124
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(died, .pred_died) %>%
autoplot()
# conf_mat_resampled(xgboost_tune, tidy = FALSE) %>%
# autoplot()
final_fitted <- xgboost_workflow %>%
# Fit the best model to the train data
finalize_workflow(select_best(xgboost_tune, "accuracy")) %>%
# Evaluate on the test data
last_fit(data_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.986 Preprocessor1_Model1
## 2 roc_auc binary 0.728 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
conf_mat(died, .pred_class) %>%
autoplot()
library(vip)
final_fitted %>%
extract_fit_engine() %>%
vip()