library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members %>%
count(died)
## # A tibble: 2 × 2
## died n
## <lgl> <int>
## 1 FALSE 75413
## 2 TRUE 1106
data <- members %>%
select(-death_cause, -injury_type, -highpoint_metres, -death_height_metres, -injury_height_metres) %>%
drop_na() %>%
select(-peak_id, -expedition_id) %>%
distinct(member_id, .keep_all = TRUE) %>%
mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>%
mutate(across(where(is.character), factor)) %>%
mutate(across(where(is.logical), factor)) %>%
mutate(member_id = as.character(member_id))
skimr::skim(data)
Name | data |
Number of rows | 72984 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 11 |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
member_id | 0 | 1 | 12 | 12 | 0 | 72984 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
peak_name | 0 | 1 | FALSE | 390 | Eve: 20994, Cho: 8608, Ama: 8235, Man: 4510 |
season | 0 | 1 | FALSE | 4 | Spr: 36150, Aut: 34186, Win: 2011, Sum: 637 |
sex | 0 | 1 | FALSE | 2 | M: 66150, F: 6834 |
citizenship | 0 | 1 | FALSE | 207 | Nep: 14367, USA: 6318, Jap: 6188, UK: 5071 |
expedition_role | 0 | 1 | FALSE | 483 | Cli: 43315, H-A: 13033, Lea: 9884, Exp: 1411 |
hired | 0 | 1 | FALSE | 2 | FAL: 59006, TRU: 13978 |
success | 0 | 1 | FALSE | 2 | FAL: 44913, TRU: 28071 |
solo | 0 | 1 | FALSE | 2 | FAL: 72868, TRU: 116 |
oxygen_used | 0 | 1 | FALSE | 2 | FAL: 55215, TRU: 17769 |
died | 0 | 1 | FALSE | 2 | no: 72055, die: 929 |
injured | 0 | 1 | FALSE | 2 | FAL: 71333, TRU: 1651 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1 | 2001.00 | 14.12 | 1905 | 1992 | 2004 | 2012 | 2019 | ▁▁▁▃▇ |
age | 0 | 1 | 37.34 | 10.39 | 7 | 29 | 36 | 44 | 85 | ▁▇▅▁▁ |
data %>%
select(hired:injured) %>%
pivot_longer(-died) %>%
filter(value == "TRUE") %>%
group_by(died, name) %>%
summarise(n_true = n()) %>%
ungroup() %>%
pivot_wider(names_from = died, values_from = n_true) %>%
mutate(pct_died = died/no * 100) %>%
ggplot(aes(pct_died, fct_reorder(name, pct_died))) +
geom_col() +
labs(y = NULL, x = "Percent Died",
caption = "Solo seem to be relevant - 'Dangerous'")
## `summarise()` has grouped output by 'died'. You can override using the
## `.groups` argument.
top10_peaks <- data %>% count(peak_name, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(peak_name)
data %>%
filter(peak_name %in% top10_peaks) %>%
count(died, peak_name) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(pct_died, fct_reorder(peak_name, pct_died))) +
geom_col() +
labs(y = "Peak Names", x = "Percent Died")
data %>%
ggplot(aes(died, year)) +
geom_boxplot()
data %>%
mutate(year_decade = year %/% 10 * 10) %>%
count(died, year_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(year_decade, pct_died)) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")
data %>%
count(died, season) %>%
pivot_wider(names_from = died, values_from = n) %>%
mutate(pct_died = (died / (died + no) * 100)) %>%
ggplot(aes(pct_died, fct_reorder(season, pct_died))) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")
plot_pct_died <- function(data, cat_var) {
data %>%
count(died, {{cat_var}}) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (died + no) * 100)) %>%
ggplot(aes(pct_died, fct_reorder({{cat_var}}, pct_died))) +
geom_col(fill = "midnightblue") +
labs(y = NULL, x = "Percent Died")
}
data %>% plot_pct_died(cat_var = sex)
data %>%
ggplot(aes(died, age)) +
geom_boxplot()
data %>%
mutate(age_decade = age %/% 10 * 10) %>%
count(died, age_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
ggplot(aes(age_decade, pct_died)) +
geom_col() %>%
labs(y = "Percent Died", x = "Age by Decade")
data %>%
mutate(age_decade = age %/% 10 * 10) %>%
count(died, age_decade) %>%
pivot_wider(names_from = died, values_from = n, values_fill = 0) %>%
mutate(pct_died = (died / (no + died)) * 100) %>%
filter(age_decade != 80) %>%
ggplot(aes(age_decade, pct_died)) +
geom_col() %>%
labs(y = "Percent Died", x = "Age by Decade")
top10_citizen <- data %>% count(citizenship, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(citizenship)
data %>%
filter(citizenship %in% top10_citizen) %>%
plot_pct_died(cat_var = citizenship)
top10_role <- data %>% count(expedition_role, sort = TRUE) %>% slice_max(order_by = n, n = 10) %>% pull(expedition_role)
data %>%
filter(expedition_role %in% top10_role) %>%
plot_pct_died(cat_var = expedition_role)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
data_split <- initial_split(data, strata = died)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- vfold_cv(data_train, strata = died)
data_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [49264/5474]> Fold01
## 2 <split [49264/5474]> Fold02
## 3 <split [49264/5474]> Fold03
## 4 <split [49264/5474]> Fold04
## 5 <split [49264/5474]> Fold05
## 6 <split [49264/5474]> Fold06
## 7 <split [49264/5474]> Fold07
## 8 <split [49264/5474]> Fold08
## 9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10
library(themis)
library(embed)
xgboost_recipe <-
recipe(formula = died ~ ., data = data_train) %>%
update_role(member_id, new_role = "id") %>%
step_other(peak_name, citizenship, expedition_role) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_smote(died)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 108,068
## Columns: 35
## $ member_id <fct> AMAD78301-02, AMAD78301-05, AMAD78301-06, A…
## $ year <dbl> -1.635974, -1.635974, -1.635974, -1.635974,…
## $ age <dbl> 0.35136828, -0.32114464, -1.18580412, 0.351…
## $ died <fct> no, no, no, no, no, no, no, no, no, no, no,…
## $ peak_name_Ama.Dablam <dbl> 2.812559, 2.812559, 2.812559, 2.812559, 2.8…
## $ peak_name_Cho.Oyu <dbl> -0.363673, -0.363673, -0.363673, -0.363673,…
## $ peak_name_Everest <dbl> -0.6366008, -0.6366008, -0.6366008, -0.6366…
## $ peak_name_Manaslu <dbl> -0.2574662, -0.2574662, -0.2574662, -0.2574…
## $ peak_name_other <dbl> -0.8516586, -0.8516586, -0.8516586, -0.8516…
## $ season_Autumn <dbl> 1.066582, 1.066582, 1.066582, 1.066582, -0.…
## $ season_Spring <dbl> -0.9927462, -0.9927462, -0.9927462, -0.9927…
## $ season_Summer <dbl> -0.09336126, -0.09336126, -0.09336126, -0.0…
## $ season_Winter <dbl> -0.1671621, -0.1671621, -0.1671621, -0.1671…
## $ sex_F <dbl> -0.3221255, -0.3221255, -0.3221255, -0.3221…
## $ sex_M <dbl> 0.3221255, 0.3221255, 0.3221255, 0.3221255,…
## $ citizenship_France <dbl> 3.9004595, 3.9004595, 3.9004595, 3.9004595,…
## $ citizenship_Japan <dbl> -0.3062252, -0.3062252, -0.3062252, -0.3062…
## $ citizenship_Nepal <dbl> -0.493404, -0.493404, -0.493404, -0.493404,…
## $ citizenship_UK <dbl> -0.2720068, -0.2720068, -0.2720068, -0.2720…
## $ citizenship_USA <dbl> -0.3077572, -0.3077572, -0.3077572, -0.3077…
## $ citizenship_other <dbl> -1.0027716, -1.0027716, -1.0027716, -1.0027…
## $ expedition_role_Climber <dbl> -1.2099521, 0.8264639, 0.8264639, 0.8264639…
## $ expedition_role_H.A.Worker <dbl> -0.4644574, -0.4644574, -0.4644574, -0.4644…
## $ expedition_role_Leader <dbl> -0.396114, -0.396114, -0.396114, -0.396114,…
## $ expedition_role_other <dbl> 3.127181, -0.319771, -0.319771, -0.319771, …
## $ hired_FALSE. <dbl> 0.4848862, 0.4848862, 0.4848862, 0.4848862,…
## $ hired_TRUE. <dbl> -0.4848862, -0.4848862, -0.4848862, -0.4848…
## $ success_FALSE. <dbl> 0.7921472, 0.7921472, 0.7921472, 0.7921472,…
## $ success_TRUE. <dbl> -0.7921472, -0.7921472, -0.7921472, -0.7921…
## $ solo_FALSE. <dbl> 0.0398985, 0.0398985, 0.0398985, 0.0398985,…
## $ solo_TRUE. <dbl> -0.0398985, -0.0398985, -0.0398985, -0.0398…
## $ oxygen_used_FALSE. <dbl> 0.5681049, 0.5681049, 0.5681049, 0.5681049,…
## $ oxygen_used_TRUE. <dbl> -0.5681049, -0.5681049, -0.5681049, -0.5681…
## $ injured_FALSE. <dbl> 0.1515509, 0.1515509, 0.1515509, 0.1515509,…
## $ injured_TRUE. <dbl> -0.1515509, -0.1515509, -0.1515509, -0.1515…
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
doParallel::registerDoParallel()
set.seed(27358)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_folds,
control = control_resamples(save_pred = TRUE))
collect_metrics(xgboost_tune)
## # A tibble: 20 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 6 2 5 0.00648 accuracy binary 0.613 10 0.0191
## 2 6 2 5 0.00648 roc_auc binary 0.666 10 0.0128
## 3 1765 8 1 0.252 accuracy binary 0.805 10 0.00260
## 4 1765 8 1 0.252 roc_auc binary 0.675 10 0.0140
## 5 257 10 7 0.0840 accuracy binary 0.985 10 0.000627
## 6 257 10 7 0.0840 roc_auc binary 0.735 10 0.0126
## 7 1867 15 14 0.00279 accuracy binary 0.963 10 0.00154
## 8 1867 15 14 0.00279 roc_auc binary 0.736 10 0.0138
## 9 1057 20 3 0.00427 accuracy binary 0.692 10 0.00690
## 10 1057 20 3 0.00427 roc_auc binary 0.703 10 0.0125
## 11 771 23 10 0.00102 accuracy binary 0.794 10 0.00474
## 12 771 23 10 0.00102 roc_auc binary 0.723 10 0.0140
## 13 1531 26 13 0.171 accuracy binary 0.986 10 0.000557
## 14 1531 26 13 0.171 roc_auc binary 0.730 10 0.00918
## 15 966 29 7 0.0419 accuracy binary 0.986 10 0.000545
## 16 966 29 7 0.0419 roc_auc binary 0.742 10 0.0127
## 17 1233 33 9 0.0140 accuracy binary 0.985 10 0.000531
## 18 1233 33 9 0.0140 roc_auc binary 0.738 10 0.0129
## 19 582 39 11 0.0186 accuracy binary 0.982 10 0.000572
## 20 582 39 11 0.0186 roc_auc binary 0.735 10 0.0128
## # ℹ 1 more variable: .config <chr>
collect_predictions(xgboost_tune) %>%
group_by(id) %>%
roc_curve(died, .pred_died) %>%
autoplot()
final_fitted <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, "accuracy")) %>%
last_fit(data_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.986 Preprocessor1_Model1
## 2 roc_auc binary 0.733 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
conf_mat(died, .pred_class) %>%
autoplot()
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.986 Preprocessor1_Model1
## 2 roc_auc binary 0.733 Preprocessor1_Model1
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
final_fitted %>%
extract_fit_engine() %>%
vip()
The original data was made up of 76519 observations and 21 variable with its key characteristics used for my prediction being the ““observed”expedition_role” variable which was used to predict if a person died or not.
The dataset focuses on two key variables: ‘expedition_role’ and ‘died’. ‘expedition_role’ describes roles held by expedition members, such as “guide” or “scientist”, and is processed using term frequency-inverse document frequency (TF-IDF). The ‘died’ variable is categorical, indicating if an expedition member died, with categories “died” and “did not die”. It acts as the predictive model’s target, exploring the link between roles and fatality outcomes.
The original data underwent significant transformations for modeling purposes. The ‘died’ variable, originally boolean, was recategorized into “died” and “did not die” for clarity. The ‘expedition_role’ variable was tokenized, filtered, and subjected to term frequency-inverse document frequency (TF-IDF) processing. These transformations made the data more suitable for machine learning, aiding in discerning patterns related to expedition roles and fatality outcomes. Converting raw data to this structured form improves model performance and interpretability.
The data preparation involved several key steps: recategorizing the ‘died’ variable, tokenizing the ‘expedition_role’ variable, filtering tokens, and applying term frequency-inverse document frequency (TF-IDF) processing on ‘expedition_role’. These steps transformed the data into a more structured format suitable for machine learning modeling.
The machine learning model used in the analysis is the logistic regression model, specifically implemented with the glmnet engine.
The model evaluation employed two metrics: accuracy, which measures the proportion of correct predictions, and roc_auc, which assesses the model’s ability to distinguish between classes based on the area under the receiver operating characteristic curve.
The major findings from the analysis revealed a correlation between expedition roles and mortality rates. Notably, the ‘guide’ role exhibited a high likelihood of survival, while the ‘porter’ role had an increased risk of death. The logistic regression model demonstrated a 55.7% ROC_AUC, suggesting limited predictive power.