library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_peaks <- members_raw %>% 
    count(peak_name, sort = TRUE) %>% 
    slice_max(n, n = 6) %>% 
    pull(peak_name)

members_raw %>% 
    filter(peak_name %in% top_peaks) %>% 
    count(peak_name, died) %>% 
    ggplot(aes(died, n, fill = died)) + 
    geom_col(show.legend = FALSE) + 
    facet_wrap(vars(peak_name), scales = "free")

members_raw %>% 
    count(died, citizenship)
## # A tibble: 267 × 3
##    died  citizenship               n
##    <lgl> <chr>                 <int>
##  1 FALSE Albania                   6
##  2 FALSE Algeria                   2
##  3 FALSE Andorra                  31
##  4 FALSE Argentina               232
##  5 FALSE Argentina/Canada          2
##  6 FALSE Armenia                   3
##  7 FALSE Australia              1395
##  8 FALSE Australia/Greece          1
##  9 FALSE Australia/Ireland         2
## 10 FALSE Australia/New Zealand    17
## # ℹ 257 more rows
top_role <- members_raw %>% 
    count(expedition_role, sort = TRUE) %>% 
    slice_max(n, n = 6) %>% 
    pull(expedition_role)

members_raw %>% 
    filter(expedition_role %in% top_role) %>% 
    count(expedition_role, died) %>% 
    ggplot(aes(died, n, fill = died)) + 
    geom_col(show.legend = FALSE) + 
    facet_wrap(vars(expedition_role), scales = "free_y")

members <- members_raw %>% 
    select(expedition_id, peak_name, year, age, died, citizenship, expedition_role, season, age) %>% 
    na.omit() %>% 
    mutate_if(is.character, as.factor) %>% 
    mutate(expedition_id = as.character(expedition_id)) %>%
    mutate(across(where(is.logical), factor))

# members <- members %>% sample_n(5000)

Feature Engineering

members_small_true <- members %>% filter(died == "TRUE") %>% sample_n(200)
members_small_false <- members %>% filter(died == "FALSE") %>% sample_n(200)

members <- bind_rows(members_small_true, members_small_false)

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
set.seed(123)
member_split <- initial_split(members, strata = died)
member_train <- training(member_split)
member_test <- testing(member_split)

set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [270/30]> Fold01
##  2 <split [270/30]> Fold02
##  3 <split [270/30]> Fold03
##  4 <split [270/30]> Fold04
##  5 <split [270/30]> Fold05
##  6 <split [270/30]> Fold06
##  7 <split [270/30]> Fold07
##  8 <split [270/30]> Fold08
##  9 <split [270/30]> Fold09
## 10 <split [270/30]> Fold10
library(embed)

member_rec <- 
    recipe(died ~ ., data = member_train) %>% 
    update_role(expedition_id, new_role = "id") %>% 
    step_lencode_glm(peak_name, outcome = vars(died)) %>%    
    step_dummy(all_nominal_predictors())

member_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:   1
## predictor: 6
## id:        1
## 
## ── Operations
## • Linear embedding for factors via GLM for: peak_name
## • Dummy variables from: all_nominal_predictors()
prep(member_rec) %>%
  tidy(number = 1) %>%
  filter(level == "..new")
## New names:
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...345`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...346`
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...361`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...370`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...571`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...587`
## # A tibble: 1 × 4
##   level   value terms     id               
##   <chr>   <dbl> <chr>     <chr>            
## 1 ..new -0.0186 peak_name lencode_glm_9EDsX

Build a Model

xgb_spec <- 
    boost_tree(
        trees = tune(),
        min_n = tune(), 
        mtry = tune(), 
        learn_rate = 0.01) %>% 
    set_engine("xgboost") %>% 
    set_mode("classification")

xgb_wf <- workflow(member_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = member_folds,
  grid = 15,
  control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## New names:
## ℹ Racing will maximize the roc_auc metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 2 eliminated; 13 candidates remain.
## ℹ Fold07: 0 eliminated; 13 candidates remain.
## ℹ Fold03: 0 eliminated; 13 candidates remain.
## ℹ Fold05: 0 eliminated; 13 candidates remain.
## ℹ Fold09: 0 eliminated; 13 candidates remain.
## ℹ Fold04: 0 eliminated; 13 candidates remain.
## ℹ Fold06: 1 eliminated; 12 candidates remain.
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...345`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...346`
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...361`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...370`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...571`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...587`
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits           id     .order .metrics          .notes          
##    <list>           <chr>   <int> <list>            <list>          
##  1 <split [270/30]> Fold01      2 <tibble [30 × 7]> <tibble [0 × 3]>
##  2 <split [270/30]> Fold02      3 <tibble [30 × 7]> <tibble [0 × 3]>
##  3 <split [270/30]> Fold10      1 <tibble [30 × 7]> <tibble [0 × 3]>
##  4 <split [270/30]> Fold07      4 <tibble [26 × 7]> <tibble [0 × 3]>
##  5 <split [270/30]> Fold03      5 <tibble [26 × 7]> <tibble [0 × 3]>
##  6 <split [270/30]> Fold05      6 <tibble [26 × 7]> <tibble [0 × 3]>
##  7 <split [270/30]> Fold09      7 <tibble [26 × 7]> <tibble [0 × 3]>
##  8 <split [270/30]> Fold04      8 <tibble [26 × 7]> <tibble [0 × 3]>
##  9 <split [270/30]> Fold06      9 <tibble [26 × 7]> <tibble [0 × 3]>
## 10 <split [270/30]> Fold08     10 <tibble [24 × 7]> <tibble [0 × 3]>
plot_race(xgb_rs)

collect_metrics(xgb_rs)
## # A tibble: 24 × 9
##     mtry trees min_n .metric  .estimator  mean     n std_err .config            
##    <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>              
##  1    41   599     8 accuracy binary     0.603    10  0.0324 Preprocessor1_Mode…
##  2    41   599     8 roc_auc  binary     0.645    10  0.0329 Preprocessor1_Mode…
##  3    88  1805    31 accuracy binary     0.617    10  0.0229 Preprocessor1_Mode…
##  4    88  1805    31 roc_auc  binary     0.650    10  0.0295 Preprocessor1_Mode…
##  5   120   136     3 accuracy binary     0.603    10  0.0270 Preprocessor1_Mode…
##  6   120   136     3 roc_auc  binary     0.637    10  0.0302 Preprocessor1_Mode…
##  7   171   122    21 accuracy binary     0.603    10  0.0393 Preprocessor1_Mode…
##  8   171   122    21 roc_auc  binary     0.662    10  0.0340 Preprocessor1_Mode…
##  9   208  1536    11 accuracy binary     0.64     10  0.0321 Preprocessor1_Mode…
## 10   208  1536    11 roc_auc  binary     0.673    10  0.0385 Preprocessor1_Mode…
## # ℹ 14 more rows
xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(member_split)
## New names:
## New names:
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...345`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...346`
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...361`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...370`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...571`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...587`
xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits            id               .metrics .notes   .predictions .workflow 
##   <list>            <chr>            <list>   <list>   <list>       <list>    
## 1 <split [300/100]> train/test split <tibble> <tibble> <tibble>     <workflow>
collect_metrics(xgb_last)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.56  Preprocessor1_Model1
## 2 roc_auc  binary         0.604 Preprocessor1_Model1
collect_predictions(xgb_last) %>% 
    conf_mat(died, .pred_class)
##           Truth
## Prediction FALSE TRUE
##      FALSE    27   21
##      TRUE     23   29
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgb_last %>%
  extract_fit_engine() %>%
  vip()

1. What is the research question?

#2. Data Exploration and Transformation: - The newly transformed data has logical and character data changed to factor, as well as omited NA’s.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: step_dummy(creates a specification of a recipe step that will convert nominal data (e.g. characters or factors) into one or more numeric binary models) and step_lencode_glm(creates a specification of a recipe step that will convert a nominal predictor into a single set of scores derived from a generalized linear model). 

#4. Model Evaluation: - Looking at the confusion matrix we can see that the model did an ok job of predicting the outcome.

5. Conclusion: