library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

top_peaks <- members_raw %>% 
    count(peak_name, sort = TRUE) %>% 
    slice_max(n, n = 6) %>% 
    pull(peak_name)

members_raw %>% 
    filter(peak_name %in% top_peaks) %>% 
    count(peak_name, died) %>% 
    ggplot(aes(died, n, fill = died)) + 
    geom_col(show.legend = FALSE) + 
    facet_wrap(vars(peak_name), scales = "free")

members_raw %>% 
    count(died, citizenship)

## # A tibble: 267 × 3
##    died  citizenship               n
##    <lgl> <chr>                 <int>
##  1 FALSE Albania                   6
##  2 FALSE Algeria                   2
##  3 FALSE Andorra                  31
##  4 FALSE Argentina               232
##  5 FALSE Argentina/Canada          2
##  6 FALSE Armenia                   3
##  7 FALSE Australia              1395
##  8 FALSE Australia/Greece          1
##  9 FALSE Australia/Ireland         2
## 10 FALSE Australia/New Zealand    17
## # ℹ 257 more rows

top_role <- members_raw %>% 
    count(expedition_role, sort = TRUE) %>% 
    slice_max(n, n = 6) %>% 
    pull(expedition_role)

members_raw %>% 
    filter(expedition_role %in% top_role) %>% 
    count(expedition_role, died) %>% 
    ggplot(aes(died, n, fill = died)) + 
    geom_col(show.legend = FALSE) + 
    facet_wrap(vars(expedition_role), scales = "free_y")

members <- members_raw %>% 
    select(expedition_id, peak_name, year, age, died, citizenship, expedition_role, season, age) %>% 
    na.omit() %>% 
    mutate_if(is.character, as.factor) %>% 
    mutate(expedition_id = as.character(expedition_id)) %>%
    mutate(across(where(is.logical), factor))

# members <- members %>% sample_n(5000)

Feature Engineering

members_small_true <- members %>% filter(died == "TRUE") %>% sample_n(200)
members_small_false <- members %>% filter(died == "FALSE") %>% sample_n(200)

members <- bind_rows(members_small_true, members_small_false)

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

set.seed(123)
member_split <- initial_split(members, strata = died)
member_train <- training(member_split)
member_test <- testing(member_split)

set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [270/30]> Fold01
##  2 <split [270/30]> Fold02
##  3 <split [270/30]> Fold03
##  4 <split [270/30]> Fold04
##  5 <split [270/30]> Fold05
##  6 <split [270/30]> Fold06
##  7 <split [270/30]> Fold07
##  8 <split [270/30]> Fold08
##  9 <split [270/30]> Fold09
## 10 <split [270/30]> Fold10

library(embed)

member_rec <- 
    recipe(died ~ ., data = member_train) %>% 
    update_role(expedition_id, new_role = "id") %>% 
    step_lencode_glm(peak_name, outcome = vars(died)) %>%    
    step_dummy(all_nominal_predictors())

member_rec

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:   1
## predictor: 6
## id:        1

##

## ── Operations

## • Linear embedding for factors via GLM for: peak_name

## • Dummy variables from: all_nominal_predictors()

prep(member_rec) %>%
  tidy(number = 1) %>%
  filter(level == "..new")

## New names:
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...345`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...346`
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...361`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...370`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...571`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...587`

## # A tibble: 1 × 4
##   level   value terms     id               
##   <chr>   <dbl> <chr>     <chr>            
## 1 ..new -0.0186 peak_name lencode_glm_9EDsX

Build a Model

xgb_spec <- 
    boost_tree(
        trees = tune(),
        min_n = tune(), 
        mtry = tune(), 
        learn_rate = 0.01) %>% 
    set_engine("xgboost") %>% 
    set_mode("classification")

xgb_wf <- workflow(member_rec, xgb_spec)

library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = member_folds,
  grid = 15,
  control = control_race(verbose_elim = TRUE)
)

## i Creating pre-processing data to finalize unknown parameter: mtry

## New names:
## ℹ Racing will maximize the roc_auc metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 2 eliminated; 13 candidates remain.
## ℹ Fold07: 0 eliminated; 13 candidates remain.
## ℹ Fold03: 0 eliminated; 13 candidates remain.
## ℹ Fold05: 0 eliminated; 13 candidates remain.
## ℹ Fold09: 0 eliminated; 13 candidates remain.
## ℹ Fold04: 0 eliminated; 13 candidates remain.
## ℹ Fold06: 1 eliminated; 12 candidates remain.
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...345`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...346`
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...361`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...370`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...571`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...587`

xgb_rs

## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits           id     .order .metrics          .notes          
##    <list>           <chr>   <int> <list>            <list>          
##  1 <split [270/30]> Fold01      2 <tibble [30 × 7]> <tibble [0 × 3]>
##  2 <split [270/30]> Fold02      3 <tibble [30 × 7]> <tibble [0 × 3]>
##  3 <split [270/30]> Fold10      1 <tibble [30 × 7]> <tibble [0 × 3]>
##  4 <split [270/30]> Fold07      4 <tibble [26 × 7]> <tibble [0 × 3]>
##  5 <split [270/30]> Fold03      5 <tibble [26 × 7]> <tibble [0 × 3]>
##  6 <split [270/30]> Fold05      6 <tibble [26 × 7]> <tibble [0 × 3]>
##  7 <split [270/30]> Fold09      7 <tibble [26 × 7]> <tibble [0 × 3]>
##  8 <split [270/30]> Fold04      8 <tibble [26 × 7]> <tibble [0 × 3]>
##  9 <split [270/30]> Fold06      9 <tibble [26 × 7]> <tibble [0 × 3]>
## 10 <split [270/30]> Fold08     10 <tibble [24 × 7]> <tibble [0 × 3]>

plot_race(xgb_rs)

collect_metrics(xgb_rs)

## # A tibble: 24 × 9
##     mtry trees min_n .metric  .estimator  mean     n std_err .config            
##    <int> <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>              
##  1    41   599     8 accuracy binary     0.603    10  0.0324 Preprocessor1_Mode…
##  2    41   599     8 roc_auc  binary     0.645    10  0.0329 Preprocessor1_Mode…
##  3    88  1805    31 accuracy binary     0.617    10  0.0229 Preprocessor1_Mode…
##  4    88  1805    31 roc_auc  binary     0.650    10  0.0295 Preprocessor1_Mode…
##  5   120   136     3 accuracy binary     0.603    10  0.0270 Preprocessor1_Mode…
##  6   120   136     3 roc_auc  binary     0.637    10  0.0302 Preprocessor1_Mode…
##  7   171   122    21 accuracy binary     0.603    10  0.0393 Preprocessor1_Mode…
##  8   171   122    21 roc_auc  binary     0.662    10  0.0340 Preprocessor1_Mode…
##  9   208  1536    11 accuracy binary     0.64     10  0.0321 Preprocessor1_Mode…
## 10   208  1536    11 roc_auc  binary     0.673    10  0.0385 Preprocessor1_Mode…
## # ℹ 14 more rows

xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "accuracy")) %>%
  last_fit(member_split)

## New names:
## New names:
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...345`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...346`
## • `expedition_role_Climber.Guide` -> `expedition_role_Climber.Guide...361`
## • `expedition_role_Climber.Sirdar` -> `expedition_role_Climber.Sirdar...370`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...571`
## • `expedition_role_Leader.Scientist` ->
##   `expedition_role_Leader.Scientist...587`

xgb_last

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits            id               .metrics .notes   .predictions .workflow 
##   <list>            <chr>            <list>   <list>   <list>       <list>    
## 1 <split [300/100]> train/test split <tibble> <tibble> <tibble>     <workflow>

collect_metrics(xgb_last)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.56  Preprocessor1_Model1
## 2 roc_auc  binary         0.604 Preprocessor1_Model1

collect_predictions(xgb_last) %>% 
    conf_mat(died, .pred_class)

##           Truth
## Prediction FALSE TRUE
##      FALSE    27   21
##      TRUE     23   29

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgb_last %>%
  extract_fit_engine() %>%
  vip()

1. What is the research question?

The question we are trying to answer is can we determine if someone died based on different categories. # Describe the data briefly
The data consists of different mountains, seasons and years they were climbed, ages and citizenship of all involved, etc. # What are the characteristics of the key variables used in the analysis?
The primary variables we are using in this analysis consist of did they die, what peak name, age, year and more.

#2. Data Exploration and Transformation: - The newly transformed data has logical and character data changed to factor, as well as omited NA’s.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: step_dummy(creates a specification of a recipe step that will convert nominal data (e.g. characters or factors) into one or more numeric binary models) and step_lencode_glm(creates a specification of a recipe step that will convert a nominal predictor into a single set of scores derived from a generalized linear model).

#4. Model Evaluation: - Looking at the confusion matrix we can see that the model did an ok job of predicting the outcome.

5. Conclusion:

The key findings of this model are that peak_name, age and year had the highest importance in the data.

Apply 6

Colin Tracy

2023-10-18

Feature Engineering

Build a Model

1. What is the research question?

3. Data Preparation and Modeling:

5. Conclusion: