library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members_raw %>% 
    filter(citizenship == "USA", 
           died %in% c("TRUE", "FALSE")) %>% 
    ggplot(aes(age, year, color = died)) + geom_point(alpha = 0.2) +
    coord_fixed()
## Warning: Removed 128 rows containing missing values (`geom_point()`).

members <- members_raw %>% 
    filter(citizenship == "USA", 
           died %in% c("TRUE", "FALSE")) %>% 
    mutate(oxygen_used = 
               case_when(str_detect(oxygen_used, "^FALSE") ~ "false",
                         str_detect(oxygen_used, "^TRUE") ~ "true",)) %>%
    select(-highpoint_metres, -peak_id, -peak_id, -citizenship, -death_height_metres, -injury_height_metres) %>%
    mutate(across(where(is.character), factor)) %>%
    mutate(across(where(is.logical), factor)) %>%
    filter(!is.na(age))
members %>% ggplot(aes(year, y = ..density.., fill = died)) + 
    geom_histogram(position = "identity", alpha = 0.5) +
                   labs(fill = "survive?")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

members %>% 
    ggplot(aes(y = oxygen_used, fill = died)) +
    geom_bar(position = "fill") +
    labs(fill = "survive?")

members %>% 
    ggplot(aes(y = solo, fill = died)) +
    geom_bar(position = "fill") +
    labs(fill = "die?")

Build Model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
member_split <- initial_split(members, strata = died)
member_train <- training(member_split)
member_test <- testing(member_split)

set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [4266/474]> Fold01
##  2 <split [4266/474]> Fold02
##  3 <split [4266/474]> Fold03
##  4 <split [4266/474]> Fold04
##  5 <split [4266/474]> Fold05
##  6 <split [4266/474]> Fold06
##  7 <split [4266/474]> Fold07
##  8 <split [4266/474]> Fold08
##  9 <split [4266/474]> Fold09
## 10 <split [4266/474]> Fold10
library(themis)
ranger_recipe <- 
  recipe(formula = died ~ ., data = member_train) %>%
    #update_role(member_id, new_role = "id") %>% 
    step_unknown(all_nominal_predictors()) %>% 
    step_other(all_nominal_predictors(), threshold = 0.03) %>% 
    step_dummy(all_nominal_predictors()) %>% 
    step_smote(died)

ranger_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 9,416
## Columns: 27
## $ year                   <dbl> 1979, 1979, 1979, 1979, 1979, 1979, 1981, 1981,…
## $ age                    <dbl> 35, 23, 28, 32, 33, 29, 34, 35, 32, 19, 28, 36,…
## $ expedition_id_other    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ member_id_other        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ peak_name_Cho.Oyu      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Everest      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Lhotse       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Makalu       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Pumori       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_other        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ season_Spring          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,…
## $ season_other           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sex_M                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,…
## $ sex_other              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ expedition_role_Leader <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ expedition_role_other  <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ hired_other            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ success_TRUE.          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,…
## $ success_other          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ solo_other             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ oxygen_used_true       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ oxygen_used_other      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ death_cause_other      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injured_TRUE.          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injured_other          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injury_type_other      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ died                   <fct> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
ranger_spec <- 
  rand_forest(trees = 1000) %>% 
  set_mode("classification") %>% 
  set_engine("ranger") 

ranger_workflow <- 
  workflow() %>% 
  add_recipe(ranger_recipe) %>% 
  add_model(ranger_spec) 

doParallel::registerDoParallel()
set.seed(27358)
ranger_tune <-
  tune_grid(ranger_workflow, 
            resamples = member_folds,
            control = control_resamples(save_pred = TRUE))
## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?
collect_metrics(ranger_tune)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary         1    10       0 Preprocessor1_Model1
## 2 roc_auc  binary         1    10       0 Preprocessor1_Model1
collect_predictions(ranger_tune) %>%
    group_by(id) %>%
    roc_curve(died, .pred_TRUE) %>% 
    autoplot()

collect_predictions(ranger_tune) %>%
    group_by(id) %>%
    roc_curve(died, .pred_FALSE) %>% 
    autoplot()

conf_mat_resampled(ranger_tune, tidy = FALSE) %>%
    autoplot()

final_fitted <- last_fit(ranger_workflow, member_split) 
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary             1 Preprocessor1_Model1
## 2 roc_auc  binary             1 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
    conf_mat(died, .pred_class) %>% 
    autoplot()

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_data <- ranger_recipe %>%
  prep() %>%
  bake(new_data = NULL) 

ranger_spec %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(died ~ ., data = imp_data) %>%
  vip(geom = "point")

# imp_data %>%
 # select(died, age, season_other, season_Spring, year) %>% 
 # ggplot(aes(y = age, fill = died)) +
  #geom_bar(position = "fill") +
  #facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
  #theme(legend.position = "top") +
  #scale_fill_brewer(type = "qual", palette = 7) +
  #scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent) 

1. What is the research question?

#2. Data Exploration and Transformation: - The newly transformed data has turned all logical and character variables into factors and omitted any NA’s in age.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: update_role which turns the id into an identifier instead of data used in the modeling, step_unknown which will assign a missing value in a factor level to unknown, step_other which will pool uncommon values together, step_dummy which converts nominal data into binary, and step_smote which will generate new examples of the minority class using nearest neighbors of these cases. 

#4. Model Evaluation: - Using ROC curves and confusion matrices we can see that the model performed ok.

5. Conclusion: