library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

members_raw %>% 
    filter(citizenship == "USA", 
           died %in% c("TRUE", "FALSE")) %>% 
    ggplot(aes(age, year, color = died)) + geom_point(alpha = 0.2) +
    coord_fixed()

## Warning: Removed 128 rows containing missing values (`geom_point()`).

members <- members_raw %>% 
    filter(citizenship == "USA", 
           died %in% c("TRUE", "FALSE")) %>% 
    mutate(oxygen_used = 
               case_when(str_detect(oxygen_used, "^FALSE") ~ "false",
                         str_detect(oxygen_used, "^TRUE") ~ "true",)) %>%
    select(-highpoint_metres, -peak_id, -peak_id, -citizenship, -death_height_metres, -injury_height_metres) %>%
    mutate(across(where(is.character), factor)) %>%
    mutate(across(where(is.logical), factor)) %>%
    filter(!is.na(age))

members %>% ggplot(aes(year, y = ..density.., fill = died)) + 
    geom_histogram(position = "identity", alpha = 0.5) +
                   labs(fill = "survive?")

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

members %>% 
    ggplot(aes(y = oxygen_used, fill = died)) +
    geom_bar(position = "fill") +
    labs(fill = "survive?")

members %>% 
    ggplot(aes(y = solo, fill = died)) +
    geom_bar(position = "fill") +
    labs(fill = "die?")

Build Model

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

set.seed(123)
member_split <- initial_split(members, strata = died)
member_train <- training(member_split)
member_test <- testing(member_split)

set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [4266/474]> Fold01
##  2 <split [4266/474]> Fold02
##  3 <split [4266/474]> Fold03
##  4 <split [4266/474]> Fold04
##  5 <split [4266/474]> Fold05
##  6 <split [4266/474]> Fold06
##  7 <split [4266/474]> Fold07
##  8 <split [4266/474]> Fold08
##  9 <split [4266/474]> Fold09
## 10 <split [4266/474]> Fold10

library(themis)
ranger_recipe <- 
  recipe(formula = died ~ ., data = member_train) %>%
    #update_role(member_id, new_role = "id") %>% 
    step_unknown(all_nominal_predictors()) %>% 
    step_other(all_nominal_predictors(), threshold = 0.03) %>% 
    step_dummy(all_nominal_predictors()) %>% 
    step_smote(died)

ranger_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 9,416
## Columns: 27
## $ year                   <dbl> 1979, 1979, 1979, 1979, 1979, 1979, 1981, 1981,…
## $ age                    <dbl> 35, 23, 28, 32, 33, 29, 34, 35, 32, 19, 28, 36,…
## $ expedition_id_other    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ member_id_other        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ peak_name_Cho.Oyu      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Everest      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Lhotse       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Makalu       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Pumori       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_other        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ season_Spring          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,…
## $ season_other           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sex_M                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,…
## $ sex_other              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ expedition_role_Leader <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ expedition_role_other  <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ hired_other            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ success_TRUE.          <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,…
## $ success_other          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ solo_other             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ oxygen_used_true       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ oxygen_used_other      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ death_cause_other      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injured_TRUE.          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injured_other          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injury_type_other      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ died                   <fct> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…

ranger_spec <- 
  rand_forest(trees = 1000) %>% 
  set_mode("classification") %>% 
  set_engine("ranger") 

ranger_workflow <- 
  workflow() %>% 
  add_recipe(ranger_recipe) %>% 
  add_model(ranger_spec) 

doParallel::registerDoParallel()
set.seed(27358)
ranger_tune <-
  tune_grid(ranger_workflow, 
            resamples = member_folds,
            control = control_resamples(save_pred = TRUE))

## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?

collect_metrics(ranger_tune)

## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary         1    10       0 Preprocessor1_Model1
## 2 roc_auc  binary         1    10       0 Preprocessor1_Model1

collect_predictions(ranger_tune) %>%
    group_by(id) %>%
    roc_curve(died, .pred_TRUE) %>% 
    autoplot()

collect_predictions(ranger_tune) %>%
    group_by(id) %>%
    roc_curve(died, .pred_FALSE) %>% 
    autoplot()

conf_mat_resampled(ranger_tune, tidy = FALSE) %>%
    autoplot()

final_fitted <- last_fit(ranger_workflow, member_split) 
collect_metrics(final_fitted)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary             1 Preprocessor1_Model1
## 2 roc_auc  binary             1 Preprocessor1_Model1

collect_predictions(final_fitted) %>%
    conf_mat(died, .pred_class) %>% 
    autoplot()

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

imp_data <- ranger_recipe %>%
  prep() %>%
  bake(new_data = NULL) 

ranger_spec %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(died ~ ., data = imp_data) %>%
  vip(geom = "point")

# imp_data %>%
 # select(died, age, season_other, season_Spring, year) %>% 
 # ggplot(aes(y = age, fill = died)) +
  #geom_bar(position = "fill") +
  #facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
  #theme(legend.position = "top") +
  #scale_fill_brewer(type = "qual", palette = 7) +
  #scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent)

1. What is the research question?

The question we are trying to answer is whether or not based on the data, we can predict if a hiker died or not on the mountain. # Describe the data briefly
The data consists of years, mountains, ages of hikers, whether they died or not, what season it was and more. # What are the characteristics of the key variables used in the analysis?
The primary variables we are using in this analysis consist of whether the hiker died, what age were they and what season it was.

#2. Data Exploration and Transformation: - The newly transformed data has turned all logical and character variables into factors and omitted any NA’s in age.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: update_role which turns the id into an identifier instead of data used in the modeling, step_unknown which will assign a missing value in a factor level to unknown, step_other which will pool uncommon values together, step_dummy which converts nominal data into binary, and step_smote which will generate new examples of the minority class using nearest neighbors of these cases.

#4. Model Evaluation: - Using ROC curves and confusion matrices we can see that the model performed ok.

5. Conclusion:

The key findings of this model is that year, peak, age and season had the most importance.

Apply 5

Colin Tracy

2023-10-11

Build Model

1. What is the research question?

3. Data Preparation and Modeling:

5. Conclusion: