library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

members_raw %>% count(died)

## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106

library(tidytext)
library(tidylo)

members_raw %>%
  unnest_tokens(word, season) %>%
  count(died, word) %>%
  filter(n > 100) %>%
  bind_log_odds(died, word, n) %>%
  arrange(-log_odds_weighted)

## # A tibble: 6 × 4
##   died  word       n log_odds_weighted
##   <lgl> <chr>  <int>             <dbl>
## 1 FALSE winter  2054             18.5 
## 2 FALSE summer   729             10.9 
## 3 TRUE  spring   555              8.69
## 4 TRUE  autumn   493              7.75
## 5 FALSE autumn 35402             -4.80
## 6 FALSE spring 37227             -5.37

member <- members_raw %>% 
    mutate(across(where(is.logical), factor)) %>% 
     mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>% 
    drop_na(expedition_role, peak_name, citizenship, age, sex)

 library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

set.seed(123)
member_split <-
  member %>%
  select(season, died, expedition_role, peak_name, citizenship, age, sex) %>%
  initial_split(strata = died)

member_train <- training(member_split)
member_test <- testing(member_split)

set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [49264/5474]> Fold01
##  2 <split [49264/5474]> Fold02
##  3 <split [49264/5474]> Fold03
##  4 <split [49264/5474]> Fold04
##  5 <split [49264/5474]> Fold05
##  6 <split [49264/5474]> Fold06
##  7 <split [49264/5474]> Fold07
##  8 <split [49264/5474]> Fold08
##  9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10

library(textrecipes)
library(themis)

member_rec <-
  recipe(formula = died ~ ., data = member_train) %>%
    step_other(expedition_role, peak_name, citizenship) %>% 
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
    step_smote(died)
  

member_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 108,082
## Columns: 23
## $ age                        <dbl> 41, 34, 25, 41, 35, 37, 23, 44, 25, 32, 42,…
## $ season_Autumn              <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season_Spring              <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ season_Summer              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season_Winter              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_role_Climber    <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1…
## $ expedition_role_H.A.Worker <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_role_Leader     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_role_other      <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0…
## $ peak_name_Ama.Dablam       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ peak_name_Cho.Oyu          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_name_Everest          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_name_Manaslu          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_name_other            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_France         <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_Japan          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_Nepal          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_UK             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_USA            <dbl> 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ citizenship_other          <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sex_F                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sex_M                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ died                       <fct> no, no, no, no, no, no, no, no, no, no, no,…

glmnet_spec <- 
  logistic_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

member_wf <- workflow(member_rec, glmnet_spec)

doParallel::registerDoParallel()
set.seed(123)
member_res <- 
  tune_grid(
    member_wf, 
    member_folds, 
    grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
  )

autoplot(member_res)

show_best(member_res)

## Warning: No value of `metric` was given; metric 'roc_auc' will be used.

## # A tibble: 5 × 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.001   roc_auc binary     0.636    10 0.00818 Preprocessor1_Model01
## 2 0.00200 roc_auc binary     0.636    10 0.00826 Preprocessor1_Model02
## 3 0.00398 roc_auc binary     0.633    10 0.00848 Preprocessor1_Model03
## 4 0.00794 roc_auc binary     0.631    10 0.00842 Preprocessor1_Model04
## 5 0.0158  roc_auc binary     0.624    10 0.00799 Preprocessor1_Model05

select_by_pct_loss(member_res, desc(penalty), metric = "roc_auc")

## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config             .best .loss
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               <dbl> <dbl>
## 1  0.0158 roc_auc binary     0.624    10 0.00799 Preprocessor1_Mode… 0.636  1.94

member_final <-
  member_wf %>%
  finalize_workflow(
    select_by_pct_loss(member_res, desc(penalty), metric = "roc_auc")
  ) %>%
  last_fit(member_split)

member_final

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits                id             .metrics .notes   .predictions .workflow 
##   <list>                <chr>          <list>   <list>   <list>       <list>    
## 1 <split [54738/18247]> train/test sp… <tibble> <tibble> <tibble>     <workflow>

collect_metrics(member_final)

## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.426 Preprocessor1_Model1
## 2 roc_auc  binary         0.628 Preprocessor1_Model1

collect_predictions(member_final) %>%
  conf_mat(died, .pred_class)

##           Truth
## Prediction  died    no
##       died   179 10412
##       no      53  7603

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

member_final %>%
  extract_fit_engine() %>%
  vi()

## # A tibble: 22 × 3
##    Variable                   Importance Sign 
##    <chr>                           <dbl> <chr>
##  1 season_Summer                   2.45  POS  
##  2 peak_name_Ama.Dablam            1.57  POS  
##  3 sex_F                           1.22  POS  
##  4 peak_name_Cho.Oyu               1.22  POS  
##  5 expedition_role_H.A.Worker      1.15  POS  
##  6 citizenship_USA                 0.928 POS  
##  7 citizenship_UK                  0.897 POS  
##  8 citizenship_Nepal               0.801 NEG  
##  9 citizenship_France              0.626 POS  
## 10 expedition_role_other           0.291 POS  
## # ℹ 12 more rows

1. What is the research question?

The question we are trying to answer is can we determine if someone died based on keywords. # Describe the data briefly
The data consists of different mountains, seasons and years they were climbed, ages and citizenship of all involved, etc. # What are the characteristics of the key variables used in the analysis?
The primary variables we are using in this analysis consist of did they die and what season it was.

#2. Data Exploration and Transformation: - The newly transformed data has logical data changed to factor.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: step_dummy(creates a specification of a recipe step that will convert nominal data into one or more numeric binary mode), step_other(creates a specification of a recipe step that will potentially pool infrequently occurring values into an other category.), and step_smote(creates a specification of a recipe step that generate new examples of the minority class using nearest neighbors of these cases).

4 Model Evaluation:

Looking at the confusion matrix we can see that the model did a good job of predicting the outcome.

5. Conclusion:

The key findings of this model are that season, peak name, and sex had the highest importance in the data.

Apply 8

Colin Tracy

2023-10-25

1. What is the research question?

3. Data Preparation and Modeling:

4 Model Evaluation:

5. Conclusion: