library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members_raw %>% count(died)
## # A tibble: 2 × 2
## died n
## <lgl> <int>
## 1 FALSE 75413
## 2 TRUE 1106
library(tidytext)
library(tidylo)
members_raw %>%
unnest_tokens(word, season) %>%
count(died, word) %>%
filter(n > 100) %>%
bind_log_odds(died, word, n) %>%
arrange(-log_odds_weighted)
## # A tibble: 6 × 4
## died word n log_odds_weighted
## <lgl> <chr> <int> <dbl>
## 1 FALSE winter 2054 18.5
## 2 FALSE summer 729 10.9
## 3 TRUE spring 555 8.69
## 4 TRUE autumn 493 7.75
## 5 FALSE autumn 35402 -4.80
## 6 FALSE spring 37227 -5.37
member <- members_raw %>%
mutate(across(where(is.logical), factor)) %>%
mutate(died = case_when(died == "TRUE" ~ "died", died == "FALSE" ~ "no")) %>%
drop_na(expedition_role, peak_name, citizenship, age, sex)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
member_split <-
member %>%
select(season, died, expedition_role, peak_name, citizenship, age, sex) %>%
initial_split(strata = died)
member_train <- training(member_split)
member_test <- testing(member_split)
set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [49264/5474]> Fold01
## 2 <split [49264/5474]> Fold02
## 3 <split [49264/5474]> Fold03
## 4 <split [49264/5474]> Fold04
## 5 <split [49264/5474]> Fold05
## 6 <split [49264/5474]> Fold06
## 7 <split [49264/5474]> Fold07
## 8 <split [49264/5474]> Fold08
## 9 <split [49265/5473]> Fold09
## 10 <split [49265/5473]> Fold10
library(textrecipes)
library(themis)
member_rec <-
recipe(formula = died ~ ., data = member_train) %>%
step_other(expedition_role, peak_name, citizenship) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_smote(died)
member_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 108,082
## Columns: 23
## $ age <dbl> 41, 34, 25, 41, 35, 37, 23, 44, 25, 32, 42,…
## $ season_Autumn <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season_Spring <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ season_Summer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season_Winter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_role_Climber <dbl> 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1…
## $ expedition_role_H.A.Worker <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_role_Leader <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_role_other <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0…
## $ peak_name_Ama.Dablam <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ peak_name_Cho.Oyu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_name_Everest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_name_Manaslu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_name_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_France <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_Japan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_Nepal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_UK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_USA <dbl> 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ citizenship_other <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sex_F <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sex_M <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ died <fct> no, no, no, no, no, no, no, no, no, no, no,…
glmnet_spec <-
logistic_reg(mixture = 1, penalty = tune()) %>%
set_engine("glmnet")
member_wf <- workflow(member_rec, glmnet_spec)
doParallel::registerDoParallel()
set.seed(123)
member_res <-
tune_grid(
member_wf,
member_folds,
grid = tibble(penalty = 10 ^ seq(-3, 0, by = 0.3))
)
autoplot(member_res)
show_best(member_res)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.001 roc_auc binary 0.636 10 0.00818 Preprocessor1_Model01
## 2 0.00200 roc_auc binary 0.636 10 0.00826 Preprocessor1_Model02
## 3 0.00398 roc_auc binary 0.633 10 0.00848 Preprocessor1_Model03
## 4 0.00794 roc_auc binary 0.631 10 0.00842 Preprocessor1_Model04
## 5 0.0158 roc_auc binary 0.624 10 0.00799 Preprocessor1_Model05
select_by_pct_loss(member_res, desc(penalty), metric = "roc_auc")
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .loss
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.0158 roc_auc binary 0.624 10 0.00799 Preprocessor1_Mode… 0.636 1.94
member_final <-
member_wf %>%
finalize_workflow(
select_by_pct_loss(member_res, desc(penalty), metric = "roc_auc")
) %>%
last_fit(member_split)
member_final
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [54738/18247]> train/test sp… <tibble> <tibble> <tibble> <workflow>
collect_metrics(member_final)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.426 Preprocessor1_Model1
## 2 roc_auc binary 0.628 Preprocessor1_Model1
collect_predictions(member_final) %>%
conf_mat(died, .pred_class)
## Truth
## Prediction died no
## died 179 10412
## no 53 7603
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
member_final %>%
extract_fit_engine() %>%
vi()
## # A tibble: 22 × 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 season_Summer 2.45 POS
## 2 peak_name_Ama.Dablam 1.57 POS
## 3 sex_F 1.22 POS
## 4 peak_name_Cho.Oyu 1.22 POS
## 5 expedition_role_H.A.Worker 1.15 POS
## 6 citizenship_USA 0.928 POS
## 7 citizenship_UK 0.897 POS
## 8 citizenship_Nepal 0.801 NEG
## 9 citizenship_France 0.626 POS
## 10 expedition_role_other 0.291 POS
## # ℹ 12 more rows
#2. Data Exploration and Transformation: - The newly transformed data has logical data changed to factor.
- There were a few steps made in this data prep and modeling section that include: step_dummy(creates a specification of a recipe step that will convert nominal data into one or more numeric binary mode), step_other(creates a specification of a recipe step that will potentially pool infrequently occurring values into an other category.), and step_smote(creates a specification of a recipe step that generate new examples of the minority class using nearest neighbors of these cases).