library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
members_raw %>%
filter(citizenship == "USA",
died %in% c("TRUE", "FALSE")) %>%
ggplot(aes(age, year, color = died)) + geom_point(alpha = 0.2) +
coord_fixed()
## Warning: Removed 128 rows containing missing values (`geom_point()`).
members <- members_raw %>%
filter(citizenship == "USA",
died %in% c("TRUE", "FALSE")) %>%
mutate(oxygen_used =
case_when(str_detect(oxygen_used, "^FALSE") ~ "false",
str_detect(oxygen_used, "^TRUE") ~ "true",)) %>%
select(-highpoint_metres, -peak_id, -peak_id, -citizenship, -death_height_metres, -injury_height_metres) %>%
mutate(across(where(is.character), factor)) %>%
mutate(across(where(is.logical), factor)) %>%
filter(!is.na(age))
members %>% ggplot(aes(year, y = ..density.., fill = died)) +
geom_histogram(position = "identity", alpha = 0.5) +
labs(fill = "survive?")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
members %>%
ggplot(aes(y = oxygen_used, fill = died)) +
geom_bar(position = "fill") +
labs(fill = "survive?")
members %>%
ggplot(aes(y = solo, fill = died)) +
geom_bar(position = "fill") +
labs(fill = "die?")
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
member_split <- initial_split(members, strata = died)
member_train <- training(member_split)
member_test <- testing(member_split)
set.seed(234)
member_folds <- vfold_cv(member_train, strata = died)
member_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [4266/474]> Fold01
## 2 <split [4266/474]> Fold02
## 3 <split [4266/474]> Fold03
## 4 <split [4266/474]> Fold04
## 5 <split [4266/474]> Fold05
## 6 <split [4266/474]> Fold06
## 7 <split [4266/474]> Fold07
## 8 <split [4266/474]> Fold08
## 9 <split [4266/474]> Fold09
## 10 <split [4266/474]> Fold10
library(themis)
ranger_recipe <-
recipe(formula = died ~ ., data = member_train) %>%
#update_role(member_id, new_role = "id") %>%
step_unknown(all_nominal_predictors()) %>%
step_other(all_nominal_predictors(), threshold = 0.03) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(died)
ranger_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 9,416
## Columns: 27
## $ year <dbl> 1979, 1979, 1979, 1979, 1979, 1979, 1981, 1981,…
## $ age <dbl> 35, 23, 28, 32, 33, 29, 34, 35, 32, 19, 28, 36,…
## $ expedition_id_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ member_id_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ peak_name_Cho.Oyu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Everest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Lhotse <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Makalu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_Pumori <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_name_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ season_Spring <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,…
## $ season_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sex_M <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,…
## $ sex_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ expedition_role_Leader <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ expedition_role_other <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ hired_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ success_TRUE. <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,…
## $ success_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ solo_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ oxygen_used_true <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ oxygen_used_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ death_cause_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injured_TRUE. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injured_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injury_type_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ died <fct> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
ranger_spec <-
rand_forest(trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
doParallel::registerDoParallel()
set.seed(27358)
ranger_tune <-
tune_grid(ranger_workflow,
resamples = member_folds,
control = control_resamples(save_pred = TRUE))
## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?
collect_metrics(ranger_tune)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 1 10 0 Preprocessor1_Model1
## 2 roc_auc binary 1 10 0 Preprocessor1_Model1
collect_predictions(ranger_tune) %>%
group_by(id) %>%
roc_curve(died, .pred_TRUE) %>%
autoplot()
collect_predictions(ranger_tune) %>%
group_by(id) %>%
roc_curve(died, .pred_FALSE) %>%
autoplot()
conf_mat_resampled(ranger_tune, tidy = FALSE) %>%
autoplot()
final_fitted <- last_fit(ranger_workflow, member_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 1 Preprocessor1_Model1
## 2 roc_auc binary 1 Preprocessor1_Model1
collect_predictions(final_fitted) %>%
conf_mat(died, .pred_class) %>%
autoplot()
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
imp_data <- ranger_recipe %>%
prep() %>%
bake(new_data = NULL)
ranger_spec %>%
set_engine("ranger", importance = "permutation") %>%
fit(died ~ ., data = imp_data) %>%
vip(geom = "point")
# imp_data %>%
# select(died, age, season_other, season_Spring, year) %>%
# ggplot(aes(y = age, fill = died)) +
#geom_bar(position = "fill") +
#facet_grid(rows = vars(feature), scales = "free_y", space = "free_y") +
#theme(legend.position = "top") +
#scale_fill_brewer(type = "qual", palette = 7) +
#scale_x_continuous(expand = expansion(mult = c(0, .01)), labels = scales::percent)
#2. Data Exploration and Transformation: - The newly transformed data has turned all logical and character variables into factors and omitted any NA’s in age.
- There were a few steps made in this data prep and modeling section that include: update_role which turns the id into an identifier instead of data used in the modeling, step_unknown which will assign a missing value in a factor level to unknown, step_other which will pool uncommon values together, step_dummy which converts nominal data into binary, and step_smote which will generate new examples of the minority class using nearest neighbors of these cases.
#4. Model Evaluation: - Using ROC curves and confusion matrices we can see that the model performed ok.