Import data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

library(tidymodels) #for building models

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10

## Warning: package 'modeldata' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

library(textrecipes) # For processing string variable
library(tidytext)
library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.3.3

members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean data

skimr::skim(members)

Data summary
Name	members
Number of rows	76519
Number of columns	21
_______________________
Column type frequency:
character	10
logical	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
expedition_id	0	1.00	9	9	10350
member_id	0	1.00	12	12	76518
peak_id	0	1.00	4	4	391
peak_name	15	1.00	4	25	390
season	0	1.00	6	7	5
sex	2	1.00	1	1	2
citizenship	10	1.00	2	23	212
expedition_role	21	1.00	4	25	524
death_cause	75413	0.01	3	27	12
injury_type	74807	0.02	3	27	11

Variable type: logical

skim_variable	complete_rate	mean	count
hired	1	0.21	FAL: 60788, TRU: 15731
success	1	0.38	FAL: 47320, TRU: 29199
solo	1	0.00	FAL: 76398, TRU: 121
oxygen_used	1	0.24	FAL: 58286, TRU: 18233
died	1	0.01	FAL: 75413, TRU: 1106
injured	1	0.02	FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2000.36	14.78	1905	1991	2004	2012	2019	▁▁▁▃▇
age	3497	0.95	37.33	10.40	7	29	36	44	85	▁▇▅▁▁
highpoint_metres	21833	0.71	7470.68	1040.06	3800	6700	7400	8400	8850	▁▁▆▃▇
death_height_metres	75451	0.01	6592.85	1308.19	400	5800	6600	7550	8830	▁▁▂▇▆
injury_height_metres	75510	0.01	7049.91	1214.24	400	6200	7100	8000	8880	▁▁▂▇▇

Explore data

members1 <- members %>%
    # Treat missing values
    select(-death_height_metres, -injury_height_metres, -death_cause, -injury_type, -peak_id) %>%
    filter(!is.na(age)) %>%
    filter(!is.na(highpoint_metres)) %>%
    distinct(member_id, .keep_all = TRUE)

members1 %>% filter(duplicated(member_id))

## # A tibble: 0 × 16
## # ℹ 16 variables: expedition_id <chr>, member_id <chr>, peak_name <chr>,
## #   year <dbl>, season <chr>, sex <chr>, age <dbl>, citizenship <chr>,
## #   expedition_role <chr>, hired <lgl>, highpoint_metres <dbl>, success <lgl>,
## #   solo <lgl>, oxygen_used <lgl>, died <lgl>, injured <lgl>

factors_vec1 <- members1 %>% select(hired, success, solo, oxygen_used, died, injured) %>% names()

members1_clean <- members1 %>%
    # Address factors imported as numeric
    mutate(across(all_of(factors_vec1), as.factor)) %>%
    
    # Recode Attrition
    mutate(died = if_else(died == "TRUE", "Died", died)) %>%
    
    # Convert character to factor
    mutate(across(where(is.character), factor))

members1 %>% count(died)

## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 51662
## 2 TRUE    744

members1 %>%
    ggplot(aes(died)) +
    geom_bar()

Died vs. age

members1 %>%
    ggplot(aes(died, age)) +
    geom_boxplot()

Correlation plot

# Step 1: binarze 
data_binarized3 <- members1_clean %>%
    select(-expedition_id, -member_id) %>%
    na.omit() %>%
        binarize()

data_binarized3 %>% glimpse()

## Rows: 52,383
## Columns: 67
## $ peak_name__Ama_Dablam          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ peak_name__Annapurna_I         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Baruntse            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Cho_Oyu             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Dhaulagiri_I        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Everest             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Himlung_Himal       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Kangchenjunga       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Lhotse              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Makalu              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Manaslu             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Pumori              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `peak_name__-OTHER`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_1997`              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ year__1997_2007                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2007_2012                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2012_Inf                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ season__Autumn                 <dbl> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ season__Spring                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season__Winter                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `season__-OTHER`               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ sex__F                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ sex__M                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `age__-Inf_29`                 <dbl> 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, …
## $ age__29_36                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, …
## $ age__36_43                     <dbl> 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ age__43_Inf                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Australia         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Austria           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Canada            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__China             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__France            <dbl> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Germany           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__India             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Italy             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Japan             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Nepal             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__New_Zealand       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Poland            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Russia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__S_Korea           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Spain             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Switzerland       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__UK                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__USA               <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, …
## $ `citizenship__-OTHER`          <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Climber       <dbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, …
## $ expedition_role__Deputy_Leader <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_role__H-A_Worker`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Leader        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_role__-OTHER`      <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, …
## $ hired__FALSE                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ hired__TRUE                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `highpoint_metres__-Inf_6750`  <dbl> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ highpoint_metres__6750_7400    <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ highpoint_metres__7400_8450    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ highpoint_metres__8450_Inf     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ success__FALSE                 <dbl> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ success__TRUE                  <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ solo__FALSE                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `solo__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ oxygen_used__FALSE             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ oxygen_used__TRUE              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ died__Died                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ died__FALSE                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ injured__FALSE                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ injured__TRUE                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: correlation
data_correlation2 <- data_binarized3 %>%
    correlate(died__Died)

## Warning: correlate(): [Data Imbalance Detected] Consider sampling to balance the classes more than 5%
##   Column with imbalance: died__Died

data_correlation2

## # A tibble: 67 × 3
##    feature         bin          correlation
##    <fct>           <chr>              <dbl>
##  1 died            Died              1     
##  2 died            FALSE            -1     
##  3 year            -Inf_1997         0.0843
##  4 success         FALSE             0.0562
##  5 success         TRUE             -0.0562
##  6 peak_name       Annapurna_I       0.0431
##  7 year            2012_Inf         -0.0330
##  8 peak_name       Ama_Dablam       -0.0323
##  9 peak_name       Dhaulagiri_I      0.0315
## 10 expedition_role H-A_Worker       -0.0309
## # ℹ 57 more rows

# Step 3: plot
data_correlation2 %>% 
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 32 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Model bulidnig

Split data

library(tidymodels)

set.seed(1234)
#members1_clean <- members1_clean #%>% sample_n(100)
members1_clean <- members1_clean #%>% 
    #group_by(died) %>% 
    #sample_n(50) %>% 
    #ungroup()

members_split <- initial_split(members1_clean, strata = died)
members_train <- training(members_split)
members_test <- testing(members_split)

members_cv <- rsample::vfold_cv(members_train, strata = died)
members_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [35373/3931]> Fold01
##  2 <split [35373/3931]> Fold02
##  3 <split [35373/3931]> Fold03
##  4 <split [35373/3931]> Fold04
##  5 <split [35374/3930]> Fold05
##  6 <split [35374/3930]> Fold06
##  7 <split [35374/3930]> Fold07
##  8 <split [35374/3930]> Fold08
##  9 <split [35374/3930]> Fold09
## 10 <split [35374/3930]> Fold10

Preprocess data

library(themis)

xgboost_rec1 <- recipes::recipe(died ~ ., data = members_train) %>%
    update_role(member_id, new_role = "ID") %>%
    step_impute_knn(all_predictors()) %>%
    step_other(citizenship, peak_name, expedition_id, expedition_role, threshold = 0.1) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_smote(died)
   
xgboost_rec1 %>% prep() %>% juice() %>% glimpse()

## Rows: 77,492
## Columns: 33
## $ member_id                  <fct> AMAD78301-02, AMAD78301-04, AMAD78301-08, A…
## $ year                       <dbl> 1978, 1978, 1978, 1979, 1979, 1979, 1979, 1…
## $ age                        <dbl> 41, 40, 29, 37, 23, 42, 30, 28, 33, 29, 26,…
## $ highpoint_metres           <dbl> 6000, 6000, 6000, 6814, 6814, 6814, 6814, 6…
## $ died                       <fct> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ expedition_id_HIML13308    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_id_other        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ peak_name_Ama.Dablam       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ peak_name_Cho.Oyu          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_name_Everest          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_name_other            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season_Autumn              <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1…
## $ season_Spring              <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0…
## $ season_Summer              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season_Winter              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sex_F                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ sex_M                      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ citizenship_Nepal          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ citizenship_other          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ expedition_role_Climber    <dbl> 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1…
## $ expedition_role_H.A.Worker <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_role_Leader     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expedition_role_other      <dbl> 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0…
## $ hired_FALSE.               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ hired_TRUE.                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ success_FALSE.             <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ success_TRUE.              <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ solo_FALSE.                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ solo_TRUE.                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ oxygen_used_FALSE.         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ oxygen_used_TRUE.          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ injured_FALSE.             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ injured_TRUE.              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Specify model

library(usemodels)
usemodels::use_xgboost(died ~ ., data = members_train)

## xgboost_recipe <- 
##   recipe(formula = died ~ ., data = members_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(33141)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_spec1 <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("xgboost") 

xgboost_workflow1 <- 
  workflow() %>% 
  add_recipe(xgboost_rec1) %>% 
  add_model(xgboost_spec1)

Tune hyperparameter

doParallel::registerDoParallel()

set.seed(20020)
xgboost_tune <-
  tune_grid(xgboost_workflow1, 
            resamples = members_cv,
            grid = 5,
            control = control_grid(save_pred = TRUE))

## Warning: package 'xgboost' was built under R version 4.3.3

Model evaluation

Identify optimal values for hyperperamters

collect_metrics(xgboost_tune)

## # A tibble: 15 × 12
##    trees min_n tree_depth learn_rate loss_reduction sample_size .metric    
##    <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>      
##  1   598     4          3    0.0109        1.66e-10       0.126 accuracy   
##  2   598     4          3    0.0109        1.66e-10       0.126 brier_class
##  3   598     4          3    0.0109        1.66e-10       0.126 roc_auc    
##  4   927    17         13    0.00662       7.86e- 7       0.497 accuracy   
##  5   927    17         13    0.00662       7.86e- 7       0.497 brier_class
##  6   927    17         13    0.00662       7.86e- 7       0.497 roc_auc    
##  7  1388    20          6    0.00235       3.17e- 4       0.849 accuracy   
##  8  1388    20          6    0.00235       3.17e- 4       0.849 brier_class
##  9  1388    20          6    0.00235       3.17e- 4       0.849 roc_auc    
## 10   213    28          9    0.0464        2.23e- 3       0.655 accuracy   
## 11   213    28          9    0.0464        2.23e- 3       0.655 brier_class
## 12   213    28          9    0.0464        2.23e- 3       0.655 roc_auc    
## 13  1951    35          7    0.235         2.23e- 1       0.385 accuracy   
## 14  1951    35          7    0.235         2.23e- 1       0.385 brier_class
## 15  1951    35          7    0.235         2.23e- 1       0.385 roc_auc    
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

collect_predictions(xgboost_tune) %>%
    group_by(id) %>% 
    roc_curve(died, .pred_Died) %>%
    autoplot()

Fit the models for the last time

xgboost_last <- xgboost_workflow1 %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(members_split)

collect_metrics(xgboost_last)

## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary        0.985  Preprocessor1_Model1
## 2 roc_auc     binary        0.776  Preprocessor1_Model1
## 3 brier_class binary        0.0145 Preprocessor1_Model1

collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(died, .pred_class) %>%
    autoplot()

Varible importance

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

Apply7

Tindra Bergstrand

2024-10-24

Import data

Clean data

Explore data

Died vs. age

Correlation plot

Model bulidnig

Split data

Preprocess data

Specify model

Tune hyperparameter

Model evaluation

Identify optimal values for hyperperamters

Fit the models for the last time

Varible importance