library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# If data is not sensitive:
members_raw_tbl %>% glimpse()
## Rows: 76,519
## Columns: 21
## $ expedition_id        <chr> "AMAD78301", "AMAD78301", "AMAD78301", "AMAD78301…
## $ member_id            <chr> "AMAD78301-01", "AMAD78301-02", "AMAD78301-03", "…
## $ peak_id              <chr> "AMAD", "AMAD", "AMAD", "AMAD", "AMAD", "AMAD", "…
## $ peak_name            <chr> "Ama Dablam", "Ama Dablam", "Ama Dablam", "Ama Da…
## $ year                 <dbl> 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1…
## $ season               <chr> "Autumn", "Autumn", "Autumn", "Autumn", "Autumn",…
## $ sex                  <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M",…
## $ age                  <dbl> 40, 41, 27, 40, 34, 25, 41, 29, 35, 37, 23, 44, 2…
## $ citizenship          <chr> "France", "France", "France", "France", "France",…
## $ expedition_role      <chr> "Leader", "Deputy Leader", "Climber", "Exp Doctor…
## $ hired                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ highpoint_metres     <dbl> NA, 6000, NA, 6000, NA, 6000, 6000, 6000, NA, 681…
## $ success              <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ solo                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ oxygen_used          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ died                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ death_cause          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death_height_metres  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ injured              <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ injury_type          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ injury_height_metres <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

Prompt 1

The goal is to help predict climber deaths.

Please write R code to create a predictive model that predicts the probability of climber deaths, use only 100 climbers.

The goal is to help predict climber deaths.

Please write R code to create a predictive model that predicts the probability of climber deaths, use only 100 climbers.

# Load libraries
library(tidyverse)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7     ✔ rsample      1.2.1
## ✔ dials        1.4.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.0     ✔ yardstick    1.3.2
## ✔ recipes      1.1.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
# Load data
members_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean and prepare data
members_tbl <- members_raw_tbl %>%
  select(died, age, sex, citizenship, expedition_role, oxygen_used) %>%
  filter(!is.na(died), !is.na(age), !is.na(sex), !is.na(citizenship), !is.na(expedition_role), !is.na(oxygen_used)) %>%
  mutate(died = as.factor(died)) %>%
  slice_sample(n = 100)

# Split into training and test sets
set.seed(123)
data_split <- initial_split(members_tbl, prop = 0.8, strata = died)
train_data <- training(data_split)
test_data  <- testing(data_split)

# Recipe for preprocessing
recipe_death <- recipe(died ~ ., data = train_data) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

# Logistic regression model
log_model <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

# Workflow
death_workflow <- workflow() %>%
  add_model(log_model) %>%
  add_recipe(recipe_death)

# Fit model
fit_model <- fit(death_workflow, data = train_data)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predict probabilities on the test set
predictions <- predict(fit_model, test_data, type = "prob") %>%
  bind_cols(test_data)
## Warning: ! There are new levels in `citizenship`: "Armenia".
## ℹ Consider using step_novel() (`?recipes::step_novel()`) before `step_dummy()`
##   to handle unseen values.
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# View the prediction column names
print(names(predictions))
## [1] ".pred_FALSE"     ".pred_TRUE"      "died"            "age"            
## [5] "sex"             "citizenship"     "expedition_role" "oxygen_used"
# Dynamically select predicted probabilities along with key info
predictions %>%
  select(starts_with(".pred"), died, age, sex, citizenship, expedition_role, oxygen_used) %>%
  head()
## # A tibble: 6 × 8
##   .pred_FALSE .pred_TRUE died    age sex   citizenship expedition_role
##         <dbl>      <dbl> <fct> <dbl> <chr> <chr>       <chr>          
## 1        1      2.22e-16 FALSE    41 M     UK          Leader         
## 2        1      2.22e-16 FALSE    37 F     France      Leader         
## 3        1.00   2.38e- 4 FALSE    44 M     UK          Climber        
## 4        1.00   1.77e- 5 FALSE    45 M     Poland      Climber        
## 5        1.00   6.66e-10 FALSE    21 M     Nepal       H-A Worker     
## 6        1.00   1.26e- 4 FALSE    36 M     India       Climber        
## # ℹ 1 more variable: oxygen_used <lgl>

Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.

# Load libraries
library(tidyverse)
library(tidymodels)
library(h2o)
## Warning: package 'h2o' was built under R version 4.4.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(agua)      # <- Required for H2O models with tidymodels
## Warning: package 'agua' was built under R version 4.4.3
## 
## Attaching package: 'agua'
## The following object is masked from 'package:workflowsets':
## 
##     rank_results
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
# Start H2O
h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\sheac\AppData\Local\Temp\RtmpgZ7isP\file2e4c480a6b31/h2o_sheac_started_from_r.out
##     C:\Users\sheac\AppData\Local\Temp\RtmpgZ7isP\file2e4c629e479d/h2o_sheac_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 692 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 4 months and 14 days 
##     H2O cluster name:           H2O_started_from_R_sheac_ehs919 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.92 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Load data
members_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Prepare data
members_tbl <- members_raw_tbl %>%
  select(died, age, sex, citizenship, expedition_role, oxygen_used) %>%
  filter(!is.na(died), !is.na(age), !is.na(sex), !is.na(citizenship), 
         !is.na(expedition_role), !is.na(oxygen_used)) %>%
  mutate(died = as.factor(died)) %>%
  slice_sample(n = 100)

# Split into train/test
set.seed(123)
data_split <- initial_split(members_tbl, prop = 0.8, strata = died)
train_data <- training(data_split)
test_data  <- testing(data_split)

# Preprocessing
recipe_death <- recipe(died ~ ., data = train_data) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

# H2O model spec
h2o_spec <- logistic_reg(mode = "classification") %>%
  set_engine("h2o")

# Workflow
workflow_h2o <- workflow() %>%
  add_model(h2o_spec) %>%
  add_recipe(recipe_death)

# Fit model
fit_h2o <- fit(workflow_h2o, data = train_data)

# Predict probabilities
predictions <- predict(fit_h2o, test_data, type = "prob") %>%
  bind_cols(test_data)
## Warning: ! There are new levels in `citizenship`: "Finland".
## ℹ Consider using step_novel() (`?recipes::step_novel()`) before `step_dummy()`
##   to handle unseen values.
## Warning: ! There are new levels in `expedition_role`: "Film Team".
## ℹ Consider using step_novel() (`?recipes::step_novel()`) before `step_dummy()`
##   to handle unseen values.
# View results
predictions %>%
  select(starts_with(".pred"), died, age, sex, citizenship, expedition_role, oxygen_used)
## # A tibble: 20 × 8
##    .pred_FALSE. .pred_TRUE. died    age sex   citizenship expedition_role
##           <dbl>       <dbl> <fct> <dbl> <chr> <chr>       <chr>          
##  1        0.992     0.00813 FALSE    46 F     Canada      Climber        
##  2        0.992     0.00813 FALSE    34 M     India       Climber        
##  3        0.992     0.00813 FALSE    47 M     Austria     Climber        
##  4        0.992     0.00813 FALSE    32 M     France      Climber        
##  5        0.992     0.00813 FALSE    37 M     Japan       Leader         
##  6        0.992     0.00813 FALSE    23 M     Nepal       H-A Worker     
##  7        0.992     0.00813 FALSE    28 M     USA         Climber        
##  8        0.992     0.00813 FALSE    38 M     Germany     Climber        
##  9        0.992     0.00813 FALSE    57 M     USA         Climber        
## 10        0.992     0.00813 FALSE    66 M     Japan       Climber        
## 11        0.992     0.00813 FALSE    42 M     Spain       Leader         
## 12        0.992     0.00813 FALSE    47 M     USA         Climber        
## 13        0.992     0.00813 FALSE    35 M     Japan       Film Team      
## 14        0.992     0.00813 FALSE    28 M     Japan       Climber        
## 15        0.992     0.00813 FALSE    24 M     India       Climber        
## 16        0.992     0.00813 FALSE    25 F     Spain       Climber        
## 17        0.992     0.00813 FALSE    50 M     Nepal       H-A Worker     
## 18        0.992     0.00813 FALSE    33 M     UK          Climber        
## 19        0.989     0.0108  FALSE    32 M     Finland     Climber        
## 20        0.992     0.00813 FALSE    56 M     Japan       Climber        
## # ℹ 1 more variable: oxygen_used <lgl>
# Shutdown H2O
h2o.shutdown(prompt = FALSE)