library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
members_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# If data is not sensitive:
members_raw_tbl %>% glimpse()
## Rows: 76,519
## Columns: 21
## $ expedition_id <chr> "AMAD78301", "AMAD78301", "AMAD78301", "AMAD78301…
## $ member_id <chr> "AMAD78301-01", "AMAD78301-02", "AMAD78301-03", "…
## $ peak_id <chr> "AMAD", "AMAD", "AMAD", "AMAD", "AMAD", "AMAD", "…
## $ peak_name <chr> "Ama Dablam", "Ama Dablam", "Ama Dablam", "Ama Da…
## $ year <dbl> 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1…
## $ season <chr> "Autumn", "Autumn", "Autumn", "Autumn", "Autumn",…
## $ sex <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M",…
## $ age <dbl> 40, 41, 27, 40, 34, 25, 41, 29, 35, 37, 23, 44, 2…
## $ citizenship <chr> "France", "France", "France", "France", "France",…
## $ expedition_role <chr> "Leader", "Deputy Leader", "Climber", "Exp Doctor…
## $ hired <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ highpoint_metres <dbl> NA, 6000, NA, 6000, NA, 6000, 6000, 6000, NA, 681…
## $ success <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ solo <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ oxygen_used <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ died <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ death_cause <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death_height_metres <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ injured <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ injury_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ injury_height_metres <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
The goal is to help predict climber deaths.
Please write R code to create a predictive model that predicts the probability of climber deaths, use only 100 climbers.
The goal is to help predict climber deaths.
Please write R code to create a predictive model that predicts the probability of climber deaths, use only 100 climbers.
# Load libraries
library(tidyverse)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ rsample 1.2.1
## ✔ dials 1.4.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.3.0 ✔ yardstick 1.3.2
## ✔ recipes 1.1.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
# Load data
members_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean and prepare data
members_tbl <- members_raw_tbl %>%
select(died, age, sex, citizenship, expedition_role, oxygen_used) %>%
filter(!is.na(died), !is.na(age), !is.na(sex), !is.na(citizenship), !is.na(expedition_role), !is.na(oxygen_used)) %>%
mutate(died = as.factor(died)) %>%
slice_sample(n = 100)
# Split into training and test sets
set.seed(123)
data_split <- initial_split(members_tbl, prop = 0.8, strata = died)
train_data <- training(data_split)
test_data <- testing(data_split)
# Recipe for preprocessing
recipe_death <- recipe(died ~ ., data = train_data) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors())
# Logistic regression model
log_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
# Workflow
death_workflow <- workflow() %>%
add_model(log_model) %>%
add_recipe(recipe_death)
# Fit model
fit_model <- fit(death_workflow, data = train_data)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Predict probabilities on the test set
predictions <- predict(fit_model, test_data, type = "prob") %>%
bind_cols(test_data)
## Warning: ! There are new levels in `citizenship`: "Armenia".
## ℹ Consider using step_novel() (`?recipes::step_novel()`) before `step_dummy()`
## to handle unseen values.
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# View the prediction column names
print(names(predictions))
## [1] ".pred_FALSE" ".pred_TRUE" "died" "age"
## [5] "sex" "citizenship" "expedition_role" "oxygen_used"
# Dynamically select predicted probabilities along with key info
predictions %>%
select(starts_with(".pred"), died, age, sex, citizenship, expedition_role, oxygen_used) %>%
head()
## # A tibble: 6 × 8
## .pred_FALSE .pred_TRUE died age sex citizenship expedition_role
## <dbl> <dbl> <fct> <dbl> <chr> <chr> <chr>
## 1 1 2.22e-16 FALSE 41 M UK Leader
## 2 1 2.22e-16 FALSE 37 F France Leader
## 3 1.00 2.38e- 4 FALSE 44 M UK Climber
## 4 1.00 1.77e- 5 FALSE 45 M Poland Climber
## 5 1.00 6.66e-10 FALSE 21 M Nepal H-A Worker
## 6 1.00 1.26e- 4 FALSE 36 M India Climber
## # ℹ 1 more variable: oxygen_used <lgl>
Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.
# Load libraries
library(tidyverse)
library(tidymodels)
library(h2o)
## Warning: package 'h2o' was built under R version 4.4.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(agua) # <- Required for H2O models with tidymodels
## Warning: package 'agua' was built under R version 4.4.3
##
## Attaching package: 'agua'
## The following object is masked from 'package:workflowsets':
##
## rank_results
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
# Start H2O
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\sheac\AppData\Local\Temp\RtmpgZ7isP\file2e4c480a6b31/h2o_sheac_started_from_r.out
## C:\Users\sheac\AppData\Local\Temp\RtmpgZ7isP\file2e4c629e479d/h2o_sheac_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 692 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 14 days
## H2O cluster name: H2O_started_from_R_sheac_ehs919
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.92 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 14 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Load data
members_raw_tbl <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Prepare data
members_tbl <- members_raw_tbl %>%
select(died, age, sex, citizenship, expedition_role, oxygen_used) %>%
filter(!is.na(died), !is.na(age), !is.na(sex), !is.na(citizenship),
!is.na(expedition_role), !is.na(oxygen_used)) %>%
mutate(died = as.factor(died)) %>%
slice_sample(n = 100)
# Split into train/test
set.seed(123)
data_split <- initial_split(members_tbl, prop = 0.8, strata = died)
train_data <- training(data_split)
test_data <- testing(data_split)
# Preprocessing
recipe_death <- recipe(died ~ ., data = train_data) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors())
# H2O model spec
h2o_spec <- logistic_reg(mode = "classification") %>%
set_engine("h2o")
# Workflow
workflow_h2o <- workflow() %>%
add_model(h2o_spec) %>%
add_recipe(recipe_death)
# Fit model
fit_h2o <- fit(workflow_h2o, data = train_data)
# Predict probabilities
predictions <- predict(fit_h2o, test_data, type = "prob") %>%
bind_cols(test_data)
## Warning: ! There are new levels in `citizenship`: "Finland".
## ℹ Consider using step_novel() (`?recipes::step_novel()`) before `step_dummy()`
## to handle unseen values.
## Warning: ! There are new levels in `expedition_role`: "Film Team".
## ℹ Consider using step_novel() (`?recipes::step_novel()`) before `step_dummy()`
## to handle unseen values.
# View results
predictions %>%
select(starts_with(".pred"), died, age, sex, citizenship, expedition_role, oxygen_used)
## # A tibble: 20 × 8
## .pred_FALSE. .pred_TRUE. died age sex citizenship expedition_role
## <dbl> <dbl> <fct> <dbl> <chr> <chr> <chr>
## 1 0.992 0.00813 FALSE 46 F Canada Climber
## 2 0.992 0.00813 FALSE 34 M India Climber
## 3 0.992 0.00813 FALSE 47 M Austria Climber
## 4 0.992 0.00813 FALSE 32 M France Climber
## 5 0.992 0.00813 FALSE 37 M Japan Leader
## 6 0.992 0.00813 FALSE 23 M Nepal H-A Worker
## 7 0.992 0.00813 FALSE 28 M USA Climber
## 8 0.992 0.00813 FALSE 38 M Germany Climber
## 9 0.992 0.00813 FALSE 57 M USA Climber
## 10 0.992 0.00813 FALSE 66 M Japan Climber
## 11 0.992 0.00813 FALSE 42 M Spain Leader
## 12 0.992 0.00813 FALSE 47 M USA Climber
## 13 0.992 0.00813 FALSE 35 M Japan Film Team
## 14 0.992 0.00813 FALSE 28 M Japan Climber
## 15 0.992 0.00813 FALSE 24 M India Climber
## 16 0.992 0.00813 FALSE 25 F Spain Climber
## 17 0.992 0.00813 FALSE 50 M Nepal H-A Worker
## 18 0.992 0.00813 FALSE 33 M UK Climber
## 19 0.989 0.0108 FALSE 32 M Finland Climber
## 20 0.992 0.00813 FALSE 56 M Japan Climber
## # ℹ 1 more variable: oxygen_used <lgl>
# Shutdown H2O
h2o.shutdown(prompt = FALSE)