library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mountain_data <-readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mountain_data %>% glimpse()
## Rows: 76,519
## Columns: 21
## $ expedition_id        <chr> "AMAD78301", "AMAD78301", "AMAD78301", "AMAD78301…
## $ member_id            <chr> "AMAD78301-01", "AMAD78301-02", "AMAD78301-03", "…
## $ peak_id              <chr> "AMAD", "AMAD", "AMAD", "AMAD", "AMAD", "AMAD", "…
## $ peak_name            <chr> "Ama Dablam", "Ama Dablam", "Ama Dablam", "Ama Da…
## $ year                 <dbl> 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1…
## $ season               <chr> "Autumn", "Autumn", "Autumn", "Autumn", "Autumn",…
## $ sex                  <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M",…
## $ age                  <dbl> 40, 41, 27, 40, 34, 25, 41, 29, 35, 37, 23, 44, 2…
## $ citizenship          <chr> "France", "France", "France", "France", "France",…
## $ expedition_role      <chr> "Leader", "Deputy Leader", "Climber", "Exp Doctor…
## $ hired                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ highpoint_metres     <dbl> NA, 6000, NA, 6000, NA, 6000, 6000, 6000, NA, 681…
## $ success              <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ solo                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ oxygen_used          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ died                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ death_cause          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ death_height_metres  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ injured              <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ injury_type          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ injury_height_metres <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

Prompt 1:

I have a dataset called mountain_data that looks like this:

mountain_data %>% glimpse() Rows: 76,519 Columns: 21 $ expedition_id “AMAD78301”, “AMAD78301”, “AMAD78301”, “AMAD78301”, “AMAD78301… $ member_id ”AMAD78301-01”, “AMAD78301-02”, “AMAD78301-03”, “AMAD78301-04”… $ peak_id “AMAD”, “AMAD”, “AMAD”, “AMAD”, “AMAD”, “AMAD”, “AMAD”, “AMAD”… $ peak_name “Ama Dablam”, “Ama Dablam”, “Ama Dablam”, “Ama Dablam”, “Ama D… $ year 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1979, 1979, 19… $ season ”Autumn”, “Autumn”, “Autumn”, “Autumn”, “Autumn”, “Autumn”, “A… $ sex ”M”, “M”, “M”, “M”, “M”, “M”, “M”, “M”, “M”, “M”, “M”, “M”, “M… $ age 40, 41, 27, 40, 34, 25, 41, 29, 35, 37, 23, 44, 25, 28, 32, 42… $ citizenship ”France”, “France”, “France”, “France”, “France”, “France”, “F… $ expedition_role ”Leader”, “Deputy Leader”, “Climber”, “Exp Doctor”, “Climber”,… $ hired FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,… $ highpoint_metres NA, 6000, NA, 6000, NA, 6000, 6000, 6000, NA, 6814, 6814, NA, … $ success FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,… $ solo FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,… $ oxygen_used FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,… $ died FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,… $ death_cause NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA… $ death_height_metres NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA… $ injured FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,… $ injury_type NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA… $ injury_height_metres NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

The goal is to help predict death for climbers

Please write R code to create a predictive model that predicts the probability of died.

Prompt 2:

Please update the code to use tidymodels instead of caret and to use the h2o model instead of glmnet.

# Load necessary libraries
library(tidyverse)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7      ✔ rsample      1.2.1 
## ✔ dials        1.3.0      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
# Start H2O cluster
h2o.init(nthreads = -1, max_mem_size = "2G")
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\bella\AppData\Local\Temp\RtmpG2JHBC\file5728163545c8/h2o_bella_started_from_r.out
##     C:\Users\bella\AppData\Local\Temp\RtmpG2JHBC\file572878a734b4/h2o_bella_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         4 seconds 400 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 13 days 
##     H2O cluster name:           H2O_started_from_R_bella_gxu699 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.98 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.3 (2024-02-29 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 13 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Clean and preprocess the data
mountain_data_clean <- mountain_data %>%
  filter(!is.na(died)) %>%
  mutate(across(c(sex, season, citizenship, expedition_role), as.factor)) %>%
  select(-c(expedition_id, member_id, peak_id, peak_name, death_cause, injury_type))

# Split data into training and test sets
set.seed(123)
train_test_split <- initial_split(mountain_data_clean, prop = 0.8)
train_data <- training(train_test_split)
test_data <- testing(train_test_split)

# Convert to H2O frames for model building
train_h2o <- as.h2o(train_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
test_h2o <- as.h2o(test_data)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# Define the response and predictor variables
response <- "died"
predictors <- c("age", "sex", "expedition_role", "season", "highpoint_metres", "oxygen_used")

# Build the H2O logistic regression model
model_h2o <- h2o.glm(
  x = predictors,
  y = response,
  training_frame = train_h2o,
  family = "binomial",
  lambda = 0.01,  # Regularization parameter
  alpha = 0.5,    # Elastic net mixing parameter
  nfolds = 5,     # Cross-validation for model tuning
  seed = 123
)
##   |                                                                              |                                                                      |   0%  |                                                                              |====                                                                  |   5%  |                                                                              |======================================================================| 100%
# View model summary
summary(model_h2o)
## Model Details:
## ==============
## 
## H2OBinomialModel: glm
## Model Key:  GLM_model_R_1733276571791_1 
## GLM Model: summary
##     family  link                            regularization
## 1 binomial logit Elastic Net (alpha = 0.5, lambda = 0.01 )
##   number_of_predictors_total number_of_active_predictors number_of_iterations
## 1                        472                           0                    1
##          training_frame
## 1 train_data_sid_9f9d_1
## 
## H2OBinomialMetrics: glm
## ** Reported on training data. **
## 
## MSE:  0.01447027
## RMSE:  0.1202924
## LogLoss:  0.07656495
## Mean Per-Class Error:  0.5
## AUC:  0.5
## AUCPR:  0.01468594
## Gini:  0
## R^2:  2.806644e-13
## Residual Deviance:  9373.847
## AIC:  9375.847
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE  TRUE    Error          Rate
## FALSE      0 60316 1.000000  =60316/60316
## TRUE       0   899 0.000000        =0/899
## Totals     0 61215 0.985314  =60316/61215
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.014686     0.028947   0
## 2                       max f2  0.014686     0.069356   0
## 3                 max f0point5  0.014686     0.018290   0
## 4                 max accuracy  0.014686     0.014686   0
## 5                max precision  0.014686     0.014686   0
## 6                   max recall  0.014686     1.000000   0
## 7              max specificity  0.014686     0.000000   0
## 8             max absolute_mcc  0.014686     0.000000   0
## 9   max min_per_class_accuracy  0.014686     0.000000   0
## 10 max mean_per_class_accuracy  0.014686     0.500000   0
## 11                     max tns  0.014686     0.000000   0
## 12                     max fns  0.014686     0.000000   0
## 13                     max fps  0.014686 60316.000000   0
## 14                     max tps  0.014686   899.000000   0
## 15                     max tnr  0.014686     0.000000   0
## 16                     max fnr  0.014686     0.000000   0
## 17                     max fpr  0.014686     1.000000   0
## 18                     max tpr  0.014686     1.000000   0
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## H2OBinomialMetrics: glm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.01447075
## RMSE:  0.1202945
## LogLoss:  0.07658189
## Mean Per-Class Error:  0.5
## AUC:  0.4822629
## AUCPR:  0.01399675
## Gini:  -0.03547413
## R^2:  -3.376918e-05
## Residual Deviance:  9375.921
## AIC:  9377.921
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE  TRUE    Error          Rate
## FALSE      0 60316 1.000000  =60316/60316
## TRUE       0   899 0.000000        =0/899
## Totals     0 61215 0.985314  =60316/61215
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.014325     0.028947   4
## 2                       max f2  0.014325     0.069356   4
## 3                 max f0point5  0.014325     0.018290   4
## 4                 max accuracy  0.014972     0.790640   0
## 5                max precision  0.014325     0.014686   4
## 6                   max recall  0.014325     1.000000   4
## 7              max specificity  0.014972     0.799672   0
## 8             max absolute_mcc  0.014791     0.007193   2
## 9   max min_per_class_accuracy  0.014791     0.402265   2
## 10 max mean_per_class_accuracy  0.014325     0.500000   4
## 11                     max tns  0.014972 48233.000000   0
## 12                     max fns  0.014972   733.000000   0
## 13                     max fps  0.014325 60316.000000   4
## 14                     max tps  0.014325   899.000000   4
## 15                     max tnr  0.014972     0.799672   0
## 16                     max fnr  0.014972     0.815350   0
## 17                     max fpr  0.014325     1.000000   4
## 18                     max tpr  0.014325     1.000000   4
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                   mean        sd   cv_1_valid   cv_2_valid   cv_3_valid
## accuracy      0.014683  0.001042     0.013552     0.016145     0.015339
## auc           0.500000  0.000000     0.500000     0.500000     0.500000
## err           0.985317  0.001042     0.986448     0.983855     0.984661
## err_count 12063.200000 89.429860 12083.000000 12066.000000 12197.000000
## f0point5      0.018286  0.001293     0.016883     0.020100     0.019100
##             cv_4_valid   cv_5_valid
## accuracy      0.014261     0.014117
## auc           0.500000     0.500000
## err           0.985739     0.985883
## err_count 11958.000000 12012.000000
## f0point5      0.017763     0.017584
## 
## ---
##                          mean         sd  cv_1_valid  cv_2_valid  cv_3_valid
## precision            0.014683   0.001042    0.013552    0.016145    0.015339
## r2                  -0.000092   0.000083   -0.000151   -0.000209   -0.000044
## recall               1.000000   0.000000    1.000000    1.000000    1.000000
## residual_deviance 1875.184100 115.709530 1759.472300 2029.507800 1965.042500
## rmse                 0.120223   0.004182    0.115631    0.126046    0.122899
## specificity          0.000000   0.000000    0.000000    0.000000    0.000000
##                    cv_4_valid  cv_5_valid
## precision            0.014261    0.014117
## r2                  -0.000020   -0.000036
## recall               1.000000    1.000000
## residual_deviance 1814.336800 1807.561300
## rmse                 0.118566    0.117975
## specificity          0.000000    0.000000
## 
## Scoring History: 
##             timestamp   duration iterations negative_log_likelihood objective
## 1 2024-12-03 20:43:11  0.000 sec          0              4686.92330   0.07656
## 2 2024-12-03 20:43:11  0.049 sec          1              4686.92330   0.07656
##   training_rmse training_logloss training_r2 training_auc training_pr_auc
## 1            NA               NA          NA           NA              NA
## 2       0.12029          0.07656     0.00000      0.50000         0.01469
##   training_lift training_classification_error
## 1            NA                            NA
## 2       1.00000                       0.98531
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##                            variable relative_importance scaled_importance
## 1    expedition_role.2nd BC Manager            0.000000                NA
## 2 expedition_role.2nd Deputy Leader            0.000000                NA
## 3    expedition_role.2nd Exp Doctor            0.000000                NA
## 4          expedition_role.ABC Cook            0.000000                NA
## 5       expedition_role.ABC Manager            0.000000                NA
##   percentage
## 1         NA
## 2         NA
## 3         NA
## 4         NA
## 5         NA
## 
## ---
##              variable relative_importance scaled_importance percentage
## 467             sex.F            0.000000                NA         NA
## 468             sex.M            0.000000                NA         NA
## 469 oxygen_used.FALSE            0.000000                NA         NA
## 470  oxygen_used.TRUE            0.000000                NA         NA
## 471               age            0.000000                NA         NA
## 472  highpoint_metres            0.000000                NA         NA
# Make predictions on the test data
predictions <- h2o.predict(model_h2o, test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset column 'expedition_role' has levels not trained on: ["2nd Sirdar", "ABC
## Staff", "Asst BC Manager", "BC Manager & Cook", "BC Manager (C1 only)", "C2
## Cook", "Camp Manager", "Chief Coach", "Climb Ldr (torchbearer 2)", "Climb Ldr
## (torchbearer 3)", ...43 not listed..., "Sirder", "Staff", "Staff Member",
## "Staff Writer", "Storekeeper", "Survey Party Leader", "TV Producer", "Team
## Historian", "Transport Manager", "Treasurer"]
# Extract predicted probabilities
pred_prob <- as.vector(predictions$predict)

# Convert predictions to binary class based on threshold of 0.5
pred_class <- ifelse(pred_prob > 0.5, 1, 0)

# Evaluate the model using ROC and AUC
perf <- h2o.performance(model_h2o, newdata = test_h2o)
auc_value <- h2o.auc(perf)
print(paste("AUC value: ", auc_value))  # AUC value
## [1] "AUC value:  0.5"
# Create a confusion matrix
conf_matrix <- as.data.frame(h2o.confusionMatrix(model_h2o, test_h2o))
print(conf_matrix)
##        FALSE  TRUE     Error          Rate
## FALSE      0 15097 1.0000000  =15097/15097
## TRUE       0   207 0.0000000        =0/207
## Totals     0 15304 0.9864741  =15097/15304
# Shutdown H2O after usage
h2o.shutdown(prompt = FALSE)