# Load required libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0
## Warning: package 'dials' was built under R version 4.3.3
## Warning: package 'infer' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
## Warning: package 'parsnip' was built under R version 4.3.3
## Warning: package 'recipes' was built under R version 4.3.3
## Warning: package 'rsample' was built under R version 4.3.3
## Warning: package 'tune' was built under R version 4.3.3
## Warning: package 'workflows' was built under R version 4.3.3
## Warning: package 'workflowsets' was built under R version 4.3.3
## Warning: package 'yardstick' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()        masks purrr::discard()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ recipes::fixed()         masks stringr::fixed()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ caret::lift()            masks purrr::lift()
## ✖ yardstick::precision()   masks caret::precision()
## ✖ yardstick::recall()      masks caret::recall()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ yardstick::spec()        masks readr::spec()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step()          masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(h2o)
## Warning: package 'h2o' was built under R version 4.3.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
# Step 1: Load the dataset
url <- "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv"
members <- read.csv(url)

# View the first few rows of the dataset
head(members)
##   expedition_id    member_id peak_id  peak_name year season sex age citizenship
## 1     AMAD78301 AMAD78301-01    AMAD Ama Dablam 1978 Autumn   M  40      France
## 2     AMAD78301 AMAD78301-02    AMAD Ama Dablam 1978 Autumn   M  41      France
## 3     AMAD78301 AMAD78301-03    AMAD Ama Dablam 1978 Autumn   M  27      France
## 4     AMAD78301 AMAD78301-04    AMAD Ama Dablam 1978 Autumn   M  40      France
## 5     AMAD78301 AMAD78301-05    AMAD Ama Dablam 1978 Autumn   M  34      France
## 6     AMAD78301 AMAD78301-06    AMAD Ama Dablam 1978 Autumn   M  25      France
##   expedition_role hired highpoint_metres success  solo oxygen_used  died
## 1          Leader FALSE               NA   FALSE FALSE       FALSE FALSE
## 2   Deputy Leader FALSE             6000   FALSE FALSE       FALSE FALSE
## 3         Climber FALSE               NA   FALSE FALSE       FALSE FALSE
## 4      Exp Doctor FALSE             6000   FALSE FALSE       FALSE FALSE
## 5         Climber FALSE               NA   FALSE FALSE       FALSE FALSE
## 6         Climber FALSE             6000   FALSE FALSE       FALSE FALSE
##   death_cause death_height_metres injured injury_type injury_height_metres
## 1        <NA>                  NA   FALSE        <NA>                   NA
## 2        <NA>                  NA   FALSE        <NA>                   NA
## 3        <NA>                  NA   FALSE        <NA>                   NA
## 4        <NA>                  NA   FALSE        <NA>                   NA
## 5        <NA>                  NA   FALSE        <NA>                   NA
## 6        <NA>                  NA   FALSE        <NA>                   NA
# Step 2: Data preprocessing
# Check structure and missing values
str(members)
## 'data.frame':    76519 obs. of  21 variables:
##  $ expedition_id       : chr  "AMAD78301" "AMAD78301" "AMAD78301" "AMAD78301" ...
##  $ member_id           : chr  "AMAD78301-01" "AMAD78301-02" "AMAD78301-03" "AMAD78301-04" ...
##  $ peak_id             : chr  "AMAD" "AMAD" "AMAD" "AMAD" ...
##  $ peak_name           : chr  "Ama Dablam" "Ama Dablam" "Ama Dablam" "Ama Dablam" ...
##  $ year                : int  1978 1978 1978 1978 1978 1978 1978 1978 1979 1979 ...
##  $ season              : chr  "Autumn" "Autumn" "Autumn" "Autumn" ...
##  $ sex                 : chr  "M" "M" "M" "M" ...
##  $ age                 : int  40 41 27 40 34 25 41 29 35 37 ...
##  $ citizenship         : chr  "France" "France" "France" "France" ...
##  $ expedition_role     : chr  "Leader" "Deputy Leader" "Climber" "Exp Doctor" ...
##  $ hired               : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ highpoint_metres    : int  NA 6000 NA 6000 NA 6000 6000 6000 NA 6814 ...
##  $ success             : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ solo                : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ oxygen_used         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ died                : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ death_cause         : chr  NA NA NA NA ...
##  $ death_height_metres : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ injured             : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ injury_type         : chr  NA NA NA NA ...
##  $ injury_height_metres: int  NA NA NA NA NA NA NA NA NA NA ...
sum(is.na(members))
## [1] 326559
# Drop irrelevant columns or columns with too many missing values (if any)
# Remove or update the following line to include specific column names
# members <- members %>% select(-c(column_name_to_exclude))

# Impute missing values if needed (e.g., mean/mode imputation)
members <- members %>% 
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

# Convert target variable (assuming "died" column exists) to a binary factor
members$died <- as.factor(members$died)

# Step 3: Split data into training and testing sets
set.seed(123) # For reproducibility
data_split <- initial_split(members, prop = 0.8, strata = died)
train <- training(data_split)
test <- testing(data_split)

# Step 4: Initialize H2O and build a predictive model
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         16 minutes 26 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 15 days 
##     H2O cluster name:           H2O_started_from_R_eliza_osh070 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.08 GB 
##     H2O cluster total cores:    16 
##     H2O cluster allowed cores:  16 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 15 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
h2o_train <- as.h2o(train)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
h2o_test <- as.h2o(test)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# Define the model
y <- "died"
x <- setdiff(names(train), y)

model <- h2o.glm(
  x = x, 
  y = y, 
  training_frame = h2o_train, 
  family = "binomial"
)
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type].
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
h2o.performance(model, h2o_test)
## H2OBinomialMetrics: glm
## 
## MSE:  0.01305523
## RMSE:  0.1142595
## LogLoss:  0.06812887
## Mean Per-Class Error:  0.4492559
## AUC:  0.6660854
## AUCPR:  0.03617563
## Gini:  0.3321708
## R^2:  0.007370384
## Residual Deviance:  2085.288
## AIC:  2113.288
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        FALSE TRUE    Error        Rate
## FALSE  14856  244 0.016159  =244/15100
## TRUE     180   24 0.882353    =180/204
## Totals 15036  268 0.027705  =424/15304
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.049090     0.101695  94
## 2                       max f2  0.026537     0.147343 170
## 3                 max f0point5  0.052030     0.097584  86
## 4                 max accuracy  0.221189     0.986670   1
## 5                max precision  0.221189     0.500000   1
## 6                   max recall  0.002013     1.000000 392
## 7              max specificity  0.223419     0.999934   0
## 8             max absolute_mcc  0.026537     0.091995 170
## 9   max min_per_class_accuracy  0.014555     0.632353 257
## 10 max mean_per_class_accuracy  0.015839     0.636237 246
## 11                     max tns  0.223419 15099.000000   0
## 12                     max fns  0.223419   204.000000   0
## 13                     max fps  0.000572 15100.000000 399
## 14                     max tps  0.002013   204.000000 392
## 15                     max tnr  0.223419     0.999934   0
## 16                     max fnr  0.223419     1.000000   0
## 17                     max fpr  0.000572     1.000000 399
## 18                     max tpr  0.002013     1.000000 392
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Step 5: Evaluate the model
# Make predictions
predictions <- h2o.predict(model, h2o_test)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# Convert predictions from H2OFrame to a numeric vector
predictions <- as.vector(predictions$predict)
# Convert probabilities to binary class labels
predicted_classes <- as.factor(ifelse(predictions > 0.5, "TRUE", "FALSE"))

# Ensure `truth` and `prediction` levels match
evaluation <- tibble(
  truth = as.factor(as.character(test$died)),  # Explicitly convert levels to character for alignment
  prediction = predicted_classes
)

# Confusion matrix using caret
evaluation <- data.frame(
  truth = as.factor(test$died),
  prediction = predicted_classes
)

confusion_matrix <- confusionMatrix(evaluation$prediction, evaluation$truth)
## Warning in confusionMatrix.default(evaluation$prediction, evaluation$truth):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
print(confusion_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE     0     0
##      TRUE  15100   204
##                                           
##                Accuracy : 0.0133          
##                  95% CI : (0.0116, 0.0153)
##     No Information Rate : 0.9867          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.01333         
##              Prevalence : 0.98667         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : FALSE           
## 
# Step 6: ROC Curve and AUC
perf <- h2o.performance(model, h2o_test)
h2o.auc(perf)
## [1] 0.6660854
# Step 7: Save the model (optional)
h2o.saveModel(model, path = "h2o_glm_model", force = TRUE)
## [1] "C:\\Users\\eliza\\Desktop\\PSU_DAT3100\\12_module14\\h2o_glm_model\\GLM_model_R_1733436504270_24"

Prompts:

I have a dataset called members located at https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv

The goal is to predict death of members. Please write R code to create a predictive model that predicts if the member died or not. - Update the code to change the name of the target variable to died - Use tidymodels instead of caret and h2o instead of glmnet - Error in select(): ! Can’t subset columns that don’t exist. ✖ Column column_name_to_exclude doesn’t exist. Backtrace: 1. members %>% select(-c(column_name_to_exclude)) 3. dplyr:::select.data.frame(., -c(column_name_to_exclude)) - Error in confusionMatrix(predicted_classes, as.factor(test$died)) : could not find function “confusionMatrix” - Error in tibble(): ! All columns in a tibble must be vectors. ✖ Column prediction is a H2OFrame object. - Error in conf_mat(): ✖ truth and estimate levels must be equivalent. • truth: FALSE and TRUE. • estimate: 1. - Error in conf_mat(): ✖ truth and estimate levels must be equivalent. • truth: FALSE and TRUE. • estimate: TRUE. - Load the caret package and replace the conf_mat() function with the confusionMatrix function - Works fine, thanks for the help.