Apply it to your data 12

library(readr)
library(skimr)

# Now, let's read the CSV file
members <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Now, let's use skimr to get a summary of the data
skim(members)

Data summary
Name	members
Number of rows	76519
Number of columns	21
_______________________
Column type frequency:
character	10
logical	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
expedition_id	0	1.00	9	9	10350
member_id	0	1.00	12	12	76518
peak_id	0	1.00	4	4	391
peak_name	15	1.00	4	25	390
season	0	1.00	6	7	5
sex	2	1.00	1	1	2
citizenship	10	1.00	2	23	212
expedition_role	21	1.00	4	25	524
death_cause	75413	0.01	3	27	12
injury_type	74807	0.02	3	27	11

Variable type: logical

skim_variable	complete_rate	mean	count
hired	1	0.21	FAL: 60788, TRU: 15731
success	1	0.38	FAL: 47320, TRU: 29199
solo	1	0.00	FAL: 76398, TRU: 121
oxygen_used	1	0.24	FAL: 58286, TRU: 18233
died	1	0.01	FAL: 75413, TRU: 1106
injured	1	0.02	FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2000.36	14.78	1905	1991	2004	2012	2019	▁▁▁▃▇
age	3497	0.95	37.33	10.40	7	29	36	44	85	▁▇▅▁▁
highpoint_metres	21833	0.71	7470.68	1040.06	3800	6700	7400	8400	8850	▁▁▆▃▇
death_height_metres	75451	0.01	6592.85	1308.19	400	5800	6600	7550	8830	▁▁▂▇▆
injury_height_metres	75510	0.01	7049.91	1214.24	400	6200	7100	8000	8880	▁▁▂▇▇

# Check the column names
colnames(members)

##  [1] "expedition_id"        "member_id"            "peak_id"             
##  [4] "peak_name"            "year"                 "season"              
##  [7] "sex"                  "age"                  "citizenship"         
## [10] "expedition_role"      "hired"                "highpoint_metres"    
## [13] "success"              "solo"                 "oxygen_used"         
## [16] "died"                 "death_cause"          "death_height_metres" 
## [19] "injured"              "injury_type"          "injury_height_metres"

# List column names and their data types
str(members)

## spc_tbl_ [76,519 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ expedition_id       : chr [1:76519] "AMAD78301" "AMAD78301" "AMAD78301" "AMAD78301" ...
##  $ member_id           : chr [1:76519] "AMAD78301-01" "AMAD78301-02" "AMAD78301-03" "AMAD78301-04" ...
##  $ peak_id             : chr [1:76519] "AMAD" "AMAD" "AMAD" "AMAD" ...
##  $ peak_name           : chr [1:76519] "Ama Dablam" "Ama Dablam" "Ama Dablam" "Ama Dablam" ...
##  $ year                : num [1:76519] 1978 1978 1978 1978 1978 ...
##  $ season              : chr [1:76519] "Autumn" "Autumn" "Autumn" "Autumn" ...
##  $ sex                 : chr [1:76519] "M" "M" "M" "M" ...
##  $ age                 : num [1:76519] 40 41 27 40 34 25 41 29 35 37 ...
##  $ citizenship         : chr [1:76519] "France" "France" "France" "France" ...
##  $ expedition_role     : chr [1:76519] "Leader" "Deputy Leader" "Climber" "Exp Doctor" ...
##  $ hired               : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ highpoint_metres    : num [1:76519] NA 6000 NA 6000 NA ...
##  $ success             : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ solo                : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ oxygen_used         : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ died                : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ death_cause         : chr [1:76519] NA NA NA NA ...
##  $ death_height_metres : num [1:76519] NA NA NA NA NA NA NA NA NA NA ...
##  $ injured             : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ injury_type         : chr [1:76519] NA NA NA NA ...
##  $ injury_height_metres: num [1:76519] NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   expedition_id = col_character(),
##   ..   member_id = col_character(),
##   ..   peak_id = col_character(),
##   ..   peak_name = col_character(),
##   ..   year = col_double(),
##   ..   season = col_character(),
##   ..   sex = col_character(),
##   ..   age = col_double(),
##   ..   citizenship = col_character(),
##   ..   expedition_role = col_character(),
##   ..   hired = col_logical(),
##   ..   highpoint_metres = col_double(),
##   ..   success = col_logical(),
##   ..   solo = col_logical(),
##   ..   oxygen_used = col_logical(),
##   ..   died = col_logical(),
##   ..   death_cause = col_character(),
##   ..   death_height_metres = col_double(),
##   ..   injured = col_logical(),
##   ..   injury_type = col_character(),
##   ..   injury_height_metres = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ recipes      1.0.9
## ✔ dials        1.2.0     ✔ rsample      1.2.0
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.4.4     ✔ tidyr        1.3.1
## ✔ infer        1.0.6     ✔ tune         1.1.2
## ✔ modeldata    1.3.0     ✔ workflows    1.1.3
## ✔ parsnip      1.1.1     ✔ workflowsets 1.0.1
## ✔ purrr        1.0.2     ✔ yardstick    1.3.0

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()  masks scales::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

# Preprocess the data (if necessary)
# For example, if there are missing values or categorical variables, you might need to handle them.

# Split the data into training and testing sets
set.seed(123) # For reproducibility
split <- initial_split(members, prop = 0.8)
train_data <- training(split)
test_data <- testing(split)

# Initialize and start an H2O cluster
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         8 days 15 hours 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    4 months and 11 days 
##     H2O cluster name:           H2O_started_from_R_jasonzink_qxv383 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.20 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.1 (2022-06-23)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (4 months and 11 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

# Convert the training and testing data to H2O frames
train_h2o <- as.h2o(train_data)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

test_h2o <- as.h2o(test_data)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

# Define the target variable name (replace "target_variable_name" with the actual column name)
target_variable_name <- "death_height_metres"

# Define the classification model using h2o
model <- h2o.deeplearning(
  x = names(train_data)[!names(train_data) %in% target_variable_name], # Features
  y = target_variable_name, # Target variable
  training_frame = train_h2o
)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type].

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |======================================================================| 100%

# Make predictions on the test set
predictions <- h2o.predict(model, test_h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

# Evaluate model performance
performance <- h2o.performance(model, test_h2o)

# Print the performance metrics
print(performance)

## H2ORegressionMetrics: deeplearning
## 
## MSE:  638569.4
## RMSE:  799.1054
## MAE:  535.8058
## RMSLE:  0.1343759
## Mean Residual Deviance :  638569.4

# Shut down the H2O cluster
h2o.shutdown()

## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?

Apply it to your data 12

Jason Zink

2024-05-02