library(readr)
library(skimr)
# Now, let's read the CSV file
members <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Now, let's use skimr to get a summary of the data
skim(members)
Data summary
| Name |
members |
| Number of rows |
76519 |
| Number of columns |
21 |
| _______________________ |
|
| Column type frequency: |
|
| character |
10 |
| logical |
6 |
| numeric |
5 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| expedition_id |
0 |
1.00 |
9 |
9 |
0 |
10350 |
0 |
| member_id |
0 |
1.00 |
12 |
12 |
0 |
76518 |
0 |
| peak_id |
0 |
1.00 |
4 |
4 |
0 |
391 |
0 |
| peak_name |
15 |
1.00 |
4 |
25 |
0 |
390 |
0 |
| season |
0 |
1.00 |
6 |
7 |
0 |
5 |
0 |
| sex |
2 |
1.00 |
1 |
1 |
0 |
2 |
0 |
| citizenship |
10 |
1.00 |
2 |
23 |
0 |
212 |
0 |
| expedition_role |
21 |
1.00 |
4 |
25 |
0 |
524 |
0 |
| death_cause |
75413 |
0.01 |
3 |
27 |
0 |
12 |
0 |
| injury_type |
74807 |
0.02 |
3 |
27 |
0 |
11 |
0 |
Variable type: logical
| hired |
0 |
1 |
0.21 |
FAL: 60788, TRU: 15731 |
| success |
0 |
1 |
0.38 |
FAL: 47320, TRU: 29199 |
| solo |
0 |
1 |
0.00 |
FAL: 76398, TRU: 121 |
| oxygen_used |
0 |
1 |
0.24 |
FAL: 58286, TRU: 18233 |
| died |
0 |
1 |
0.01 |
FAL: 75413, TRU: 1106 |
| injured |
0 |
1 |
0.02 |
FAL: 74806, TRU: 1713 |
Variable type: numeric
| year |
0 |
1.00 |
2000.36 |
14.78 |
1905 |
1991 |
2004 |
2012 |
2019 |
▁▁▁▃▇ |
| age |
3497 |
0.95 |
37.33 |
10.40 |
7 |
29 |
36 |
44 |
85 |
▁▇▅▁▁ |
| highpoint_metres |
21833 |
0.71 |
7470.68 |
1040.06 |
3800 |
6700 |
7400 |
8400 |
8850 |
▁▁▆▃▇ |
| death_height_metres |
75451 |
0.01 |
6592.85 |
1308.19 |
400 |
5800 |
6600 |
7550 |
8830 |
▁▁▂▇▆ |
| injury_height_metres |
75510 |
0.01 |
7049.91 |
1214.24 |
400 |
6200 |
7100 |
8000 |
8880 |
▁▁▂▇▇ |
# Check the column names
colnames(members)
## [1] "expedition_id" "member_id" "peak_id"
## [4] "peak_name" "year" "season"
## [7] "sex" "age" "citizenship"
## [10] "expedition_role" "hired" "highpoint_metres"
## [13] "success" "solo" "oxygen_used"
## [16] "died" "death_cause" "death_height_metres"
## [19] "injured" "injury_type" "injury_height_metres"
# List column names and their data types
str(members)
## spc_tbl_ [76,519 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ expedition_id : chr [1:76519] "AMAD78301" "AMAD78301" "AMAD78301" "AMAD78301" ...
## $ member_id : chr [1:76519] "AMAD78301-01" "AMAD78301-02" "AMAD78301-03" "AMAD78301-04" ...
## $ peak_id : chr [1:76519] "AMAD" "AMAD" "AMAD" "AMAD" ...
## $ peak_name : chr [1:76519] "Ama Dablam" "Ama Dablam" "Ama Dablam" "Ama Dablam" ...
## $ year : num [1:76519] 1978 1978 1978 1978 1978 ...
## $ season : chr [1:76519] "Autumn" "Autumn" "Autumn" "Autumn" ...
## $ sex : chr [1:76519] "M" "M" "M" "M" ...
## $ age : num [1:76519] 40 41 27 40 34 25 41 29 35 37 ...
## $ citizenship : chr [1:76519] "France" "France" "France" "France" ...
## $ expedition_role : chr [1:76519] "Leader" "Deputy Leader" "Climber" "Exp Doctor" ...
## $ hired : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ highpoint_metres : num [1:76519] NA 6000 NA 6000 NA ...
## $ success : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ solo : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ oxygen_used : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ died : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ death_cause : chr [1:76519] NA NA NA NA ...
## $ death_height_metres : num [1:76519] NA NA NA NA NA NA NA NA NA NA ...
## $ injured : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ injury_type : chr [1:76519] NA NA NA NA ...
## $ injury_height_metres: num [1:76519] NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. expedition_id = col_character(),
## .. member_id = col_character(),
## .. peak_id = col_character(),
## .. peak_name = col_character(),
## .. year = col_double(),
## .. season = col_character(),
## .. sex = col_character(),
## .. age = col_double(),
## .. citizenship = col_character(),
## .. expedition_role = col_character(),
## .. hired = col_logical(),
## .. highpoint_metres = col_double(),
## .. success = col_logical(),
## .. solo = col_logical(),
## .. oxygen_used = col_logical(),
## .. died = col_logical(),
## .. death_cause = col_character(),
## .. death_height_metres = col_double(),
## .. injured = col_logical(),
## .. injury_type = col_character(),
## .. injury_height_metres = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.9
## ✔ dials 1.2.0 ✔ rsample 1.2.0
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.4.4 ✔ tidyr 1.3.1
## ✔ infer 1.0.6 ✔ tune 1.1.2
## ✔ modeldata 1.3.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.1 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.2 ✔ yardstick 1.3.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
# Preprocess the data (if necessary)
# For example, if there are missing values or categorical variables, you might need to handle them.
# Split the data into training and testing sets
set.seed(123) # For reproducibility
split <- initial_split(members, prop = 0.8)
train_data <- training(split)
test_data <- testing(split)
# Initialize and start an H2O cluster
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 8 days 15 hours
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 4 months and 11 days
## H2O cluster name: H2O_started_from_R_jasonzink_qxv383
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.20 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.1 (2022-06-23)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (4 months and 11 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert the training and testing data to H2O frames
train_h2o <- as.h2o(train_data)
##
|
| | 0%
|
|======================================================================| 100%
test_h2o <- as.h2o(test_data)
##
|
| | 0%
|
|======================================================================| 100%
# Define the target variable name (replace "target_variable_name" with the actual column name)
target_variable_name <- "death_height_metres"
# Define the classification model using h2o
model <- h2o.deeplearning(
x = names(train_data)[!names(train_data) %in% target_variable_name], # Features
y = target_variable_name, # Target variable
training_frame = train_h2o
)
## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [member_id, peak_name, death_cause, peak_id, sex, citizenship, expedition_role, season, expedition_id, injury_type].
##
|
| | 0%
|
|=============================================================== | 90%
|
|======================================================================| 100%
# Make predictions on the test set
predictions <- h2o.predict(model, test_h2o)
##
|
| | 0%
|
|======================================================================| 100%
# Evaluate model performance
performance <- h2o.performance(model, test_h2o)
# Print the performance metrics
print(performance)
## H2ORegressionMetrics: deeplearning
##
## MSE: 638569.4
## RMSE: 799.1054
## MAE: 535.8058
## RMSLE: 0.1343759
## Mean Residual Deviance : 638569.4
# Shut down the H2O cluster
h2o.shutdown()
## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?