Goal: Build a classification model to predict whether the person died
in an expedition
Import Data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.4.1
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.1
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.1
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.4.1
## Loading required package: recipes
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Explore Data
skimr::skim(members)
Data summary
| Name |
members |
| Number of rows |
76519 |
| Number of columns |
21 |
| _______________________ |
|
| Column type frequency: |
|
| character |
10 |
| logical |
6 |
| numeric |
5 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| expedition_id |
0 |
1.00 |
9 |
9 |
0 |
10350 |
0 |
| member_id |
0 |
1.00 |
12 |
12 |
0 |
76518 |
0 |
| peak_id |
0 |
1.00 |
4 |
4 |
0 |
391 |
0 |
| peak_name |
15 |
1.00 |
4 |
25 |
0 |
390 |
0 |
| season |
0 |
1.00 |
6 |
7 |
0 |
5 |
0 |
| sex |
2 |
1.00 |
1 |
1 |
0 |
2 |
0 |
| citizenship |
10 |
1.00 |
2 |
23 |
0 |
212 |
0 |
| expedition_role |
21 |
1.00 |
4 |
25 |
0 |
524 |
0 |
| death_cause |
75413 |
0.01 |
3 |
27 |
0 |
12 |
0 |
| injury_type |
74807 |
0.02 |
3 |
27 |
0 |
11 |
0 |
Variable type: logical
| hired |
0 |
1 |
0.21 |
FAL: 60788, TRU: 15731 |
| success |
0 |
1 |
0.38 |
FAL: 47320, TRU: 29199 |
| solo |
0 |
1 |
0.00 |
FAL: 76398, TRU: 121 |
| oxygen_used |
0 |
1 |
0.24 |
FAL: 58286, TRU: 18233 |
| died |
0 |
1 |
0.01 |
FAL: 75413, TRU: 1106 |
| injured |
0 |
1 |
0.02 |
FAL: 74806, TRU: 1713 |
Variable type: numeric
| year |
0 |
1.00 |
2000.36 |
14.78 |
1905 |
1991 |
2004 |
2012 |
2019 |
▁▁▁▃▇ |
| age |
3497 |
0.95 |
37.33 |
10.40 |
7 |
29 |
36 |
44 |
85 |
▁▇▅▁▁ |
| highpoint_metres |
21833 |
0.71 |
7470.68 |
1040.06 |
3800 |
6700 |
7400 |
8400 |
8850 |
▁▁▆▃▇ |
| death_height_metres |
75451 |
0.01 |
6592.85 |
1308.19 |
400 |
5800 |
6600 |
7550 |
8830 |
▁▁▂▇▆ |
| injury_height_metres |
75510 |
0.01 |
7049.91 |
1214.24 |
400 |
6200 |
7100 |
8000 |
8880 |
▁▁▂▇▇ |
str(members)
## spc_tbl_ [76,519 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ expedition_id : chr [1:76519] "AMAD78301" "AMAD78301" "AMAD78301" "AMAD78301" ...
## $ member_id : chr [1:76519] "AMAD78301-01" "AMAD78301-02" "AMAD78301-03" "AMAD78301-04" ...
## $ peak_id : chr [1:76519] "AMAD" "AMAD" "AMAD" "AMAD" ...
## $ peak_name : chr [1:76519] "Ama Dablam" "Ama Dablam" "Ama Dablam" "Ama Dablam" ...
## $ year : num [1:76519] 1978 1978 1978 1978 1978 ...
## $ season : chr [1:76519] "Autumn" "Autumn" "Autumn" "Autumn" ...
## $ sex : chr [1:76519] "M" "M" "M" "M" ...
## $ age : num [1:76519] 40 41 27 40 34 25 41 29 35 37 ...
## $ citizenship : chr [1:76519] "France" "France" "France" "France" ...
## $ expedition_role : chr [1:76519] "Leader" "Deputy Leader" "Climber" "Exp Doctor" ...
## $ hired : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ highpoint_metres : num [1:76519] NA 6000 NA 6000 NA ...
## $ success : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ solo : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ oxygen_used : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ died : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ death_cause : chr [1:76519] NA NA NA NA ...
## $ death_height_metres : num [1:76519] NA NA NA NA NA NA NA NA NA NA ...
## $ injured : logi [1:76519] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ injury_type : chr [1:76519] NA NA NA NA ...
## $ injury_height_metres: num [1:76519] NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. expedition_id = col_character(),
## .. member_id = col_character(),
## .. peak_id = col_character(),
## .. peak_name = col_character(),
## .. year = col_double(),
## .. season = col_character(),
## .. sex = col_character(),
## .. age = col_double(),
## .. citizenship = col_character(),
## .. expedition_role = col_character(),
## .. hired = col_logical(),
## .. highpoint_metres = col_double(),
## .. success = col_logical(),
## .. solo = col_logical(),
## .. oxygen_used = col_logical(),
## .. died = col_logical(),
## .. death_cause = col_character(),
## .. death_height_metres = col_double(),
## .. injured = col_logical(),
## .. injury_type = col_character(),
## .. injury_height_metres = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
variances <- apply(members, 2, var, na.rm = TRUE)
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
zero_variance_columns <- names(variances[variances == 0])
print(zero_variance_columns)
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Clean Data
factors_vec <- c("death_cause", "injury_type", "highpoint_metres", "death_height_metres")
members_clean <- members %>%
select(-expedition_id, -member_id, -peak_id, -peak_name, -year, -season, -sex, -citizenship, -expedition_role, -hired, -success, -solo, -oxygen_used, -injured)
# Adressing Missing Values
members_clean <- members_clean %>%
# Impute missing numeric columns with the median
mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .))) %>%
# Impute missing categorical columns with "Unknown"
mutate(across(where(is.character), ~ ifelse(is.na(.), "Unknown", .)))
skimr::skim(members_clean)
Data summary
| Name |
members_clean |
| Number of rows |
76519 |
| Number of columns |
7 |
| _______________________ |
|
| Column type frequency: |
|
| character |
2 |
| logical |
1 |
| numeric |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| death_cause |
0 |
1 |
3 |
27 |
0 |
12 |
0 |
| injury_type |
0 |
1 |
3 |
27 |
0 |
12 |
0 |
Variable type: logical
| died |
0 |
1 |
0.01 |
FAL: 75413, TRU: 1106 |
Variable type: numeric
| age |
0 |
1 |
37.27 |
10.16 |
7 |
30 |
36 |
43 |
85 |
▁▇▅▁▁ |
| highpoint_metres |
0 |
1 |
7450.51 |
879.83 |
3800 |
6814 |
7400 |
8167 |
8850 |
▁▁▃▇▆ |
| death_height_metres |
0 |
1 |
6599.90 |
154.48 |
400 |
6600 |
6600 |
6600 |
8830 |
▁▁▁▇▁ |
| injury_height_metres |
0 |
1 |
7099.34 |
139.48 |
400 |
7100 |
7100 |
7100 |
8880 |
▁▁▁▇▁ |
Continue Exploring Data
members_clean %>%
ggplot(aes(x = age, fill = as.factor(died))) +
geom_histogram(binwidth = 5, alpha = 0.7, position = "dodge") +
facet_wrap(~ as.factor(died), scales = "free_y") +
labs(
title = "Age Distribution by Survival Status",
x = "Age",
y = "count",
fill = "Died"
) +
theme_minimal()

Prepare DAta
data_binarized_tbl <- members_clean %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 76,519
## Columns: 18
## $ `age__-Inf_30` <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, …
## $ age__30_36 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ age__36_43 <dbl> 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ age__43_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ `highpoint_metres__-Inf_6814` <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, …
## $ highpoint_metres__6814_7400 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ highpoint_metres__7400_8167 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ highpoint_metres__8167_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ died__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ died__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__Unknown <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `death_cause__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `death_height_metres__-Inf_6600` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ death_height_metres__6600_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ injury_type__Unknown <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `injury_type__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `injury_height_metres__-Inf_7100` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ injury_height_metres__7100_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate( died__0 )
data_corr_tbl
## # A tibble: 18 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 died 0 1
## 2 died 1 -1
## 3 death_cause Unknown 0.995
## 4 death_cause -OTHER -0.995
## 5 death_height_metres -Inf_6600 0.686
## 6 death_height_metres 6600_Inf -0.686
## 7 age 30_36 -0.0328
## 8 age 43_Inf 0.0181
## 9 age 36_43 0.0175
## 10 injury_type -OTHER 0.0161
## 11 injury_type Unknown -0.0161
## 12 highpoint_metres 7400_8167 -0.0144
## 13 highpoint_metres 8167_Inf 0.0133
## 14 injury_height_metres -Inf_7100 -0.00972
## 15 injury_height_metres 7100_Inf 0.00972
## 16 highpoint_metres -Inf_6814 -0.00679
## 17 highpoint_metres 6814_7400 0.00343
## 18 age -Inf_30 -0.000782
Plot
data_corr_tbl %>%
plot_correlation_funnel()
