library(tidyverse)
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
expedition <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl (6): hired, success, solo, oxygen_used, died, injured
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Goal: Predict whether Himalaya climbers died
Issues with data:
Missing values
Factors or numeric variables:
season, success, sex, injured, hired
Character variables: Convert to numbers in the recipies step
Unbalanced target variable: died
ID variable: member_id
# Treating missing values
data_clean <- expedition %>%
select( -injury_type, -death_height_metres, -injury_height_metres) %>%
na.omit()
# Adressing unnbalanced target variable
data_clean %>% count(died)
## # A tibble: 1 × 2
## died n
## <lgl> <int>
## 1 TRUE 744
data_clean %>%
ggplot(aes(died)) +
geom_bar()
#
top_10_death_cause_vec <- data_clean %>%
count(death_cause, sort = TRUE) %>%
head(10) %>%
pull(death_cause)
# Relationship between pay and attrition
data_clean %>%
filter(death_cause %in% top_10_death_cause_vec) %>%
count(died, death_cause) %>%
ggplot(aes(died, death_cause, fill = n)) +
geom_tile()
# Step 1: Binarize
data_binarized <- data_clean %>%
select(-member_id) %>% # ID variable
binarize()
data_binarized %>% glimpse()
## Rows: 744
## Columns: 107
## $ expedition_id__MANA72101 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_id__PISA94301 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_id__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ peak_id__AMAD <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ peak_id__ANN1 <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ peak_id__ANN3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__CHOY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__DHA1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__DHA4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__EVER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__HIME <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__KANG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__LANG <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__LHOT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__LSHR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__MAKA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__MANA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__PISA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__PUMO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__YALU <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `peak_id__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Ama_Dablam <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ peak_name__Annapurna_I <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ peak_name__Annapurna_III <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Cho_Oyu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Dhaulagiri_I <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Dhaulagiri_IV <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Everest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Himalchuli_East <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Kangchenjunga <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Langtang_Lirung <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Lhotse <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Lhotse_Shar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Makalu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Manaslu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Pisang <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Pumori <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Yalung_Kang <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `peak_name__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_1985` <dbl> 1, 1, 0, 0, 1, 1, 1, 1, 1, …
## $ year__1985_1995 <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ year__1995_2007.25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2007.25_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ season__Autumn <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ season__Spring <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ season__Winter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `season__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ sex__F <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ sex__M <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `age__-Inf_28.75` <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ age__28.75_35 <dbl> 0, 1, 1, 0, 1, 1, 1, 0, 0, …
## $ age__35_42 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ age__42_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Australia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Austria <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ citizenship__Bulgaria <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Canada <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ citizenship__China <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Czech_Republic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Czechoslovakia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__France <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Germany <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Hungary <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__India <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Italy <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ citizenship__Japan <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ citizenship__Nepal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Poland <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Russia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__S_Korea <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Slovenia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Spain <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ citizenship__Switzerland <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__UK <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ citizenship__USA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__W_Germany <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `citizenship__-OTHER` <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Climber <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, …
## $ expedition_role__Deputy_Leader <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Exp_Doctor <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_role__H-A_Worker` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Leader <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ `expedition_role__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hired__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ hired__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `highpoint_metres__-Inf_6500` <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ highpoint_metres__6500_7400 <dbl> 0, 1, 0, 1, 1, 0, 1, 1, 0, …
## $ highpoint_metres__7400_8300 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ highpoint_metres__8300_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ success__0 <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, …
## $ success__1 <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ solo__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `solo__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ oxygen_used__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ oxygen_used__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__AMS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__Avalanche <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ death_cause__Crevasse <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `death_cause__Disappearance_(unexplained)` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__Exhaustion <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `death_cause__Exposure_/_frostbite` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__Fall <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ `death_cause__Falling_rock_/_ice` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `death_cause__Illness_(non-AMS)` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `death_cause__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ injured__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `injured__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlation
data_correlation <- data_binarized %>%
correlate(death_cause__Avalanche)
data_correlation
## # A tibble: 107 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 death_cause Avalanche 1
## 2 death_cause Fall -0.472
## 3 highpoint_metres -Inf_6500 0.397
## 4 success 0 0.339
## 5 success 1 -0.339
## 6 highpoint_metres 8300_Inf -0.313
## 7 oxygen_used 0 0.275
## 8 oxygen_used 1 -0.275
## 9 season Autumn 0.271
## 10 season Spring -0.257
## # ℹ 97 more rows
# Step 3: Plot
data_correlation %>%
correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 69 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps