library(tidyverse)
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
expedition <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Goal: Predict whether Himalaya climbers died

Issues with data:

Missing values

Factors or numeric variables:

season, success, sex, injured, hired

Character variables: Convert to numbers in the recipies step

Unbalanced target variable: died

ID variable: member_id

Clean Data

# Treating missing values
data_clean <- expedition %>% 
    select( -injury_type, -death_height_metres, -injury_height_metres) %>%
    na.omit()

Explore Data

# Adressing unnbalanced target variable
data_clean %>% count(died)
## # A tibble: 1 × 2
##   died      n
##   <lgl> <int>
## 1 TRUE    744
data_clean %>%
    ggplot(aes(died)) +
    geom_bar()

#

top_10_death_cause_vec <- data_clean %>% 
    count(death_cause, sort = TRUE) %>% 
    head(10) %>% 
    pull(death_cause)

# Relationship between pay and attrition
data_clean %>%
    filter(death_cause %in% top_10_death_cause_vec) %>%
    count(died, death_cause) %>%
    ggplot(aes(died, death_cause, fill = n)) +
    geom_tile()

Relationship in all variables with Correlation Plot

 # Step 1: Binarize
data_binarized <- data_clean %>%
    select(-member_id) %>% # ID variable
    binarize()

data_binarized %>% glimpse()
## Rows: 744
## Columns: 107
## $ expedition_id__MANA72101                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_id__PISA94301                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_id__-OTHER`                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ peak_id__AMAD                              <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ peak_id__ANN1                              <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ peak_id__ANN3                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__CHOY                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__DHA1                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__DHA4                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__EVER                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__HIME                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__KANG                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__LANG                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__LHOT                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__LSHR                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__MAKA                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__MANA                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__PISA                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__PUMO                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__YALU                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `peak_id__-OTHER`                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Ama_Dablam                      <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ peak_name__Annapurna_I                     <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ peak_name__Annapurna_III                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Cho_Oyu                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Dhaulagiri_I                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Dhaulagiri_IV                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Everest                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Himalchuli_East                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Kangchenjunga                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Langtang_Lirung                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Lhotse                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Lhotse_Shar                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Makalu                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Manaslu                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Pisang                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Pumori                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_name__Yalung_Kang                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `peak_name__-OTHER`                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_1985`                          <dbl> 1, 1, 0, 0, 1, 1, 1, 1, 1, …
## $ year__1985_1995                            <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, …
## $ year__1995_2007.25                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2007.25_Inf                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ season__Autumn                             <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ season__Spring                             <dbl> 0, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ season__Winter                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `season__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ sex__F                                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ sex__M                                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `age__-Inf_28.75`                          <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ age__28.75_35                              <dbl> 0, 1, 1, 0, 1, 1, 1, 0, 0, …
## $ age__35_42                                 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ age__42_Inf                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Australia                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Austria                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ citizenship__Bulgaria                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Canada                        <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ citizenship__China                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Czech_Republic                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Czechoslovakia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__France                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Germany                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Hungary                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__India                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Italy                         <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ citizenship__Japan                         <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ citizenship__Nepal                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Poland                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Russia                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__S_Korea                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Slovenia                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Spain                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ citizenship__Switzerland                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__UK                            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ citizenship__USA                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__W_Germany                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `citizenship__-OTHER`                      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Climber                   <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, …
## $ expedition_role__Deputy_Leader             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Exp_Doctor                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_role__H-A_Worker`              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Leader                    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ `expedition_role__-OTHER`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hired__0                                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ hired__1                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `highpoint_metres__-Inf_6500`              <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ highpoint_metres__6500_7400                <dbl> 0, 1, 0, 1, 1, 0, 1, 1, 0, …
## $ highpoint_metres__7400_8300                <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ highpoint_metres__8300_Inf                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ success__0                                 <dbl> 1, 0, 1, 0, 1, 1, 1, 1, 1, …
## $ success__1                                 <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ solo__0                                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `solo__-OTHER`                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ oxygen_used__0                             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ oxygen_used__1                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__AMS                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__Avalanche                     <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, …
## $ death_cause__Crevasse                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `death_cause__Disappearance_(unexplained)` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__Exhaustion                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `death_cause__Exposure_/_frostbite`        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ death_cause__Fall                          <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ `death_cause__Falling_rock_/_ice`          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `death_cause__Illness_(non-AMS)`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `death_cause__-OTHER`                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ injured__0                                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `injured__-OTHER`                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlation
data_correlation <- data_binarized %>%
    correlate(death_cause__Avalanche)

data_correlation
## # A tibble: 107 × 3
##    feature          bin       correlation
##    <fct>            <chr>           <dbl>
##  1 death_cause      Avalanche       1    
##  2 death_cause      Fall           -0.472
##  3 highpoint_metres -Inf_6500       0.397
##  4 success          0               0.339
##  5 success          1              -0.339
##  6 highpoint_metres 8300_Inf       -0.313
##  7 oxygen_used      0               0.275
##  8 oxygen_used      1              -0.275
##  9 season           Autumn          0.271
## 10 season           Spring         -0.257
## # ℹ 97 more rows
# Step 3: Plot
data_correlation %>%
    correlationfunnel::plot_correlation_funnel() 
## Warning: ggrepel: 69 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps