Import Data

members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean Data

skimr::skim(members)
Data summary
Name members
Number of rows 76519
Number of columns 21
_______________________
Column type frequency:
character 10
logical 6
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
expedition_id 0 1.00 9 9 0 10350 0
member_id 0 1.00 12 12 0 76518 0
peak_id 0 1.00 4 4 0 391 0
peak_name 15 1.00 4 25 0 390 0
season 0 1.00 6 7 0 5 0
sex 2 1.00 1 1 0 2 0
citizenship 10 1.00 2 23 0 212 0
expedition_role 21 1.00 4 25 0 524 0
death_cause 75413 0.01 3 27 0 12 0
injury_type 74807 0.02 3 27 0 11 0

Variable type: logical

skim_variable n_missing complete_rate mean count
hired 0 1 0.21 FAL: 60788, TRU: 15731
success 0 1 0.38 FAL: 47320, TRU: 29199
solo 0 1 0.00 FAL: 76398, TRU: 121
oxygen_used 0 1 0.24 FAL: 58286, TRU: 18233
died 0 1 0.01 FAL: 75413, TRU: 1106
injured 0 1 0.02 FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2000.36 14.78 1905 1991 2004 2012 2019 ▁▁▁▃▇
age 3497 0.95 37.33 10.40 7 29 36 44 85 ▁▇▅▁▁
highpoint_metres 21833 0.71 7470.68 1040.06 3800 6700 7400 8400 8850 ▁▁▆▃▇
death_height_metres 75451 0.01 6592.85 1308.19 400 5800 6600 7550 8830 ▁▁▂▇▆
injury_height_metres 75510 0.01 7049.91 1214.24 400 6200 7100 8000 8880 ▁▁▂▇▇

Issues with data: - numeric variables - year, age, highpoint_metres, death_height_metres, injury_height_metres - zero variance variables - character variables - convert to numbers in recipes step - ID variable - expedition_id

factors_vec <- members %>% select(year, age, highpoint_metres, death_height_metres, injury_height_metres) %>% names()

data_clean <- members %>% 
    
    # Drop Variables
    select(-c(death_height_metres, injury_height_metres, death_cause, injury_type))

Explore Data

data_clean %>% count(died)
## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106
data_clean %>% 
    ggplot(aes(died)) + 
    geom_bar()

died vs. expedition_id

data_clean %>% 
    ggplot(aes(died, season)) + 
    geom_boxplot()

Correlation Plot

# Step 1: Binarize
data_binarized <- data_clean %>% 
    select(-expedition_id, -highpoint_metres, -age, -expedition_role, -peak_name, -citizenship, -sex) %>% 
    binarize()

data_binarized %>% glimpse()
## Rows: 76,519
## Columns: 36
## $ `member_id__KANG10101-01` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `member_id__-OTHER`       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ peak_id__AMAD             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ peak_id__ANN1             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__ANN4             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__BARU             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__CHOY             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__DHA1             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__EVER             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__HIML             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__KANG             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__LHOT             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__MAKA             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__MANA             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ peak_id__PUMO             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `peak_id__-OTHER`         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `year__-Inf_1991`         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ year__1991_2004           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year__2004_2012           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year__2012_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ season__Autumn            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ season__Spring            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,…
## $ season__Winter            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `season__-OTHER`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ hired__0                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hired__1                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ success__0                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,…
## $ success__1                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,…
## $ solo__0                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `solo__-OTHER`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ oxygen_used__0            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ oxygen_used__1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ died__0                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ died__1                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ injured__0                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ injured__1                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Step 2: Correlation
data_correlation <- data_binarized %>% 
    correlate(died__1)
## Warning: correlate(): [Data Imbalance Detected] Consider sampling to balance the classes more than 5%
##   Column with imbalance: died__1
data_correlation
## # A tibble: 36 × 3
##    feature bin       correlation
##    <fct>   <chr>           <dbl>
##  1 died    0             -1     
##  2 died    1              1     
##  3 year    -Inf_1991      0.0616
##  4 success 0              0.0415
##  5 success 1             -0.0415
##  6 peak_id ANN1           0.0359
##  7 peak_id AMAD          -0.0313
##  8 peak_id DHA1           0.0288
##  9 peak_id CHOY          -0.0261
## 10 year    2012_Inf      -0.0260
## # ℹ 26 more rows
# Step 3: Plot
data_correlation %>% 
    correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps