Import Data

members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')
## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(members)
Data summary
Name members
Number of rows 76519
Number of columns 21
_______________________
Column type frequency:
character 10
logical 6
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
expedition_id 0 1.00 9 9 0 10350 0
member_id 0 1.00 12 12 0 76518 0
peak_id 0 1.00 4 4 0 391 0
peak_name 15 1.00 4 25 0 390 0
season 0 1.00 6 7 0 5 0
sex 2 1.00 1 1 0 2 0
citizenship 10 1.00 2 23 0 212 0
expedition_role 21 1.00 4 25 0 524 0
death_cause 75413 0.01 3 27 0 12 0
injury_type 74807 0.02 3 27 0 11 0

Variable type: logical

skim_variable n_missing complete_rate mean count
hired 0 1 0.21 FAL: 60788, TRU: 15731
success 0 1 0.38 FAL: 47320, TRU: 29199
solo 0 1 0.00 FAL: 76398, TRU: 121
oxygen_used 0 1 0.24 FAL: 58286, TRU: 18233
died 0 1 0.01 FAL: 75413, TRU: 1106
injured 0 1 0.02 FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2000.36 14.78 1905 1991 2004 2012 2019 ▁▁▁▃▇
age 3497 0.95 37.33 10.40 7 29 36 44 85 ▁▇▅▁▁
highpoint_metres 21833 0.71 7470.68 1040.06 3800 6700 7400 8400 8850 ▁▁▆▃▇
death_height_metres 75451 0.01 6592.85 1308.19 400 5800 6600 7550 8830 ▁▁▂▇▆
injury_height_metres 75510 0.01 7049.91 1214.24 400 6200 7100 8000 8880 ▁▁▂▇▇
data <- members %>%

 # Treat missing values 
  select(-death_cause, -injury_type, -highpoint_metres, -death_height_metres, -injury_height_metres) %>%
  na.omit() %>%
  
  # Log Transform Variables with pos-skewed Distribution
  mutate(across(where(is.logical), as.factor))
# Step 1: Prepare data 
data_binarized_tbl <- data %>%
  select(-peak_name) %>%
  binarize()

data_binarized_tbl %>% glimpse()
## Rows: 72,985
## Columns: 71
## $ expedition_id__EVER88101       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_id__-OTHER`        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `member_id__KANG10101-01`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `member_id__-OTHER`            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ peak_id__AMAD                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ peak_id__ANN1                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__ANN4                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__BARU                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__CHOY                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__DHA1                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__EVER                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__HIML                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__KANG                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__LHOT                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__MAKA                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__MANA                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ peak_id__PUMO                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `peak_id__-OTHER`              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `year__-Inf_1992`              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ year__1992_2004                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2004_2012                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2012_Inf                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ season__Autumn                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ season__Spring                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, …
## $ season__Winter                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `season__-OTHER`               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ sex__F                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ sex__M                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `age__-Inf_29`                 <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, …
## $ age__29_36                     <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ age__36_44                     <dbl> 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
## $ age__44_Inf                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Australia         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Austria           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Canada            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__China             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__France            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ citizenship__Germany           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__India             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Italy             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Japan             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Nepal             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Netherlands       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__New_Zealand       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Poland            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Russia            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__S_Korea           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Spain             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__Switzerland       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__UK                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ citizenship__USA               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, …
## $ citizenship__W_Germany         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `citizenship__-OTHER`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Climber       <dbl> 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, …
## $ expedition_role__Deputy_Leader <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Exp_Doctor    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_role__H-A_Worker`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ expedition_role__Leader        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `expedition_role__-OTHER`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ hired__FALSE                   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ hired__TRUE                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ success__FALSE                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, …
## $ success__TRUE                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ solo__FALSE                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ `solo__-OTHER`                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ oxygen_used__FALSE             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ oxygen_used__TRUE              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ died__FALSE                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ died__TRUE                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ injured__FALSE                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ injured__TRUE                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate 
data_corr_tbl <- data_binarized_tbl %>%
  correlate(died__TRUE)
## Warning: correlate(): [Data Imbalance Detected] Consider sampling to balance the classes more than 5%
##   Column with imbalance: died__TRUE
data_corr_tbl
## # A tibble: 71 × 3
##    feature bin       correlation
##    <fct>   <chr>           <dbl>
##  1 died    FALSE         -1     
##  2 died    TRUE           1     
##  3 year    -Inf_1992      0.0519
##  4 peak_id ANN1           0.0336
##  5 success FALSE          0.0332
##  6 success TRUE          -0.0332
##  7 peak_id DHA1           0.0290
##  8 peak_id AMAD          -0.0281
##  9 peak_id CHOY          -0.0241
## 10 year    2004_2012     -0.0211
## # ℹ 61 more rows
# Step 3: Plot 
data_corr_tbl %>%
  plot_correlation_funnel()
## Warning: ggrepel: 41 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Conclusion

After using all my variables to find the best correlation my results were insufficient. The variable year had the best correlation at 0.1, which is not ideal or what I was looking for. This was the most interesting data set for me to use because I was cruiosity but the results did not work out accordingly.