Goal: Predict whether Himalayan climbers died

Import Data

members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(members)

Data summary
Name	members
Number of rows	76519
Number of columns	21
_______________________
Column type frequency:
character	10
logical	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
expedition_id	0	1.00	9	9	10350
member_id	0	1.00	12	12	76518
peak_id	0	1.00	4	4	391
peak_name	15	1.00	4	25	390
season	0	1.00	6	7	5
sex	2	1.00	1	1	2
citizenship	10	1.00	2	23	212
expedition_role	21	1.00	4	25	524
death_cause	75413	0.01	3	27	12
injury_type	74807	0.02	3	27	11

Variable type: logical

skim_variable	complete_rate	mean	count
hired	1	0.21	FAL: 60788, TRU: 15731
success	1	0.38	FAL: 47320, TRU: 29199
solo	1	0.00	FAL: 76398, TRU: 121
oxygen_used	1	0.24	FAL: 58286, TRU: 18233
died	1	0.01	FAL: 75413, TRU: 1106
injured	1	0.02	FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2000.36	14.78	1905	1991	2004	2012	2019	▁▁▁▃▇
age	3497	0.95	37.33	10.40	7	29	36	44	85	▁▇▅▁▁
highpoint_metres	21833	0.71	7470.68	1040.06	3800	6700	7400	8400	8850	▁▁▆▃▇
death_height_metres	75451	0.01	6592.85	1308.19	400	5800	6600	7550	8830	▁▁▂▇▆
injury_height_metres	75510	0.01	7049.91	1214.24	400	6200	7100	8000	8880	▁▁▂▇▇

# Remove variables with too many missing values
members_clean <- members %>%
  select(-c( highpoint_metres, death_height_metres, death_cause, injured, injury_type, injury_height_metres, age)) %>%
    
# Remove Irrelevant Variables
    select(-oxygen_used, -solo, -hired) %>%
    
# Remove Redundant Variables
    select(-c(peak_id)) %>%

# Remove Duplicates in Member_id
    distinct(member_id, .keep_all = TRUE) %>%
    
    select(-expedition_role, -peak_name, -citizenship, -sex) %>%
    na.omit() %>%

    mutate(across(where(is.logical), as.factor)) %>%
    mutate(across(where(is.character), as.factor))

Explore Data

members_clean %>% count(died)

## # A tibble: 2 × 2
##   died      n
##   <fct> <int>
## 1 FALSE 75412
## 2 TRUE   1106

members_clean %>%
  ggplot(aes(died)) +
  geom_bar()

Death vs Success Rate

ggplot(members_clean, aes(x = died, fill = as.factor(success))) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Success by Died Status",
       x = "Died",
       y = "Proportion",
       fill = "Success")

Correlation Plot

# Step 1: binarize
member_binarized <- members_clean %>%
  binarize()

member_binarized %>% glimpse()

## Rows: 76,518
## Columns: 16
## $ expedition_id__EVER88101  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `expedition_id__-OTHER`   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `member_id__ACHN15301-01` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `member_id__-OTHER`       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ `year__-Inf_1991`         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ year__1991_2004           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year__2004_2012           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year__2012_Inf            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ season__Autumn            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ season__Spring            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,…
## $ season__Winter            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `season__-OTHER`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ success__FALSE            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,…
## $ success__TRUE             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,…
## $ died__FALSE               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ died__TRUE                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

# Step 2: correlation
member_correlation <- member_binarized %>%
  correlate(success__FALSE)

member_correlation

## # A tibble: 16 × 3
##    feature       bin          correlation
##    <fct>         <chr>              <dbl>
##  1 success       FALSE            1      
##  2 success       TRUE            -1      
##  3 year          -Inf_1991        0.194  
##  4 year          2012_Inf        -0.132  
##  5 year          2004_2012       -0.101  
##  6 season        Spring          -0.0604 
##  7 season        Winter           0.0497 
##  8 died          TRUE             0.0415 
##  9 died          FALSE           -0.0415 
## 10 season        Autumn           0.0396 
## 11 year          1991_2004        0.0338 
## 12 season        -OTHER           0.0233 
## 13 expedition_id -OTHER          -0.0178 
## 14 expedition_id EVER88101        0.0178 
## 15 member_id     -OTHER           0.00460
## 16 member_id     ACHN15301-01    -0.00460

# Step 3: Plot
member_correlation %>%
  correlationfunnel::plot_correlation_funnel()

Apply 5: CEO Departures

Shea Clark

2025-03-08

Import Data

Explore Data