To be honest I was very confused this time and think I nned some more time to understand this. I will talk to you next week

Import data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(correlationfunnel)

## ══ correlationfunnel Tip #2 ════════════════════════════════════════════════════
## Clean your NA's prior to using `binarize()`.
## Missing values and cleaning data are critical to getting great correlations. :)

library(tidymodels) #for building models

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10

## Warning: package 'modeldata' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

library(textrecipes) # For processing string variable
library(tidytext)

members <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv')

## Rows: 76519 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): expedition_id, member_id, peak_id, peak_name, season, sex, citizen...
## dbl  (5): year, age, highpoint_metres, death_height_metres, injury_height_me...
## lgl  (6): hired, success, solo, oxygen_used, died, injured
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

departures <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv')

## Rows: 9423 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): coname, exec_fullname, interim_coceo, still_there, notes, sources...
## dbl  (10): dismissal_dataset_id, gvkey, fyear, co_per_rol, departure_code, c...
## dttm  (1): leftofc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean data

skimr::skim(members)

Data summary
Name	members
Number of rows	76519
Number of columns	21
_______________________
Column type frequency:
character	10
logical	6
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
expedition_id	0	1.00	9	9	10350
member_id	0	1.00	12	12	76518
peak_id	0	1.00	4	4	391
peak_name	15	1.00	4	25	390
season	0	1.00	6	7	5
sex	2	1.00	1	1	2
citizenship	10	1.00	2	23	212
expedition_role	21	1.00	4	25	524
death_cause	75413	0.01	3	27	12
injury_type	74807	0.02	3	27	11

Variable type: logical

skim_variable	complete_rate	mean	count
hired	1	0.21	FAL: 60788, TRU: 15731
success	1	0.38	FAL: 47320, TRU: 29199
solo	1	0.00	FAL: 76398, TRU: 121
oxygen_used	1	0.24	FAL: 58286, TRU: 18233
died	1	0.01	FAL: 75413, TRU: 1106
injured	1	0.02	FAL: 74806, TRU: 1713

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2000.36	14.78	1905	1991	2004	2012	2019	▁▁▁▃▇
age	3497	0.95	37.33	10.40	7	29	36	44	85	▁▇▅▁▁
highpoint_metres	21833	0.71	7470.68	1040.06	3800	6700	7400	8400	8850	▁▁▆▃▇
death_height_metres	75451	0.01	6592.85	1308.19	400	5800	6600	7550	8830	▁▁▂▇▆
injury_height_metres	75510	0.01	7049.91	1214.24	400	6200	7100	8000	8880	▁▁▂▇▇

skimr::skim(departures)

Data summary
Name	departures
Number of rows	9423
Number of columns	19
_______________________
Column type frequency:
character	8
numeric	10
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
coname	0	1.00	2	30	3860
exec_fullname	0	1.00	5	790	8701
interim_coceo	9105	0.03	6	7	6
still_there	7311	0.22	3	10	77
notes	1644	0.83	5	3117	7755
sources	1475	0.84	18	1843	7915
eight_ks	4499	0.52	69	3884	4914
_merge	0	1.00	11	11	1

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
dismissal_dataset_id	0	1.00	5684.10	25005.46	1	2305.5	4593	6812.5	559044	▇▁▁▁▁
gvkey	0	1.00	40132.48	53921.34	1004	7337.0	14385	60900.5	328795	▇▁▁▁▁
fyear	0	1.00	2007.74	8.19	1987	2000.0	2008	2016.0	2020	▁▆▅▅▇
co_per_rol	0	1.00	25580.22	18202.38	-1	8555.5	22980	39275.5	64602	▇▆▅▃▃
departure_code	1667	0.82	5.20	1.53	1	5.0	5	7.0	9	▁▃▇▅▁
ceo_dismissal	1813	0.81	0.20	0.40	0	0.0	0	0.0	1	▇▁▁▁▂
tenure_no_ceodb	0	1.00	1.03	0.17	0	1.0	1	1.0	3	▁▇▁▁▁
max_tenure_ceodb	0	1.00	1.05	0.24	1	1.0	1	1.0	4	▇▁▁▁▁
fyear_gone	1802	0.81	2006.64	13.63	1980	2000.0	2007	2013.0	2997	▇▁▁▁▁
cik	245	0.97	741469.17	486551.43	1750	106413.0	857323	1050375.8	1808065	▆▁▇▂▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
leftofc	1802	0.81	1981-01-01	2998-04-27	2006-12-31	3627

factors_vec <- departures %>% select(gvkey, fyear, departure_code, ceo_dismissal, fyear_gone) %>% names()

depatures_clean2 <- departures %>%
    # Address factors imported as numeric
    mutate(across(all_of(factors_vec), as.factor)) %>%
    # Drop zero-variance variables
    select(-c(`_merge`, max_tenure_ceodb, leftofc, interim_coceo, notes, sources, eight_ks))

Explore data

depatures_clean2 %>% count(ceo_dismissal)

## # A tibble: 3 × 2
##   ceo_dismissal     n
##   <fct>         <int>
## 1 0              6121
## 2 1              1489
## 3 <NA>           1813

depatures_clean2 %>%
    ggplot(aes(ceo_dismissal)) +
    geom_bar()

Correlation plot

# Step 1: binarze 
data_binarized <- depatures_clean2 %>%
      select(-c(still_there, ceo_dismissal, fyear_gone, departure_code, cik)) %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 9,423
## Columns: 45
## $ `dismissal_dataset_id__-Inf_2305.5` <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dismissal_dataset_id__2305.5_4593   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dismissal_dataset_id__4593_6812.5   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dismissal_dataset_id__6812.5_Inf    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ coname__SEARS_HOLDINGS_CORP         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `coname__-OTHER`                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ gvkey__6307                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `gvkey__-OTHER`                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ fyear__1993                         <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0…
## $ fyear__1994                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__1995                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__1996                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__1997                         <dbl> 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
## $ fyear__1998                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__1999                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2000                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ fyear__2001                         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2002                         <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2003                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2004                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2005                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2006                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2007                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ fyear__2008                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2009                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2010                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2011                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2012                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2013                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2014                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2015                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2016                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2017                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2018                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ fyear__2019                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `fyear__-OTHER`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `co_per_rol__-Inf_8555.5`           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ co_per_rol__8555.5_22980            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ co_per_rol__22980_39275.5           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ co_per_rol__39275.5_Inf             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ exec_fullname__Amin_J._Khoury       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `exec_fullname__-OTHER`             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ tenure_no_ceodb__1                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ tenure_no_ceodb__2                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tenure_no_ceodb__-OTHER`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

# Step 2: correlation
data_correlation <- data_binarized %>%
    correlate(dismissal_dataset_id__6812.5_Inf)

data_correlation

## # A tibble: 45 × 3
##    feature              bin           correlation
##    <fct>                <chr>               <dbl>
##  1 dismissal_dataset_id 6812.5_Inf         1     
##  2 dismissal_dataset_id -Inf_2305.5       -0.333 
##  3 dismissal_dataset_id 2305.5_4593       -0.333 
##  4 dismissal_dataset_id 4593_6812.5       -0.333 
##  5 co_per_rol           -Inf_8555.5       -0.330 
##  6 co_per_rol           39275.5_Inf        0.251 
##  7 co_per_rol           22980_39275.5      0.124 
##  8 fyear                2017               0.101 
##  9 fyear                1996              -0.0941
## 10 fyear                1994              -0.0933
## # ℹ 35 more rows

# Step 3: plot
data_correlation %>% 
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Second data set

members1 <- members %>%
    # Treat missing values
    select(-expedition_id, -member_id)



factors_vec1 <- members1 %>% select(year, age, highpoint_metres, death_height_metres, injury_height_metres) %>% names()

members1_clean <- members1 %>%
    # Address factors imported as numeric
    mutate(across(all_of(factors_vec1), as.factor))
    # Drop zero-variance variables

members %>% count(died)

## # A tibble: 2 × 2
##   died      n
##   <lgl> <int>
## 1 FALSE 75413
## 2 TRUE   1106

members %>%
    ggplot(aes(died)) +
    geom_bar()

Died vs. age

members %>%
    ggplot(aes(died, age)) +
    geom_boxplot()

## Warning: Removed 3497 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Correlation plot

# Step 1: binarze 
data_binarized1 <- members1_clean %>%
    select(-c(injury_height_metres, death_height_metres, death_cause, injury_type, highpoint_metres, age, expedition_role, peak_name, citizenship, sex)) %>%
    binarize()

data_binarized1 %>% glimpse()

## Rows: 76,519
## Columns: 71
## $ peak_id__AMAD     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ peak_id__ANN1     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__ANN4     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__BARU     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__CHOY     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__DHA1     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__EVER     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__HIML     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__KANG     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__LHOT     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__MAKA     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__MANA     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ peak_id__PUMO     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `peak_id__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1980        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1981        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1982        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1983        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1984        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1985        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1986        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1987        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1988        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1989        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1990        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1991        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1992        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1993        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1994        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1995        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1996        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1997        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1998        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1999        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2000        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2001        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2002        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2003        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2004        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2005        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2006        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2007        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2008        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2009        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2010        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2011        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2012        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2013        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2014        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2015        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2016        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2017        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2018        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__2019        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `year__-OTHER`    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ season__Autumn    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season__Spring    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ season__Winter    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `season__-OTHER`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hired__0          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ hired__1          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ success__0        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0…
## $ success__1        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1…
## $ solo__0           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ `solo__-OTHER`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ oxygen_used__0    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ oxygen_used__1    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ died__0           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ died__1           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ injured__0        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ injured__1        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

# Step 2: correlation
data_correlation2 <- data_binarized1 %>%
    correlate(died__1)

## Warning: correlate(): [Data Imbalance Detected] Consider sampling to balance the classes more than 5%
##   Column with imbalance: died__1

data_correlation2

## # A tibble: 71 × 3
##    feature bin    correlation
##    <fct>   <chr>        <dbl>
##  1 died    0          -1     
##  2 died    1           1     
##  3 year    -OTHER      0.0587
##  4 success 0           0.0415
##  5 success 1          -0.0415
##  6 peak_id ANN1        0.0359
##  7 peak_id AMAD       -0.0313
##  8 peak_id DHA1        0.0288
##  9 peak_id CHOY       -0.0261
## 10 peak_id KANG        0.0254
## # ℹ 61 more rows

# Step 3: plot
data_correlation2 %>% 
    correlationfunnel::plot_correlation_funnel()

## Warning: ggrepel: 55 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Apply5

Tindra Bergstrand

2024-10-10

To be honest I was very confused this time and think I nned some more time to understand this. I will talk to you next week

Import data

Clean data

Explore data

Second data set