Goal: To predict the number of days lasted in the game Alone before tapping out or winning (days_lasted). Click here for the data.
survivalists <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-01-24/survivalists.csv')
## Rows: 94 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, gender, city, state, country, reason_tapped_out, reason_cate...
## dbl (5): season, age, result, days_lasted, day_linked_up
## lgl (1): medically_evacuated
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(survivalists)
| Name | survivalists |
| Number of rows | 94 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| logical | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| name | 0 | 1.00 | 8 | 23 | 0 | 84 | 0 |
| gender | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| city | 0 | 1.00 | 3 | 15 | 0 | 77 | 0 |
| state | 1 | 0.99 | 4 | 16 | 0 | 40 | 0 |
| country | 0 | 1.00 | 6 | 19 | 0 | 4 | 0 |
| reason_tapped_out | 10 | 0.89 | 6 | 61 | 0 | 61 | 0 |
| reason_category | 10 | 0.89 | 16 | 17 | 0 | 3 | 0 |
| team | 80 | 0.15 | 16 | 23 | 0 | 7 | 0 |
| profession | 0 | 1.00 | 6 | 73 | 0 | 80 | 0 |
| url | 0 | 1.00 | 8 | 75 | 0 | 87 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| medically_evacuated | 0 | 1 | 0.27 | FAL: 69, TRU: 25 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| season | 0 | 1.00 | 4.96 | 2.55 | 1 | 3.00 | 5.0 | 7.00 | 9 | ▇▇▃▇▇ |
| age | 0 | 1.00 | 37.94 | 8.84 | 19 | 31.00 | 38.5 | 44.00 | 61 | ▂▆▇▅▁ |
| result | 0 | 1.00 | 5.28 | 2.83 | 1 | 3.00 | 5.0 | 7.75 | 10 | ▇▇▇▇▆ |
| days_lasted | 0 | 1.00 | 39.04 | 27.85 | 0 | 10.50 | 39.5 | 63.75 | 100 | ▇▅▆▆▁ |
| day_linked_up | 86 | 0.09 | 9.00 | 0.76 | 8 | 8.75 | 9.0 | 9.25 | 10 | ▃▁▇▁▃ |
data <- survivalists %>%
# Treat missing values
select(-season, -day_linked_up, -team, -url) %>%
na.omit()
skimr::skim(data)
| Name | data |
| Number of rows | 83 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| logical | 1 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| name | 0 | 1 | 9 | 23 | 0 | 74 | 0 |
| gender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| city | 0 | 1 | 3 | 15 | 0 | 69 | 0 |
| state | 0 | 1 | 4 | 16 | 0 | 39 | 0 |
| country | 0 | 1 | 6 | 14 | 0 | 3 | 0 |
| reason_tapped_out | 0 | 1 | 6 | 61 | 0 | 60 | 0 |
| reason_category | 0 | 1 | 16 | 17 | 0 | 3 | 0 |
| profession | 0 | 1 | 6 | 73 | 0 | 72 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| medically_evacuated | 0 | 1 | 0.3 | FAL: 58, TRU: 25 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 38.01 | 9.05 | 19 | 31 | 39 | 44.5 | 61 | ▃▆▇▅▂ |
| result | 0 | 1 | 5.77 | 2.56 | 2 | 4 | 6 | 8.0 | 10 | ▇▇▃▇▆ |
| days_lasted | 0 | 1 | 34.94 | 26.18 | 0 | 8 | 35 | 55.5 | 89 | ▇▅▅▃▃ |
Identify good predictors.
age
data %>%
ggplot(aes(days_lasted, age)) +
geom_point()
result
data %>%
ggplot(aes(days_lasted, as.factor(result))) +
geom_boxplot()
title
data %>%
# tokenize title
unnest_tokens(output = word, input = reason_tapped_out) %>%
# calculate avg rent per word
group_by(word) %>%
summarise(days_lasted = mean(days_lasted),
n =n()) %>%
ungroup() %>%
filter(n > 1, !str_detect(word, "\\d")) %>%
slice_max(order_by = days_lasted, n = 2) %>%
# Plot
ggplot(aes(days_lasted, fct_reorder(word, days_lasted))) +
geom_point() +
labs(y = "Reason Tapped Out")
EDA shortcut
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-reason_tapped_out, -profession, -name) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 83
## Columns: 130
## $ `age__-Inf_31` <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ age__31_39 <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ age__39_44.5 <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
## $ age__44.5_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ gender__Female <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gender__Male <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ city__Aiken <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Albemarle <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ city__Anchorage <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Augusta <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Bellevue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Bellingham <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Boulder <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Caledon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Canal_Flats <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Cherryfield <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Coolidge <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Earlysville <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__East_Jordan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Echo_Bay <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Edna_Bay <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Espanola <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Exeter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Fayetteville <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Flathead_Valley <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Fort_Collins <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Fox <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Fox_Lake <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Grass_Valley <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hattiesburg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Henry <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Homer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Indianapolis <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Inian_Islands <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Jackson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ city__Juneau <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Kentwood <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Langhorne <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Laramie <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Lewis <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Liberty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Lincoln <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Lopez_Island <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Lubbock <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mahanoy_City <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mantua <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Montreal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Montville <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Monument <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mullingar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Oak_Ridge <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Otis <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Pagosa_Springs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Pittsburgh <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Plattsmouth <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Poolesville <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Port_McNeill <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Portland <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Quasqueton <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Raymond <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Redding <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Rush_City <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ city__Saint_John <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ city__Salt_Lake_City <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__San_Antonio <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sandpoint <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Santa_Pola <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ city__Sisters <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Skowhegan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sturgis <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Umatilla <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ city__Vancouver <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Wellsboro <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Windsor <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ city__Wrangell <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Alaska <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Arizona <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Arkansas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__British_Columbia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__California <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Colorado <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__England <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Florida <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ state__Georgia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Idaho <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Illinois <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Indiana <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Iowa <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Kentucky <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Louisiana <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Maine <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Maryland <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Massachusetts <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Michigan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Minnesota <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ state__Mississippi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Montana <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Nebraska <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__New_Brunswick <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ state__North_Carolina <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ state__Ohio <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ state__Ontario <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ state__Oregon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Pennsylvania <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Quebec <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Saskatchewan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__South_Carolina <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Tennessee <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Texas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Utah <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Valencia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ state__Virginia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Washington <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Wyoming <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__Canada <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, …
## $ country__United_Kingdom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__United_States <dbl> 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, …
## $ `result__-Inf_4` <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ result__4_6 <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ result__6_8 <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ result__8_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ `days_lasted__-Inf_8` <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, …
## $ days_lasted__8_35 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ days_lasted__35_55.5 <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ days_lasted__55.5_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ medically_evacuated__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ medically_evacuated__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `reason_category__Family_/_personal` <dbl> 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ reason_category__Loss_of_inventory <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `reason_category__Medical_/_health` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(`result__-Inf_4`)
data_corr_tbl
## # A tibble: 130 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 result -Inf_4 1
## 2 days_lasted 55.5_Inf 0.716
## 3 days_lasted -Inf_8 -0.452
## 4 result 4_6 -0.424
## 5 result 6_8 -0.382
## 6 result 8_Inf -0.368
## 7 days_lasted 8_35 -0.322
## 8 city Poolesville 0.209
## 9 state Maryland 0.209
## 10 city Rush_City 0.209
## # ℹ 120 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 108 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps