Goal: To predict the number of days lasted in the game Alone before tapping out or winning (days_lasted). Click here for the data.

Import Data

survivalists <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-01-24/survivalists.csv')
## Rows: 94 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, gender, city, state, country, reason_tapped_out, reason_cate...
## dbl  (5): season, age, result, days_lasted, day_linked_up
## lgl  (1): medically_evacuated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(survivalists)
Data summary
Name survivalists
Number of rows 94
Number of columns 16
_______________________
Column type frequency:
character 10
logical 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1.00 8 23 0 84 0
gender 0 1.00 4 6 0 2 0
city 0 1.00 3 15 0 77 0
state 1 0.99 4 16 0 40 0
country 0 1.00 6 19 0 4 0
reason_tapped_out 10 0.89 6 61 0 61 0
reason_category 10 0.89 16 17 0 3 0
team 80 0.15 16 23 0 7 0
profession 0 1.00 6 73 0 80 0
url 0 1.00 8 75 0 87 0

Variable type: logical

skim_variable n_missing complete_rate mean count
medically_evacuated 0 1 0.27 FAL: 69, TRU: 25

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
season 0 1.00 4.96 2.55 1 3.00 5.0 7.00 9 ▇▇▃▇▇
age 0 1.00 37.94 8.84 19 31.00 38.5 44.00 61 ▂▆▇▅▁
result 0 1.00 5.28 2.83 1 3.00 5.0 7.75 10 ▇▇▇▇▆
days_lasted 0 1.00 39.04 27.85 0 10.50 39.5 63.75 100 ▇▅▆▆▁
day_linked_up 86 0.09 9.00 0.76 8 8.75 9.0 9.25 10 ▃▁▇▁▃
data <- survivalists %>%
    
    # Treat missing values
    select(-season, -day_linked_up, -team, -url) %>%
    na.omit()

skimr::skim(data)
Data summary
Name data
Number of rows 83
Number of columns 12
_______________________
Column type frequency:
character 8
logical 1
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
name 0 1 9 23 0 74 0
gender 0 1 4 6 0 2 0
city 0 1 3 15 0 69 0
state 0 1 4 16 0 39 0
country 0 1 6 14 0 3 0
reason_tapped_out 0 1 6 61 0 60 0
reason_category 0 1 16 17 0 3 0
profession 0 1 6 73 0 72 0

Variable type: logical

skim_variable n_missing complete_rate mean count
medically_evacuated 0 1 0.3 FAL: 58, TRU: 25

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 38.01 9.05 19 31 39 44.5 61 ▃▆▇▅▂
result 0 1 5.77 2.56 2 4 6 8.0 10 ▇▇▃▇▆
days_lasted 0 1 34.94 26.18 0 8 35 55.5 89 ▇▅▅▃▃

Explore Data

Identify good predictors.

age

data %>%
    ggplot(aes(days_lasted, age)) +
    geom_point()

result

data %>%
    ggplot(aes(days_lasted, as.factor(result))) +
    geom_boxplot()

title

data %>%
    
    # tokenize title
    unnest_tokens(output = word, input = reason_tapped_out) %>%
    
    # calculate avg rent per word
    group_by(word) %>%
    summarise(days_lasted = mean(days_lasted),
              n           =n()) %>%
    ungroup() %>%
    
    filter(n > 1, !str_detect(word, "\\d")) %>%
    slice_max(order_by = days_lasted, n = 2) %>%

    # Plot
    ggplot(aes(days_lasted, fct_reorder(word, days_lasted))) +
    geom_point() +
    
    labs(y = "Reason Tapped Out")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-reason_tapped_out, -profession, -name) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 83
## Columns: 130
## $ `age__-Inf_31`                       <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, …
## $ age__31_39                           <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ age__39_44.5                         <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
## $ age__44.5_Inf                        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ gender__Female                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ gender__Male                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ city__Aiken                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Albemarle                      <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ city__Anchorage                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Augusta                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Bellevue                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Bellingham                     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Boulder                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Caledon                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Canal_Flats                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Cherryfield                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Coolidge                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Earlysville                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__East_Jordan                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Echo_Bay                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Edna_Bay                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Espanola                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Exeter                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Fayetteville                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Flathead_Valley                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Fort_Collins                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Fox                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Fox_Lake                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Grass_Valley                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Hattiesburg                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Henry                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Homer                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Indianapolis                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Inian_Islands                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Jackson                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ city__Juneau                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Kentwood                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Langhorne                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Laramie                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Lewis                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Liberty                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Lincoln                        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Lopez_Island                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Lubbock                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mahanoy_City                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mantua                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Montreal                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Montville                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Monument                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Mullingar                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Oak_Ridge                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Otis                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Pagosa_Springs                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Pittsburgh                     <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Plattsmouth                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Poolesville                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Port_McNeill                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Portland                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Quasqueton                     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Raymond                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Redding                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Rush_City                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ city__Saint_John                     <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ city__Salt_Lake_City                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__San_Antonio                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sandpoint                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Santa_Pola                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ city__Sisters                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Skowhegan                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Sturgis                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Umatilla                       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ city__Vancouver                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Wellsboro                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ city__Windsor                        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ city__Wrangell                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Alaska                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Arizona                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Arkansas                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__British_Columbia              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__California                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Colorado                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__England                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Florida                       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ state__Georgia                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Idaho                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Illinois                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Indiana                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Iowa                          <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Kentucky                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Louisiana                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Maine                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Maryland                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Massachusetts                 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Michigan                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Minnesota                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ state__Mississippi                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Montana                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Nebraska                      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__New_Brunswick                 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ state__North_Carolina                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ state__Ohio                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ state__Ontario                       <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ state__Oregon                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Pennsylvania                  <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Quebec                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Saskatchewan                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__South_Carolina                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Tennessee                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Texas                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Utah                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Valencia                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ state__Virginia                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Washington                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ state__Wyoming                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__Canada                      <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, …
## $ country__United_Kingdom              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ country__United_States               <dbl> 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, …
## $ `result__-Inf_4`                     <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ result__4_6                          <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ result__6_8                          <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ result__8_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ `days_lasted__-Inf_8`                <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, …
## $ days_lasted__8_35                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ days_lasted__35_55.5                 <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ days_lasted__55.5_Inf                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ medically_evacuated__0               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ medically_evacuated__1               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `reason_category__Family_/_personal` <dbl> 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, …
## $ reason_category__Loss_of_inventory   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `reason_category__Medical_/_health`  <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(`result__-Inf_4`)

data_corr_tbl
## # A tibble: 130 × 3
##    feature     bin         correlation
##    <fct>       <chr>             <dbl>
##  1 result      -Inf_4            1    
##  2 days_lasted 55.5_Inf          0.716
##  3 days_lasted -Inf_8           -0.452
##  4 result      4_6              -0.424
##  5 result      6_8              -0.382
##  6 result      8_Inf            -0.368
##  7 days_lasted 8_35             -0.322
##  8 city        Poolesville       0.209
##  9 state       Maryland          0.209
## 10 city        Rush_City         0.209
## # ℹ 120 more rows
# Step 3: Plot 
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 108 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps