goal: to predict how long a book will last on the NY Times Bestsellers list

Click here for the data

Import data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
my_data <- nyt_titles %>%
    select(-first_week, -debut_rank) %>%
    na.omit() %>%
    mutate(total_weeks = log(total_weeks))

skimr::skim(my_data)
Data summary
Name my_data
Number of rows 7427
Number of columns 6
_______________________
Column type frequency:
character 2
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 55 0 7168 0
author 0 1 4 73 0 2205 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3716.06 2145.23 0 1858.50 3717.00 5573.5 7430.00 ▇▇▇▇▇
year 0 1 1989.63 26.21 1931 1968.00 2000.00 2011.0 2020.00 ▂▂▂▃▇
total_weeks 0 1 1.48 1.11 0 0.69 1.39 2.3 5.18 ▇▇▆▂▁
best_rank 0 1 6.92 4.57 1 3.00 6.00 10.5 17.00 ▇▅▃▃▂

Explore data

data_binarized <- my_data %>%
    select(-title, -id, -author) %>%
    binarize()

data_binarized %>% glimpse
## Rows: 7,427
## Columns: 12
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5                               <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf                             <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
mydata_correlate <- data_binarized %>%
    correlate(total_weeks__2.30258509299405_Inf)
mydata_correlate %>%
    plot_correlation_funnel()