Goal: to predict total weeks on bestseller list Click here for the data

Import Data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
data <- nyt_titles %>%
    na.omit() %>%
    
    mutate(total_weeks = log(total_weeks)) %>%
    separate_rows(author, sep = "and | with")

Explore Data

Debut Rank

data %>% 
    ggplot(aes(total_weeks, as.factor(debut_rank))) +
    geom_boxplot()

Best Rank

data %>% 
    ggplot(aes(total_weeks, as.factor(best_rank))) +
    geom_boxplot()

data %>%
    ggplot(aes(year, total_weeks)) +
    geom_col()

data %>% 
    
    unnest_tokens(output = word, input = author) %>%
    
    group_by(word) %>%
    summarise(total_weeks = mean(total_weeks),
              n     = n ()) %>%
    ungroup () %>%
    
    filter(n > 20) %>%
    slice_max(order_by = total_weeks, n = 20) %>%
    
    ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
    geom_point() +
    
    labs(y = "Author's Name")

EDA Shortcut

cols.dont.want <- "first_week"

data <- data[, ! names(data) %in% cols.dont.want, drop = F]

data_binarized_tbl <- data %>% 
    select(-id, -title) %>%
    binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,835
## Columns: 19
## $ author__Danielle_Steel                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ author__James_Patterson                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `year__-Inf_1971`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1971_2001                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2001_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10                                 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10_Inf                               <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
data_corr_tbl <- data_binarized_tbl %>%
    correlate(total_weeks__2.30258509299405_Inf)
data_corr_tbl
## # A tibble: 19 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 2.30258509299405_Inf                  1       
##  2 total_weeks -Inf_0.693147180559945               -0.389   
##  3 best_rank   -Inf_3                                0.335   
##  4 total_weeks 1.38629436111989_2.30258509299405    -0.320   
##  5 best_rank   10_Inf                               -0.306   
##  6 total_weeks 0.693147180559945_1.38629436111989   -0.256   
##  7 year        -Inf_1971                             0.245   
##  8 year        2011_Inf                             -0.241   
##  9 year        2001_2011                            -0.219   
## 10 year        1971_2001                             0.208   
## 11 best_rank   6_10                                 -0.128   
## 12 best_rank   3_6                                   0.0752  
## 13 author      James_Patterson                      -0.0417  
## 14 debut_rank  4_8                                  -0.0321  
## 15 author      -OTHER                                0.0240  
## 16 debut_rank  8_12                                  0.0180  
## 17 debut_rank  -Inf_4                                0.0152  
## 18 author      Danielle_Steel                        0.00567 
## 19 debut_rank  12_Inf                               -0.000531
data_corr_tbl %>% 
    plot_correlation_funnel()

Split Data

# data <- sample_n(data, 100)

set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test  <- testing(data_split)

set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5288/588]> Fold01
##  2 <split [5288/588]> Fold02
##  3 <split [5288/588]> Fold03
##  4 <split [5288/588]> Fold04
##  5 <split [5288/588]> Fold05
##  6 <split [5288/588]> Fold06
##  7 <split [5289/587]> Fold07
##  8 <split [5289/587]> Fold08
##  9 <split [5289/587]> Fold09
## 10 <split [5289/587]> Fold10
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.4.2
usemodels::use_xgboost(total_weeks ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = total_weeks ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(49873)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <- 
    recipe(formula = total_weeks ~ ., data = data_train) %>% 
    recipes::update_role(id, new_role = "id variable") %>%
    step_other(title, author) %>%
    step_tokenize(title) %>% 
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_dummy(author, one_hot = TRUE) %>%
    step_YeoJohnson(year, debut_rank)

xgboost_recipe %>% prep() %>% juice() %>% glimpse() 
## Warning: max_tokens was set to 100, but only 2 was available and selected.
## Rows: 5,876
## Columns: 9
## $ id                    <dbl> 7361, 7110, 7203, 1849, 1528, 7017, 3309, 1792, …
## $ year                  <dbl> 1960, 2005, 2013, 2004, 2006, 2017, 2011, 2018, …
## $ debut_rank            <dbl> 7.5904361, 4.6228006, 5.0384401, 0.8626939, 0.86…
## $ best_rank             <dbl> 15, 4, 6, 9, 16, 4, 1, 15, 1, 2, 14, 2, 1, 10, 1…
## $ total_weeks           <dbl> 0.0000000, 1.7917595, 0.6931472, 0.0000000, 0.00…
## $ tfidf_title_burn      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_other     <dbl> 0.6934877, 0.6934877, 0.6934877, 0.6934877, 0.69…
## $ author_Danielle.Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_other          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(344)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv,
            grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: max_tokens was set to 100, but only 2 was available and selected.
## Warning: package 'xgboost' was built under R version 4.4.2
## → A | warning: max_tokens was set to 100, but only 3 was available and selected.
## There were issues with some computations   A: x1                                                 → B | warning: max_tokens was set to 100, but only 2 was available and selected.
## There were issues with some computations   A: x1There were issues with some computations   A: x1   B: x1                                                         → C | warning: max_tokens was set to 100, but only 4 was available and selected.
## There were issues with some computations   A: x1   B: x1There were issues with some computations   A: x1   B: x1   C: x1There were issues with some computations   A: x1   B: x2   C: x1There were issues with some computations   A: x1   B: x3   C: x1There were issues with some computations   A: x2   B: x3   C: x1There were issues with some computations   A: x2   B: x4   C: x1There were issues with some computations   A: x2   B: x5   C: x1There were issues with some computations   A: x2   B: x6   C: x1There were issues with some computations   A: x2   B: x7   C: x1There were issues with some computations   A: x2   B: x7   C: x1

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
##    mtry trees min_n learn_rate .metric .estimator  mean     n std_err .config   
##   <int> <int> <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>     
## 1     3  1104    28    0.00484 rmse    standard   0.539    10 0.00990 Preproces…
## 2     1  1613    36    0.0290  rmse    standard   0.550    10 0.0102  Preproces…
## 3     6  1524    23    0.0836  rmse    standard   0.561    10 0.00908 Preproces…
## 4     5   768    12    0.112   rmse    standard   0.561    10 0.00929 Preproces…
## 5     6   162     7    0.00108 rmse    standard   1.26     10 0.00815 Preproces…
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

data_fit <- tune::last_fit(xgboost_fw, data_split)
## → A | warning: max_tokens was set to 100, but only 2 was available and selected.
## There were issues with some computations   A: x1There were issues with some computations   A: x1
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.546 Preprocessor1_Model1
## 2 rsq     standard       0.760 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
    ggplot(aes(total_weeks, .pred)) +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") +
    coord_fixed()