Goal: to predict total weeks on bestseller list Click here for the data

Import Data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
data <- nyt_titles %>%
    na.omit() %>%
    
    mutate(total_weeks = log(total_weeks)) %>%
    separate_rows(author, sep = "and | with")

Explore Data

Debut Rank

data %>% 
    ggplot(aes(total_weeks, as.factor(debut_rank))) +
    geom_boxplot()

Best Rank

data %>% 
    ggplot(aes(total_weeks, as.factor(best_rank))) +
    geom_boxplot()

data %>%
    ggplot(aes(year, total_weeks)) +
    geom_col()

data %>% 
    
    unnest_tokens(output = word, input = author) %>%
    
    group_by(word) %>%
    summarise(total_weeks = mean(total_weeks),
              n     = n ()) %>%
    ungroup () %>%
    
    filter(n > 20) %>%
    slice_max(order_by = total_weeks, n = 20) %>%
    
    ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
    geom_point() +
    
    labs(y = "Author's Name")

EDA Shortcut

cols.dont.want <- "first_week"

data <- data[, ! names(data) %in% cols.dont.want, drop = F]

data_binarized_tbl <- data %>% 
    select(-id, -title) %>%
    binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,835
## Columns: 19
## $ author__Danielle_Steel                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ author__James_Patterson                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `year__-Inf_1971`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1971_2001                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2001_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10                                 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10_Inf                               <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
data_corr_tbl <- data_binarized_tbl %>%
    correlate(total_weeks__2.30258509299405_Inf)
data_corr_tbl
## # A tibble: 19 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 2.30258509299405_Inf                  1       
##  2 total_weeks -Inf_0.693147180559945               -0.389   
##  3 best_rank   -Inf_3                                0.335   
##  4 total_weeks 1.38629436111989_2.30258509299405    -0.320   
##  5 best_rank   10_Inf                               -0.306   
##  6 total_weeks 0.693147180559945_1.38629436111989   -0.256   
##  7 year        -Inf_1971                             0.245   
##  8 year        2011_Inf                             -0.241   
##  9 year        2001_2011                            -0.219   
## 10 year        1971_2001                             0.208   
## 11 best_rank   6_10                                 -0.128   
## 12 best_rank   3_6                                   0.0752  
## 13 author      James_Patterson                      -0.0417  
## 14 debut_rank  4_8                                  -0.0321  
## 15 author      -OTHER                                0.0240  
## 16 debut_rank  8_12                                  0.0180  
## 17 debut_rank  -Inf_4                                0.0152  
## 18 author      Danielle_Steel                        0.00567 
## 19 debut_rank  12_Inf                               -0.000531
data_corr_tbl %>% 
    plot_correlation_funnel()

Split Data

data <- sample_n(data, 100)

set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test  <- testing(data_split)

set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.4.2
usemodels::use_xgboost(total_weeks ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = total_weeks ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6804)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <- 
    recipe(formula = total_weeks ~ ., data = data_train) %>% 
    recipes::update_role(id, new_role = "id variable") %>%
    step_other(title, author) %>%
    step_tokenize(title) %>% 
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_dummy(author, one_hot = TRUE) %>%
    step_YeoJohnson(year, debut_rank)

xgboost_recipe %>% prep() %>% juice() %>% glimpse() 
## Warning: max_tokens was set to 100, but only 3 was available and selected.
## Rows: 75
## Columns: 10
## $ id                     <dbl> 6538, 5706, 34, 4068, 3180, 4775, 2005, 6826, 4…
## $ year                   <dbl> 1957, 2019, 2002, 2007, 1951, 1994, 1970, 2018,…
## $ debut_rank             <dbl> 4.8586631, 4.0652209, 4.8586631, 2.3721381, 0.8…
## $ best_rank              <dbl> 9, 13, 1, 1, 16, 6, 4, 1, 8, 6, 1, 2, 2, 12, 13…
## $ total_weeks            <dbl> 2.1972246, 0.0000000, 2.0794415, 1.7917595, 1.0…
## $ tfidf_title_2nd        <dbl> 0.000000, 0.000000, 2.165367, 0.000000, 0.00000…
## $ tfidf_title_chance     <dbl> 0.000000, 0.000000, 2.165367, 0.000000, 0.00000…
## $ tfidf_title_other      <dbl> 0.6998812, 0.6998812, 0.0000000, 0.6998812, 0.6…
## $ author_Janet.Evanovich <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ author_other           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(344)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv,
            grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: max_tokens was set to 100, but only 3 was available and selected.
## Warning: package 'xgboost' was built under R version 4.4.2
## → A | warning: max_tokens was set to 100, but only 3 was available and selected.
## There were issues with some computations   A: x1                                                 → B | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations   A: x1There were issues with some computations   A: x1   B: x1There were issues with some computations   A: x2   B: x1There were issues with some computations   A: x2   B: x2There were issues with some computations   A: x3   B: x3There were issues with some computations   A: x4   B: x3There were issues with some computations   A: x4   B: x4There were issues with some computations   A: x5   B: x4There were issues with some computations   A: x5   B: x5There were issues with some computations   A: x6   B: x5There were issues with some computations   A: x6   B: x6There were issues with some computations   A: x7   B: x6There were issues with some computations   A: x7   B: x7There were issues with some computations   A: x8   B: x7There were issues with some computations   A: x8   B: x8There were issues with some computations   A: x9   B: x9There were issues with some computations   A: x10   B: x10There were issues with some computations   A: x10   B: x10