goal: to predict how long a book will last on the NY Times Bestsellers list

Import data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

my_data <- nyt_titles %>%
    select(-debut_rank, -first_week) %>%
    na.omit() %>%
    mutate(total_weeks = log(total_weeks))

skimr::skim(my_data)

Data summary
Name	my_data
Number of rows	7427
Number of columns	6
_______________________
Column type frequency:
character	2
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	55	0	7168	0
author	0	1	4	73	0	2205	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3716.06	2145.23	0	1858.50	3717.00	5573.5	7430.00	▇▇▇▇▇
year	1	1989.63	26.21	1931	1968.00	2000.00	2011.0	2020.00	▂▂▂▃▇
total_weeks	1	1.48	1.11	0	0.69	1.39	2.3	5.18	▇▇▆▂▁
best_rank	1	6.92	4.57	1	3.00	6.00	10.5	17.00	▇▅▃▃▂

Explore data

data_binarized <- my_data %>%
    select(-title, -id) %>%
               binarize()

data_binarized %>% glimpse

## Rows: 7,427
## Columns: 14
## $ author__Danielle_Steel                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5                               <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf                             <dbl> 0, 0, 0, 1, 1, 0, 0, 0…

mydata_correlate <- data_binarized %>%
    correlate(total_weeks__2.30258509299405_Inf)
mydata_correlate %>%
    plot_correlation_funnel()

Build Data

my_data <- sample_n(my_data, 100)

# split data into training and testing

set.seed(1122)
data_split <- rsample::initial_split(my_data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data
set.seed(2211)
my_data_cv <- rsample::vfold_cv(data_train)
my_data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10

library(usemodels)
usemodels::use_xgboost(total_weeks ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = total_weeks ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(38194)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify recipe
xgboost_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id variable") %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_other(author) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_log(year, best_rank)
    
    
xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 75
## Columns: 106
## $ id                       <dbl> 873, 6655, 3639, 794, 1942, 4940, 646, 64, 19…
## $ year                     <dbl> 7.604396, 7.605890, 7.569412, 7.605392, 7.598…
## $ best_rank                <dbl> 1.0986123, 1.9459101, 1.3862944, 1.0986123, 2…
## $ total_weeks              <dbl> 1.7917595, 0.6931472, 0.6931472, 1.3862944, 0…
## $ tfidf_title_2            <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_a            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_all          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alternate    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_amberley     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_and          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_any          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_are          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_at           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_back         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_battlefield  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_being        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_big          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_blessing     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blood        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_blows        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_bone         <dbl> 0.000000, 0.000000, 0.000000, 2.165367, 0.000…
## $ tfidf_title_bridges      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bungalow     <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_burning      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_business     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_captain      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_celia        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_certain      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_chesapeake   <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_christmas    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_chronicle    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_clear        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cold         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_collector    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_come         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_commissioner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_could        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_counsel      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_country      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_crossed      <dbl> 0.000000, 0.000000, 0.000000, 2.165367, 0.000…
## $ tfidf_title_dark         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_day          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_dead         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_deception    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_deed         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_die          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_double       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_dreamfever   <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_earth        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_elizabeth    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_enemy        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_exit         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_fame         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_fifth        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_fire         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_first        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_for          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_from         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.082…
## $ tfidf_title_galileans    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_game         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_ghost        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_good         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_heartless    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_here         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_hiding       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_hope         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_husband      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_if           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_image        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_images       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_in           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_incredible   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_institute    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_is           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_jedi         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_jolly        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_journey      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_justice      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_kill         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tfidf_title_king's`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_kings        <dbl> 0.000000, 1.082683, 0.000000, 0.000000, 0.000…
## $ tfidf_title_last         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_let          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_light        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_lincoln      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_list         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_little       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_long         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_looking      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_looks        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_lost         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_love         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_mandingo     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_march        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_master       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_me           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_memory       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_mitford      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_moon         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_of           <dbl> 0.0000000, 0.5350165, 0.0000000, 0.0000000, 0…
## $ tfidf_title_the          <dbl> 0.0000000, 0.3017732, 0.0000000, 0.0000000, 0…
## $ tfidf_title_time         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.825…
## $ tfidf_title_to           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_title_way          <dbl> 0.0000000, 0.9126646, 0.0000000, 0.0000000, 0…
## $ author_Irving.Wallace    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_other             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …

# Specify model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# tune hyperparameters
set.seed(34266)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = my_data_cv, 
            grid = 5)

## i Creating pre-processing data to finalize unknown parameter: mtry

## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.

## 
There were issues with some computations   A: x1

There were issues with some computations   A: x2

There were issues with some computations   A: x3

There were issues with some computations   A: x4

There were issues with some computations   A: x5

There were issues with some computations   A: x6

There were issues with some computations   A: x7

There were issues with some computations   A: x8

There were issues with some computations   A: x9

There were issues with some computations   A: x10

There were issues with some computations   A: x10

Apply2

Jordan Lanowy

2024-09-19

Import data

Explore data

Build Data