Goal: To predict the total weeks on Bestsellers list (total_weeks). Click here for the data.

Import Data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

Clean Data

data <- nyt_titles %>%
  
  # Treat Missing Values
    na.omit() %>%
  
  # Log Transform Variables with Pos-skewed Distribution
  mutate(total_weeks = log(total_weeks))
  

skimr::skim(data)

Data summary
Name	data
Number of rows	7427
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	55	0	7168	0
author	0	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-07-02	3346

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3716.06	2145.23	0	1858.50	3717.00	5573.5	7430.00	▇▇▇▇▇
year	1	1989.63	26.21	1931	1968.00	2000.00	2011.0	2020.00	▂▂▂▃▇
total_weeks	1	1.48	1.11	0	0.69	1.39	2.3	5.18	▇▇▆▂▁
debut_rank	1	7.90	4.57	1	4.00	8.00	12.0	17.00	▇▆▅▅▅
best_rank	1	6.92	4.57	1	3.00	6.00	10.5	17.00	▇▅▃▃▂

Explore Data

Identify good predictors.

Year

data %>%
  ggplot(aes(total_weeks, year)) +
  scale_y_log10() +
  geom_point()

First Week on Bestseller List

data %>%
  
   # Extract date features from first_week
    mutate(year = lubridate::year(first_week),
           month = lubridate::month(first_week, label = TRUE),
           weekday = lubridate::wday(first_week, label = TRUE)) %>%
    select(-first_week) %>%
  
  
  ggplot(aes(total_weeks, month)) +
    geom_point()

data %>%
  
   # Extract date features from first_week
    mutate(year = lubridate::year(first_week),
           month = lubridate::month(first_week, label = TRUE),
           weekday = lubridate::wday(first_week, label = TRUE)) %>%
    #select(-first_week) %>%
  
  
  ggplot(aes(weekday, total_weeks)) +
    geom_point()

Debut Rank

data %>%
  ggplot(aes(total_weeks, debut_rank)) +
  scale_y_log10()+
  geom_point()

data %>%
  ggplot(aes(total_weeks, as.factor(debut_rank))) +
  geom_boxplot()

data %>%
  ggplot(aes(debut_rank, as.factor(total_weeks))) +
  geom_point()

Best Rank

data %>%
  ggplot(aes(total_weeks, best_rank)) +
  scale_y_log10()+
  geom_point()

Author

data %>%
  
  # Tokenize Author
  unnest_tokens(output = Author, input = author) %>%
  
  # Calculate avg number of weeks by author
  group_by(Author) %>%
  summarise(total_weeks = mean(total_weeks),
                  n     = n()) %>%
  ungroup() %>%
  
  filter(n > 10) %>%
  slice_max(order_by = Author, n = 20) %>%
  
  # Plot
  ggplot(aes(total_weeks, fct_reorder(Author, total_weeks))) +
  geom_point() +
  
  labs(y = "")

# Step 1: Prepare data
data_binarized <- data %>%
  select(-title, -year, -id) %>%
  
   # Extract date features from first_week
    mutate(year = lubridate::year(first_week),
           month = lubridate::month(first_week, label = TRUE),
           weekday = lubridate::wday(first_week, label = TRUE)) %>%
    select(-first_week) %>%
    binarize()

data_binarized %>% glimpse()

## Rows: 7,427
## Columns: 32
## $ author__Danielle_Steel                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5                               <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf                             <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ month__01                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__02                                       <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ month__03                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__04                                       <dbl> 0, 1, 0, 0, 0, 0, 1, 0…
## $ month__05                                       <dbl> 1, 0, 1, 1, 0, 0, 0, 1…
## $ month__06                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__07                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__08                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__09                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__10                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__11                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__12                                       <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ weekday__Sun                                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ weekday__Mon                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0…

# Step 2: Correlate
data_corr <- data_binarized %>%
  correlate(total_weeks__2.30258509299405_Inf)

data_corr

## # A tibble: 32 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 2.30258509299405_Inf                     1    
##  2 total_weeks -Inf_0.693147180559945                  -0.397
##  3 best_rank   -Inf_3                                   0.344
##  4 total_weeks 1.38629436111989_2.30258509299405       -0.323
##  5 best_rank   10.5_Inf                                -0.314
##  6 total_weeks 0.693147180559945_1.38629436111989      -0.256
##  7 year        1968_2000                                0.242
##  8 year        2011_Inf                                -0.235
##  9 year        2000_2011                               -0.230
## 10 year        -Inf_1968                                0.217
## # ℹ 22 more rows

# Step 3: Plot
data_corr %>%
  plot_correlation_funnel()

## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

data_corr %>% glimpse

## Rows: 32
## Columns: 3
## $ feature     <fct> total_weeks, total_weeks, best_rank, total_weeks, best_ran…
## $ bin         <chr> "2.30258509299405_Inf", "-Inf_0.693147180559945", "-Inf_3"…
## $ correlation <dbl> 1.00000000, -0.39682290, 0.34356819, -0.32304526, -0.31361…

Build Models

Split data

data <- sample_n(data, 200)

# Split into training and testing data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation 
# Use rsample::bootstraps(data_train) instead of vfold_cv(data_train) for data sets under 1000
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [135/15]> Fold01
##  2 <split [135/15]> Fold02
##  3 <split [135/15]> Fold03
##  4 <split [135/15]> Fold04
##  5 <split [135/15]> Fold05
##  6 <split [135/15]> Fold06
##  7 <split [135/15]> Fold07
##  8 <split [135/15]> Fold08
##  9 <split [135/15]> Fold09
## 10 <split [135/15]> Fold10

usemodels::use_xgboost(total_weeks ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = total_weeks ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(57769)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify recipe
xgboost_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id") %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_date(first_week, keep_original_cols = FALSE) %>%
    step_other(author) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_log(debut_rank, best_rank)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 150
## Columns: 127
## $ id                       <dbl> 5745, 5270, 3661, 359, 4488, 3398, 2463, 2361…
## $ year                     <dbl> 1964, 2020, 1958, 2018, 1988, 1942, 2001, 198…
## $ debut_rank               <dbl> 2.0794415, 2.1972246, 2.4849066, 1.9459101, 2…
## $ best_rank                <dbl> 0.6931472, 1.3862944, 2.3978953, 2.7080502, 2…
## $ total_weeks              <dbl> 3.6375862, 1.6094379, 1.3862944, 0.0000000, 0…
## $ tfidf_title_1225         <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_79           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_8th          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_a            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_academic     <dbl> 0.00000, 0.00000, 5.01728, 0.00000, 0.00000, …
## $ tfidf_title_advances     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_adventures   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_advocate     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_affair       <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_again        <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_air          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alexandria   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alice        <dbl> 0.00000, 0.00000, 0.00000, 2.50864, 0.00000, …
## $ tfidf_title_alive        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_aloft        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alone        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_and          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_angels       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_are          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_around       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_associate    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_at           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_auntie       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_back         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_balliois     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_beast        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_beauty       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bedtime      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_beet         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_belgravia    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_belong       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_better       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_birthday     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blane        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bloodline    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blowout      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blue         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bolitho      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bone         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_book         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bookshop     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_breakdown    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_brought      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_by           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cade         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_camerons     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_captive      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_caroline     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_castle       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cat          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_caught       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_century      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_ceremony     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cezanne      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_changes      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_chasing      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cheever      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cherie       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tfidf_title_children's` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_christmas    <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_christy      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_city         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_clancy       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_clock        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_come         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_comes        <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_confession   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_contact      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_correct      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tfidf_title_couldn't`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_crash        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_crimson      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cross        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_dark         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_day          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dead         <dbl> 0.000000, 0.000000, 0.000000, 2.165367, 0.000…
## $ tfidf_title_death        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_evil         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_force        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_happy        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_heart        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_high         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_house        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_in           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_innocent     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_jewels       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_king         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_man          <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_my           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_not          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_of           <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_on           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_one          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_or           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_stories      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_street       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_the          <dbl> 0.6263815, 1.2527630, 0.0000000, 0.0000000, 0…
## $ tfidf_title_to           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_we           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_with         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_year          <int> 1964, 2020, 1958, 2018, 1988, 1942, 2001, 198…
## $ author_Danielle.Steel    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_other             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ first_week_dow_Sun       <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ first_week_dow_Mon       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Tue       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Wed       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Thu       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Fri       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Sat       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Jan     <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
## $ first_week_month_Feb     <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Mar     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Apr     <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_May     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ first_week_month_Jun     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ first_week_month_Jul     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Aug     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Sep     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Oct     <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ first_week_month_Nov     <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ first_week_month_Dec     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …

# Specify model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(100)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5)

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 10
##   trees min_n tree_depth learn_rate .metric .estimator  mean     n std_err
##   <int> <int>      <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1  1605     7          3    0.00214 rmse    standard   0.556    10  0.0287
## 2  1072    15          7    0.00750 rmse    standard   0.681    10  0.0192
## 3   676    30         12    0.0160  rmse    standard   0.730    10  0.0226
## 4   306    34          6    0.159   rmse    standard   0.733    10  0.0301
## 5  1253    19         14    0.0697  rmse    standard   0.854    10  0.0405
## # ℹ 1 more variable: .config <chr>

Improve Model for Apply 4

set.seed(123)
bestsellers_split <-
  nyt_titles %>%
  transmute(
    author,
    total_weeks = if_else(total_weeks > 4, "long", "short")
  ) %>%
  na.omit() %>%
  initial_split(strata = total_weeks)
bestsellers_train <- training(bestsellers_split)
bestsellers_test <- testing(bestsellers_split)

set.seed(234)
bestsellers_folds <- vfold_cv(bestsellers_train, strata = total_weeks)
bestsellers_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5012/558]> Fold01
##  2 <split [5013/557]> Fold02
##  3 <split [5013/557]> Fold03
##  4 <split [5013/557]> Fold04
##  5 <split [5013/557]> Fold05
##  6 <split [5013/557]> Fold06
##  7 <split [5013/557]> Fold07
##  8 <split [5013/557]> Fold08
##  9 <split [5013/557]> Fold09
## 10 <split [5014/556]> Fold10

bestsellers_train %>% count(total_weeks)

## # A tibble: 2 × 2
##   total_weeks     n
##   <chr>       <int>
## 1 long         2721
## 2 short        2849

library(wordpiece)

## Warning: package 'wordpiece' was built under R version 4.4.1

svm_spec <- svm_linear(mode = "classification")

bestsellers_rec <-
  recipe(total_weeks ~ author, data = bestsellers_train) %>%
  step_tokenize_wordpiece(author, max_chars = 10) %>%
  step_tokenfilter(author, max_tokens = 100) %>%
  step_tf(author) %>%
  step_normalize(all_numeric_predictors())

prep(bestsellers_rec) %>% bake(new_data = NULL) %>% glimpse()

## Rows: 5,570
## Columns: 101
## $ total_weeks         <fct> long, long, long, long, long, long, long, long, lo…
## $ `tf_author_'`       <dbl> -0.08715687, -0.08715687, -0.08715687, -0.08715687…
## $ `tf_author_##a`     <dbl> -0.09419984, -0.09419984, -0.09419984, -0.09419984…
## $ `tf_author_##ac`    <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ `tf_author_##ci`    <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ `tf_author_##e`     <dbl> -0.1406038, -0.1406038, -0.1406038, -0.1406038, -0…
## $ `tf_author_##er`    <dbl> -0.1458252, -0.1458252, -0.1458252, -0.1458252, -0…
## $ `tf_author_##es`    <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ `tf_author_##ford`  <dbl> -0.08396368, -0.08396368, -0.08396368, -0.08396368…
## $ `tf_author_##in`    <dbl> -0.07951265, -0.07951265, -0.07951265, -0.07951265…
## $ `tf_author_##l`     <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ `tf_author_##m`     <dbl> -0.09024042, -0.09024042, -0.09024042, -0.09024042…
## $ `tf_author_##man`   <dbl> -0.1193075, -0.1193075, -0.1193075, -0.1193075, -0…
## $ `tf_author_##n`     <dbl> -0.1199358, -0.1199358, -0.1199358, -0.1199358, -0…
## $ `tf_author_##ne`    <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ `tf_author_##ont`   <dbl> -0.08819633, -0.08819633, -0.08819633, -0.08819633…
## $ `tf_author_##ovich` <dbl> -0.07614065, -0.07614065, -0.07614065, -0.07614065…
## $ `tf_author_##s`     <dbl> -0.1310066, -0.1310066, -0.1310066, -0.1310066, -0…
## $ `tf_author_##sen`   <dbl> -0.07409856, -0.07409856, -0.07409856, -0.07409856…
## $ `tf_author_##ssler` <dbl> -0.09652724, -0.09652724, -0.09652724, -0.09652724…
## $ `tf_author_##well`  <dbl> -0.07951265, -0.07951265, -0.07951265, -0.07951265…
## $ `tf_author_##y`     <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ `tf_author_##z`     <dbl> -0.1111617, -0.1111617, -0.1111617, -0.1111617, -0…
## $ tf_author_.         <dbl> -0.3243985, -0.3243985, -0.3243985, -0.3243985, -0…
## $ `tf_author_[UNK]`   <dbl> -0.1419488, -0.1419488, -0.1419488, -0.1419488, -0…
## $ tf_author_a         <dbl> -0.1207399, -0.1207399, -0.1207399, -0.1207399, -0…
## $ tf_author_alice     <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ tf_author_and       <dbl> -0.227056, -0.227056, -0.227056, -0.227056, -0.227…
## $ tf_author_ann       <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ tf_author_anne      <dbl> -0.1111617, -0.1111617, -0.1111617, -0.1111617, -0…
## $ tf_author_b         <dbl> -0.1406038, -0.1406038, -0.1406038, -0.1406038, -0…
## $ tf_author_bald      <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ tf_author_barbara   <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ tf_author_brown     <dbl> -0.1019979, -0.1019979, -0.1019979, 8.1317469, -0.…
## $ tf_author_by        <dbl> -0.08310031, -0.08310031, -0.08310031, -0.08310031…
## $ tf_author_c         <dbl> -0.09893404, -0.09893404, -0.09893404, -0.09893404…
## $ tf_author_child     <dbl> -0.07732059, -0.07732059, -0.07732059, -0.07732059…
## $ tf_author_clark     <dbl> -0.09274912, -0.09274912, -0.09274912, -0.09274912…
## $ tf_author_clive     <dbl> -0.1034598, -0.1034598, -0.1034598, -0.1034598, -0…
## $ tf_author_co        <dbl> -0.08504102, -0.08504102, -0.08504102, -0.08504102…
## $ tf_author_cr        <dbl> -0.08610525, -0.08610525, -0.08610525, -0.08610525…
## $ tf_author_cu        <dbl> -0.09652724, -0.09652724, -0.09652724, -0.09652724…
## $ tf_author_d         <dbl> -0.1034598, -0.1034598, -0.1034598, -0.1034598, -0…
## $ tf_author_danielle  <dbl> -0.1266872, -0.1266872, -0.1266872, -0.1266872, -0…
## $ tf_author_david     <dbl> -0.1222252, -0.1222252, -0.1222252, -0.1222252, -0…
## $ tf_author_dean      <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_e         <dbl> -0.1185213, -0.1185213, -0.1185213, -0.1185213, -0…
## $ tf_author_elizabeth <dbl> -0.1136183, -0.1136183, -0.1136183, -0.1136183, -0…
## $ tf_author_evan      <dbl> -0.0796298, -0.0796298, -0.0796298, -0.0796298, -0…
## $ tf_author_f         <dbl> -0.09985495, -0.09985495, -0.09985495, -0.09985495…
## $ tf_author_frank     <dbl> -0.08819633, -0.08819633, 11.33630484, -0.08819633…
## $ tf_author_gr        <dbl> -0.09224066, -0.09224066, -0.09224066, -0.09224066…
## $ tf_author_griffin   <dbl> -0.08610525, -0.08610525, -0.08610525, -0.08610525…
## $ tf_author_higgins   <dbl> -0.1120983, -0.1120983, -0.1120983, -0.1120983, -0…
## $ tf_author_howard    <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_j         <dbl> -0.1646265, -0.1646265, -0.1646265, -0.1646265, -0…
## $ tf_author_james     <dbl> -0.196587, -0.196587, -0.196587, -0.196587, -0.196…
## $ tf_author_jan       <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_janet     <dbl> -0.08610525, -0.08610525, -0.08610525, -0.08610525…
## $ tf_author_jeff      <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_john      <dbl> -0.2093925, -0.2093925, -0.2093925, -0.2093925, -0…
## $ tf_author_jonathan  <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ tf_author_judith    <dbl> -0.07951265, -0.07951265, -0.07951265, -0.07951265…
## $ tf_author_k         <dbl> -0.1191634, -0.1191634, -0.1191634, -0.1191634, -0…
## $ tf_author_keller    <dbl> -0.0851383, -0.0851383, -0.0851383, -0.0851383, -0…
## $ tf_author_ken       <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ tf_author_king      <dbl> -0.09991709, -0.09991709, -0.09991709, -0.09991709…
## $ tf_author_ko        <dbl> -0.09322522, -0.09322522, -0.09322522, -0.09322522…
## $ tf_author_l         <dbl> -0.08715687, -0.08715687, -0.08715687, -0.08715687…
## $ tf_author_la        <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ tf_author_lee       <dbl> -0.09124584, -0.09124584, -0.09124584, -0.09124584…
## $ tf_author_lisa      <dbl> -0.09124584, -0.09124584, -0.09124584, -0.09124584…
## $ tf_author_louis     <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ tf_author_ma        <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_mac       <dbl> -0.08396368, -0.08396368, -0.08396368, -0.08396368…
## $ tf_author_mary      <dbl> -0.1358838, -0.1358838, -0.1358838, -0.1358838, -0…
## $ tf_author_mc        <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_michael   <dbl> -0.1237294, -0.1237294, -0.1237294, -0.1237294, -0…
## $ tf_author_nora      <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_o         <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_parker    <dbl> -0.08287274, -0.08287274, -0.08287274, -0.08287274…
## $ tf_author_patterson <dbl> -0.1392703, -0.1392703, -0.1392703, -0.1392703, -0…
## $ tf_author_paul      <dbl> -0.09706705, -0.09706705, -0.09706705, -0.09706705…
## $ tf_author_r         <dbl> -0.1132612, -0.1132612, -0.1132612, -0.1132612, -0…
## $ tf_author_richard   <dbl> -0.1160255, -0.1160255, -0.1160255, -0.1160255, -0…
## $ tf_author_robert    <dbl> -0.1605555, -0.1605555, -0.1605555, -0.1605555, -0…
## $ tf_author_roberts   <dbl> -0.08287274, -0.08287274, -0.08287274, -0.08287274…
## $ tf_author_s         <dbl> -0.09087668, -0.09087668, -0.09087668, -0.09087668…
## $ tf_author_sand      <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_scott     <dbl> -0.08715687, -0.08715687, -0.08715687, -0.08715687…
## $ tf_author_smith     <dbl> -0.09124584, -0.09124584, -0.09124584, -0.09124584…
## $ tf_author_steel     <dbl> -0.1259539, -0.1259539, -0.1259539, -0.1259539, -0…
## $ tf_author_stephen   <dbl> -0.1199358, -0.1199358, -0.1199358, -0.1199358, -0…
## $ tf_author_stuart    <dbl> -0.1007678, -0.1007678, -0.1007678, -0.1007678, -0…
## $ tf_author_taylor    <dbl> -0.08396368, -0.08396368, -0.08396368, -0.08396368…
## $ tf_author_terry     <dbl> -0.08504102, -0.08504102, -0.08504102, -0.08504102…
## $ tf_author_thomas    <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_tom       <dbl> -0.07951265, -0.07951265, -0.07951265, -0.07951265…
## $ tf_author_w         <dbl> -0.1078035, -0.1078035, -0.1078035, -0.1078035, -0…
## $ tf_author_william   <dbl> -0.1152284, -0.1152284, -0.1152284, -0.1152284, -0…
## $ tf_author_woods     <dbl> -0.09800482, -0.09800482, -0.09800482, -0.09800482…

Apply 4: NY Times Bestseller List

Sara Donahue

2024-10-10