Goal: To predict the total weeks on Bestsellers list (total_weeks). Click here for the data.

Import Data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂

Clean Data

data <- nyt_titles %>%
  
  # Treat Missing Values
   na.omit() %>%
  
  # Log Transform Variables with Pos-skewed Distribution
  mutate(total_weeks = log(total_weeks))
  

skimr::skim(data)
Data summary
Name data
Number of rows 7427
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 55 0 7168 0
author 0 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-07-02 3346

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3716.06 2145.23 0 1858.50 3717.00 5573.5 7430.00 ▇▇▇▇▇
year 0 1 1989.63 26.21 1931 1968.00 2000.00 2011.0 2020.00 ▂▂▂▃▇
total_weeks 0 1 1.48 1.11 0 0.69 1.39 2.3 5.18 ▇▇▆▂▁
debut_rank 0 1 7.90 4.57 1 4.00 8.00 12.0 17.00 ▇▆▅▅▅
best_rank 0 1 6.92 4.57 1 3.00 6.00 10.5 17.00 ▇▅▃▃▂

Explore Data

Identify good predictors.

Year

data %>%
  ggplot(aes(total_weeks, year)) +
  scale_y_log10() +
  geom_point()

First Week on Bestseller List

data %>%
  
   # Extract date features from first_week
    mutate(year = lubridate::year(first_week),
           month = lubridate::month(first_week, label = TRUE),
           weekday = lubridate::wday(first_week, label = TRUE)) %>%
    select(-first_week) %>%
  
  
  ggplot(aes(total_weeks, month)) +
    geom_point()

data %>%
  
   # Extract date features from first_week
    mutate(year = lubridate::year(first_week),
           month = lubridate::month(first_week, label = TRUE),
           weekday = lubridate::wday(first_week, label = TRUE)) %>%
    select(-first_week) %>%
  
  
  ggplot(aes(weekday, total_weeks)) +
    geom_point()

Debut Rank

data %>%
  ggplot(aes(total_weeks, debut_rank)) +
  scale_y_log10()+
  geom_point()

data %>%
  ggplot(aes(total_weeks, as.factor(debut_rank))) +
  geom_boxplot()

data %>%
  ggplot(aes(debut_rank, as.factor(total_weeks))) +
  geom_point()

Best Rank

data %>%
  ggplot(aes(total_weeks, best_rank)) +
  scale_y_log10()+
  geom_point()

Author

data %>%
  
  # Tokenize Author
  unnest_tokens(output = Author, input = author) %>%
  
  # Calculate avg number of weeks by author
  group_by(Author) %>%
  summarise(total_weeks = mean(total_weeks),
                  n     = n()) %>%
  ungroup() %>%
  
  filter(n > 10) %>%
  slice_max(order_by = Author, n = 20) %>%
  
  # Plot
  ggplot(aes(total_weeks, fct_reorder(Author, total_weeks))) +
  geom_point() +
  
  labs(y = "")

# Step 1: Prepare data
data_binarized <- data %>%
  select(-title, -year, -id) %>%
  
   # Extract date features from first_week
    mutate(year = lubridate::year(first_week),
           month = lubridate::month(first_week, label = TRUE),
           weekday = lubridate::wday(first_week, label = TRUE)) %>%
    select(-first_week) %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 7,427
## Columns: 32
## $ author__Danielle_Steel                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5                               <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf                             <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ month__01                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__02                                       <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ month__03                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__04                                       <dbl> 0, 1, 0, 0, 0, 0, 1, 0…
## $ month__05                                       <dbl> 1, 0, 1, 1, 0, 0, 0, 1…
## $ month__06                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__07                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__08                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__09                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__10                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__11                                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__12                                       <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ weekday__Sun                                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ weekday__Mon                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
# Step 2: Correlate
data_corr <- data_binarized %>%
  correlate(total_weeks__2.30258509299405_Inf)

data_corr
## # A tibble: 32 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 2.30258509299405_Inf                     1    
##  2 total_weeks -Inf_0.693147180559945                  -0.397
##  3 best_rank   -Inf_3                                   0.344
##  4 total_weeks 1.38629436111989_2.30258509299405       -0.323
##  5 best_rank   10.5_Inf                                -0.314
##  6 total_weeks 0.693147180559945_1.38629436111989      -0.256
##  7 year        1968_2000                                0.242
##  8 year        2011_Inf                                -0.235
##  9 year        2000_2011                               -0.230
## 10 year        -Inf_1968                                0.217
## # ℹ 22 more rows
# Step 3: Plot
data_corr %>%
  plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

data_corr %>% glimpse
## Rows: 32
## Columns: 3
## $ feature     <fct> total_weeks, total_weeks, best_rank, total_weeks, best_ran…
## $ bin         <chr> "2.30258509299405_Inf", "-Inf_0.693147180559945", "-Inf_3"…
## $ correlation <dbl> 1.00000000, -0.39682290, 0.34356819, -0.32304526, -0.31361…

Build Models

Split data

data <- sample_n(data, 200)

# Split into training and testing data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation 
# Use rsample::bootstraps(data_train) instead of vfold_cv(data_train) for data sets under 1000
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [135/15]> Fold01
##  2 <split [135/15]> Fold02
##  3 <split [135/15]> Fold03
##  4 <split [135/15]> Fold04
##  5 <split [135/15]> Fold05
##  6 <split [135/15]> Fold06
##  7 <split [135/15]> Fold07
##  8 <split [135/15]> Fold08
##  9 <split [135/15]> Fold09
## 10 <split [135/15]> Fold10
usemodels::use_xgboost(total_weeks ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = total_weeks ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(57769)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe
xgboost_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id") %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_date(first_week, keep_original_cols = FALSE) %>%
    step_other(author) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_log(debut_rank, best_rank)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 150
## Columns: 127
## $ id                        <dbl> 4821, 2403, 3368, 7122, 2996, 7165, 6448, 68…
## $ year                      <dbl> 2007, 2015, 2009, 2017, 2016, 1986, 1947, 19…
## $ debut_rank                <dbl> 2.0794415, 2.0794415, 2.7725887, 1.9459101, …
## $ best_rank                 <dbl> 0.0000000, 1.3862944, 2.6390573, 0.6931472, …
## $ total_weeks               <dbl> 2.5649494, 2.0794415, 0.0000000, 1.0986123, …
## $ tfidf_title_a             <dbl> 0.000000, 0.000000, 1.629048, 0.000000, 0.00…
## $ tfidf_title_act           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_after         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_age           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `tfidf_title_ain't`       <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_air           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_all           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_alley         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_always        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_america       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_an            <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_anathem       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_and           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_anger         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_appetites     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_arthas        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_at            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_atlas         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_b             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_baby          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_basket        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_beautifully   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_beauty        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_before        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bennett       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_between       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_beyond        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bitterroots   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_blessings     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_blue          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_blues         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bond          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_book          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_borne         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_brayford      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_brothers      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_burgoyne      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bye           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_case          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_castle        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_catch         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_centennial    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_challenge     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_chance        <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_chances       <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_chaos         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cheap         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_children      <dbl> 1.672427, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_christmas     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_circle        <dbl> 0.00000, 0.00000, 0.00000, 5.01728, 0.00000,…
## $ tfidf_title_city          <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_clancy        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cleft         <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_cold          <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_companion     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_condominium   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_consul        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cosmopolitans <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_country       <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_coyle         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cradle        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dangerous     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dark          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_darkness      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dating        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_daughter      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `tfidf_title_day's`       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_days          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dead          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_deals         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_deeper        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_descending    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_desire        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_devolution    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_diana         <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_dirt          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_distant       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_doing         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_doll          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dragon        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_edge          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_from          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_game          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_girl          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_hour          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_i             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_in            <dbl> 0.000000, 1.629048, 0.000000, 0.000000, 0.00…
## $ tfidf_title_lord          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_my            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_of            <dbl> 0.6374625, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_once          <dbl> 0.000000, 0.000000, 2.165367, 0.000000, 0.00…
## $ tfidf_title_prey          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_shot          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_sisters       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_so            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_the           <dbl> 0.4831578, 0.7247366, 0.0000000, 0.0000000, …
## $ tfidf_title_to            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_well          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_woman         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_you           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_year           <int> 2007, 2015, 2009, 2017, 2016, 1986, 1947, 19…
## $ author_Danielle.Steel     <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ author_other              <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ first_week_dow_Sun        <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,…
## $ first_week_dow_Mon        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Tue        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Wed        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Thu        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Fri        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Sat        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Jan      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Feb      <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
## $ first_week_month_Mar      <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Apr      <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_May      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Jun      <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Jul      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Aug      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Sep      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ first_week_month_Oct      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,…
## $ first_week_month_Nov      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ first_week_month_Dec      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Specify model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(100)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5)

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
##   trees min_n tree_depth learn_rate .metric .estimator  mean     n std_err
##   <int> <int>      <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1  1072    15          7    0.00750 rmse    standard   0.587    10  0.0500
## 2  1605     7          3    0.00214 rmse    standard   0.605    10  0.0411
## 3   676    30         12    0.0160  rmse    standard   0.637    10  0.0518
## 4   306    34          6    0.159   rmse    standard   0.671    10  0.0574
## 5  1253    19         14    0.0697  rmse    standard   0.700    10  0.0547
## # ℹ 1 more variable: .config <chr>

Make Predictions