Import Data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')

## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

data <- nyt_titles %>%
    
    # Treat missing values
    select(-debut_rank, -best_rank, -first_week) %>%
    na.omit() %>%
    filter(total_weeks <= 100) %>%

    # log transform variables with pos-skewed distribution
    mutate(total_weeks = log(total_weeks))

data

## # A tibble: 7,416 × 5
##       id title                   author                year total_weeks
##    <dbl> <chr>                   <chr>                <dbl>       <dbl>
##  1     0 "\"H\" IS FOR HOMICIDE" Sue Grafton           1991        2.71
##  2     1 "\"I\" IS FOR INNOCENT" Sue Grafton           1992        2.40
##  3    10 "''G'' IS FOR GUMSHOE"  Sue Grafton           1990        1.79
##  4   100 "A DOG'S JOURNEY"       W. Bruce Cameron      2012        0   
##  5  1000 "CHANGING FACES"        Kimberla Lawson Roby  2006        0   
##  6  1001 "CHAOS"                 Patricia Cornwell     2016        1.10
##  7  1002 "CHAPTERHOUSE: DUNE"    Frank Herbert         1985        2.77
##  8  1003 "CHARADE"               Sandra Brown          1994        1.61
##  9  1004 "CHARLESTON"            John Jakes            2002        1.39
## 10  1005 "CHARLOTTE GRAY"        Sebastian Faulks      1999        0   
## # ℹ 7,406 more rows

Explore Data

data %>%
    ggplot(aes(total_weeks,author)) +
    geom_point()

data %>%
    ggplot(aes(total_weeks,year)) +
    geom_point()

# Prepare
data_binarized_tbl <- data %>%
    select(-year) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,416
## Columns: 12
## $ `id__-Inf_1858.75`                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ id__1858.75_3715.5                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__3715.5_5574.25                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__5574.25_Inf                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ title__GONE                                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `title__-OTHER`                                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ author__Danielle_Steel                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…

# Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(total_weeks__2.30258509299405_Inf)

data_corr_tbl

## # A tibble: 12 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 2.30258509299405_Inf                   1      
##  2 total_weeks -Inf_0.693147180559945                -0.396  
##  3 total_weeks 1.38629436111989_2.30258509299405     -0.322  
##  4 total_weeks 0.693147180559945_1.38629436111989    -0.256  
##  5 id          -Inf_1858.75                          -0.0279 
##  6 id          3715.5_5574.25                         0.0235 
##  7 title       GONE                                  -0.0129 
##  8 title       -OTHER                                 0.0129 
##  9 id          5574.25_Inf                            0.00880
## 10 id          1858.75_3715.5                        -0.00440
## 11 author      Danielle_Steel                         0.00425
## 12 author      -OTHER                                -0.00425

# Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Build models

data <- sample_n(data, 100)

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test  <- testing(data_split)

# Further spit training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10

library(usemodels)

## Warning: package 'usemodels' was built under R version 4.5.2

usemodels::use_xgboost(total_weeks ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = total_weeks ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6804)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify recipe
xgboost_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>% 
    recipes::update_role(id, new_role = "id variable") %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_tokenize(author) %>%
    step_tokenfilter(author, max_tokens = 100) %>%
    step_tfidf(author) %>%
    step_zv(all_predictors()) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) 
    
xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 75
## Columns: 203
## $ id                         <dbl> 6499, 3669, 840, 234, 4209, 4702, 4054, 103…
## $ year                       <dbl> 1959, 2002, 2013, 2015, 1948, 1980, 2003, 1…
## $ total_weeks                <dbl> 1.0986123, 1.7917595, 0.6931472, 2.5649494,…
## $ tfidf_title_47th           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_a              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.9943845,…
## $ tfidf_title_abiding        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_abode          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_about          <dbl> 0.0000000, 0.0000000, 0.7217889, 0.0000000,…
## $ tfidf_title_abstinence     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_account        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_agent          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_american       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_and            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_any            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_are            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bare           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_blue           <dbl> 0.000000, 0.000000, 0.000000, 1.443578, 0.0…
## $ tfidf_title_blues          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bones          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bourne         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_boy            <dbl> 0.0000000, 0.0000000, 0.7217889, 0.0000000,…
## $ tfidf_title_bride          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_bridget        <dbl> 0.0000000, 0.0000000, 0.7217889, 0.0000000,…
## $ tfidf_title_career         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_catch          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_chances        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_china          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_choice         <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_coma           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_corner         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_countdown      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_court          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_courtship      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_creation       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_darkest        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_darkness       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_date           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_daughter       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dead           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_death          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_demons         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_die            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_dog            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dreams         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_dwell          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_evening        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_evil           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_eye            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_few            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_first          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_fisherman      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_for            <dbl> 0.000000, 1.491577, 0.000000, 0.000000, 0.0…
## $ tfidf_title_four           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_from           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_frozen         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_fundamentalist <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_gathering      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_girl           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_glory          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_god            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_goddess        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_golden         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_goodbar        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hard           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_heaven         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_his            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_homeland       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hound          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hours          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_identity       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_in             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_is             <dbl> 0.000000, 1.825329, 0.000000, 0.000000, 0.0…
## $ tfidf_title_jones          <dbl> 0.0000000, 0.0000000, 0.7217889, 0.0000000,…
## $ tfidf_title_king           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_knows          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_last           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_lawyer         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_leaving        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_legacy         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_leia           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_light          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_line           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_looking        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_lord           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_love           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_mad            <dbl> 0.0000000, 0.0000000, 0.7217889, 0.0000000,…
## $ tfidf_title_man            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_matarese       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_me             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_midnight       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_mischief       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_moon           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.2…
## $ tfidf_title_more           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_mountains      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_mr             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_naked          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_title_nature         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_title_new            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_of             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.5972532,…
## $ tfidf_title_shadow         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_the            <dbl> 0.6645680, 0.0000000, 0.2215227, 0.0000000,…
## $ tfidf_title_time           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_to             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_a             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_allison       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_amanda        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_and           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_author_andrews       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_ann           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_anne          <dbl> 0.000000, 0.000000, 0.000000, 4.330733, 0.0…
## $ tfidf_author_aubrey        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_bach          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_berry         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_bradda        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_brandon       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_burcell       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_author_by            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_caldwell      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_carl          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_carré         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_child         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_christopher   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_cleeves       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_clive         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_author_cook          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_crombie       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_cussler       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_author_d             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_danielle      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_dave          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_dean          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_deborah       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_denis         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_dorst         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_doug          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_dustin        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_edward        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_elizabeth     <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_erica         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_evanovich     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_field         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_fielding      <dbl> 0.000000, 0.000000, 2.165367, 0.000000, 0.0…
## $ tfidf_author_fred          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_galbraith     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_gallico       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_garwood       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_george        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.4…
## $ tfidf_author_gipson        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_godden        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_goodkind      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_grafton       <dbl> 0.000000, 1.825329, 0.000000, 0.000000, 0.0…
## $ tfidf_author_grisham       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_hamid         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_harold        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_harrison      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_harry         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_hayden        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_helen         <dbl> 0.000000, 0.000000, 2.165367, 0.000000, 0.0…
## $ tfidf_author_heller        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_hiaasen       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_hitrec        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.4…
## $ tfidf_author_hunter        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_ian           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_author_illustrated   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_j             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_j.d           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_jacqueline    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_jakes         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_jan           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_jance         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_janet         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_janeway       <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_jayne         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_jeff          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_jodi          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_john          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_johnson       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_jong          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_jordan        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_joseph        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.2…
## $ tfidf_author_judith        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_julie         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_karin         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_karon         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_kathy         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_kay           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_kaye          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_kim           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_kinsella      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_koontz        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_krentz        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_l             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_le            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_lee           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_ludlum        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_mary          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_morris        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_richard       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_author_robb          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0…
## $ tfidf_author_robert        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_author_robin         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_author_sue           <dbl> 0.000000, 1.825329, 0.000000, 0.000000, 0.0…
## $ tfidf_author_west          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

# Specify model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and models using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec)

# Tune hyperparameters
set.seed(6804)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5)

## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.

## There were issues with some computations   A: x3There were issues with some computations   A: x6There were issues with some computations   A: x9There were issues with some computations   A: x12There were issues with some computations   A: x15There were issues with some computations   A: x18There were issues with some computations   A: x21There were issues with some computations   A: x24There were issues with some computations   A: x27There were issues with some computations   A: x30There were issues with some computations   A: x30

Apply to your data 2

Liam Smith

2026-02-13

Import Data

Explore Data

Build models