goal: to predict how long a book will last on the NY Times Bestsellers list
Click here for the data
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): title, author
## dbl (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
my_data <- nyt_titles %>%
select(-debut_rank, -first_week) %>%
na.omit() %>%
mutate(total_weeks = log(total_weeks))
skimr::skim(my_data)
Name | my_data |
Number of rows | 7427 |
Number of columns | 6 |
_______________________ | |
Column type frequency: | |
character | 2 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 55 | 0 | 7168 | 0 |
author | 0 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3716.06 | 2145.23 | 0 | 1858.50 | 3717.00 | 5573.5 | 7430.00 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.63 | 26.21 | 1931 | 1968.00 | 2000.00 | 2011.0 | 2020.00 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 1.48 | 1.11 | 0 | 0.69 | 1.39 | 2.3 | 5.18 | ▇▇▆▂▁ |
best_rank | 0 | 1 | 6.92 | 4.57 | 1 | 3.00 | 6.00 | 10.5 | 17.00 | ▇▅▃▃▂ |
data_binarized <- my_data %>%
select(-title, -id) %>%
binarize()
data_binarized %>% glimpse
## Rows: 7,427
## Columns: 14
## $ author__Danielle_Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
mydata_correlate <- data_binarized %>%
correlate(total_weeks__2.30258509299405_Inf)
mydata_correlate %>%
plot_correlation_funnel()
my_data <- sample_n(my_data, 100)
# split data into training and testing
set.seed(1122)
data_split <- rsample::initial_split(my_data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training data
set.seed(2211)
my_data_cv <- rsample::vfold_cv(data_train)
my_data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
usemodels::use_xgboost(total_weeks ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = total_weeks ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(38194)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe
xgboost_recipe <-
recipe(formula = total_weeks ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id variable") %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_other(author) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_log(year, best_rank)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 106
## $ id <dbl> 873, 6655, 3639, 794, 1942, 4940, 646, 64, 19…
## $ year <dbl> 7.604396, 7.605890, 7.569412, 7.605392, 7.598…
## $ best_rank <dbl> 1.0986123, 1.9459101, 1.3862944, 1.0986123, 2…
## $ total_weeks <dbl> 1.7917595, 0.6931472, 0.6931472, 1.3862944, 0…
## $ tfidf_title_2 <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_a <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_all <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alternate <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_amberley <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_and <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_any <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_are <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_at <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_back <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_battlefield <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_being <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_big <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_blessing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blood <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_blows <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_bone <dbl> 0.000000, 0.000000, 0.000000, 2.165367, 0.000…
## $ tfidf_title_bridges <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bungalow <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_burning <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_business <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_captain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_celia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_certain <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_chesapeake <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_christmas <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_chronicle <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_clear <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cold <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_collector <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_come <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_commissioner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_could <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_counsel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_country <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_crossed <dbl> 0.000000, 0.000000, 0.000000, 2.165367, 0.000…
## $ tfidf_title_dark <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_day <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_dead <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_deception <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_deed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_die <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_double <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_dreamfever <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_earth <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_elizabeth <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_enemy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_exit <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_fame <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_fifth <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_fire <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_first <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_for <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_from <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.082…
## $ tfidf_title_galileans <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_game <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_ghost <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_good <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_heartless <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_here <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_hiding <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_hope <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_husband <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_if <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_image <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_images <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_in <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_incredible <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_institute <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_is <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_jedi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_jolly <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_journey <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_justice <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_kill <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tfidf_title_king's` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_kings <dbl> 0.000000, 1.082683, 0.000000, 0.000000, 0.000…
## $ tfidf_title_last <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_let <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_light <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_lincoln <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_list <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_little <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_long <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_looking <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_looks <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_lost <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_love <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_mandingo <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_march <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_master <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_me <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_memory <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_mitford <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_moon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_of <dbl> 0.0000000, 0.5350165, 0.0000000, 0.0000000, 0…
## $ tfidf_title_the <dbl> 0.0000000, 0.3017732, 0.0000000, 0.0000000, 0…
## $ tfidf_title_time <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.825…
## $ tfidf_title_to <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ tfidf_title_way <dbl> 0.0000000, 0.9126646, 0.0000000, 0.0000000, 0…
## $ author_Irving.Wallace <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
# Specify model
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# tune hyperparameters
set.seed(34266)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = my_data_cv,
grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
## standard deviation, resulting in a divide by 0 error. `NA` will be returned.
##
There were issues with some computations A: x1
There were issues with some computations A: x2
There were issues with some computations A: x3
There were issues with some computations A: x4
There were issues with some computations A: x5
There were issues with some computations A: x6
There were issues with some computations A: x7
There were issues with some computations A: x8
There were issues with some computations A: x9
There were issues with some computations A: x10
There were issues with some computations A: x10