Goal: To predict the total weeks on Bestsellers list (total_weeks). Click here for the data.
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): title, author
## dbl (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
# Treat Missing Values
na.omit() %>%
# Log Transform Variables with Pos-skewed Distribution
mutate(total_weeks = log(total_weeks))
skimr::skim(data)
Name | data |
Number of rows | 7427 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 55 | 0 | 7168 | 0 |
author | 0 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-07-02 | 3346 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3716.06 | 2145.23 | 0 | 1858.50 | 3717.00 | 5573.5 | 7430.00 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.63 | 26.21 | 1931 | 1968.00 | 2000.00 | 2011.0 | 2020.00 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 1.48 | 1.11 | 0 | 0.69 | 1.39 | 2.3 | 5.18 | ▇▇▆▂▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.00 | 8.00 | 12.0 | 17.00 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.92 | 4.57 | 1 | 3.00 | 6.00 | 10.5 | 17.00 | ▇▅▃▃▂ |
Identify good predictors.
Year
data %>%
ggplot(aes(total_weeks, year)) +
scale_y_log10() +
geom_point()
First Week on Bestseller List
data %>%
# Extract date features from first_week
mutate(year = lubridate::year(first_week),
month = lubridate::month(first_week, label = TRUE),
weekday = lubridate::wday(first_week, label = TRUE)) %>%
select(-first_week) %>%
ggplot(aes(total_weeks, month)) +
geom_point()
data %>%
# Extract date features from first_week
mutate(year = lubridate::year(first_week),
month = lubridate::month(first_week, label = TRUE),
weekday = lubridate::wday(first_week, label = TRUE)) %>%
select(-first_week) %>%
ggplot(aes(weekday, total_weeks)) +
geom_point()
Debut Rank
data %>%
ggplot(aes(total_weeks, debut_rank)) +
scale_y_log10()+
geom_point()
data %>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
data %>%
ggplot(aes(debut_rank, as.factor(total_weeks))) +
geom_point()
Best Rank
data %>%
ggplot(aes(total_weeks, best_rank)) +
scale_y_log10()+
geom_point()
Author
data %>%
# Tokenize Author
unnest_tokens(output = Author, input = author) %>%
# Calculate avg number of weeks by author
group_by(Author) %>%
summarise(total_weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10) %>%
slice_max(order_by = Author, n = 20) %>%
# Plot
ggplot(aes(total_weeks, fct_reorder(Author, total_weeks))) +
geom_point() +
labs(y = "")
# Step 1: Prepare data
data_binarized <- data %>%
select(-title, -year, -id) %>%
# Extract date features from first_week
mutate(year = lubridate::year(first_week),
month = lubridate::month(first_week, label = TRUE),
weekday = lubridate::wday(first_week, label = TRUE)) %>%
select(-first_week) %>%
binarize()
data_binarized %>% glimpse()
## Rows: 7,427
## Columns: 32
## $ author__Danielle_Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ month__01 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__02 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ month__03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__04 <dbl> 0, 1, 0, 0, 0, 0, 1, 0…
## $ month__05 <dbl> 1, 0, 1, 1, 0, 0, 0, 1…
## $ month__06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__07 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__08 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__09 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__12 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ weekday__Sun <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ weekday__Mon <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
# Step 2: Correlate
data_corr <- data_binarized %>%
correlate(total_weeks__2.30258509299405_Inf)
data_corr
## # A tibble: 32 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 total_weeks 2.30258509299405_Inf 1
## 2 total_weeks -Inf_0.693147180559945 -0.397
## 3 best_rank -Inf_3 0.344
## 4 total_weeks 1.38629436111989_2.30258509299405 -0.323
## 5 best_rank 10.5_Inf -0.314
## 6 total_weeks 0.693147180559945_1.38629436111989 -0.256
## 7 year 1968_2000 0.242
## 8 year 2011_Inf -0.235
## 9 year 2000_2011 -0.230
## 10 year -Inf_1968 0.217
## # ℹ 22 more rows
# Step 3: Plot
data_corr %>%
plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
data_corr %>% glimpse
## Rows: 32
## Columns: 3
## $ feature <fct> total_weeks, total_weeks, best_rank, total_weeks, best_ran…
## $ bin <chr> "2.30258509299405_Inf", "-Inf_0.693147180559945", "-Inf_3"…
## $ correlation <dbl> 1.00000000, -0.39682290, 0.34356819, -0.32304526, -0.31361…
Split data
data <- sample_n(data, 200)
# Split into training and testing data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training data set for cross-validation
# Use rsample::bootstraps(data_train) instead of vfold_cv(data_train) for data sets under 1000
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [135/15]> Fold01
## 2 <split [135/15]> Fold02
## 3 <split [135/15]> Fold03
## 4 <split [135/15]> Fold04
## 5 <split [135/15]> Fold05
## 6 <split [135/15]> Fold06
## 7 <split [135/15]> Fold07
## 8 <split [135/15]> Fold08
## 9 <split [135/15]> Fold09
## 10 <split [135/15]> Fold10
usemodels::use_xgboost(total_weeks ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = total_weeks ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(57769)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe
xgboost_recipe <-
recipe(formula = total_weeks ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id") %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_date(first_week, keep_original_cols = FALSE) %>%
step_other(author) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_log(debut_rank, best_rank)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 150
## Columns: 127
## $ id <dbl> 4821, 2403, 3368, 7122, 2996, 7165, 6448, 68…
## $ year <dbl> 2007, 2015, 2009, 2017, 2016, 1986, 1947, 19…
## $ debut_rank <dbl> 2.0794415, 2.0794415, 2.7725887, 1.9459101, …
## $ best_rank <dbl> 0.0000000, 1.3862944, 2.6390573, 0.6931472, …
## $ total_weeks <dbl> 2.5649494, 2.0794415, 0.0000000, 1.0986123, …
## $ tfidf_title_a <dbl> 0.000000, 0.000000, 1.629048, 0.000000, 0.00…
## $ tfidf_title_act <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_after <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_age <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `tfidf_title_ain't` <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_air <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_all <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_alley <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_always <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_america <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_an <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_anathem <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_and <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_anger <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_appetites <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_arthas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_at <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_atlas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_b <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_baby <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_basket <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_beautifully <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_beauty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_before <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bennett <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_between <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_beyond <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bitterroots <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_blessings <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_blue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_blues <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bond <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_book <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_borne <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_brayford <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_brothers <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_burgoyne <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bye <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_case <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_castle <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_catch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_centennial <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_challenge <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_chance <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_chances <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_chaos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cheap <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_children <dbl> 1.672427, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_christmas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_circle <dbl> 0.00000, 0.00000, 0.00000, 5.01728, 0.00000,…
## $ tfidf_title_city <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_clancy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cleft <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_cold <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_companion <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_condominium <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_consul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cosmopolitans <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_country <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_coyle <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cradle <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dangerous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dark <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_darkness <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dating <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_daughter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `tfidf_title_day's` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_days <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dead <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_deals <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_deeper <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_descending <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_desire <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_devolution <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_diana <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,…
## $ tfidf_title_dirt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_distant <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_doing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_doll <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dragon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_edge <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_from <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_game <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_girl <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_hour <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_i <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_in <dbl> 0.000000, 1.629048, 0.000000, 0.000000, 0.00…
## $ tfidf_title_lord <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_my <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_of <dbl> 0.6374625, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_once <dbl> 0.000000, 0.000000, 2.165367, 0.000000, 0.00…
## $ tfidf_title_prey <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_shot <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_sisters <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_so <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_the <dbl> 0.4831578, 0.7247366, 0.0000000, 0.0000000, …
## $ tfidf_title_to <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_well <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_woman <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_you <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_year <int> 2007, 2015, 2009, 2017, 2016, 1986, 1947, 19…
## $ author_Danielle.Steel <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ author_other <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ first_week_dow_Sun <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,…
## $ first_week_dow_Mon <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Tue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Wed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Thu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Fri <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_dow_Sat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Jan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Feb <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
## $ first_week_month_Mar <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Apr <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_May <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Jun <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Jul <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Aug <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ first_week_month_Sep <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ first_week_month_Oct <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,…
## $ first_week_month_Nov <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ first_week_month_Dec <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Specify model
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(100)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 1072 15 7 0.00750 rmse standard 0.587 10 0.0500
## 2 1605 7 3 0.00214 rmse standard 0.605 10 0.0411
## 3 676 30 12 0.0160 rmse standard 0.637 10 0.0518
## 4 306 34 6 0.159 rmse standard 0.671 10 0.0574
## 5 1253 19 14 0.0697 rmse standard 0.700 10 0.0547
## # ℹ 1 more variable: .config <chr>