Goal: To predict the total weeks on Bestsellers list (total_weeks). Click here for the data.
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): title, author
## dbl (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
# Treat Missing Values
na.omit() %>%
# Log Transform Variables with Pos-skewed Distribution
mutate(total_weeks = log(total_weeks))
skimr::skim(data)
Name | data |
Number of rows | 7427 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 55 | 0 | 7168 | 0 |
author | 0 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-07-02 | 3346 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3716.06 | 2145.23 | 0 | 1858.50 | 3717.00 | 5573.5 | 7430.00 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.63 | 26.21 | 1931 | 1968.00 | 2000.00 | 2011.0 | 2020.00 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 1.48 | 1.11 | 0 | 0.69 | 1.39 | 2.3 | 5.18 | ▇▇▆▂▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.00 | 8.00 | 12.0 | 17.00 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.92 | 4.57 | 1 | 3.00 | 6.00 | 10.5 | 17.00 | ▇▅▃▃▂ |
Identify good predictors.
Year
data %>%
ggplot(aes(total_weeks, year)) +
scale_y_log10() +
geom_point()
First Week on Bestseller List
data %>%
# Extract date features from first_week
mutate(year = lubridate::year(first_week),
month = lubridate::month(first_week, label = TRUE),
weekday = lubridate::wday(first_week, label = TRUE)) %>%
select(-first_week) %>%
ggplot(aes(total_weeks, month)) +
geom_point()
data %>%
# Extract date features from first_week
mutate(year = lubridate::year(first_week),
month = lubridate::month(first_week, label = TRUE),
weekday = lubridate::wday(first_week, label = TRUE)) %>%
#select(-first_week) %>%
ggplot(aes(weekday, total_weeks)) +
geom_point()
Debut Rank
data %>%
ggplot(aes(total_weeks, debut_rank)) +
scale_y_log10()+
geom_point()
data %>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
data %>%
ggplot(aes(debut_rank, as.factor(total_weeks))) +
geom_point()
Best Rank
data %>%
ggplot(aes(total_weeks, best_rank)) +
scale_y_log10()+
geom_point()
Author
data %>%
# Tokenize Author
unnest_tokens(output = Author, input = author) %>%
# Calculate avg number of weeks by author
group_by(Author) %>%
summarise(total_weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10) %>%
slice_max(order_by = Author, n = 20) %>%
# Plot
ggplot(aes(total_weeks, fct_reorder(Author, total_weeks))) +
geom_point() +
labs(y = "")
# Step 1: Prepare data
data_binarized <- data %>%
select(-title, -year, -id) %>%
# Extract date features from first_week
mutate(year = lubridate::year(first_week),
month = lubridate::month(first_week, label = TRUE),
weekday = lubridate::wday(first_week, label = TRUE)) %>%
select(-first_week) %>%
binarize()
data_binarized %>% glimpse()
## Rows: 7,427
## Columns: 32
## $ author__Danielle_Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ month__01 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__02 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ month__03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__04 <dbl> 0, 1, 0, 0, 0, 0, 1, 0…
## $ month__05 <dbl> 1, 0, 1, 1, 0, 0, 0, 1…
## $ month__06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__07 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__08 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__09 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ month__12 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ weekday__Sun <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ weekday__Mon <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
# Step 2: Correlate
data_corr <- data_binarized %>%
correlate(total_weeks__2.30258509299405_Inf)
data_corr
## # A tibble: 32 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 total_weeks 2.30258509299405_Inf 1
## 2 total_weeks -Inf_0.693147180559945 -0.397
## 3 best_rank -Inf_3 0.344
## 4 total_weeks 1.38629436111989_2.30258509299405 -0.323
## 5 best_rank 10.5_Inf -0.314
## 6 total_weeks 0.693147180559945_1.38629436111989 -0.256
## 7 year 1968_2000 0.242
## 8 year 2011_Inf -0.235
## 9 year 2000_2011 -0.230
## 10 year -Inf_1968 0.217
## # ℹ 22 more rows
# Step 3: Plot
data_corr %>%
plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
data_corr %>% glimpse
## Rows: 32
## Columns: 3
## $ feature <fct> total_weeks, total_weeks, best_rank, total_weeks, best_ran…
## $ bin <chr> "2.30258509299405_Inf", "-Inf_0.693147180559945", "-Inf_3"…
## $ correlation <dbl> 1.00000000, -0.39682290, 0.34356819, -0.32304526, -0.31361…
Split data
data <- sample_n(data, 200)
# Split into training and testing data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training data set for cross-validation
# Use rsample::bootstraps(data_train) instead of vfold_cv(data_train) for data sets under 1000
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [135/15]> Fold01
## 2 <split [135/15]> Fold02
## 3 <split [135/15]> Fold03
## 4 <split [135/15]> Fold04
## 5 <split [135/15]> Fold05
## 6 <split [135/15]> Fold06
## 7 <split [135/15]> Fold07
## 8 <split [135/15]> Fold08
## 9 <split [135/15]> Fold09
## 10 <split [135/15]> Fold10
usemodels::use_xgboost(total_weeks ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = total_weeks ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(57769)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe
xgboost_recipe <-
recipe(formula = total_weeks ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id") %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_date(first_week, keep_original_cols = FALSE) %>%
step_other(author) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_log(debut_rank, best_rank)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 150
## Columns: 127
## $ id <dbl> 5745, 5270, 3661, 359, 4488, 3398, 2463, 2361…
## $ year <dbl> 1964, 2020, 1958, 2018, 1988, 1942, 2001, 198…
## $ debut_rank <dbl> 2.0794415, 2.1972246, 2.4849066, 1.9459101, 2…
## $ best_rank <dbl> 0.6931472, 1.3862944, 2.3978953, 2.7080502, 2…
## $ total_weeks <dbl> 3.6375862, 1.6094379, 1.3862944, 0.0000000, 0…
## $ tfidf_title_1225 <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_79 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_8th <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_a <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_academic <dbl> 0.00000, 0.00000, 5.01728, 0.00000, 0.00000, …
## $ tfidf_title_advances <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_adventures <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_advocate <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_affair <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_again <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_air <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alexandria <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alice <dbl> 0.00000, 0.00000, 0.00000, 2.50864, 0.00000, …
## $ tfidf_title_alive <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_aloft <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alone <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_and <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_angels <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_are <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_around <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_associate <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_at <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_auntie <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_back <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_balliois <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_beast <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_beauty <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bedtime <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_beet <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_belgravia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_belong <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_better <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_birthday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blane <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bloodline <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blowout <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bolitho <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bone <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_book <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bookshop <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_breakdown <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_brought <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_by <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cade <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_camerons <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_captive <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_caroline <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_castle <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_caught <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_century <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_ceremony <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cezanne <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_changes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_chasing <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cheever <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cherie <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tfidf_title_children's` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_christmas <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_christy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_city <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_clancy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_clock <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_come <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_comes <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_confession <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_contact <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_correct <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tfidf_title_couldn't` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_crash <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_crimson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cross <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_dark <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_day <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dead <dbl> 0.000000, 0.000000, 0.000000, 2.165367, 0.000…
## $ tfidf_title_death <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_evil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_force <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_happy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_heart <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_high <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_house <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_in <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_innocent <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_jewels <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_king <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_man <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_my <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_not <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_of <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_title_on <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_one <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_or <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_stories <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_street <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_the <dbl> 0.6263815, 1.2527630, 0.0000000, 0.0000000, 0…
## $ tfidf_title_to <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_we <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_with <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_year <int> 1964, 2020, 1958, 2018, 1988, 1942, 2001, 198…
## $ author_Danielle.Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ first_week_dow_Sun <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ first_week_dow_Mon <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Tue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Wed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Thu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Fri <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_dow_Sat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Jan <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
## $ first_week_month_Feb <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Mar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Apr <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_May <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ first_week_month_Jun <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ first_week_month_Jul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Aug <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Sep <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_week_month_Oct <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ first_week_month_Nov <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ first_week_month_Dec <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
# Specify model
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(100)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 1605 7 3 0.00214 rmse standard 0.556 10 0.0287
## 2 1072 15 7 0.00750 rmse standard 0.681 10 0.0192
## 3 676 30 12 0.0160 rmse standard 0.730 10 0.0226
## 4 306 34 6 0.159 rmse standard 0.733 10 0.0301
## 5 1253 19 14 0.0697 rmse standard 0.854 10 0.0405
## # ℹ 1 more variable: .config <chr>
set.seed(123)
bestsellers_split <-
nyt_titles %>%
transmute(
author,
total_weeks = if_else(total_weeks > 4, "long", "short")
) %>%
na.omit() %>%
initial_split(strata = total_weeks)
bestsellers_train <- training(bestsellers_split)
bestsellers_test <- testing(bestsellers_split)
set.seed(234)
bestsellers_folds <- vfold_cv(bestsellers_train, strata = total_weeks)
bestsellers_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [5012/558]> Fold01
## 2 <split [5013/557]> Fold02
## 3 <split [5013/557]> Fold03
## 4 <split [5013/557]> Fold04
## 5 <split [5013/557]> Fold05
## 6 <split [5013/557]> Fold06
## 7 <split [5013/557]> Fold07
## 8 <split [5013/557]> Fold08
## 9 <split [5013/557]> Fold09
## 10 <split [5014/556]> Fold10
bestsellers_train %>% count(total_weeks)
## # A tibble: 2 × 2
## total_weeks n
## <chr> <int>
## 1 long 2721
## 2 short 2849
library(wordpiece)
## Warning: package 'wordpiece' was built under R version 4.4.1
svm_spec <- svm_linear(mode = "classification")
bestsellers_rec <-
recipe(total_weeks ~ author, data = bestsellers_train) %>%
step_tokenize_wordpiece(author, max_chars = 10) %>%
step_tokenfilter(author, max_tokens = 100) %>%
step_tf(author) %>%
step_normalize(all_numeric_predictors())
prep(bestsellers_rec) %>% bake(new_data = NULL) %>% glimpse()
## Rows: 5,570
## Columns: 101
## $ total_weeks <fct> long, long, long, long, long, long, long, long, lo…
## $ `tf_author_'` <dbl> -0.08715687, -0.08715687, -0.08715687, -0.08715687…
## $ `tf_author_##a` <dbl> -0.09419984, -0.09419984, -0.09419984, -0.09419984…
## $ `tf_author_##ac` <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ `tf_author_##ci` <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ `tf_author_##e` <dbl> -0.1406038, -0.1406038, -0.1406038, -0.1406038, -0…
## $ `tf_author_##er` <dbl> -0.1458252, -0.1458252, -0.1458252, -0.1458252, -0…
## $ `tf_author_##es` <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ `tf_author_##ford` <dbl> -0.08396368, -0.08396368, -0.08396368, -0.08396368…
## $ `tf_author_##in` <dbl> -0.07951265, -0.07951265, -0.07951265, -0.07951265…
## $ `tf_author_##l` <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ `tf_author_##m` <dbl> -0.09024042, -0.09024042, -0.09024042, -0.09024042…
## $ `tf_author_##man` <dbl> -0.1193075, -0.1193075, -0.1193075, -0.1193075, -0…
## $ `tf_author_##n` <dbl> -0.1199358, -0.1199358, -0.1199358, -0.1199358, -0…
## $ `tf_author_##ne` <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ `tf_author_##ont` <dbl> -0.08819633, -0.08819633, -0.08819633, -0.08819633…
## $ `tf_author_##ovich` <dbl> -0.07614065, -0.07614065, -0.07614065, -0.07614065…
## $ `tf_author_##s` <dbl> -0.1310066, -0.1310066, -0.1310066, -0.1310066, -0…
## $ `tf_author_##sen` <dbl> -0.07409856, -0.07409856, -0.07409856, -0.07409856…
## $ `tf_author_##ssler` <dbl> -0.09652724, -0.09652724, -0.09652724, -0.09652724…
## $ `tf_author_##well` <dbl> -0.07951265, -0.07951265, -0.07951265, -0.07951265…
## $ `tf_author_##y` <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ `tf_author_##z` <dbl> -0.1111617, -0.1111617, -0.1111617, -0.1111617, -0…
## $ tf_author_. <dbl> -0.3243985, -0.3243985, -0.3243985, -0.3243985, -0…
## $ `tf_author_[UNK]` <dbl> -0.1419488, -0.1419488, -0.1419488, -0.1419488, -0…
## $ tf_author_a <dbl> -0.1207399, -0.1207399, -0.1207399, -0.1207399, -0…
## $ tf_author_alice <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ tf_author_and <dbl> -0.227056, -0.227056, -0.227056, -0.227056, -0.227…
## $ tf_author_ann <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ tf_author_anne <dbl> -0.1111617, -0.1111617, -0.1111617, -0.1111617, -0…
## $ tf_author_b <dbl> -0.1406038, -0.1406038, -0.1406038, -0.1406038, -0…
## $ tf_author_bald <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ tf_author_barbara <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ tf_author_brown <dbl> -0.1019979, -0.1019979, -0.1019979, 8.1317469, -0.…
## $ tf_author_by <dbl> -0.08310031, -0.08310031, -0.08310031, -0.08310031…
## $ tf_author_c <dbl> -0.09893404, -0.09893404, -0.09893404, -0.09893404…
## $ tf_author_child <dbl> -0.07732059, -0.07732059, -0.07732059, -0.07732059…
## $ tf_author_clark <dbl> -0.09274912, -0.09274912, -0.09274912, -0.09274912…
## $ tf_author_clive <dbl> -0.1034598, -0.1034598, -0.1034598, -0.1034598, -0…
## $ tf_author_co <dbl> -0.08504102, -0.08504102, -0.08504102, -0.08504102…
## $ tf_author_cr <dbl> -0.08610525, -0.08610525, -0.08610525, -0.08610525…
## $ tf_author_cu <dbl> -0.09652724, -0.09652724, -0.09652724, -0.09652724…
## $ tf_author_d <dbl> -0.1034598, -0.1034598, -0.1034598, -0.1034598, -0…
## $ tf_author_danielle <dbl> -0.1266872, -0.1266872, -0.1266872, -0.1266872, -0…
## $ tf_author_david <dbl> -0.1222252, -0.1222252, -0.1222252, -0.1222252, -0…
## $ tf_author_dean <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_e <dbl> -0.1185213, -0.1185213, -0.1185213, -0.1185213, -0…
## $ tf_author_elizabeth <dbl> -0.1136183, -0.1136183, -0.1136183, -0.1136183, -0…
## $ tf_author_evan <dbl> -0.0796298, -0.0796298, -0.0796298, -0.0796298, -0…
## $ tf_author_f <dbl> -0.09985495, -0.09985495, -0.09985495, -0.09985495…
## $ tf_author_frank <dbl> -0.08819633, -0.08819633, 11.33630484, -0.08819633…
## $ tf_author_gr <dbl> -0.09224066, -0.09224066, -0.09224066, -0.09224066…
## $ tf_author_griffin <dbl> -0.08610525, -0.08610525, -0.08610525, -0.08610525…
## $ tf_author_higgins <dbl> -0.1120983, -0.1120983, -0.1120983, -0.1120983, -0…
## $ tf_author_howard <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_j <dbl> -0.1646265, -0.1646265, -0.1646265, -0.1646265, -0…
## $ tf_author_james <dbl> -0.196587, -0.196587, -0.196587, -0.196587, -0.196…
## $ tf_author_jan <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_janet <dbl> -0.08610525, -0.08610525, -0.08610525, -0.08610525…
## $ tf_author_jeff <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_john <dbl> -0.2093925, -0.2093925, -0.2093925, -0.2093925, -0…
## $ tf_author_jonathan <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ tf_author_judith <dbl> -0.07951265, -0.07951265, -0.07951265, -0.07951265…
## $ tf_author_k <dbl> -0.1191634, -0.1191634, -0.1191634, -0.1191634, -0…
## $ tf_author_keller <dbl> -0.0851383, -0.0851383, -0.0851383, -0.0851383, -0…
## $ tf_author_ken <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ tf_author_king <dbl> -0.09991709, -0.09991709, -0.09991709, -0.09991709…
## $ tf_author_ko <dbl> -0.09322522, -0.09322522, -0.09322522, -0.09322522…
## $ tf_author_l <dbl> -0.08715687, -0.08715687, -0.08715687, -0.08715687…
## $ tf_author_la <dbl> -0.08064783, -0.08064783, -0.08064783, -0.08064783…
## $ tf_author_lee <dbl> -0.09124584, -0.09124584, -0.09124584, -0.09124584…
## $ tf_author_lisa <dbl> -0.09124584, -0.09124584, -0.09124584, -0.09124584…
## $ tf_author_louis <dbl> -0.0771935, -0.0771935, -0.0771935, -0.0771935, -0…
## $ tf_author_ma <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_mac <dbl> -0.08396368, -0.08396368, -0.08396368, -0.08396368…
## $ tf_author_mary <dbl> -0.1358838, -0.1358838, -0.1358838, -0.1358838, -0…
## $ tf_author_mc <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_michael <dbl> -0.1237294, -0.1237294, -0.1237294, -0.1237294, -0…
## $ tf_author_nora <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_o <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_parker <dbl> -0.08287274, -0.08287274, -0.08287274, -0.08287274…
## $ tf_author_patterson <dbl> -0.1392703, -0.1392703, -0.1392703, -0.1392703, -0…
## $ tf_author_paul <dbl> -0.09706705, -0.09706705, -0.09706705, -0.09706705…
## $ tf_author_r <dbl> -0.1132612, -0.1132612, -0.1132612, -0.1132612, -0…
## $ tf_author_richard <dbl> -0.1160255, -0.1160255, -0.1160255, -0.1160255, -0…
## $ tf_author_robert <dbl> -0.1605555, -0.1605555, -0.1605555, -0.1605555, -0…
## $ tf_author_roberts <dbl> -0.08287274, -0.08287274, -0.08287274, -0.08287274…
## $ tf_author_s <dbl> -0.09087668, -0.09087668, -0.09087668, -0.09087668…
## $ tf_author_sand <dbl> -0.07836144, -0.07836144, -0.07836144, -0.07836144…
## $ tf_author_scott <dbl> -0.08715687, -0.08715687, -0.08715687, -0.08715687…
## $ tf_author_smith <dbl> -0.09124584, -0.09124584, -0.09124584, -0.09124584…
## $ tf_author_steel <dbl> -0.1259539, -0.1259539, -0.1259539, -0.1259539, -0…
## $ tf_author_stephen <dbl> -0.1199358, -0.1199358, -0.1199358, -0.1199358, -0…
## $ tf_author_stuart <dbl> -0.1007678, -0.1007678, -0.1007678, -0.1007678, -0…
## $ tf_author_taylor <dbl> -0.08396368, -0.08396368, -0.08396368, -0.08396368…
## $ tf_author_terry <dbl> -0.08504102, -0.08504102, -0.08504102, -0.08504102…
## $ tf_author_thomas <dbl> -0.08176765, -0.08176765, -0.08176765, -0.08176765…
## $ tf_author_tom <dbl> -0.07951265, -0.07951265, -0.07951265, -0.07951265…
## $ tf_author_w <dbl> -0.1078035, -0.1078035, -0.1078035, -0.1078035, -0…
## $ tf_author_william <dbl> -0.1152284, -0.1152284, -0.1152284, -0.1152284, -0…
## $ tf_author_woods <dbl> -0.09800482, -0.09800482, -0.09800482, -0.09800482…