Goal: to predict total weeks on bestseller list Click here for the data
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): title, author
## dbl (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
| Name | nyt_titles |
| Number of rows | 7431 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
| author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
| year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
| total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
| debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
| best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
na.omit() %>%
mutate(total_weeks = log(total_weeks)) %>%
separate_rows(author, sep = "and | with")
Debut Rank
data %>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
Best Rank
data %>%
ggplot(aes(total_weeks, as.factor(best_rank))) +
geom_boxplot()
data %>%
ggplot(aes(year, total_weeks)) +
geom_col()
data %>%
unnest_tokens(output = word, input = author) %>%
group_by(word) %>%
summarise(total_weeks = mean(total_weeks),
n = n ()) %>%
ungroup () %>%
filter(n > 20) %>%
slice_max(order_by = total_weeks, n = 20) %>%
ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
geom_point() +
labs(y = "Author's Name")
EDA Shortcut
cols.dont.want <- "first_week"
data <- data[, ! names(data) %in% cols.dont.want, drop = F]
data_binarized_tbl <- data %>%
select(-id, -title) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,835
## Columns: 19
## $ author__Danielle_Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ author__James_Patterson <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `year__-Inf_1971` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1971_2001 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2001_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
data_corr_tbl <- data_binarized_tbl %>%
correlate(total_weeks__2.30258509299405_Inf)
data_corr_tbl
## # A tibble: 19 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 total_weeks 2.30258509299405_Inf 1
## 2 total_weeks -Inf_0.693147180559945 -0.389
## 3 best_rank -Inf_3 0.335
## 4 total_weeks 1.38629436111989_2.30258509299405 -0.320
## 5 best_rank 10_Inf -0.306
## 6 total_weeks 0.693147180559945_1.38629436111989 -0.256
## 7 year -Inf_1971 0.245
## 8 year 2011_Inf -0.241
## 9 year 2001_2011 -0.219
## 10 year 1971_2001 0.208
## 11 best_rank 6_10 -0.128
## 12 best_rank 3_6 0.0752
## 13 author James_Patterson -0.0417
## 14 debut_rank 4_8 -0.0321
## 15 author -OTHER 0.0240
## 16 debut_rank 8_12 0.0180
## 17 debut_rank -Inf_4 0.0152
## 18 author Danielle_Steel 0.00567
## 19 debut_rank 12_Inf -0.000531
data_corr_tbl %>%
plot_correlation_funnel()
# data <- sample_n(data, 100)
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [5288/588]> Fold01
## 2 <split [5288/588]> Fold02
## 3 <split [5288/588]> Fold03
## 4 <split [5288/588]> Fold04
## 5 <split [5288/588]> Fold05
## 6 <split [5288/588]> Fold06
## 7 <split [5289/587]> Fold07
## 8 <split [5289/587]> Fold08
## 9 <split [5289/587]> Fold09
## 10 <split [5289/587]> Fold10
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.4.2
usemodels::use_xgboost(total_weeks ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = total_weeks ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(49873)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = total_weeks ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id variable") %>%
step_other(title, author) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_dummy(author, one_hot = TRUE) %>%
step_YeoJohnson(year, debut_rank)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Warning: max_tokens was set to 100, but only 2 was available and selected.
## Rows: 5,876
## Columns: 9
## $ id <dbl> 7361, 7110, 7203, 1849, 1528, 7017, 3309, 1792, …
## $ year <dbl> 1960, 2005, 2013, 2004, 2006, 2017, 2011, 2018, …
## $ debut_rank <dbl> 7.5904361, 4.6228006, 5.0384401, 0.8626939, 0.86…
## $ best_rank <dbl> 15, 4, 6, 9, 16, 4, 1, 15, 1, 2, 14, 2, 1, 10, 1…
## $ total_weeks <dbl> 0.0000000, 1.7917595, 0.6931472, 0.0000000, 0.00…
## $ tfidf_title_burn <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_other <dbl> 0.6934877, 0.6934877, 0.6934877, 0.6934877, 0.69…
## $ author_Danielle.Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(344)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: max_tokens was set to 100, but only 2 was available and selected.
## Warning: package 'xgboost' was built under R version 4.4.2
## → A | warning: max_tokens was set to 100, but only 3 was available and selected.
## There were issues with some computations A: x1 → B | warning: max_tokens was set to 100, but only 2 was available and selected.
## There were issues with some computations A: x1There were issues with some computations A: x1 B: x1 → C | warning: max_tokens was set to 100, but only 4 was available and selected.
## There were issues with some computations A: x1 B: x1There were issues with some computations A: x1 B: x1 C: x1There were issues with some computations A: x1 B: x2 C: x1There were issues with some computations A: x1 B: x3 C: x1There were issues with some computations A: x2 B: x3 C: x1There were issues with some computations A: x2 B: x4 C: x1There were issues with some computations A: x2 B: x5 C: x1There were issues with some computations A: x2 B: x6 C: x1There were issues with some computations A: x2 B: x7 C: x1There were issues with some computations A: x2 B: x7 C: x1
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## mtry trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 3 1104 28 0.00484 rmse standard 0.539 10 0.00990 Preproces…
## 2 1 1613 36 0.0290 rmse standard 0.550 10 0.0102 Preproces…
## 3 6 1524 23 0.0836 rmse standard 0.561 10 0.00908 Preproces…
## 4 5 768 12 0.112 rmse standard 0.561 10 0.00929 Preproces…
## 5 6 162 7 0.00108 rmse standard 1.26 10 0.00815 Preproces…
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
data_fit <- tune::last_fit(xgboost_fw, data_split)
## → A | warning: max_tokens was set to 100, but only 2 was available and selected.
## There were issues with some computations A: x1There were issues with some computations A: x1
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.546 Preprocessor1_Model1
## 2 rsq standard 0.760 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
ggplot(aes(total_weeks, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()