library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ recipes      1.0.8
## ✔ dials        1.2.0     ✔ rsample      1.2.0
## ✔ dplyr        1.1.3     ✔ tibble       3.2.1
## ✔ ggplot2      3.4.3     ✔ tidyr        1.3.0
## ✔ infer        1.0.5     ✔ tune         1.1.2
## ✔ modeldata    1.2.0     ✔ workflows    1.1.3
## ✔ parsnip      1.1.1     ✔ workflowsets 1.0.1
## ✔ purrr        1.0.2     ✔ yardstick    1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(usemodels)
library(textrecipes)
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Question and Data

How many weeks are books predicted to be on the best selling list?

The dataset shows the best selling books according to New York Times. It includes book names, authors, how many weeks they spent on the best selling list, their best rank on the list, and what rank they debuted on the list. It also includes the year, first week on, and book ids.

The key variable in this dataset is the weeks they spent on the best selling list and other important variables are book ids and names. These are important particularly to this question because the ids/names separate the books from each other and the total weeks on the list will help us find the answer.

Data Exploration and Transformation

nyt_titles_cond <- nyt_titles %>%
    select(id, year:best_rank)
nyt_titles_cond %>%
   ggplot(aes(id, total_weeks)) +
    scale_y_log10() +
    scale_x_log10()

Data Preparation and Modeling

nyt_titles_cond <- sample_n(nyt_titles_cond, 100)

set.seed(123)
nyt_titles_split <- initial_split(nyt_titles_cond, strata = total_weeks)
nyt_titles_train <- training(nyt_titles_split)
nyt_titles_test <- testing(nyt_titles_split)

set.seed(234)
nyt_titles_folds <- bootstraps(nyt_titles_train, strata = total_weeks)
ranger_recipe <-
    recipe(formula = total_weeks ~ ., data = nyt_titles_train)

ranger_spec <-
    rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
    set_mode("regression") %>%
    set_engine("ranger")

ranger_workflow <-
    workflow() %>%
    add_recipe(ranger_recipe) %>%
    add_model(ranger_spec)

set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
    tune_grid(ranger_workflow, resamples = nyt_titles_folds, grid = 11)
## i Creating pre-processing data to finalize unknown parameter: mtry

Model Evaluation

Conclusion