data <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

data %>% skimr::skim()
Data summary
Name Piped data
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂

Clean Data

Notes

data <- data %>%
     
     # Convert character to factor
     mutate(across(where(is.character), factor)) %>%
    
    # Remove missing values
    filter(!is.na(author))

Explore Data

*** Identify variables with correlation with the target variable.***

* Total weeks

data %>% count(total_weeks)
## # A tibble: 90 × 2
##    total_weeks     n
##          <dbl> <int>
##  1           1  1684
##  2           2   816
##  3           3   700
##  4           4   599
##  5           5   478
##  6           6   406
##  7           7   298
##  8           8   287
##  9           9   198
## 10          10   202
## # ℹ 80 more rows

Debut rank

data %>% 
    ggplot(aes(total_weeks, debut_rank)) +
    geom_point(color= "steelblue") +
  geom_smooth(method = "lm", 
              formula = y ~ poly(x, 2), 
              color = "indianred3")

Best rank

data %>% 
    ggplot(aes(total_weeks, best_rank)) +
    geom_point(color= "steelblue") +
  geom_smooth(color = "tomato")

We will use debut rank as a predictor

Tune on xgboost model

set.seed(123)
data_split <- data %>%
    initial_split(strata = total_weeks)

data_train <- training(data_split)

data_test <- testing(data_split)

set.seed(156)
data_fold <- vfold_cv(data_train, strata = total_weeks)
data_fold
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5010/559]> Fold01
##  2 <split [5011/558]> Fold02
##  3 <split [5011/558]> Fold03
##  4 <split [5011/558]> Fold04
##  5 <split [5012/557]> Fold05
##  6 <split [5013/556]> Fold06
##  7 <split [5013/556]> Fold07
##  8 <split [5013/556]> Fold08
##  9 <split [5013/556]> Fold09
## 10 <split [5014/555]> Fold10
split_category <- function(x) {
    x %>%
        str_split(", ") %>%
        map(str_remove_all, "[:punct:]") %>%
        map(str_squish) %>%
        map(str_to_lower) %>%
        map(str_replace_all, " ", "_")
}

data_rec <-
    recipe(total_weeks ~ ., data = data_train) %>%
    step_tokenize(debut_rank, custom_token = split_category) %>%
    step_tokenfilter(debut_rank, max_tokens = 30) %>%
    step_tf(debut_rank)
xgb_spec <-
  boost_tree(
    trees = tune(),
    mtry = tune(),
    min_n = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

xgb_wf <- workflows::workflow(data_rec, xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
##   learn_rate = 0.01
## 
## Computational engine: xgboost