Apply 1: NY Times bestsellers

NY Times bestsellers list: Build a regression model to predict total weeks on best sellers list (total_weeks). Use the nyt_titles dataset.

Import Data

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean data

outliers in the Target variable: remove movies with total weeks > 100
skewed distribution of the target variable: log transformation log(total_weeks)
missing values in author: remove

nyt_titles %>% glimpse()

## Rows: 7,431
## Columns: 8
## $ id          <dbl> 0, 1, 10, 100, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1…
## $ title       <chr> "\"H\" IS FOR HOMICIDE", "\"I\" IS FOR INNOCENT", "''G'' I…
## $ author      <chr> "Sue Grafton", "Sue Grafton", "Sue Grafton", "W. Bruce Cam…
## $ year        <dbl> 1991, 1992, 1990, 2012, 2006, 2016, 1985, 1994, 2002, 1999…
## $ total_weeks <dbl> 15, 11, 6, 1, 1, 3, 16, 5, 4, 1, 3, 2, 11, 6, 9, 8, 1, 1, …
## $ first_week  <date> 1991-05-05, 1992-04-26, 1990-05-06, 2012-05-27, 2006-02-1…
## $ debut_rank  <dbl> 1, 14, 4, 3, 11, 1, 9, 7, 7, 12, 13, 5, 12, 2, 11, 13, 2, …
## $ best_rank   <dbl> 2, 2, 8, 14, 14, 7, 2, 10, 12, 17, 13, 13, 8, 5, 5, 11, 4,…

nyt_titles %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

# Remove four rows with missing values in author
data <- 
    nyt_titles %>%
    
    # remove outliers in total weeks
    filter(total_weeks < 100) %>% 
    
    # Remove missing values
    filter(!is.na(author)) %>%
    
    # Log transform the target variable
    mutate(total_weeks = log(total_weeks)) %>%
    
    # Add a decade variable 
    mutate(decade = year %/% 10 * 10)

Explore Data

data %>% select(-id) %>% explore()
data %>% describe_all()
data %>% describe_cat(author)
data %>% select(-id) %>% explore_all(target = total_weeks)

author

data %>%
    
    group_by(author) %>%
    summarise(avg_total_weeks = mean(total_weeks)) %>%
    ungroup() %>%
    
    slice_max(order_by = avg_total_weeks, n = 20) %>%
    
    ggplot(aes(avg_total_weeks, fct_reorder(author, avg_total_weeks))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "Top Authors by Total Weeks",
         x = "Avg. Total Weeks", y = NULL)

title

data %>% 
    tidytext::unnest_tokens(word, title) %>%
    anti_join(stop_words) %>%
  
    group_by(word) %>%
      summarise(
        n = n(),
        avg_total_weeks = mean(total_weeks)
      ) %>%
      ggplot(aes(n, avg_total_weeks)) +
      geom_hline(
        yintercept = mean(data$total_weeks), lty = 2,
        color = "gray50", size = 1.5
      ) +
      geom_jitter(color = "midnightblue", alpha = 0.7) +
      geom_text(aes(label = word),
        check_overlap = TRUE, family = "IBMPlexSans",
        vjust = "top", hjust = "left"
      ) +
      scale_x_log10() +
      scale_y_log10()

## Joining with `by = join_by(word)`

debut rank

No correlation found.

data %>%
    ggplot(aes(as.factor(debut_rank), total_weeks)) +
    geom_boxplot()

year

There seems to be a non-linear relationship between year and total weeks.

data %>%
    mutate(decade = year %/% 10 * 10) %>%
    ggplot(aes(as.factor(decade), total_weeks)) +
    geom_boxplot()

data_selected <- data %>%
    
    select(id, total_weeks, author, title, decade)

Build a Model

# data_selected <- sample_n(data_selected, 100)


set.seed(123)
data_split <- initial_split(data_selected, strata = total_weeks)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- bootstraps(data_train, strata = total_weeks)
data_folds

## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits              id         
##    <list>              <chr>      
##  1 <split [5560/2003]> Bootstrap01
##  2 <split [5560/2025]> Bootstrap02
##  3 <split [5560/2097]> Bootstrap03
##  4 <split [5560/2054]> Bootstrap04
##  5 <split [5560/2025]> Bootstrap05
##  6 <split [5560/2057]> Bootstrap06
##  7 <split [5560/2041]> Bootstrap07
##  8 <split [5560/2070]> Bootstrap08
##  9 <split [5560/2066]> Bootstrap09
## 10 <split [5560/2013]> Bootstrap10
## # ℹ 15 more rows

library(usemodels)
use_xgboost(total_weeks ~., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = total_weeks ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(24272)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

library(embed) # for step_lencode_glm()

xgboost_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id") %>%
    step_tokenize(title) %>%
    step_stopwords(title)%>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_lencode_glm(author, outcome = vars(total_weeks)) %>% # for author with many levels
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_normalize(decade)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 5,560
## Columns: 104
## $ id                    <dbl> 100, 1000, 1007, 1011, 1012, 1014, 1018, 1019, 1…
## $ author                <dbl> 3.465736e-01, 0.000000e+00, 6.931472e-01, 6.8252…
## $ decade                <dbl> 0.9579522, 0.5762542, 0.5762542, 0.9579522, 0.95…
## $ total_weeks           <dbl> 0.0000000, 0.0000000, 0.6931472, 0.0000000, 0.00…
## $ tfidf_title_affair    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_back      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_beach     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_black     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blood     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blue      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bones     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_book      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cat       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_christmas <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.0…
## $ tfidf_title_city      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cold      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_country   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cross     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dark      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_daughter  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_day       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_days      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dead      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_death     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_die       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dream     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000…
## $ tfidf_title_earth     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_end       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_eye       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_fall      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_family    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000…
## $ tfidf_title_fire      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_first     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_friends   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_game      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_girl      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_girls     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_god       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_golden    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_gone      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_good      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_great     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_heart     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_heat      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_heaven    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_high      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_home      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000…
## $ tfidf_title_honor     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_house     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_island    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_king      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_kiss      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_lady      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_last      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_life      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_light     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_little    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_long      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_lost      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_love      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_man       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_men       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_midnight  <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000…
## $ tfidf_title_moon      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_mountain  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_mr        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_mrs       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_murder    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000…
## $ tfidf_title_new       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_night     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000…
## $ tfidf_title_old       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_one       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_paradise  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_place     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_point     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_prey      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_private   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_red       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_return    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_river     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_road      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_rose      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_sea       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_second    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_secret    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_shadow    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_sky       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_son       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_song      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_star      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_stories   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_storm     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_story     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_summer    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_three     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_time      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_tree      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_two       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_wars      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_white     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_wife      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_winter    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_woman     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_world     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(15793)
doParallel::registerDoParallel()
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_folds, 
            grid = 5)

Explore Results

show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 8
##   trees min_n .metric .estimator  mean     n std_err .config             
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1   458    31 rmse    standard   0.972    25 0.00300 Preprocessor1_Model4
## 2   923    38 rmse    standard   0.976    25 0.00348 Preprocessor1_Model5
## 3    83    14 rmse    standard   0.977    25 0.00249 Preprocessor1_Model2
## 4  1637    20 rmse    standard   0.991    25 0.00445 Preprocessor1_Model3
## 5  1593     4 rmse    standard   1.04     25 0.00537 Preprocessor1_Model1

# How did all the possible parameter combinations do?
autoplot(xgboost_tune)

We can finalize our random forest workflow with the best performing parameters.

final_rf <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, "rmse"))

The function last_fit() fits this finalized random forest one last time to the training data and evaluates one last time on the testing data.

data_fit <- last_fit(final_rf, data_split)
data_fit

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [5560/1855]> train/test split <tibble> <tibble> <tibble>     <workflow>

Evaluate model

collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.940 Preprocessor1_Model1
## 2 rsq     standard       0.308 Preprocessor1_Model1

collect_predictions(data_fit)

## # A tibble: 1,855 × 5
##    id               .pred  .row total_weeks .config             
##    <chr>            <dbl> <int>       <dbl> <chr>               
##  1 train/test split 1.13      9       1.39  Preprocessor1_Model1
##  2 train/test split 1.30     10       0     Preprocessor1_Model1
##  3 train/test split 1.10     11       1.10  Preprocessor1_Model1
##  4 train/test split 1.30     13       2.40  Preprocessor1_Model1
##  5 train/test split 0.819    15       2.20  Preprocessor1_Model1
##  6 train/test split 0.707    19       2.48  Preprocessor1_Model1
##  7 train/test split 1.56     22       1.61  Preprocessor1_Model1
##  8 train/test split 1.31     31       1.61  Preprocessor1_Model1
##  9 train/test split 1.88     32       1.61  Preprocessor1_Model1
## 10 train/test split 0.654    39       0.693 Preprocessor1_Model1
## # ℹ 1,845 more rows

collect_predictions(data_fit) %>%
    ggplot(aes(total_weeks, .pred)) +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") +
    coord_fixed()