knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.2

library(correlationfunnel)

## Warning: package 'correlationfunnel' was built under R version 4.3.2

## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)

library(textrecipes)

## Warning: package 'textrecipes' was built under R version 4.3.2

## Loading required package: recipes
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.3.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.6     ✔ workflows    1.1.3
## ✔ modeldata    1.3.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.3.0

## Warning: package 'dials' was built under R version 4.3.2

## Warning: package 'scales' was built under R version 4.3.2

## Warning: package 'infer' was built under R version 4.3.2

## Warning: package 'modeldata' was built under R version 4.3.2

## Warning: package 'parsnip' was built under R version 4.3.2

## Warning: package 'tune' was built under R version 4.3.2

## Warning: package 'workflows' was built under R version 4.3.2

## Warning: package 'workflowsets' was built under R version 4.3.2

## Warning: package 'yardstick' was built under R version 4.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.3.2

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

library(embed)

## Warning: package 'embed' was built under R version 4.3.2

library(ggplot2)

Goal: to predict total weeks on best sellers list (total_weeks) Click here for the data.

Import Data

nyt <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(nyt)

Data summary
Name	nyt
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

data <- nyt %>%
    
    # Treat missing values
    select(-first_week) %>%
    filter(!is.na(author)) %>%
    filter(total_weeks < 200) %>%
    mutate(total_weeks = log(total_weeks)) %>%
    mutate(decade = year %/% 10 * 10)

Explore Data

Identify good predictors.

debut_rank

data %>%
    ggplot(aes(total_weeks, as.factor(debut_rank))) +
    geom_boxplot()

best_rank

data %>%
    ggplot(aes(total_weeks, as.factor(best_rank))) +
    scale_x_log10() +
    geom_boxplot()

## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 1684 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

author

data %>%
    
    group_by(author) %>%
    summarise(total_weeks_avg = mean(total_weeks)) %>% ungroup() %>%
    
    slice_max(order_by = total_weeks_avg, n = 10) %>%

    ggplot(aes(total_weeks_avg, fct_reorder(author, total_weeks_avg))) +
    geom_col() +

labs(title = "Best Author by Total Weeks", y = NULL)

Words in title

data %>%
    
    #tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    #calculate avg rent per word
    group_by(word) %>%
    summarise(total_weeks = mean(total_weeks),
              n     = n()) %>%
    
    ungroup() %>%
    
    filter(n > 10, !str_detect(word, "\\a")) %>%
    slice_max(order_by = total_weeks, n = 20) %>%
    
    #plot
    ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
    geom_point() +
    
    labs(y = "Words in Title")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-title, -author) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,427
## Columns: 24
## $ `id__-Inf_1858.5`                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ id__1858.5_3717                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__3717_5573.5                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__5573.5_Inf                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5                               <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf                             <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `decade__-Inf_1960`                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ decade__1960_2000                               <dbl> 1, 1, 1, 0, 1, 0, 1, 1…
## $ decade__2000_2010                               <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ decade__2010_Inf                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0…

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(best_rank__10.5_Inf)

data_corr_tbl

## # A tibble: 24 × 3
##    feature     bin                               correlation
##    <fct>       <chr>                                   <dbl>
##  1 best_rank   10.5_Inf                               1     
##  2 total_weeks -Inf_0.693147180559945                 0.469 
##  3 best_rank   -Inf_3                                -0.381 
##  4 best_rank   6_10.5                                -0.317 
##  5 total_weeks 2.30258509299405_Inf                  -0.314 
##  6 best_rank   3_6                                   -0.302 
##  7 total_weeks 1.38629436111989_2.30258509299405     -0.197 
##  8 debut_rank  12_Inf                                 0.0900
##  9 debut_rank  4_8                                   -0.0579
## 10 year        1968_2000                             -0.0229
## # ℹ 14 more rows

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

Preprocess Data

Build Models

Split Data

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data, strata = total_weeks)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [5012/557]> Fold01
##  2 <split [5012/557]> Fold02
##  3 <split [5012/557]> Fold03
##  4 <split [5012/557]> Fold04
##  5 <split [5012/557]> Fold05
##  6 <split [5012/557]> Fold06
##  7 <split [5012/557]> Fold07
##  8 <split [5012/557]> Fold08
##  9 <split [5012/557]> Fold09
## 10 <split [5013/556]> Fold10

library(usemodels)

## Warning: package 'usemodels' was built under R version 4.3.2

usemodels::use_xgboost(total_weeks ~., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = total_weeks ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(41811)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- recipe(formula = total_weeks ~., data = data_train) %>%
    
    recipes::update_role(id, new_role = "id variable") %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tf(title) %>%
    step_other(author) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_normalize(year) %>%
    step_log(best_rank)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 5,569
## Columns: 108
## $ id                    <dbl> 100, 1000, 1005, 1007, 1011, 1012, 1014, 1021, 1…
## $ year                  <dbl> 0.852199469, 0.622342321, 0.354175649, 0.6989613…
## $ debut_rank            <dbl> 3, 11, 12, 5, 2, 14, 7, 9, 1, 16, 3, 1, 1, 8, 12…
## $ best_rank             <dbl> 2.639057, 2.639057, 2.833213, 2.564949, 1.386294…
## $ decade                <dbl> 2010, 2000, 1990, 2000, 2010, 2010, 2000, 2010, …
## $ total_weeks           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.6931472, 0.00…
## $ tf_title_a            <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_all          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_an           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_and          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_are          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_at           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_back         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_before       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_black        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_blood        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_blue         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_bones        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_book         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_by           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_cat          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_christmas    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_city         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_come         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_country      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tf_title_dark         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_darkness     <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_daughter     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_day          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_days         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_dead         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_death        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_deep         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_dream        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_end          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_family       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_fire         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_first        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_for          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_from         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_game         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_girl         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_girls        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_god          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_golden       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_good         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_heart        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_her          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_home         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_honor        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_house        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_i            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_in           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_is           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_island       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_king         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_last         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_life         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_light        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_little       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_long         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_lost         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_love         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_man          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_me           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_men          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_moon         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_mr           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_mrs          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_murder       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_my           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_new          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_night        <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_no           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_not          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_of           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_on           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_one          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_other        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_prey         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_red          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_river        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_road         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_sea          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_second       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_secret       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_shadow       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_son          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_star         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_storm        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_summer       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_the          <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_this         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_time         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_to           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_tree         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_two          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_we           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_white        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_who          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_wife         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_winter       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_with         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_woman        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_world        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_you          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Danielle.Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ author_other          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, …

# Specify Model
xgboost_spec <- boost_tree(trees = tune(), min_n = tune()) %>%
    set_mode("regression") %>%
    set_engine("xgboost")

#Combine recipe and model using workflow
xgboost_workflow <-
    workflow() %>%
    add_recipe(xgboost_recipe) %>%
    add_model(xgboost_spec)

# Tune hyperparameters
set.seed(344)
xgboost_tune <-
    tune_grid(xgboost_workflow,
              resamples = data_cv,
              grid = 5)

Evaluate Models

show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 8
##   trees min_n .metric .estimator  mean     n std_err .config             
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1   387    24 rmse    standard   0.562    10 0.0102  Preprocessor1_Model3
## 2   627    15 rmse    standard   0.569    10 0.0110  Preprocessor1_Model2
## 3  1386    35 rmse    standard   0.598    10 0.0101  Preprocessor1_Model5
## 4  1978    31 rmse    standard   0.611    10 0.00941 Preprocessor1_Model4
## 5   914     2 rmse    standard   0.631    10 0.00978 Preprocessor1_Model1

# Update the model by selecting the best hyperparmaters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                                      tune::select_best(xgboost_tune, metric = "rmse"))

# Fit model on the entire training data and test it on the test data.
data_fit <- last_fit(xgboost_fw, data_split)

collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.580 Preprocessor1_Model1
## 2 rsq     standard       0.729 Preprocessor1_Model1

collect_predictions(data_fit) %>%
    ggplot(aes(total_weeks, .pred)) +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    coord_fixed()

svm_spec

svm_recipe <- recipe(formula = total_weeks ~., data = data_train) %>%
    
    recipes::update_role(id, new_role = "id variable") %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tf(title) %>%
    step_other(author) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_normalize(year) %>%
    step_log(best_rank)

prep(svm_recipe) %>% bake(new_data = NULL)

## # A tibble: 5,569 × 108
##       id  year debut_rank best_rank decade total_weeks tf_title_a tf_title_all
##    <dbl> <dbl>      <dbl>     <dbl>  <dbl>       <dbl>      <int>        <int>
##  1   100 0.852          3      2.64   2010       0              1            0
##  2  1000 0.622         11      2.64   2000       0              0            0
##  3  1005 0.354         12      2.83   1990       0              0            0
##  4  1007 0.699          5      2.56   2000       0.693          0            0
##  5  1011 0.852          2      1.39   2010       0              0            0
##  6  1012 0.852         14      2.77   2010       0              0            0
##  7  1014 0.699          7      1.95   2000       0.693          0            0
##  8  1021 0.776          9      2.56   2010       0              0            0
##  9  1022 1.08           1      2.64   2010       0              0            0
## 10  1028 1.12          16      2.20   2010       0              0            0
## # ℹ 5,559 more rows
## # ℹ 100 more variables: tf_title_an <int>, tf_title_and <int>,
## #   tf_title_are <int>, tf_title_at <int>, tf_title_back <int>,
## #   tf_title_before <int>, tf_title_black <int>, tf_title_blood <int>,
## #   tf_title_blue <int>, tf_title_bones <int>, tf_title_book <int>,
## #   tf_title_by <int>, tf_title_cat <int>, tf_title_christmas <int>,
## #   tf_title_city <int>, tf_title_come <int>, tf_title_country <int>, …

rf_spec <-
  rand_forest(trees = 500) %>%
  set_mode("regression")

rf_spec

## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger

svm_spec <-
  svm_linear() %>%
  set_mode("regression")

svm_spec

## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR

svm_wf <- workflow(svm_recipe, svm_spec)
rf_wf <- workflow(svm_recipe, rf_spec)

library(LiblineaR)

## Warning: package 'LiblineaR' was built under R version 4.3.2

library(ranger)

## Warning: package 'ranger' was built under R version 4.3.2

doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
  svm_wf,
  resamples = data_cv,
  control = contrl_preds
)

ranger_rs <- fit_resamples(
  rf_wf,
  resamples = data_cv,
  control = contrl_preds
)

collect_metrics(svm_rs)

## # A tibble: 2 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.11      10 0.00804 Preprocessor1_Model1
## 2 rsq     standard   0.0970    10 0.00969 Preprocessor1_Model1

collect_metrics(ranger_rs)

## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.582    10 0.00798 Preprocessor1_Model1
## 2 rsq     standard   0.735    10 0.00741 Preprocessor1_Model1

bind_rows(
  collect_predictions(svm_rs) %>%
    mutate(mod = "SVM"),
  collect_predictions(ranger_rs) %>%
    mutate(mod = "ranger")
) %>%
  ggplot(aes(best_rank, .pred, color = id)) +
  geom_abline(lty = 2, color = "gray50", size = 1.2) +
  facet_wrap(vars(mod)) +
  coord_fixed()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

final_fitted <- last_fit(svm_wf, data_split)

collect_metrics(final_fitted) ## metrics evaluated on the *testing* data

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      1.12   Preprocessor1_Model1
## 2 rsq     standard      0.0958 Preprocessor1_Model1

extract_workflow(final_fitted) %>%
  tidy() %>%
  filter(term != "Bias") %>%
  group_by(estimate > 0) %>%
  slice_max(abs(estimate), n = 10) %>%
  ungroup() %>%
  mutate(term = str_remove(term, "tf_title_")) %>%
  ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
  geom_col(alpha = 0.8) +
  scale_fill_discrete(labels = c("low ratings", "high ratings")) +
  labs(y = NULL, fill = "More from...")

# Make Predictions

Code Apply 4

Joe Stanley

2024-02-11

Import Data

Explore Data

Preprocess Data

Build Models

Evaluate Models