Goal: Click here for the data

Import Data

nyt_titles <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

data <- nyt_titles %>%
    
    # Treat missing values
    select(-author) %>%
    na.omit() %>%

    # log transform variables with pos-skewed distribution
    mutate(best = log(total_weeks))

Explore Data

Identify good predictors.

best rank

data %>%
    ggplot(aes(total_weeks, best_rank)) +
    scale_y_log10() +
    geom_point()

data %>%
    ggplot(aes(total_weeks, as.factor(debut_rank))) +
    geom_boxplot()

data %>%
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg best rank per week
    group_by(word) %>%
    summarise(weeks = mean(total_weeks),
              n = n()) %>%
    ungroup() %>%

    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = weeks, n = 20) %>%
        
    # Plot
    ggplot(aes(weeks, fct_reorder(word, weeks))) +
    geom_point() +
    
    labs(y = "Words in Title")

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-id, -title) %>%
    # Extract date features from first_week
mutate(year = lubridate::year(first_week),
       month = lubridate::month(first_week, label = TRUE),
       weekday = lubridate::wday(first_week, label = TRUE)) %>%
    select(-first_week) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,431
## Columns: 34
## $ `year__-Inf_1968`                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year__1968_2000                          <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,…
## $ year__2000_2011                          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,…
## $ year__2011_Inf                           <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,…
## $ `total_weeks__-Inf_2`                    <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,…
## $ total_weeks__2_4                         <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ total_weeks__4_10                        <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ total_weeks__10_Inf                      <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ `debut_rank__-Inf_4`                     <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,…
## $ debut_rank__4_8                          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,…
## $ debut_rank__8_12                         <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,…
## $ debut_rank__12_Inf                       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `best_rank__-Inf_3`                      <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ best_rank__3_6                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ best_rank__6_10                          <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,…
## $ best_rank__10_Inf                        <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,…
## $ `best__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,…
## $ best__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ best__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ best__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ month__01                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__02                                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ month__03                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ month__04                                <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ month__05                                <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,…
## $ month__06                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__07                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__08                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ month__09                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__10                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__11                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__12                                <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ weekday__Sun                             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ weekday__Mon                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

# Step 2: Correate
data_corr_tblNY <- data_binarized_tbl %>%
    correlate(best__2.30258509299405_Inf)

data_corr_tblNY

## # A tibble: 34 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 10_Inf                                   1    
##  2 best        2.30258509299405_Inf                     1    
##  3 total_weeks -Inf_2                                  -0.397
##  4 best        -Inf_0.693147180559945                  -0.397
##  5 best_rank   -Inf_3                                   0.343
##  6 total_weeks 4_10                                    -0.323
##  7 best        1.38629436111989_2.30258509299405       -0.323
##  8 best_rank   10_Inf                                  -0.313
##  9 total_weeks 2_4                                     -0.257
## 10 best        0.693147180559945_1.38629436111989      -0.257
## # ℹ 24 more rows

# Step 3: Plot
data_corr_tblNY %>%
    plot_correlation_funnel()

## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split data

data <- sample_n(data, 100)

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split) %>%
    select(-first_week)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10

library(usemodels)
usemodels:: use_xgboost(year ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = year ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6804)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Create the recipe
xgboost_recipe <- 
  recipe(formula = debut_rank ~ ., data = data_train) %>%
  recipes::update_role(title, new_role = "id variable") %>%
  step_dummy(title) %>%  # Convert title to dummy variables
  step_YeoJohnson(debut_rank, best_rank, total_weeks)

# Prepare the recipe and inspect the data
prepared_data <- xgboost_recipe %>% prep() %>% juice()
glimpse(prepared_data)

## Rows: 75
## Columns: 80
## $ id                                         <dbl> 6665, 6072, 181, 1665, 591,…
## $ year                                       <dbl> 2020, 1934, 1998, 2007, 195…
## $ total_weeks                                <dbl> 1.2227678, 1.6341027, 2.327…
## $ best_rank                                  <dbl> 3.0511898, 1.8986052, 3.369…
## $ best                                       <dbl> 1.0986123, 1.7917595, 2.995…
## $ debut_rank                                 <dbl> 2.8117917, 2.5109843, 1.794…
## $ title_A.CROWN.OF.SWORDS                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_A.NIGHT.WITHOUT.ARMOR                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ title_A.STRANGER.IS.WATCHING               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_A.TOUCH.OF.DEAD                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_AFTERMATH...EMPIRE.S.END             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BARBARY.SHORE                        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ title_BEARTOWN                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BRIGHT.FEATHER                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BRIMSTONE                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CASTLE.ON.THE.HILL                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CELIA.AMBERLEY                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CHILD.S.PLAY                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DASHING.THROUGH.THE.SNOW             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DECEIVED                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DO.I.WAKE.OR.SLEEP                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DREAMS.OF.JOY                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_EVERLASTING                          <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ title_FRIDAY                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_GUARDIANS.OF.THE.WEST                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HARVEST                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_ISLE.OF.PALMS                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_JUBILEE.TRAIL                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_KAFKA.ON.THE.SHORE                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_KISS.ME..DEADLY                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_KITTY.FOYLE                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_MIDNIGHT.RUNNER                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_MISTRESS.OF.MELLYN                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NEVER.GO.BACK                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NEW.MOON.RISING                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NEXT.TO.VALOUR                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_ONE.PERFECT.LIE                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_PARADISE.LOST                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_REDEMPTION                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_REMEMBRANCE.ROCK                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RETURN.TO.SULLIVANS.ISLAND           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SECOND.CHANCE                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SPY                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_STRANGERS.IN.DEATH                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_STRONG.MEDICINE                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SUMMER.OF..69                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SWEET.TOMORROWS                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ title_TEMPTATION.AND.SURRENDER             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_TEMPTING.FATE                        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ title_THE.BELOVED.RETURNS                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.BREAKDOWN                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.BRIEF.WONDROUS.LIFE.OF.OSCAR.WAO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.CASE.OF.LUCY.BENDING             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.CHAMBER                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.EXILES                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.FIRST.DEADLY.SIN                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.HAJ                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.INNOCENT                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.KLONE.AND.I                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.LEGACY                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.MERMAID.CHAIR                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ title_THE.NIGHT.GARDENER                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.PROVINCIAL.LADY.IN.AMERICA       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.ROBBER.BRIDE                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.SEARCHER                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.SIGN.OF.JONAH                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.STEEL.KISS                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.STIEHL.ASSASSIN                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.THIRTEENTH.APOSTLE               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.THOUSAND.ORCS                    <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ title_THE.UNDERGROUND.MAN                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WEDDING.DRESS                    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WHITE.RAJAH                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WHITE.WITCH                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WRONG.SIDE.OF.GOODBYE            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THINGS.YOU.SAVE.IN.A.FIRE            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THIS.WAS.A.MAN                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_TWO.LITTLE.GIRLS.IN.BLUE             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_UNSEEN                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_WHISTLE                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Set up the XGBoost model specification
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>%  
  set_engine("xgboost")

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(344)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv, 
            grid = 5)

## Warning: package 'xgboost' was built under R version 4.3.3

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 10
##    mtry trees min_n learn_rate .metric .estimator  mean     n std_err .config   
##   <int> <int> <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>     
## 1     5  1613    36    0.0290  rmse    standard    1.25    10  0.0485 Preproces…
## 2    25  1104    28    0.00484 rmse    standard    1.27    10  0.0326 Preproces…
## 3    63  1524    23    0.0836  rmse    standard    1.34    10  0.0606 Preproces…
## 4    47   768    12    0.112   rmse    standard    1.61    10  0.0830 Preproces…
## 5    71   162     7    0.00108 rmse    standard    2.37    10  0.0806 Preproces…

# Update the model by selecting the best
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                         tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune:: collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.918  Preprocessor1_Model1
## 2 rsq     standard      0.0179 Preprocessor1_Model1

tune:: collect_predictions(data_fit) %>%
    ggplot(aes(debut_rank, .pred)) +
    geom_point(alpha = 1, fill = "pink") +
    geom_abline(lty = 2, color = "purple") +
    coord_fixed()

Apply 3: NY Times Best Sellers

Erin McEvoy

2024-09-26

Import Data

Explore Data

Build Models

Evaluate Models