Goal: Click here for the data

Import Data

nyt_titles <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

data <- nyt_titles %>%

     # Treat missing values
   # select(-author) %>%
    #na.omit() %>%
    
    # log transform variables with pos-skewed distribution
    mutate(best = log(total_weeks))

Explore Data

Identify good predictors.

best rank

data %>%
    ggplot(aes(total_weeks, best_rank)) +
    scale_y_log10() +
    geom_point()

data %>%
    ggplot(aes(total_weeks, as.factor(debut_rank))) +
    geom_boxplot()

data %>%
    
    # tokenize title
    unnest_tokens(output = word, input = author) %>%
    
    # calculate avg best rank per week
    group_by(word) %>%
    summarise(weeks = mean(total_weeks),
              n = n()) %>%
    ungroup() %>%

    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = weeks, n = 50) %>%
        
    # Plot
    ggplot(aes(weeks, fct_reorder(word, weeks))) +
    geom_point() +
    
    labs(y = "Author")

EDA shortcut

data2 <- nyt_titles %>%
    
# Treat missing values
   select(-author) %>%
   na.omit() 

# Step 1: Prepare data
data_binarized_tbl <- data2 %>%
    select(-id, -title) %>%
    # Extract date features from first_week
mutate(year = lubridate::year(first_week),
       month = lubridate::month(first_week, label = TRUE),
       weekday = lubridate::wday(first_week, label = TRUE)) %>%
    select(-first_week) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,431
## Columns: 30
## $ `year__-Inf_1968`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ year__1968_2000       <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, …
## $ year__2000_2011       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ year__2011_Inf        <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `total_weeks__-Inf_2` <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ total_weeks__2_4      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ total_weeks__4_10     <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ total_weeks__10_Inf   <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `debut_rank__-Inf_4`  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ debut_rank__4_8       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ debut_rank__8_12      <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
## $ debut_rank__12_Inf    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ `best_rank__-Inf_3`   <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ best_rank__3_6        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ best_rank__6_10       <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ best_rank__10_Inf     <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, …
## $ month__01             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__02             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ month__03             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ month__04             <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ month__05             <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ month__06             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__07             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ month__08             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ month__09             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__10             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__11             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ month__12             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ weekday__Sun          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ weekday__Mon          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correate
data_corr_tblNY <- data_binarized_tbl %>%
    correlate(best_rank__6_10)

data_corr_tblNY

## # A tibble: 30 × 3
##    feature     bin    correlation
##    <fct>       <chr>        <dbl>
##  1 best_rank   6_10        1     
##  2 best_rank   -Inf_3     -0.362 
##  3 best_rank   10_Inf     -0.317 
##  4 best_rank   3_6        -0.288 
##  5 total_weeks 10_Inf     -0.130 
##  6 weekday     Sun         0.0744
##  7 weekday     Mon        -0.0744
##  8 total_weeks -Inf_2      0.0699
##  9 total_weeks 4_10        0.0464
## 10 debut_rank  -Inf_4     -0.0290
## # ℹ 20 more rows

# Step 3: Plot
data_corr_tblNY %>%
    plot_correlation_funnel()

## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split data

data <- sample_n(data, 100)

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split) %>%
    select(-first_week)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10

library(usemodels)
usemodels:: use_xgboost(author ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = author ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6804)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Create the recipe
xgboost_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>%
  recipes::update_role(total_weeks, new_role = "outcome") %>%
  step_dummy(title, author) %>%  # Convert title to dummy variables
  step_YeoJohnson(year, best_rank, total_weeks)

# Prepare the recipe and inspect the data
prepared_data <- xgboost_recipe %>% prep() %>% juice()
glimpse(prepared_data)

## Rows: 75
## Columns: 152
## $ id                                            <dbl> 1349, 6414, 3817, 7322, …
## $ year                                          <dbl> 2012, 1934, 1999, 1974, …
## $ debut_rank                                    <dbl> 4, 6, 14, 9, 14, 3, 1, 6…
## $ best_rank                                     <dbl> 0.8127585, 3.1043761, 1.…
## $ best                                          <dbl> 1.7917595, 1.0986123, 1.…
## $ total_weeks                                   <dbl> 1.5260825, 1.1633586, 1.…
## $ title_A.DENSITY.OF.SOULS                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_A.FABLE                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_A.TREE.GROWS.IN.BROOKLYN                <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ title_ALL.THAT.REMAINS                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_AND.RIDE.A.TIGER                        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ title_ANSWERED.PRAYERS                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BLACK.LEOPARD..RED.WOLF                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BLOOD.WORK                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BONES.OF.THE.LOST                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CASHELMARA                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CEREMONY.OF.THE.INNOCENT                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CERTAIN.GIRLS                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CLAUDIUS.THE.GOD                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_COME.WITH.ME.TO.MACEDONIA               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_COPPER.BEACH                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEADLOCKED                              <dbl> 1, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEADLY.DECISIONS                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEATH.IN.THE.AIR                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEATH.OF.KINGS                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEVICES.AND.DESIRES                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DOTING                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_GALAPAGOS                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_GARDEN.OF.BEASTS                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_GREEN.DOLPHIN.STREET                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HARM.S.WAY                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HAUNTED                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HER.OWN.RULES                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HOLLYWOOD.HUSBANDS                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_I.DON.T.KNOW.HOW.SHE.DOES.IT            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_JUROR..3                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_LIGHT.IN.SHADOW                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_LOVES.MUSIC..LOVES.TO.DANCE             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_MINI.SHOPAHOLIC                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NERILKA.S.STORY                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NIGHT.OF.THE.HAWK                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_O.SHEPHERD..SPEAK..By...Viking..        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_PRIVATE.ENTERPRISE                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_PURELY.ACADEMIC                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RACHEL.CADE                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RAGTIME                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RESTLESS.ARE.THE.SAILS                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RIVER.S.END                             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ title_RIVERS.OF.GLORY                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_ROBERT.B..PARKER.S.LULLABY              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_ROLE.OF.HONOR                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SEDUCING.AN.ANGEL                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SLOW.WALTZ.IN.CEDAR.BEND                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ title_SOMEDAY..SOMEDAY..MAYBE                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.CHASE                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.CLONE.WARS                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.DEPARTMENT.OF.SENSITIVE.CRIMES      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.FOUNTAIN.OVERFLOWS                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.GENERAL.S.DAUGHTER                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.HIGH.KING.OF.MONTIVAL               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.HOURGLASS                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.KILL.SWITCH                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.LACUNA                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.LONG.LOVE                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.MAPPING.OF.LOVE.AND.DEATH           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.MIDNIGHT.HOUSE                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.PAINTED.QUEEN                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.PRINCE.OF.BEVERLY.HILLS             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.SCAPEGOAT                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.SCARPETTA.FACTOR                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ title_THE.SEARCH                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.STRANGE.BOARDERS.AT.PALACE.CRESCENT <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ title_THE.THREE.SIRENS                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WITCHES.OF.EASTWICK                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THINGS.YOU.SAVE.IN.A.FIRE               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_TOUGH.GUYS.DON.T.DANCE                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_WAYFARING.STRANGER                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_WINTER.KILLS                            <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ title_WITHIN.THIS.PRESENT                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_WIZARD.AND.GLASS                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Agatha.Christie                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Albert.Brooks                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Alex.Berenson                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Alexander.McCall.Smith                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Allison.Pearson                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Angela.Mackail.Thirkell                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Anne.McCaffrey                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Barbara.Kingsolver                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Barbara.Taylor.Bradford                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Bernard.Cornwell                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Betty.Smith                            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ author_Charlaine.Harris                       <dbl> 1, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Charles.E..Mercer                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Christopher.Rice                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Dale.Brown                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Danielle.Steel                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Daphne.du.Maurier                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_E..L..Doctorow                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_E..Phillips.Oppenheim                  <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ author_Edwin.Gilbert                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Elizabeth.Goudge                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Elizabeth.Peters.and.Joan.Hess         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Evelyn.Eaton                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_F..Van.Wyck.Mason                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Henry.Green                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Iris.Johansen                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Irving.Wallace                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jackie.Collins                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jacqueline.Winspear                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.E..Bassett                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.Lee.Burke                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.Patterson.and.James.O..Born      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.Patterson.and.Nancy.Allen        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.Rollins.and.Grant.Blackwood      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Janet.Evanovich.and.Lee.Goldberg       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jayne.Ann.Krentz                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jeffery.Deaver                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jennifer.Weiner                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_John.Gardner                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_John.Sedges                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_John.Updike                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Karen.Traviss                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Katherine.Center                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Kathy.Reichs                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Kurt.Vonnegut                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Lauren.Graham                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Leonard.Drohan                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Margaret.Ayer.Barnes                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Marlon.James                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Mary.Balogh                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Mary.Higgins.Clark                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Michael.Connelly                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Nelson.DeMille                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Nora.Roberts                           <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ author_Norman.Mailer                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_P..D..James                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Patricia.Cornwell                      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ author_Patricia.D..Cornwell                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Rebecca.West                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Richard.Condon                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ author_Robert.Graves                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Robert.James.Waller                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ author_Robert.Wilder                          <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ author_S..M..Stirling                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Sophie.Kinsella                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Stephen.King                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Stringfellow.Barr                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Stuart.Woods                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Susan.Howatch                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Taylor.Caldwell                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Upton.Sinclair                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_William.Faulkner                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …

# Set up the XGBoost model specification
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>%  
  set_engine("xgboost")

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(344)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv, 
            grid = 5)

## Warning: package 'xgboost' was built under R version 4.3.3

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 9
##   trees min_n learn_rate .metric .estimator   mean     n std_err .config        
##   <int> <int>      <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>          
## 1   114     2    0.0584  rmse    standard   0.0131    10 0.00587 Preprocessor1_…
## 2   986    13    0.00657 rmse    standard   0.0608    10 0.0143  Preprocessor1_…
## 3   779    23    0.00224 rmse    standard   0.258     10 0.0282  Preprocessor1_…
## 4  1986    32    0.0137  rmse    standard   0.276     10 0.0349  Preprocessor1_…
## 5  1427    38    0.153   rmse    standard   0.553     10 0.0234  Preprocessor1_…

# Update the model by selecting the best
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                         tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune:: collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.0103 Preprocessor1_Model1
## 2 rsq     standard      1.00   Preprocessor1_Model1

tune:: collect_predictions(data_fit) %>%
    ggplot(aes(total_weeks, .pred)) +
    geom_point(alpha = 1, fill = "pink") +
    geom_abline(lty = 2, color = "purple") +
    coord_fixed()

Make Predictions

data %>%
    ggplot(aes(total_weeks)) +
    geom_histogram(bins = 20)

library(tidytext)

tidy_data <-
    data %>%
    unnest_tokens(name, author)

tidy_data %>%
    count(name, sort = TRUE)

## # A tibble: 192 × 2
##    name          n
##    <chr>     <int>
##  1 james         9
##  2 and           6
##  3 john          5
##  4 e             4
##  5 elizabeth     4
##  6 cornwell      3
##  7 robert        3
##  8 allen         2
##  9 ann           2
## 10 anne          2
## # ℹ 182 more rows

tidy_data %>%
    group_by(name) %>%
    summarise(n = n(),
              weeks = mean(total_weeks)) %>%
    ggplot(aes(n, weeks)) +
    geom_hline(yintercept = mean(data$total_weeks),
               lty = 2, color = "gray50", linewidth = 2) +
    geom_point(color = "midnightblue", alpha = 1) +
    geom_text(aes(label = weeks), check_overlap = TRUE, vjust = "top", hjust = "left") +
    scale_x_log10()

library(recipes)
data_recipe <- 
    recipe(formula = total_weeks ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id variable") %>%
    step_tokenize(author, title) %>%
    step_normalize(year, total_weeks, debut_rank, best_rank)

ranger_spec <-
    rand_forest(trees = 500) %>%
    set_mode("regression")

ranger_spec

## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger

svm_spec <-
    svm_linear() %>%
    set_mode("regression")

svm_spec

## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR

ranger_wf <-  workflow(data_recipe, ranger_spec)
svm_wf <- workflow(data_recipe, svm_spec)

doParallel :: registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
    svm_wf,
    resamples = data_cv,
    control = contrl_preds
)

## Warning: package 'LiblineaR' was built under R version 4.3.3

## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.

ranger_rs <- fit_resamples(
   ranger_wf,
   resamples = data_cv,
   control = contrl_preds
)

## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.

#collect_metrics(svm_rs)
#collect_metrics(ranger_rs)

#bind_rows(
 #   collect_predictions(svm_rs) %>%
  #      mutate(mod= "SVM"),
   # collect_predictions(ranger_rs) %>%
    #    mutate(mod = "ranger")
#) %>%
 #   ggplot(aes(place, .pred, color = id)) +
  #  geom_abline(lty = 2, color = "gray50", size = 1.2) +
   # facet_wrap(vars(mod)) +
    #coord_fixed()

final_fitted <- last_fit(svm_wf, data_split)

## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.

collect_metrics(final_fitted)

## NULL

#final_datawf <- extract_workflow(final_fitted)
#predict(final_datawf, data_test[55,])

#extract_workflow(final_fitted) %>%
    #tidy() %>%
    #filter(term != "Bias") %>%
    #group_by(estimate > 0) %>%
    ##slice_max(abs(estimate), n = 10) %>%
    #ungroup() %>%
    #mutate(term = str_remove(term, "tf_author")) %>%
    #ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0))
#+
    #geom_col(alpha = 0.8)

Apply 3: NY Times Best Sellers

Erin McEvoy

2024-09-26

Import Data

Explore Data

Build Models

Evaluate Models

Make Predictions