Goal: Click here for the data

Import Data

nyt_titles <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
data <- nyt_titles %>%

     # Treat missing values
    select(-author) %>%
    na.omit() %>%
    
    #log transform variables with pos-skewed distribution
    mutate(best = log(total_weeks))

Explore Data

Identify good predictors.

best rank

data %>%
    ggplot(aes(total_weeks, best_rank)) +
    scale_y_log10() +
    geom_point()

data %>%
    ggplot(aes(total_weeks, as.factor(debut_rank))) +
    geom_boxplot()

data %>%
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg best rank per week
    group_by(word) %>%
    summarise(weeks = mean(total_weeks),
              n = n()) %>%
    ungroup() %>%

    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = weeks, n = 50) %>%
        
    # Plot
    ggplot(aes(weeks, fct_reorder(word, weeks))) +
    geom_point() +
    
    labs(y = "Title")

EDA shortcut

data2 <- nyt_titles %>%
    
# Treat missing values
   select(-author, -title) %>%
   na.omit() 

# Step 1: Prepare data
data_binarized_tbl <- data2 %>%
    select(-id) %>%
    # Extract date features from first_week
mutate(year = lubridate::year(first_week),
       month = lubridate::month(first_week, label = TRUE),
       weekday = lubridate::wday(first_week, label = TRUE)) %>%
    select(-first_week) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 7,431
## Columns: 30
## $ `year__-Inf_1968`     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ year__1968_2000       <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, …
## $ year__2000_2011       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ year__2011_Inf        <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `total_weeks__-Inf_2` <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ total_weeks__2_4      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ total_weeks__4_10     <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ total_weeks__10_Inf   <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `debut_rank__-Inf_4`  <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ debut_rank__4_8       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ debut_rank__8_12      <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
## $ debut_rank__12_Inf    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ `best_rank__-Inf_3`   <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ best_rank__3_6        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ best_rank__6_10       <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ best_rank__10_Inf     <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, …
## $ month__01             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__02             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ month__03             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ month__04             <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ month__05             <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ month__06             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__07             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ month__08             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ month__09             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__10             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__11             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ month__12             <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ weekday__Sun          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ weekday__Mon          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correate
data_corr_tblNY <- data_binarized_tbl %>%
    correlate(best_rank__6_10)

data_corr_tblNY
## # A tibble: 30 × 3
##    feature     bin    correlation
##    <fct>       <chr>        <dbl>
##  1 best_rank   6_10        1     
##  2 best_rank   -Inf_3     -0.362 
##  3 best_rank   10_Inf     -0.317 
##  4 best_rank   3_6        -0.288 
##  5 total_weeks 10_Inf     -0.130 
##  6 weekday     Sun         0.0744
##  7 weekday     Mon        -0.0744
##  8 total_weeks -Inf_2      0.0699
##  9 total_weeks 4_10        0.0464
## 10 debut_rank  -Inf_4     -0.0290
## # ℹ 20 more rows
# Step 3: Plot
data_corr_tblNY %>%
    plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split data

data <- sample_n(data, 100)

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split) %>%
    select(-first_week)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
usemodels:: use_xgboost(debut_rank ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = debut_rank ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6804)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Create the recipe
xgboost_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id") %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_YeoJohnson(year, best_rank)

# Prepare the recipe and inspect the data
prepared_data <- xgboost_recipe %>% prep() %>% juice()
glimpse(prepared_data)
## Rows: 75
## Columns: 106
## $ id                        <dbl> 1133, 3278, 6417, 4658, 651, 1013, 2246, 476…
## $ year                      <dbl> 2005, 1981, 2015, 2012, 2012, 1997, 2008, 19…
## $ debut_rank                <dbl> 4, 2, 4, 10, 13, 10, 13, 9, 13, 4, 3, 15, 11…
## $ best_rank                 <dbl> 4.4958307, 0.7772916, 1.3199481, 3.8204547, …
## $ best                      <dbl> 0.0000000, 3.8501476, 1.7917595, 0.0000000, …
## $ total_weeks               <dbl> 1, 47, 6, 1, 4, 12, 4, 12, 5, 1, 5, 5, 12, 8…
## $ tfidf_title_11th          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_8th           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_a             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_acts          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_african       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_age           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_agreement     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_apeirogon     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_be            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_beautiful     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_believing     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.44…
## $ tfidf_title_bending       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_beryl         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_best          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_blinding      <dbl> 0.000000, 0.000000, 0.000000, 1.443578, 0.00…
## $ tfidf_title_blood         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_blue          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_bones         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_breath        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_cain          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_called        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_candy         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_caroline      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_case          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_cezanne       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_chasing       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_chemist       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cherie        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_circus        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_claiborne     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_company       <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_confession    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_cross         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_darkness      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_day           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dead          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_death         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_destinies     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dirty         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dolores       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_down          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_drop          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `tfidf_title_emperor's`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_every         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_fall          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_feel          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_finishing     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_five          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_flame         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_for           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_forward       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_foundation    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_freedom       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_fun           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_g             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_gave          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `tfidf_title_gentleman's` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_god           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_golden        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_gondolin      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_good          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_green         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_gumshoe       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_hideaway      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_home          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_honor         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_hour          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_house         <dbl> 0.0000000, 3.6506582, 0.0000000, 0.0000000, …
## $ tfidf_title_humans        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_hurricane     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_hurry         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_i             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_in            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_interest      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_involved      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_is            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_island        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_jitterbug     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_judas         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_knife         <dbl> 0.000000, 0.000000, 0.000000, 1.443578, 0.00…
## $ tfidf_title_lacuna        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_lady          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_land          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_last          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_lie           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 1.44…
## $ tfidf_title_london        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_lost          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_love          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_lucky         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_lucy          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_make          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_man           <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.00…
## $ `tfidf_title_markham's`   <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_masked        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_meadow        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_mountain      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_no            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_of            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_the           <dbl> 0.0000000, 0.0000000, 1.1451323, 0.3817108, …
## $ tfidf_title_to            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Set up the XGBoost model specification
xgboost_spec <- boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>%
    set_mode("regression") %>%
    set_engine("xgboost", case_weights = data_train$case_weight)

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(344)
xgboost_tune <-
  tune_grid(xgboost_workflow,
            resamples = data_cv, 
            grid = 5)
## Warning: package 'xgboost' was built under R version 4.3.3

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 9
##   trees min_n learn_rate .metric .estimator  mean     n std_err .config         
##   <int> <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
## 1   114     2    0.0584  rmse    standard    1.19    10   0.699 Preprocessor1_M…
## 2   986    13    0.00657 rmse    standard    3.74    10   1.03  Preprocessor1_M…
## 3   779    23    0.00224 rmse    standard    4.10    10   1.33  Preprocessor1_M…
## 4  1986    32    0.0137  rmse    standard    5.34    10   1.07  Preprocessor1_M…
## 5  1427    38    0.153   rmse    standard    7.28    10   1.15  Preprocessor1_M…
# Update the model by selecting the best
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                         tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune:: collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       2.03  Preprocessor1_Model1
## 2 rsq     standard       0.956 Preprocessor1_Model1
tune:: collect_predictions(data_fit) %>%
    ggplot(aes(total_weeks, .pred)) +
    geom_point(alpha = 1, fill = "pink") +
    geom_abline(lty = 2, color = "purple") +
    coord_fixed()

Make Predictions

data %>%
    ggplot(aes(total_weeks)) +
    geom_histogram(bins = 20)

library(tidytext)

tidy_data <-
    data %>%
    unnest_tokens(name, title)

tidy_data %>%
    count(name, sort = TRUE)
## # A tibble: 218 × 2
##    name         n
##    <chr>    <int>
##  1 the         40
##  2 of           9
##  3 a            5
##  4 to           4
##  5 daughter     3
##  6 house        3
##  7 be           2
##  8 dead         2
##  9 for          2
## 10 in           2
## # ℹ 208 more rows
tidy_data %>%
    group_by(name) %>%
    summarise(n = n(),
              weeks = mean(total_weeks)) %>%
    ggplot(aes(n, weeks)) +
    geom_hline(yintercept = mean(data$total_weeks),
               lty = 2, color = "gray50", linewidth = 2) +
    geom_point(color = "midnightblue", alpha = 0.7) +
    geom_text(aes(label = name), check_overlap = TRUE, vjust = "top", hjust = "left") +
    scale_x_log10()

library(textrecipes)

data_recipe <- 
    recipe(formula = total_weeks ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id") %>%
    step_zv(all_predictors()) %>%  # Add step_zv to remove zero-variance predictors
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_normalize(all_numeric_predictors())
ranger_spec <-
    rand_forest(trees = 500) %>%
    set_mode("regression")

ranger_spec
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger
svm_spec <-
    svm_linear() %>%
    set_mode("regression")

svm_spec
## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR
ranger_wf <-  workflow(data_recipe, ranger_spec)
glm_wf <- workflow(data_recipe, svm_spec)
doParallel :: registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

glm_rs <- fit_resamples(
    glm_wf,
    resamples = data_cv,
    control = contrl_preds
)
## Warning: package 'LiblineaR' was built under R version 4.3.3
ranger_rs <- fit_resamples(
   ranger_wf,
   resamples = data_cv,
   control = contrl_preds
)
collect_metrics(glm_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.58     10  0.838  Preprocessor1_Model1
## 2 rsq     standard   0.749    10  0.0392 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.40     10  1.08   Preprocessor1_Model1
## 2 rsq     standard   0.852    10  0.0243 Preprocessor1_Model1
bind_rows(
   collect_predictions(glm_rs) %>%
       mutate(mod= "GLM"),
   collect_predictions(ranger_rs) %>%
       mutate(mod = "ranger")
) %>%
   ggplot(aes(place, .pred, color = id)) +
   geom_abline(lty = 2, color = "gray50", linewidth = 1.2) +
   facet_wrap(vars(mod)) +
   coord_fixed()

# Refitting the model with the updated workflow
final_fitted <- last_fit(glm_wf, data_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       4.34  Preprocessor1_Model1
## 2 rsq     standard       0.600 Preprocessor1_Model1
final_wf <- extract_workflow(final_fitted)
predict(final_wf, data_test[20,])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1 -1.66
library(tidymodels)

# Set up the recipe with step_novel() and step_zv()
#recipe <- recipe(total_weeks ~ ., data = data_train) %>%
    #step_zv(all_predictors()) %>%  # Remove zero variance predictors
    #step_tokenize(title) %>%        # Tokenize titles
    #step_tokenfilter(title, max_tokens = 100) %>%  # Filter tokens
    #step_tfidf(title) %>%           # Apply TF-IDF
   # step_novel(all_nominal_predictors()) %>%  # Handle new levels
    #step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%  # One-hot encoding
   # step_normalize(all_numeric_predictors())  # Normalize numeric predictors

# Set up the model specification (for example, a random forest)
#rf_spec <- rand_forest(trees = 500) %>%
    #set_mode("regression") %>%
    #set_engine("ranger")

# Combine recipe and model into a workflow
#rf_workflow <- workflow() %>%
    #add_recipe(recipe) %>%
    #add_model(rf_spec)

# Fit the model
#set.seed(123)
#rf_fit <- fit(rf_workflow, data = data_train)

# Make predictions on the test data
#rf_predictions <- predict(rf_fit, new_data = data_test)

# View predictions
#print(rf_predictions)
#final_datawf <- extract_workflow(final_fitted)
#predict(final_datawf, data_test[55,])
extract_workflow(final_fitted) %>%
    tidy() %>%
    filter(term != "Bias") %>%
    group_by(estimate > 0) %>%
    slice_max(abs(estimate), n = 10) %>%
    ungroup() %>%
    mutate(term = str_remove(term, "tf_author")) %>%
    ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
    geom_col(alpha = 0.8)

collect_metrics(glm_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.58     10  0.838  Preprocessor1_Model1
## 2 rsq     standard   0.749    10  0.0392 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   4.40     10  1.08   Preprocessor1_Model1
## 2 rsq     standard   0.852    10  0.0243 Preprocessor1_Model1