Goal: To predict total weeks on best sellers list

Click[here for the data] (‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv’)

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
data <- nyt_titles %>%
  
  # Treat missing values 
    # select(-id) %>%
    filter(!is.na(author)) %>%
    filter(total_weeks < 100) %>%
    mutate(total_weeks = log(total_weeks)) %>%
    mutate(decade = year %/% 10 * 10)

#Explore Data

Identify Good Predictors

best_rank

data%>%
  ggplot(aes(total_weeks, best_rank)) +
  scale_y_log10() +
  geom_point()

debut_rank

data%>%
  ggplot(aes(total_weeks, as.factor(debut_rank))) +
  geom_boxplot()

author

data %>%
    
    group_by(author) %>%
    summarise(total_weeks_avg = mean(total_weeks)) %>% ungroup() %>%
    
    slice_max(order_by = total_weeks_avg, n = 20) %>%

    ggplot(aes(total_weeks_avg, fct_reorder(author, total_weeks_avg))) +
    geom_col() +

labs(title = "Best Author by Total Weeks", y = NULL)

Title

data %>%
    
    #tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    #calculate avg rent per word
    group_by(word) %>%
    summarise(total_weeks = mean(total_weeks),
              n     = n()) %>%
    ungroup() %>%
    
    filter(n > 10, !str_detect(word, "\\a")) %>%
    slice_max(order_by = total_weeks, n = 20) %>%
    
    #plot
    ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
    geom_point() +
    
    labs(y = "Words in Title")

EDA shortcut

# step 1 - prepare data
data_binarized_tbl <- data %>%
  select(-author, -title, -first_week) %>%
  binarize()

data_binarized_tbl %>% glimpse()
## Rows: 7,415
## Columns: 24
## $ `id__-Inf_1858.5`                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ id__1858.5_3715                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__3715_5574.5                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__5574.5_Inf                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_11                                 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__11_Inf                               <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `decade__-Inf_1960`                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ decade__1960_2000                               <dbl> 1, 1, 1, 0, 1, 0, 1, 1…
## $ decade__2000_2010                               <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ decade__2010_Inf                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate(total_weeks__2.30258509299405_Inf)

data_corr_tbl
## # A tibble: 24 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 2.30258509299405_Inf                     1    
##  2 total_weeks -Inf_0.693147180559945                  -0.396
##  3 best_rank   -Inf_3                                   0.341
##  4 total_weeks 1.38629436111989_2.30258509299405       -0.322
##  5 best_rank   11_Inf                                  -0.283
##  6 decade      2000_2010                               -0.261
##  7 total_weeks 0.693147180559945_1.38629436111989      -0.256
##  8 year        1968_2000                                0.243
##  9 year        2011_Inf                                -0.235
## 10 year        2000_2011                               -0.231
## # ℹ 14 more rows
# Step 3: Plot
data_corr_tbl %>%
  plot_correlation_funnel()

Build Models

Split Data

 data <- sample_n(data, 100)

#Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further Split training dataset for cross-varification
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(textrecipes)

total_rec <- 
   recipe(formula = total_weeks ~ ., data = data_train) %>% 
   recipes::update_role(id, new_role = "id variable") %>%
  step_tokenize(title) %>%
  step_tokenfilter(title, max_tokens = 100) %>%
  step_tfidf(title) %>%
  step_other(author) %>%
  step_date(first_week, keep_original_cols = FALSE) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_log(best_rank)

total_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 128
## $ id                            <dbl> 6395, 1113, 6797, 870, 2132, 5842, 839, …
## $ year                          <dbl> 2002, 1973, 1985, 1969, 1932, 2020, 1998…
## $ debut_rank                    <dbl> 15, 9, 3, 15, 3, 14, 3, 2, 15, 15, 13, 9…
## $ best_rank                     <dbl> 1.6094379, 0.6931472, 0.0000000, 2.07944…
## $ decade                        <dbl> 2000, 1970, 1980, 1960, 1930, 2020, 1990…
## $ total_weeks                   <dbl> 1.6094379, 3.2580965, 3.2188758, 2.07944…
## $ tfidf_title_1956              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_a                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_about             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_according         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_all               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_am                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_ambassador        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_and               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_art               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_bad               <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_beach             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_before            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bitter            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_black             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bourne            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_boys              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bridget           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bullet            <dbl> 0.000000, 0.000000, 0.000000, 4.330733, …
## $ tfidf_title_came              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_carnal            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cheerful          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_chocolate         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_christmas         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_city              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_come              <dbl> 0.000000, 4.330733, 0.000000, 0.000000, …
## $ tfidf_title_company           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cordura           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_corner            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_crows             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cupboard          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_curse             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_dark              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_darling           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_death             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_deception         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_desire            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_diary             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_downhill          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dream             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_edge              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_end               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_english           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_eve               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_eyes              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_fatal             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_feared            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_festive           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_fireman           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_floods            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_floor             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_for               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_fortieth          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_friends           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_from              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_full              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_game              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_garden            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_garp              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_girl              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_glass             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_golden            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_goodbye           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_got               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_guy               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hausfrau          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_head              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_heart             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_heaven            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_her               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_here              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hollywood         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_holy              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_honour            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hope              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hotel             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_i                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_i've`            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_in                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_innocence         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_insight           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_it's`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_jones's`         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_king              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_kiss              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_ladies            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_lake              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_leave             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_life              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_line              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_lovers            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_my                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_not               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_of                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_on                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_son               <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_the               <dbl> 1.2070929, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_tide              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_to                <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_white             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_you               <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ first_week_year               <int> 2002, 1973, 1985, 1969, 1932, 2020, 1998…
## $ author_Alexander.McCall.Smith <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ author_other                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ first_week_dow_Sun            <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ first_week_dow_Mon            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Tue            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Wed            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Thu            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Fri            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Sat            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jan          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Feb          <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0…
## $ first_week_month_Mar          <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1…
## $ first_week_month_Apr          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ first_week_month_May          <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jun          <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jul          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Aug          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ first_week_month_Sep          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ first_week_month_Oct          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Nov          <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Dec          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

#Let’s Create a Model Specification

ranger_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) 

ranger_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 9
## $ id          <dbl> 6395, 1113, 6797, 870, 2132, 5842, 839, 2123, 4336, 5880, …
## $ title       <fct> "THE STONE MONKEY", "COME NINEVEH, COME TYRE", "THINNER", …
## $ author      <fct> Jeffery Deaver, Allen Drury, Richard Bachman, John Cheever…
## $ year        <dbl> 2002, 1973, 1985, 1969, 1932, 2020, 1998, 2015, 1959, 2019…
## $ first_week  <date> 2002-03-31, 1973-11-25, 1985-03-03, 1969-05-18, 1932-08-1…
## $ debut_rank  <dbl> 15, 9, 3, 15, 3, 14, 3, 2, 15, 15, 13, 9, 11, 1, 16, 14, 3…
## $ best_rank   <dbl> 5, 2, 1, 8, 3, 5, 3, 16, 14, 3, 3, 14, 10, 3, 1, 7, 6, 15,…
## $ decade      <dbl> 2000, 1970, 1980, 1960, 1930, 2020, 1990, 2010, 1950, 2010…
## $ total_weeks <dbl> 1.6094379, 3.2580965, 3.2188758, 2.0794415, 0.0000000, 0.0…
ranger_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 500) %>% 
  set_mode("regression") %>% 
  set_engine("ranger") 

ranger_workflow <- 
  workflow() %>% 
  add_recipe(ranger_recipe) %>% 
  add_model(ranger_spec) 

svm_spec <-
  svm_linear() %>%
  set_mode("regression")

svm_spec
## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR
ranger_wf <- workflow(total_rec, ranger_spec)
svm_wf <- workflow(total_rec, svm_spec)
doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- tune_grid(
  svm_wf,
  resamples = data_cv,
  grid = 5
)
## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?
## Warning: package 'LiblineaR' was built under R version 4.2.3
ranger_rs <- tune_grid(
  ranger_wf,
  resamples = data_cv,
  grid = 5
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: package 'ranger' was built under R version 4.2.3

How did these two models compare

collect_metrics(svm_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.00     10  0.0697 Preprocessor1_Model1
## 2 rsq     standard   0.230    10  0.0945 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 10 × 8
##     mtry min_n .metric .estimator  mean     n std_err .config             
##    <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
##  1    94     8 rmse    standard   0.716    10  0.0669 Preprocessor1_Model1
##  2    94     8 rsq     standard   0.495    10  0.0734 Preprocessor1_Model1
##  3    37    35 rmse    standard   0.813    10  0.0764 Preprocessor1_Model2
##  4    37    35 rsq     standard   0.419    10  0.0843 Preprocessor1_Model2
##  5   109    32 rmse    standard   0.771    10  0.0632 Preprocessor1_Model3
##  6   109    32 rsq     standard   0.427    10  0.0769 Preprocessor1_Model3
##  7    17    18 rmse    standard   0.801    10  0.0804 Preprocessor1_Model4
##  8    17    18 rsq     standard   0.456    10  0.0884 Preprocessor1_Model4
##  9    58    14 rmse    standard   0.720    10  0.0720 Preprocessor1_Model5
## 10    58    14 rsq     standard   0.493    10  0.0803 Preprocessor1_Model5

We can visualize these results by comparing the predicted rating with the true rating:

Evaluate Models

tune::show_best(ranger_rs, metric = "rmse")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config             
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1    94     8 rmse    standard   0.716    10  0.0669 Preprocessor1_Model1
## 2    58    14 rmse    standard   0.720    10  0.0720 Preprocessor1_Model5
## 3   109    32 rmse    standard   0.771    10  0.0632 Preprocessor1_Model3
## 4    17    18 rmse    standard   0.801    10  0.0804 Preprocessor1_Model4
## 5    37    35 rmse    standard   0.813    10  0.0764 Preprocessor1_Model2
# update the model by selecting best hyperparameters.
ranger_fw <- tune::finalize_workflow(ranger_wf,
                        tune::select_best(ranger_rs, metric = "rmse" ))

# fit the model on the entire training data and test it on the test data
data_total <- tune::last_fit(ranger_fw, data_split)
tune::collect_metrics(data_total)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.603 Preprocessor1_Model1
## 2 rsq     standard       0.552 Preprocessor1_Model1
tune::collect_predictions(data_total) %>%
  ggplot(aes(total_weeks, .pred)) +
  geom_point(alpha = 0.3, fill = "midnightblue") +
  geom_abline(lty = 2, color = "gray50") +
  coord_fixed()

What changes were made and why?

I decided to replace the xgboost function from Apply 3 and rather incorporate the ranger function from this past weeks Code Along 4. In using ranger, I found a faster more effecient function that handled my large data set easier than xgboost. The evaluation was easier as ranger function created a more simplified model. More specifically the biggest change I have noticed was the ggplot at the completion of my data. This model is easier to articulate compared to the one created with xgboost. I found using a random forest model was an easier predictor than xgboost. These changes affect the rmse and rsq estimates by raising the rmse to 0.671 from 0.584 and lowered the rsq from 0.727 to 0.641.