Goal: To predict total weeks on best sellers list

Click[here for the data] (‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv’)

nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')

## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): title, author
## dbl  (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

data <- nyt_titles %>%
  
  # Treat missing values 
    # select(-id) %>%
    filter(!is.na(author)) %>%
    filter(total_weeks < 100) %>%
    mutate(total_weeks = log(total_weeks)) %>%
    mutate(decade = year %/% 10 * 10)

#Explore Data

Identify Good Predictors

best_rank

data%>%
  ggplot(aes(total_weeks, best_rank)) +
  scale_y_log10() +
  geom_point()

debut_rank

data%>%
  ggplot(aes(total_weeks, as.factor(debut_rank))) +
  geom_boxplot()

author

data %>%
    
    group_by(author) %>%
    summarise(total_weeks_avg = mean(total_weeks)) %>% ungroup() %>%
    
    slice_max(order_by = total_weeks_avg, n = 20) %>%

    ggplot(aes(total_weeks_avg, fct_reorder(author, total_weeks_avg))) +
    geom_col() +

labs(title = "Best Author by Total Weeks", y = NULL)

Title

data %>%
    
    #tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    #calculate avg rent per word
    group_by(word) %>%
    summarise(total_weeks = mean(total_weeks),
              n     = n()) %>%
    ungroup() %>%
    
    filter(n > 10, !str_detect(word, "\\a")) %>%
    slice_max(order_by = total_weeks, n = 20) %>%
    
    #plot
    ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
    geom_point() +
    
    labs(y = "Words in Title")

EDA shortcut

# step 1 - prepare data
data_binarized_tbl <- data %>%
  select(-author, -title, -first_week) %>%
  binarize()

data_binarized_tbl %>% glimpse()

## Rows: 7,415
## Columns: 24
## $ `id__-Inf_1858.5`                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ id__1858.5_3715                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__3715_5574.5                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__5574.5_Inf                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `year__-Inf_1968`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000                                 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf                                  <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945`           <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405  <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf               <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4`                            <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8                                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12                                <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf                              <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3`                             <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_11                                 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__11_Inf                               <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `decade__-Inf_1960`                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ decade__1960_2000                               <dbl> 1, 1, 1, 0, 1, 0, 1, 1…
## $ decade__2000_2010                               <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ decade__2010_Inf                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0…

# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate(total_weeks__2.30258509299405_Inf)

data_corr_tbl

## # A tibble: 24 × 3
##    feature     bin                                correlation
##    <fct>       <chr>                                    <dbl>
##  1 total_weeks 2.30258509299405_Inf                     1    
##  2 total_weeks -Inf_0.693147180559945                  -0.396
##  3 best_rank   -Inf_3                                   0.341
##  4 total_weeks 1.38629436111989_2.30258509299405       -0.322
##  5 best_rank   11_Inf                                  -0.283
##  6 decade      2000_2010                               -0.261
##  7 total_weeks 0.693147180559945_1.38629436111989      -0.256
##  8 year        1968_2000                                0.243
##  9 year        2011_Inf                                -0.235
## 10 year        2000_2011                               -0.231
## # ℹ 14 more rows

# Step 3: Plot
data_corr_tbl %>%
  plot_correlation_funnel()

Build Models

Split Data

 data <- sample_n(data, 100)

#Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further Split training dataset for cross-varification
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10

library(textrecipes)

total_rec <- 
   recipe(formula = total_weeks ~ ., data = data_train) %>% 
   recipes::update_role(id, new_role = "id variable") %>%
  step_tokenize(title) %>%
  step_tokenfilter(title, max_tokens = 100) %>%
  step_tfidf(title) %>%
  step_other(author) %>%
  step_date(first_week, keep_original_cols = FALSE) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_log(best_rank)

total_rec %>% prep() %>% juice() %>% glimpse()

## Rows: 75
## Columns: 128
## $ id                            <dbl> 6395, 1113, 6797, 870, 2132, 5842, 839, …
## $ year                          <dbl> 2002, 1973, 1985, 1969, 1932, 2020, 1998…
## $ debut_rank                    <dbl> 15, 9, 3, 15, 3, 14, 3, 2, 15, 15, 13, 9…
## $ best_rank                     <dbl> 1.6094379, 0.6931472, 0.0000000, 2.07944…
## $ decade                        <dbl> 2000, 1970, 1980, 1960, 1930, 2020, 1990…
## $ total_weeks                   <dbl> 1.6094379, 3.2580965, 3.2188758, 2.07944…
## $ tfidf_title_1956              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_a                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_about             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_according         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_all               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_am                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_ambassador        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_and               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_art               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_bad               <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_beach             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_before            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bitter            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_black             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bourne            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_boys              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bridget           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bullet            <dbl> 0.000000, 0.000000, 0.000000, 4.330733, …
## $ tfidf_title_came              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_carnal            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cheerful          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_chocolate         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_christmas         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_city              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_come              <dbl> 0.000000, 4.330733, 0.000000, 0.000000, …
## $ tfidf_title_company           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cordura           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_corner            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_crows             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cupboard          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_curse             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_dark              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_darling           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_death             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_deception         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_desire            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_diary             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_downhill          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dream             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_edge              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_end               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_english           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_eve               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_eyes              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_fatal             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_feared            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_festive           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_fireman           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_floods            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_floor             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_for               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_fortieth          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_friends           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_from              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_full              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_game              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_garden            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_garp              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_girl              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_glass             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_golden            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_goodbye           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_got               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_guy               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hausfrau          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_head              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_heart             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_heaven            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_her               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_here              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hollywood         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_holy              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_honour            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hope              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hotel             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_i                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_i've`            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_in                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_innocence         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_insight           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_it's`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_jones's`         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_king              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_kiss              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_ladies            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_lake              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_leave             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_life              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_line              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_lovers            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_my                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_not               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_of                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_on                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_son               <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_the               <dbl> 1.2070929, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_tide              <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_to                <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_white             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_you               <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ first_week_year               <int> 2002, 1973, 1985, 1969, 1932, 2020, 1998…
## $ author_Alexander.McCall.Smith <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ author_other                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ first_week_dow_Sun            <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ first_week_dow_Mon            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Tue            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Wed            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Thu            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Fri            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Sat            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jan          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Feb          <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0…
## $ first_week_month_Mar          <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1…
## $ first_week_month_Apr          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ first_week_month_May          <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jun          <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jul          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Aug          <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ first_week_month_Sep          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ first_week_month_Oct          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Nov          <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Dec          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

#Let’s Create a Model Specification

ranger_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) 

ranger_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 75
## Columns: 9
## $ id          <dbl> 6395, 1113, 6797, 870, 2132, 5842, 839, 2123, 4336, 5880, …
## $ title       <fct> "THE STONE MONKEY", "COME NINEVEH, COME TYRE", "THINNER", …
## $ author      <fct> Jeffery Deaver, Allen Drury, Richard Bachman, John Cheever…
## $ year        <dbl> 2002, 1973, 1985, 1969, 1932, 2020, 1998, 2015, 1959, 2019…
## $ first_week  <date> 2002-03-31, 1973-11-25, 1985-03-03, 1969-05-18, 1932-08-1…
## $ debut_rank  <dbl> 15, 9, 3, 15, 3, 14, 3, 2, 15, 15, 13, 9, 11, 1, 16, 14, 3…
## $ best_rank   <dbl> 5, 2, 1, 8, 3, 5, 3, 16, 14, 3, 3, 14, 10, 3, 1, 7, 6, 15,…
## $ decade      <dbl> 2000, 1970, 1980, 1960, 1930, 2020, 1990, 2010, 1950, 2010…
## $ total_weeks <dbl> 1.6094379, 3.2580965, 3.2188758, 2.0794415, 0.0000000, 0.0…

ranger_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 500) %>% 
  set_mode("regression") %>% 
  set_engine("ranger") 

ranger_workflow <- 
  workflow() %>% 
  add_recipe(ranger_recipe) %>% 
  add_model(ranger_spec) 

svm_spec <-
  svm_linear() %>%
  set_mode("regression")

svm_spec

## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR

ranger_wf <- workflow(total_rec, ranger_spec)
svm_wf <- workflow(total_rec, svm_spec)

doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- tune_grid(
  svm_wf,
  resamples = data_cv,
  grid = 5
)

## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?

## Warning: package 'LiblineaR' was built under R version 4.2.3

ranger_rs <- tune_grid(
  ranger_wf,
  resamples = data_cv,
  grid = 5
)

## i Creating pre-processing data to finalize unknown parameter: mtry

## Warning: package 'ranger' was built under R version 4.2.3

How did these two models compare

collect_metrics(svm_rs)

## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.00     10  0.0697 Preprocessor1_Model1
## 2 rsq     standard   0.230    10  0.0945 Preprocessor1_Model1

collect_metrics(ranger_rs)

## # A tibble: 10 × 8
##     mtry min_n .metric .estimator  mean     n std_err .config             
##    <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
##  1    94     8 rmse    standard   0.716    10  0.0669 Preprocessor1_Model1
##  2    94     8 rsq     standard   0.495    10  0.0734 Preprocessor1_Model1
##  3    37    35 rmse    standard   0.813    10  0.0764 Preprocessor1_Model2
##  4    37    35 rsq     standard   0.419    10  0.0843 Preprocessor1_Model2
##  5   109    32 rmse    standard   0.771    10  0.0632 Preprocessor1_Model3
##  6   109    32 rsq     standard   0.427    10  0.0769 Preprocessor1_Model3
##  7    17    18 rmse    standard   0.801    10  0.0804 Preprocessor1_Model4
##  8    17    18 rsq     standard   0.456    10  0.0884 Preprocessor1_Model4
##  9    58    14 rmse    standard   0.720    10  0.0720 Preprocessor1_Model5
## 10    58    14 rsq     standard   0.493    10  0.0803 Preprocessor1_Model5

We can visualize these results by comparing the predicted rating with the true rating:

Evaluate Models

tune::show_best(ranger_rs, metric = "rmse")

## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config             
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1    94     8 rmse    standard   0.716    10  0.0669 Preprocessor1_Model1
## 2    58    14 rmse    standard   0.720    10  0.0720 Preprocessor1_Model5
## 3   109    32 rmse    standard   0.771    10  0.0632 Preprocessor1_Model3
## 4    17    18 rmse    standard   0.801    10  0.0804 Preprocessor1_Model4
## 5    37    35 rmse    standard   0.813    10  0.0764 Preprocessor1_Model2

# update the model by selecting best hyperparameters.
ranger_fw <- tune::finalize_workflow(ranger_wf,
                        tune::select_best(ranger_rs, metric = "rmse" ))

# fit the model on the entire training data and test it on the test data
data_total <- tune::last_fit(ranger_fw, data_split)

tune::collect_metrics(data_total)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.603 Preprocessor1_Model1
## 2 rsq     standard       0.552 Preprocessor1_Model1

tune::collect_predictions(data_total) %>%
  ggplot(aes(total_weeks, .pred)) +
  geom_point(alpha = 0.3, fill = "midnightblue") +
  geom_abline(lty = 2, color = "gray50") +
  coord_fixed()

What changes were made and why?

I decided to replace the xgboost function from Apply 3 and rather incorporate the ranger function from this past weeks Code Along 4. In using ranger, I found a faster more effecient function that handled my large data set easier than xgboost. The evaluation was easier as ranger function created a more simplified model. More specifically the biggest change I have noticed was the ggplot at the completion of my data. This model is easier to articulate compared to the one created with xgboost. I found using a random forest model was an easier predictor than xgboost. These changes affect the rmse and rsq estimates by raising the rmse to 0.671 from 0.584 and lowered the rsq from 0.727 to 0.641.

Apply 4: nyt_titles

Reed Wilson

2024-02-29

Build Models

Evaluate Models

What changes were made and why?