Goal: To predict total weeks on best sellers list
Click[here for the data] (‘https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv’)
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): title, author
## dbl (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
# Treat missing values
# select(-id) %>%
filter(!is.na(author)) %>%
filter(total_weeks < 100) %>%
mutate(total_weeks = log(total_weeks)) %>%
mutate(decade = year %/% 10 * 10)
#Explore Data
Identify Good Predictors
best_rank
data%>%
ggplot(aes(total_weeks, best_rank)) +
scale_y_log10() +
geom_point()
debut_rank
data%>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
author
data %>%
group_by(author) %>%
summarise(total_weeks_avg = mean(total_weeks)) %>% ungroup() %>%
slice_max(order_by = total_weeks_avg, n = 20) %>%
ggplot(aes(total_weeks_avg, fct_reorder(author, total_weeks_avg))) +
geom_col() +
labs(title = "Best Author by Total Weeks", y = NULL)
Title
data %>%
#tokenize title
unnest_tokens(output = word, input = title) %>%
#calculate avg rent per word
group_by(word) %>%
summarise(total_weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\a")) %>%
slice_max(order_by = total_weeks, n = 20) %>%
#plot
ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
geom_point() +
labs(y = "Words in Title")
EDA shortcut
# step 1 - prepare data
data_binarized_tbl <- data %>%
select(-author, -title, -first_week) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,415
## Columns: 24
## $ `id__-Inf_1858.5` <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ id__1858.5_3715 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__3715_5574.5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__5574.5_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_11 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__11_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `decade__-Inf_1960` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ decade__1960_2000 <dbl> 1, 1, 1, 0, 1, 0, 1, 1…
## $ decade__2000_2010 <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ decade__2010_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(total_weeks__2.30258509299405_Inf)
data_corr_tbl
## # A tibble: 24 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 total_weeks 2.30258509299405_Inf 1
## 2 total_weeks -Inf_0.693147180559945 -0.396
## 3 best_rank -Inf_3 0.341
## 4 total_weeks 1.38629436111989_2.30258509299405 -0.322
## 5 best_rank 11_Inf -0.283
## 6 decade 2000_2010 -0.261
## 7 total_weeks 0.693147180559945_1.38629436111989 -0.256
## 8 year 1968_2000 0.243
## 9 year 2011_Inf -0.235
## 10 year 2000_2011 -0.231
## # ℹ 14 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
Split Data
data <- sample_n(data, 100)
#Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further Split training dataset for cross-varification
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(textrecipes)
total_rec <-
recipe(formula = total_weeks ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id variable") %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_other(author) %>%
step_date(first_week, keep_original_cols = FALSE) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_log(best_rank)
total_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 128
## $ id <dbl> 6395, 1113, 6797, 870, 2132, 5842, 839, …
## $ year <dbl> 2002, 1973, 1985, 1969, 1932, 2020, 1998…
## $ debut_rank <dbl> 15, 9, 3, 15, 3, 14, 3, 2, 15, 15, 13, 9…
## $ best_rank <dbl> 1.6094379, 0.6931472, 0.0000000, 2.07944…
## $ decade <dbl> 2000, 1970, 1980, 1960, 1930, 2020, 1990…
## $ total_weeks <dbl> 1.6094379, 3.2580965, 3.2188758, 2.07944…
## $ tfidf_title_1956 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_a <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_about <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_according <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_all <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_am <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_ambassador <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_and <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_art <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_bad <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_beach <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_before <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bitter <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_black <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bourne <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_boys <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bridget <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_bullet <dbl> 0.000000, 0.000000, 0.000000, 4.330733, …
## $ tfidf_title_came <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_carnal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cheerful <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_chocolate <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_christmas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_city <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_come <dbl> 0.000000, 4.330733, 0.000000, 0.000000, …
## $ tfidf_title_company <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cordura <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_corner <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_crows <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cupboard <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_curse <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_dark <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_darling <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_death <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_deception <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_desire <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_diary <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_downhill <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dream <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_edge <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_end <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_english <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_eve <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_eyes <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_fatal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_feared <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_festive <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_fireman <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_floods <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_floor <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_for <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_fortieth <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_friends <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_from <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_full <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_game <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_garden <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_garp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_girl <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_glass <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_golden <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_goodbye <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_got <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_guy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hausfrau <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_head <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_heart <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_heaven <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_her <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_here <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hollywood <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_holy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_honour <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hope <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hotel <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_i <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_i've` <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_in <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_innocence <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_insight <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_it's` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `tfidf_title_jones's` <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_king <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_kiss <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_ladies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_lake <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_leave <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_life <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_line <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_lovers <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_my <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_not <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_of <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_on <dbl> 0.0000000, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_son <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_the <dbl> 1.2070929, 0.0000000, 0.0000000, 0.00000…
## $ tfidf_title_tide <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_to <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_white <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ tfidf_title_you <dbl> 0.000000, 0.000000, 0.000000, 0.000000, …
## $ first_week_year <int> 2002, 1973, 1985, 1969, 1932, 2020, 1998…
## $ author_Alexander.McCall.Smith <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ author_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ first_week_dow_Sun <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ first_week_dow_Mon <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Tue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Wed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Thu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Fri <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_dow_Sat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Feb <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0…
## $ first_week_month_Mar <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1…
## $ first_week_month_Apr <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ first_week_month_May <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jun <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Jul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Aug <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ first_week_month_Sep <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ first_week_month_Oct <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Nov <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ first_week_month_Dec <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#Let’s Create a Model Specification
ranger_recipe <-
recipe(formula = total_weeks ~ ., data = data_train)
ranger_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 9
## $ id <dbl> 6395, 1113, 6797, 870, 2132, 5842, 839, 2123, 4336, 5880, …
## $ title <fct> "THE STONE MONKEY", "COME NINEVEH, COME TYRE", "THINNER", …
## $ author <fct> Jeffery Deaver, Allen Drury, Richard Bachman, John Cheever…
## $ year <dbl> 2002, 1973, 1985, 1969, 1932, 2020, 1998, 2015, 1959, 2019…
## $ first_week <date> 2002-03-31, 1973-11-25, 1985-03-03, 1969-05-18, 1932-08-1…
## $ debut_rank <dbl> 15, 9, 3, 15, 3, 14, 3, 2, 15, 15, 13, 9, 11, 1, 16, 14, 3…
## $ best_rank <dbl> 5, 2, 1, 8, 3, 5, 3, 16, 14, 3, 3, 14, 10, 3, 1, 7, 6, 15,…
## $ decade <dbl> 2000, 1970, 1980, 1960, 1930, 2020, 1990, 2010, 1950, 2010…
## $ total_weeks <dbl> 1.6094379, 3.2580965, 3.2188758, 2.0794415, 0.0000000, 0.0…
ranger_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 500) %>%
set_mode("regression") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
svm_spec <-
svm_linear() %>%
set_mode("regression")
svm_spec
## Linear Support Vector Machine Model Specification (regression)
##
## Computational engine: LiblineaR
ranger_wf <- workflow(total_rec, ranger_spec)
svm_wf <- workflow(total_rec, svm_spec)
doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)
svm_rs <- tune_grid(
svm_wf,
resamples = data_cv,
grid = 5
)
## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?
## Warning: package 'LiblineaR' was built under R version 4.2.3
ranger_rs <- tune_grid(
ranger_wf,
resamples = data_cv,
grid = 5
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning: package 'ranger' was built under R version 4.2.3
How did these two models compare
collect_metrics(svm_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.00 10 0.0697 Preprocessor1_Model1
## 2 rsq standard 0.230 10 0.0945 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 10 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 94 8 rmse standard 0.716 10 0.0669 Preprocessor1_Model1
## 2 94 8 rsq standard 0.495 10 0.0734 Preprocessor1_Model1
## 3 37 35 rmse standard 0.813 10 0.0764 Preprocessor1_Model2
## 4 37 35 rsq standard 0.419 10 0.0843 Preprocessor1_Model2
## 5 109 32 rmse standard 0.771 10 0.0632 Preprocessor1_Model3
## 6 109 32 rsq standard 0.427 10 0.0769 Preprocessor1_Model3
## 7 17 18 rmse standard 0.801 10 0.0804 Preprocessor1_Model4
## 8 17 18 rsq standard 0.456 10 0.0884 Preprocessor1_Model4
## 9 58 14 rmse standard 0.720 10 0.0720 Preprocessor1_Model5
## 10 58 14 rsq standard 0.493 10 0.0803 Preprocessor1_Model5
We can visualize these results by comparing the predicted rating with the true rating:
tune::show_best(ranger_rs, metric = "rmse")
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 94 8 rmse standard 0.716 10 0.0669 Preprocessor1_Model1
## 2 58 14 rmse standard 0.720 10 0.0720 Preprocessor1_Model5
## 3 109 32 rmse standard 0.771 10 0.0632 Preprocessor1_Model3
## 4 17 18 rmse standard 0.801 10 0.0804 Preprocessor1_Model4
## 5 37 35 rmse standard 0.813 10 0.0764 Preprocessor1_Model2
# update the model by selecting best hyperparameters.
ranger_fw <- tune::finalize_workflow(ranger_wf,
tune::select_best(ranger_rs, metric = "rmse" ))
# fit the model on the entire training data and test it on the test data
data_total <- tune::last_fit(ranger_fw, data_split)
tune::collect_metrics(data_total)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.603 Preprocessor1_Model1
## 2 rsq standard 0.552 Preprocessor1_Model1
tune::collect_predictions(data_total) %>%
ggplot(aes(total_weeks, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()
I decided to replace the xgboost function from Apply 3 and rather incorporate the ranger function from this past weeks Code Along 4. In using ranger, I found a faster more effecient function that handled my large data set easier than xgboost. The evaluation was easier as ranger function created a more simplified model. More specifically the biggest change I have noticed was the ggplot at the completion of my data. This model is easier to articulate compared to the one created with xgboost. I found using a random forest model was an easier predictor than xgboost. These changes affect the rmse and rsq estimates by raising the rmse to 0.671 from 0.584 and lowered the rsq from 0.727 to 0.641.