knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.2
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 4.3.2
## ══ correlationfunnel Tip #1 ════════════════════════════════════════════════════
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)
library(textrecipes)
## Warning: package 'textrecipes' was built under R version 4.3.2
## Loading required package: recipes
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.6 ✔ workflows 1.1.3
## ✔ modeldata 1.3.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.3.0
## Warning: package 'dials' was built under R version 4.3.2
## Warning: package 'scales' was built under R version 4.3.2
## Warning: package 'infer' was built under R version 4.3.2
## Warning: package 'modeldata' was built under R version 4.3.2
## Warning: package 'parsnip' was built under R version 4.3.2
## Warning: package 'tune' was built under R version 4.3.2
## Warning: package 'workflows' was built under R version 4.3.2
## Warning: package 'workflowsets' was built under R version 4.3.2
## Warning: package 'yardstick' was built under R version 4.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.2
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(embed)
## Warning: package 'embed' was built under R version 4.3.2
library(ggplot2)
Goal: to predict total weeks on best sellers list (total_weeks) Click here for the data.
nyt <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
## Rows: 7431 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): title, author
## dbl (5): id, year, total_weeks, debut_rank, best_rank
## date (1): first_week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(nyt)
| Name | nyt |
| Number of rows | 7431 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
| author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
| year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
| total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
| debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
| best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt %>%
# Treat missing values
select(-first_week) %>%
filter(!is.na(author)) %>%
filter(total_weeks < 200) %>%
mutate(total_weeks = log(total_weeks)) %>%
mutate(decade = year %/% 10 * 10)
Identify good predictors.
debut_rank
data %>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
best_rank
data %>%
ggplot(aes(total_weeks, as.factor(best_rank))) +
scale_x_log10() +
geom_boxplot()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 1684 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
author
data %>%
group_by(author) %>%
summarise(total_weeks_avg = mean(total_weeks)) %>% ungroup() %>%
slice_max(order_by = total_weeks_avg, n = 10) %>%
ggplot(aes(total_weeks_avg, fct_reorder(author, total_weeks_avg))) +
geom_col() +
labs(title = "Best Author by Total Weeks", y = NULL)
Words in title
data %>%
#tokenize title
unnest_tokens(output = word, input = title) %>%
#calculate avg rent per word
group_by(word) %>%
summarise(total_weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\a")) %>%
slice_max(order_by = total_weeks, n = 20) %>%
#plot
ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
geom_point() +
labs(y = "Words in Title")
EDA shortcut
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-title, -author) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,427
## Columns: 24
## $ `id__-Inf_1858.5` <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ id__1858.5_3717 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__3717_5573.5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__5573.5_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ best_rank__6_10.5 <dbl> 0, 0, 1, 0, 0, 1, 0, 1…
## $ best_rank__10.5_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `decade__-Inf_1960` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ decade__1960_2000 <dbl> 1, 1, 1, 0, 1, 0, 1, 1…
## $ decade__2000_2010 <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ decade__2010_Inf <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(best_rank__10.5_Inf)
data_corr_tbl
## # A tibble: 24 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 best_rank 10.5_Inf 1
## 2 total_weeks -Inf_0.693147180559945 0.469
## 3 best_rank -Inf_3 -0.381
## 4 best_rank 6_10.5 -0.317
## 5 total_weeks 2.30258509299405_Inf -0.314
## 6 best_rank 3_6 -0.302
## 7 total_weeks 1.38629436111989_2.30258509299405 -0.197
## 8 debut_rank 12_Inf 0.0900
## 9 debut_rank 4_8 -0.0579
## 10 year 1968_2000 -0.0229
## # ℹ 14 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
Split Data
# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data, strata = total_weeks)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [5012/557]> Fold01
## 2 <split [5012/557]> Fold02
## 3 <split [5012/557]> Fold03
## 4 <split [5012/557]> Fold04
## 5 <split [5012/557]> Fold05
## 6 <split [5012/557]> Fold06
## 7 <split [5012/557]> Fold07
## 8 <split [5012/557]> Fold08
## 9 <split [5012/557]> Fold09
## 10 <split [5013/556]> Fold10
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.3.2
usemodels::use_xgboost(total_weeks ~., data = data_train)
## xgboost_recipe <-
## recipe(formula = total_weeks ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(41811)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <- recipe(formula = total_weeks ~., data = data_train) %>%
recipes::update_role(id, new_role = "id variable") %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tf(title) %>%
step_other(author) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_normalize(year) %>%
step_log(best_rank)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 5,569
## Columns: 108
## $ id <dbl> 100, 1000, 1005, 1007, 1011, 1012, 1014, 1021, 1…
## $ year <dbl> 0.852199469, 0.622342321, 0.354175649, 0.6989613…
## $ debut_rank <dbl> 3, 11, 12, 5, 2, 14, 7, 9, 1, 16, 3, 1, 1, 8, 12…
## $ best_rank <dbl> 2.639057, 2.639057, 2.833213, 2.564949, 1.386294…
## $ decade <dbl> 2010, 2000, 1990, 2000, 2010, 2010, 2000, 2010, …
## $ total_weeks <dbl> 0.0000000, 0.0000000, 0.0000000, 0.6931472, 0.00…
## $ tf_title_a <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_all <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_an <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_and <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_are <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_at <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_back <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_before <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_black <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_blood <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_blue <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_bones <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_book <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_by <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_cat <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_christmas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_city <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_come <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_country <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tf_title_dark <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_darkness <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_daughter <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_day <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_days <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_dead <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_death <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_deep <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_dream <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_end <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_family <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_fire <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_first <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_for <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_from <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_game <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_girl <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_girls <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_god <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_golden <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_good <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_heart <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_her <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_home <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_honor <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_house <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_i <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_in <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_is <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_island <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_king <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_last <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_life <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_light <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_little <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_long <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_lost <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_love <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_man <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_me <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_men <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_moon <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_mr <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_mrs <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_murder <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_my <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_new <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_night <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_no <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_not <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_of <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_on <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_one <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_other <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_prey <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_red <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_river <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_road <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_sea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_second <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_secret <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_shadow <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_son <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_star <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_storm <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_summer <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_the <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_this <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_time <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_to <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_tree <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_two <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_we <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_white <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_who <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_wife <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_winter <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_with <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_woman <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_world <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_you <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Danielle.Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ author_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, …
# Specify Model
xgboost_spec <- boost_tree(trees = tune(), min_n = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
#Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(344)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 8
## trees min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 387 24 rmse standard 0.562 10 0.0102 Preprocessor1_Model3
## 2 627 15 rmse standard 0.569 10 0.0110 Preprocessor1_Model2
## 3 1386 35 rmse standard 0.598 10 0.0101 Preprocessor1_Model5
## 4 1978 31 rmse standard 0.611 10 0.00941 Preprocessor1_Model4
## 5 914 2 rmse standard 0.631 10 0.00978 Preprocessor1_Model1
# Update the model by selecting the best hyperparmaters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
# Fit model on the entire training data and test it on the test data.
data_fit <- last_fit(xgboost_fw, data_split)
collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.580 Preprocessor1_Model1
## 2 rsq standard 0.729 Preprocessor1_Model1
collect_predictions(data_fit) %>%
ggplot(aes(total_weeks, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
coord_fixed()
svm_spec
svm_recipe <- recipe(formula = total_weeks ~., data = data_train) %>%
recipes::update_role(id, new_role = "id variable") %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tf(title) %>%
step_other(author) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_normalize(year) %>%
step_log(best_rank)
prep(svm_recipe) %>% bake(new_data = NULL)
## # A tibble: 5,569 × 108
## id year debut_rank best_rank decade total_weeks tf_title_a tf_title_all
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 100 0.852 3 2.64 2010 0 1 0
## 2 1000 0.622 11 2.64 2000 0 0 0
## 3 1005 0.354 12 2.83 1990 0 0 0
## 4 1007 0.699 5 2.56 2000 0.693 0 0
## 5 1011 0.852 2 1.39 2010 0 0 0
## 6 1012 0.852 14 2.77 2010 0 0 0
## 7 1014 0.699 7 1.95 2000 0.693 0 0
## 8 1021 0.776 9 2.56 2010 0 0 0
## 9 1022 1.08 1 2.64 2010 0 0 0
## 10 1028 1.12 16 2.20 2010 0 0 0
## # ℹ 5,559 more rows
## # ℹ 100 more variables: tf_title_an <int>, tf_title_and <int>,
## # tf_title_are <int>, tf_title_at <int>, tf_title_back <int>,
## # tf_title_before <int>, tf_title_black <int>, tf_title_blood <int>,
## # tf_title_blue <int>, tf_title_bones <int>, tf_title_book <int>,
## # tf_title_by <int>, tf_title_cat <int>, tf_title_christmas <int>,
## # tf_title_city <int>, tf_title_come <int>, tf_title_country <int>, …
rf_spec <-
rand_forest(trees = 500) %>%
set_mode("regression")
rf_spec
## Random Forest Model Specification (regression)
##
## Main Arguments:
## trees = 500
##
## Computational engine: ranger
svm_spec <-
svm_linear() %>%
set_mode("regression")
svm_spec
## Linear Support Vector Machine Model Specification (regression)
##
## Computational engine: LiblineaR
svm_wf <- workflow(svm_recipe, svm_spec)
rf_wf <- workflow(svm_recipe, rf_spec)
library(LiblineaR)
## Warning: package 'LiblineaR' was built under R version 4.3.2
library(ranger)
## Warning: package 'ranger' was built under R version 4.3.2
doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)
svm_rs <- fit_resamples(
svm_wf,
resamples = data_cv,
control = contrl_preds
)
ranger_rs <- fit_resamples(
rf_wf,
resamples = data_cv,
control = contrl_preds
)
collect_metrics(svm_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.11 10 0.00804 Preprocessor1_Model1
## 2 rsq standard 0.0970 10 0.00969 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 0.582 10 0.00798 Preprocessor1_Model1
## 2 rsq standard 0.735 10 0.00741 Preprocessor1_Model1
bind_rows(
collect_predictions(svm_rs) %>%
mutate(mod = "SVM"),
collect_predictions(ranger_rs) %>%
mutate(mod = "ranger")
) %>%
ggplot(aes(best_rank, .pred, color = id)) +
geom_abline(lty = 2, color = "gray50", size = 1.2) +
facet_wrap(vars(mod)) +
coord_fixed()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
final_fitted <- last_fit(svm_wf, data_split)
collect_metrics(final_fitted) ## metrics evaluated on the *testing* data
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1.12 Preprocessor1_Model1
## 2 rsq standard 0.0958 Preprocessor1_Model1
extract_workflow(final_fitted) %>%
tidy() %>%
filter(term != "Bias") %>%
group_by(estimate > 0) %>%
slice_max(abs(estimate), n = 10) %>%
ungroup() %>%
mutate(term = str_remove(term, "tf_title_")) %>%
ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
geom_col(alpha = 0.8) +
scale_fill_discrete(labels = c("low ratings", "high ratings")) +
labs(y = NULL, fill = "More from...")
# Make Predictions