Goal: Predict Total Weeks on Best Sellers List Click here for the data
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
# Address missing values
data <- nyt_titles %>%
na.omit() %>%
# log transform variables
mutate(total_weeks = log(total_weeks)) %>%
filter(total_weeks < 100) %>%
select(-first_week, -best_rank) %>%
# convert character/date variables to factors
mutate(across(where(is.character), factor)) %>%
separate_rows(author, sep = " and | with ")
# address outliers
data %>%
ggplot(aes(y = total_weeks))
#Explore Data
library(readr)
library(correlationfunnel)
data_binarized_tbl <- data %>%
select(-title, -id) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,829
## Columns: 15
## $ author__Danielle_Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ author__James_Patterson <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `author__-OTHER` <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `year__-Inf_1971` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ year__1971_2001 <dbl> 1, 1, 1, 0, 0, 0, 1, 1…
## $ year__2001_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0…
## $ `total_weeks__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ total_weeks__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0…
## $ total_weeks__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1…
## $ total_weeks__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0…
data_corr_tbl <- data_binarized_tbl %>%
correlate(total_weeks__2.30258509299405_Inf)
data_corr_tbl
## # A tibble: 15 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 total_weeks 2.30258509299405_Inf 1
## 2 total_weeks -Inf_0.693147180559945 -0.389
## 3 total_weeks 1.38629436111989_2.30258509299405 -0.319
## 4 total_weeks 0.693147180559945_1.38629436111989 -0.257
## 5 year -Inf_1971 0.245
## 6 year 2011_Inf -0.241
## 7 year 2001_2011 -0.219
## 8 year 1971_2001 0.208
## 9 debut_rank 4_8 -0.0319
## 10 debut_rank 8_12 0.0179
## 11 debut_rank -Inf_4 0.0149
## 12 author James_Patterson -0.0128
## 13 author Danielle_Steel 0.00566
## 14 author -OTHER 0.00550
## 15 debut_rank 12_Inf -0.000420
data_corr_tbl %>%
plot_correlation_funnel()
Split data
data <- sample_n(data, 100)
# split into test and train data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# further split training data set for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
library(usemodels)
usemodels::use_xgboost(title ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = title ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(6804)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
data_train_clean <- data_train %>%
drop_na(total_weeks, debut_rank)
#specify recipe
xgboost_recipe <-
recipe(formula = year ~ ., data = data_train) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_other(author, threshold = 0.05) %>%
step_dummy(author, one_hot = TRUE) %>%
step_YeoJohnson(debut_rank)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 106
## $ id <dbl> 3209, 3417, 1435, 2710, 5033, 4990, 6541, 514…
## $ total_weeks <dbl> 1.098612, 1.098612, 1.386294, 1.098612, 1.791…
## $ debut_rank <dbl> 3.4792065, 2.0772896, 4.2632382, 3.4792065, 4…
## $ year <dbl> 2014, 2010, 2008, 1934, 1999, 1950, 1959, 201…
## $ tfidf_title_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_a <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_affair <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_alfred <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_amelia <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_and <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_anywhere <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_arctic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_at <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bad <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_beverly <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_black <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_blue <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bones <dbl> 0.000000, 0.000000, 1.825329, 0.000000, 0.000…
## $ tfidf_title_breakdown <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_broken <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_brush <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_business <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_but <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_butterfield <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_byzantium <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_captain <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_cause <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_celestine <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_change <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_claudius <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_claws <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_complaint <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_country <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_deaky <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_death <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_devil <dbl> 0.000000, 0.000000, 2.165367, 0.000000, 0.000…
## $ `tfidf_title_devil's` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_disenchanted <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_does <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tfidf_title_don't` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dress <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_drift <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dry <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dusk <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_earhart <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_east <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_edge <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 2.165…
## $ tfidf_title_effect <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_egg <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_end <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_error <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_evening <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_evil <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_far <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_fatal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_few <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_firefly <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_first <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_force <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_four <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_freaky <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_gate <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_go <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_golden <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_grand <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_grove <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_here <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_hills <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_house <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_how <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_i <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_in <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_infinite <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_inn <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_ireland <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_is <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_island <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_it <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_jitterbug <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_kill <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_kind <dbl> 0.000000, 2.165367, 0.000000, 0.000000, 0.000…
## $ tfidf_title_kisser <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_know <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_land <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_lane <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_late <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_lightship <dbl> 0.000000, 0.000000, 0.000000, 4.330733, 0.000…
## $ tfidf_title_live <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_luster <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_man <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_martyr <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_midnight <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_mistress <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_moon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_moonspinners <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_my <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_never <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_night <dbl> 2.165367, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_nights <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_noon <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
## $ tfidf_title_of <dbl> 0.0000000, 0.9905007, 0.0000000, 0.0000000, 0…
## $ tfidf_title_off <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_orange <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_the <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0…
## $ author_Stuart.Woods <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
# specify model
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
#combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# tune hyperparameters
set.seed(344)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## mtry trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 7 1613 36 0.0290 rmse standard 26.6 10 1.48 Preproce…
## 2 34 1104 28 0.00484 rmse standard 26.7 10 1.23 Preproce…
## 3 83 1524 23 0.0836 rmse standard 27.1 10 2.39 Preproce…
## 4 63 768 12 0.112 rmse standard 32.1 10 2.55 Preproce…
## 5 94 162 7 0.00108 rmse standard 1674. 10 2.14 Preproce…
xgboost_fw <- finalize_workflow(
xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse")
)
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 22.5 Preprocessor1_Model1
## 2 rsq standard 0.491 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
ggplot(aes(year, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "grey50") +
coord_fixed()