Goal: Click here for the data
nyt_titles <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
# Treat missing values
select(-author) %>%
na.omit() %>%
# log transform variables with pos-skewed distribution
mutate(best = log(total_weeks))
Identify good predictors.
best rank
data %>%
ggplot(aes(total_weeks, best_rank)) +
scale_y_log10() +
geom_point()
data %>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
data %>%
# tokenize title
unnest_tokens(output = word, input = title) %>%
# calculate avg best rank per week
group_by(word) %>%
summarise(weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\d")) %>%
slice_max(order_by = weeks, n = 20) %>%
# Plot
ggplot(aes(weeks, fct_reorder(word, weeks))) +
geom_point() +
labs(y = "Words in Title")
EDA shortcut
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-id, -title) %>%
# Extract date features from first_week
mutate(year = lubridate::year(first_week),
month = lubridate::month(first_week, label = TRUE),
weekday = lubridate::wday(first_week, label = TRUE)) %>%
select(-first_week) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,431
## Columns: 34
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,…
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,…
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,…
## $ `total_weeks__-Inf_2` <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,…
## $ total_weeks__2_4 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ total_weeks__4_10 <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ total_weeks__10_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,…
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,…
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,…
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ best_rank__6_10 <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,…
## $ best_rank__10_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,…
## $ `best__-Inf_0.693147180559945` <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,…
## $ best__0.693147180559945_1.38629436111989 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ best__1.38629436111989_2.30258509299405 <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,…
## $ best__2.30258509299405_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ month__01 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__02 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ month__03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ month__04 <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,…
## $ month__05 <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,…
## $ month__06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__07 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__08 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ month__09 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ month__12 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ weekday__Sun <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ weekday__Mon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Step 2: Correate
data_corr_tblNY <- data_binarized_tbl %>%
correlate(best__2.30258509299405_Inf)
data_corr_tblNY
## # A tibble: 34 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 total_weeks 10_Inf 1
## 2 best 2.30258509299405_Inf 1
## 3 total_weeks -Inf_2 -0.397
## 4 best -Inf_0.693147180559945 -0.397
## 5 best_rank -Inf_3 0.343
## 6 total_weeks 4_10 -0.323
## 7 best 1.38629436111989_2.30258509299405 -0.323
## 8 best_rank 10_Inf -0.313
## 9 total_weeks 2_4 -0.257
## 10 best 0.693147180559945_1.38629436111989 -0.257
## # ℹ 24 more rows
# Step 3: Plot
data_corr_tblNY %>%
plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Split data
data <- sample_n(data, 100)
# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split) %>%
select(-first_week)
data_test <- testing(data_split)
# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
usemodels:: use_xgboost(year ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = year ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(6804)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Create the recipe
xgboost_recipe <-
recipe(formula = debut_rank ~ ., data = data_train) %>%
recipes::update_role(title, new_role = "id variable") %>%
step_dummy(title) %>% # Convert title to dummy variables
step_YeoJohnson(debut_rank, best_rank, total_weeks)
# Prepare the recipe and inspect the data
prepared_data <- xgboost_recipe %>% prep() %>% juice()
glimpse(prepared_data)
## Rows: 75
## Columns: 80
## $ id <dbl> 6665, 6072, 181, 1665, 591,…
## $ year <dbl> 2020, 1934, 1998, 2007, 195…
## $ total_weeks <dbl> 1.2227678, 1.6341027, 2.327…
## $ best_rank <dbl> 3.0511898, 1.8986052, 3.369…
## $ best <dbl> 1.0986123, 1.7917595, 2.995…
## $ debut_rank <dbl> 2.8117917, 2.5109843, 1.794…
## $ title_A.CROWN.OF.SWORDS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_A.NIGHT.WITHOUT.ARMOR <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ title_A.STRANGER.IS.WATCHING <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_A.TOUCH.OF.DEAD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_AFTERMATH...EMPIRE.S.END <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BARBARY.SHORE <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ title_BEARTOWN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BRIGHT.FEATHER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BRIMSTONE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CASTLE.ON.THE.HILL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CELIA.AMBERLEY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CHILD.S.PLAY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DASHING.THROUGH.THE.SNOW <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DECEIVED <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DO.I.WAKE.OR.SLEEP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DREAMS.OF.JOY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_EVERLASTING <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ title_FRIDAY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_GUARDIANS.OF.THE.WEST <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HARVEST <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_ISLE.OF.PALMS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_JUBILEE.TRAIL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_KAFKA.ON.THE.SHORE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_KISS.ME..DEADLY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_KITTY.FOYLE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_MIDNIGHT.RUNNER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_MISTRESS.OF.MELLYN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NEVER.GO.BACK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NEW.MOON.RISING <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NEXT.TO.VALOUR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_ONE.PERFECT.LIE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_PARADISE.LOST <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_REDEMPTION <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_REMEMBRANCE.ROCK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RETURN.TO.SULLIVANS.ISLAND <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SECOND.CHANCE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SPY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_STRANGERS.IN.DEATH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_STRONG.MEDICINE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SUMMER.OF..69 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SWEET.TOMORROWS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ title_TEMPTATION.AND.SURRENDER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_TEMPTING.FATE <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ title_THE.BELOVED.RETURNS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.BREAKDOWN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.BRIEF.WONDROUS.LIFE.OF.OSCAR.WAO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.CASE.OF.LUCY.BENDING <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.CHAMBER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.EXILES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.FIRST.DEADLY.SIN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.HAJ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.INNOCENT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.KLONE.AND.I <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.LEGACY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.MERMAID.CHAIR <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ title_THE.NIGHT.GARDENER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.PROVINCIAL.LADY.IN.AMERICA <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.ROBBER.BRIDE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.SEARCHER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.SIGN.OF.JONAH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.STEEL.KISS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.STIEHL.ASSASSIN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.THIRTEENTH.APOSTLE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.THOUSAND.ORCS <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ title_THE.UNDERGROUND.MAN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WEDDING.DRESS <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WHITE.RAJAH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WHITE.WITCH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WRONG.SIDE.OF.GOODBYE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THINGS.YOU.SAVE.IN.A.FIRE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THIS.WAS.A.MAN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_TWO.LITTLE.GIRLS.IN.BLUE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_UNSEEN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_WHISTLE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Set up the XGBoost model specification
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(344)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
## Warning: package 'xgboost' was built under R version 4.3.3
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## mtry trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 5 1613 36 0.0290 rmse standard 1.25 10 0.0485 Preproces…
## 2 25 1104 28 0.00484 rmse standard 1.27 10 0.0326 Preproces…
## 3 63 1524 23 0.0836 rmse standard 1.34 10 0.0606 Preproces…
## 4 47 768 12 0.112 rmse standard 1.61 10 0.0830 Preproces…
## 5 71 162 7 0.00108 rmse standard 2.37 10 0.0806 Preproces…
# Update the model by selecting the best
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune:: collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.918 Preprocessor1_Model1
## 2 rsq standard 0.0179 Preprocessor1_Model1
tune:: collect_predictions(data_fit) %>%
ggplot(aes(debut_rank, .pred)) +
geom_point(alpha = 1, fill = "pink") +
geom_abline(lty = 2, color = "purple") +
coord_fixed()