Goal: Click here for the data
nyt_titles <- read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
skimr::skim(nyt_titles)
Name | nyt_titles |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
# Treat missing values
# select(-author) %>%
#na.omit() %>%
# log transform variables with pos-skewed distribution
mutate(best = log(total_weeks))
Identify good predictors.
best rank
data %>%
ggplot(aes(total_weeks, best_rank)) +
scale_y_log10() +
geom_point()
data %>%
ggplot(aes(total_weeks, as.factor(debut_rank))) +
geom_boxplot()
data %>%
# tokenize title
unnest_tokens(output = word, input = author) %>%
# calculate avg best rank per week
group_by(word) %>%
summarise(weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\d")) %>%
slice_max(order_by = weeks, n = 50) %>%
# Plot
ggplot(aes(weeks, fct_reorder(word, weeks))) +
geom_point() +
labs(y = "Author")
EDA shortcut
data2 <- nyt_titles %>%
# Treat missing values
select(-author) %>%
na.omit()
# Step 1: Prepare data
data_binarized_tbl <- data2 %>%
select(-id, -title) %>%
# Extract date features from first_week
mutate(year = lubridate::year(first_week),
month = lubridate::month(first_week, label = TRUE),
weekday = lubridate::wday(first_week, label = TRUE)) %>%
select(-first_week) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 7,431
## Columns: 30
## $ `year__-Inf_1968` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ year__1968_2000 <dbl> 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, …
## $ year__2000_2011 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ year__2011_Inf <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `total_weeks__-Inf_2` <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ total_weeks__2_4 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ total_weeks__4_10 <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, …
## $ total_weeks__10_Inf <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `debut_rank__-Inf_4` <dbl> 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ debut_rank__4_8 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, …
## $ debut_rank__8_12 <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, …
## $ debut_rank__12_Inf <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ `best_rank__-Inf_3` <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ best_rank__3_6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ best_rank__6_10 <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ best_rank__10_Inf <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, …
## $ month__01 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__02 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ month__03 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ month__04 <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ month__05 <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ month__06 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__07 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ month__08 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ month__09 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ month__11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ month__12 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ weekday__Sun <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ weekday__Mon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correate
data_corr_tblNY <- data_binarized_tbl %>%
correlate(best_rank__6_10)
data_corr_tblNY
## # A tibble: 30 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 best_rank 6_10 1
## 2 best_rank -Inf_3 -0.362
## 3 best_rank 10_Inf -0.317
## 4 best_rank 3_6 -0.288
## 5 total_weeks 10_Inf -0.130
## 6 weekday Sun 0.0744
## 7 weekday Mon -0.0744
## 8 total_weeks -Inf_2 0.0699
## 9 total_weeks 4_10 0.0464
## 10 debut_rank -Inf_4 -0.0290
## # ℹ 20 more rows
# Step 3: Plot
data_corr_tblNY %>%
plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Split data
data <- sample_n(data, 100)
# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split) %>%
select(-first_week)
data_test <- testing(data_split)
# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
usemodels:: use_xgboost(author ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = author ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(6804)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Create the recipe
xgboost_recipe <-
recipe(formula = total_weeks ~ ., data = data_train) %>%
recipes::update_role(total_weeks, new_role = "outcome") %>%
step_dummy(title, author) %>% # Convert title to dummy variables
step_YeoJohnson(year, best_rank, total_weeks)
# Prepare the recipe and inspect the data
prepared_data <- xgboost_recipe %>% prep() %>% juice()
glimpse(prepared_data)
## Rows: 75
## Columns: 152
## $ id <dbl> 1349, 6414, 3817, 7322, …
## $ year <dbl> 2012, 1934, 1999, 1974, …
## $ debut_rank <dbl> 4, 6, 14, 9, 14, 3, 1, 6…
## $ best_rank <dbl> 0.8127585, 3.1043761, 1.…
## $ best <dbl> 1.7917595, 1.0986123, 1.…
## $ total_weeks <dbl> 1.5260825, 1.1633586, 1.…
## $ title_A.DENSITY.OF.SOULS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_A.FABLE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_A.TREE.GROWS.IN.BROOKLYN <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ title_ALL.THAT.REMAINS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_AND.RIDE.A.TIGER <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ title_ANSWERED.PRAYERS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BLACK.LEOPARD..RED.WOLF <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BLOOD.WORK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_BONES.OF.THE.LOST <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CASHELMARA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CEREMONY.OF.THE.INNOCENT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CERTAIN.GIRLS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_CLAUDIUS.THE.GOD <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_COME.WITH.ME.TO.MACEDONIA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_COPPER.BEACH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEADLOCKED <dbl> 1, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEADLY.DECISIONS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEATH.IN.THE.AIR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEATH.OF.KINGS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DEVICES.AND.DESIRES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_DOTING <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_GALAPAGOS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_GARDEN.OF.BEASTS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_GREEN.DOLPHIN.STREET <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HARM.S.WAY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HAUNTED <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HER.OWN.RULES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_HOLLYWOOD.HUSBANDS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_I.DON.T.KNOW.HOW.SHE.DOES.IT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_JUROR..3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_LIGHT.IN.SHADOW <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_LOVES.MUSIC..LOVES.TO.DANCE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_MINI.SHOPAHOLIC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NERILKA.S.STORY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_NIGHT.OF.THE.HAWK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_O.SHEPHERD..SPEAK..By...Viking.. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_PRIVATE.ENTERPRISE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_PURELY.ACADEMIC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RACHEL.CADE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RAGTIME <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RESTLESS.ARE.THE.SAILS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_RIVER.S.END <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ title_RIVERS.OF.GLORY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_ROBERT.B..PARKER.S.LULLABY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_ROLE.OF.HONOR <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SEDUCING.AN.ANGEL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_SLOW.WALTZ.IN.CEDAR.BEND <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ title_SOMEDAY..SOMEDAY..MAYBE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.CHASE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.CLONE.WARS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.DEPARTMENT.OF.SENSITIVE.CRIMES <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.FOUNTAIN.OVERFLOWS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.GENERAL.S.DAUGHTER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.HIGH.KING.OF.MONTIVAL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.HOURGLASS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.KILL.SWITCH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.LACUNA <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.LONG.LOVE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.MAPPING.OF.LOVE.AND.DEATH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.MIDNIGHT.HOUSE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.PAINTED.QUEEN <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.PRINCE.OF.BEVERLY.HILLS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.SCAPEGOAT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.SCARPETTA.FACTOR <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ title_THE.SEARCH <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.STRANGE.BOARDERS.AT.PALACE.CRESCENT <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ title_THE.THREE.SIRENS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THE.WITCHES.OF.EASTWICK <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_THINGS.YOU.SAVE.IN.A.FIRE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_TOUGH.GUYS.DON.T.DANCE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_WAYFARING.STRANGER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_WINTER.KILLS <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ title_WITHIN.THIS.PRESENT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ title_WIZARD.AND.GLASS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Agatha.Christie <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Albert.Brooks <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Alex.Berenson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Alexander.McCall.Smith <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Allison.Pearson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Angela.Mackail.Thirkell <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Anne.McCaffrey <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Barbara.Kingsolver <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Barbara.Taylor.Bradford <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Bernard.Cornwell <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Betty.Smith <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ author_Charlaine.Harris <dbl> 1, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Charles.E..Mercer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Christopher.Rice <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Dale.Brown <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Danielle.Steel <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Daphne.du.Maurier <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_E..L..Doctorow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_E..Phillips.Oppenheim <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ author_Edwin.Gilbert <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Elizabeth.Goudge <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Elizabeth.Peters.and.Joan.Hess <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Evelyn.Eaton <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_F..Van.Wyck.Mason <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Henry.Green <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Iris.Johansen <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Irving.Wallace <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jackie.Collins <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jacqueline.Winspear <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.E..Bassett <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.Lee.Burke <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.Patterson.and.James.O..Born <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.Patterson.and.Nancy.Allen <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_James.Rollins.and.Grant.Blackwood <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Janet.Evanovich.and.Lee.Goldberg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jayne.Ann.Krentz <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jeffery.Deaver <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Jennifer.Weiner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_John.Gardner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_John.Sedges <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_John.Updike <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Karen.Traviss <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Katherine.Center <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Kathy.Reichs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Kurt.Vonnegut <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Lauren.Graham <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Leonard.Drohan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Margaret.Ayer.Barnes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Marlon.James <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Mary.Balogh <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Mary.Higgins.Clark <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Michael.Connelly <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Nelson.DeMille <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Nora.Roberts <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ author_Norman.Mailer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_P..D..James <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Patricia.Cornwell <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ author_Patricia.D..Cornwell <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Rebecca.West <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Richard.Condon <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ author_Robert.Graves <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Robert.James.Waller <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ author_Robert.Wilder <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ author_S..M..Stirling <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Sophie.Kinsella <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Stephen.King <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Stringfellow.Barr <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Stuart.Woods <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Susan.Howatch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Taylor.Caldwell <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_Upton.Sinclair <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ author_William.Faulkner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Set up the XGBoost model specification
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(344)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
## Warning: package 'xgboost' was built under R version 4.3.3
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 9
## trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 114 2 0.0584 rmse standard 0.0131 10 0.00587 Preprocessor1_…
## 2 986 13 0.00657 rmse standard 0.0608 10 0.0143 Preprocessor1_…
## 3 779 23 0.00224 rmse standard 0.258 10 0.0282 Preprocessor1_…
## 4 1986 32 0.0137 rmse standard 0.276 10 0.0349 Preprocessor1_…
## 5 1427 38 0.153 rmse standard 0.553 10 0.0234 Preprocessor1_…
# Update the model by selecting the best
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune:: collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.0103 Preprocessor1_Model1
## 2 rsq standard 1.00 Preprocessor1_Model1
tune:: collect_predictions(data_fit) %>%
ggplot(aes(total_weeks, .pred)) +
geom_point(alpha = 1, fill = "pink") +
geom_abline(lty = 2, color = "purple") +
coord_fixed()
data %>%
ggplot(aes(total_weeks)) +
geom_histogram(bins = 20)
library(tidytext)
tidy_data <-
data %>%
unnest_tokens(name, author)
tidy_data %>%
count(name, sort = TRUE)
## # A tibble: 192 × 2
## name n
## <chr> <int>
## 1 james 9
## 2 and 6
## 3 john 5
## 4 e 4
## 5 elizabeth 4
## 6 cornwell 3
## 7 robert 3
## 8 allen 2
## 9 ann 2
## 10 anne 2
## # ℹ 182 more rows
tidy_data %>%
group_by(name) %>%
summarise(n = n(),
weeks = mean(total_weeks)) %>%
ggplot(aes(n, weeks)) +
geom_hline(yintercept = mean(data$total_weeks),
lty = 2, color = "gray50", linewidth = 2) +
geom_point(color = "midnightblue", alpha = 1) +
geom_text(aes(label = weeks), check_overlap = TRUE, vjust = "top", hjust = "left") +
scale_x_log10()
library(recipes)
data_recipe <-
recipe(formula = total_weeks ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id variable") %>%
step_tokenize(author, title) %>%
step_normalize(year, total_weeks, debut_rank, best_rank)
ranger_spec <-
rand_forest(trees = 500) %>%
set_mode("regression")
ranger_spec
## Random Forest Model Specification (regression)
##
## Main Arguments:
## trees = 500
##
## Computational engine: ranger
svm_spec <-
svm_linear() %>%
set_mode("regression")
svm_spec
## Linear Support Vector Machine Model Specification (regression)
##
## Computational engine: LiblineaR
ranger_wf <- workflow(data_recipe, ranger_spec)
svm_wf <- workflow(data_recipe, svm_spec)
doParallel :: registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)
svm_rs <- fit_resamples(
svm_wf,
resamples = data_cv,
control = contrl_preds
)
## Warning: package 'LiblineaR' was built under R version 4.3.3
## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.
ranger_rs <- fit_resamples(
ranger_wf,
resamples = data_cv,
control = contrl_preds
)
## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.
#collect_metrics(svm_rs)
#collect_metrics(ranger_rs)
#bind_rows(
# collect_predictions(svm_rs) %>%
# mutate(mod= "SVM"),
# collect_predictions(ranger_rs) %>%
# mutate(mod = "ranger")
#) %>%
# ggplot(aes(place, .pred, color = id)) +
# geom_abline(lty = 2, color = "gray50", size = 1.2) +
# facet_wrap(vars(mod)) +
#coord_fixed()
final_fitted <- last_fit(svm_wf, data_split)
## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.
collect_metrics(final_fitted)
## NULL
#final_datawf <- extract_workflow(final_fitted)
#predict(final_datawf, data_test[55,])
#extract_workflow(final_fitted) %>%
#tidy() %>%
#filter(term != "Bias") %>%
#group_by(estimate > 0) %>%
##slice_max(abs(estimate), n = 10) %>%
#ungroup() %>%
#mutate(term = str_remove(term, "tf_author")) %>%
#ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0))
#+
#geom_col(alpha = 0.8)