data <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-10/nyt_titles.tsv')
data %>% skimr::skim()
Name | Piped data |
Number of rows | 7431 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- data %>%
# Convert character to factor
mutate(across(where(is.character), factor)) %>%
# Remove missing values
filter(!is.na(author))
*** Identify variables with correlation with the target variable.***
data %>% count(total_weeks)
## # A tibble: 90 × 2
## total_weeks n
## <dbl> <int>
## 1 1 1684
## 2 2 816
## 3 3 700
## 4 4 599
## 5 5 478
## 6 6 406
## 7 7 298
## 8 8 287
## 9 9 198
## 10 10 202
## # ℹ 80 more rows
data %>%
ggplot(aes(total_weeks, debut_rank)) +
geom_point(color= "steelblue") +
geom_smooth(method = "lm",
formula = y ~ poly(x, 2),
color = "indianred3")
data %>%
ggplot(aes(total_weeks, best_rank)) +
geom_point(color= "steelblue") +
geom_smooth(color = "tomato")
set.seed(123)
data_split <- data %>%
initial_split(strata = total_weeks)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(156)
data_fold <- vfold_cv(data_train, strata = total_weeks)
data_fold
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [5010/559]> Fold01
## 2 <split [5011/558]> Fold02
## 3 <split [5011/558]> Fold03
## 4 <split [5011/558]> Fold04
## 5 <split [5012/557]> Fold05
## 6 <split [5013/556]> Fold06
## 7 <split [5013/556]> Fold07
## 8 <split [5013/556]> Fold08
## 9 <split [5013/556]> Fold09
## 10 <split [5014/555]> Fold10
split_category <- function(x) {
x %>%
str_split(", ") %>%
map(str_remove_all, "[:punct:]") %>%
map(str_squish) %>%
map(str_to_lower) %>%
map(str_replace_all, " ", "_")
}
data_rec <-
recipe(total_weeks ~ ., data = data_train) %>%
step_tokenize(debut_rank, custom_token = split_category) %>%
step_tokenfilter(debut_rank, max_tokens = 30) %>%
step_tf(debut_rank)
xgb_spec <-
boost_tree(
trees = tune(),
mtry = tune(),
min_n = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("regression")
xgb_wf <- workflows::workflow(data_rec, xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## trees = tune()
## min_n = tune()
## learn_rate = 0.01
##
## Computational engine: xgboost