# Load the NYT titles dataset
# Import Data
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
nyt_full <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_full.tsv')
skimr::skim(nyt_titles)
| Name | nyt_titles |
| Number of rows | 7431 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
| author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
| year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
| total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
| debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
| best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
select(-id) %>%
filter(total_weeks > 0) %>%
mutate(total_weeks = log(total_weeks))
Year
data %>%
ggplot(aes(total_weeks, year)) +
scale_y_log10() +
geom_point()
data %>%
group_by(author) %>%
filter(n() > 40) %>%
ungroup() %>%
ggplot(aes(total_weeks, as.factor(author))) +
geom_boxplot() +
coord_flip()
data %>%
# tokenize title
unnest_tokens(output = word, input = title) %>%
# calculate avg weeks per word
group_by(word) %>%
summarise(total_weeks = mean(total_weeks),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\d")) %>%
slice_max(order_by = total_weeks, n = 20) %>%
ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
geom_point() +
labs(y = "words in title")
data_binarized_tbl <- data %>%
select(-title, -author, -first_week) %>%
binarize()
target_col_name <- names(data_binarized_tbl) %>%
str_subset("total_weeks") %>%
tail(1)
data_corr_tbl <- data_binarized_tbl %>%
correlate(target = !!sym(target_col_name))
data_corr_tbl %>%
plot_correlation_funnel()
#split data
set.seed(1234)
data_split <- rsample::initial_split(data, prop = 0.75)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train, v = 5)
#create model
# 3. Experiment with different preprocessing techniques
rf_recipe <-
recipe(formula = total_weeks ~ ., data = data_train) %>%
# FIX 1: Drop first_week immediately
step_rm(first_week) %>%
# FIX 3: Convert character strings to factors immediately so novel/dummy steps don't crash
step_string2factor(all_nominal_predictors()) %>%
# Impute missing values since we kept them earlier
step_impute_mean(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tf(title) %>%
# FIX 2: Catch any new authors in data_test before dummying
step_novel(all_nominal_predictors()) %>%
step_other(author, threshold = 0.02) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE)
# 4. Explore various machine learning algorithms
rf_spec <-
rand_forest(trees = tune(), min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
rf_workflow <-
workflow() %>%
add_recipe(rf_recipe) %>%
add_model(rf_spec)
set.seed(344)
rf_tune <-
tune_grid(rf_workflow,
resamples = data_cv,
grid = 2) # Lowered to 2 just to make it run faster while we test!
best_rmse <- select_best(rf_tune, metric = 'rmse')
final_rf <- finalize_workflow(
rf_workflow,
best_rmse
)
final_fit <- last_fit(final_rf, data_split)
collect_metrics(final_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.590 pre0_mod0_post0
## 2 rsq standard 0.721 pre0_mod0_post0
# Visualize
collect_predictions(final_fit) %>%
ggplot(aes(total_weeks, .pred)) +
geom_point(alpha = 0.5, color = 'midnightblue') +
geom_abline(lty = 2, color = 'gray50') +
coord_obs_pred() +
labs(x = 'Actual Total Weeks (Log)',
y = 'Predicted Total Weeks (Log)',
title = 'Random Forest Model: Actual vs Predicted')