# Load the NYT titles dataset
# Import Data
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
nyt_full <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_full.tsv')
skimr::skim(nyt_titles)
| Name | nyt_titles |
| Number of rows | 7431 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 1 | 74 | 0 | 7172 | 0 |
| author | 4 | 1 | 4 | 73 | 0 | 2205 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| first_week | 0 | 1 | 1931-10-12 | 2020-12-06 | 2000-06-25 | 3348 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 3715.00 | 2145.29 | 0 | 1857.5 | 3715 | 5572.5 | 7430 | ▇▇▇▇▇ |
| year | 0 | 1 | 1989.61 | 26.23 | 1931 | 1968.0 | 2000 | 2011.0 | 2020 | ▂▂▂▃▇ |
| total_weeks | 0 | 1 | 8.13 | 11.21 | 1 | 2.0 | 4 | 10.0 | 178 | ▇▁▁▁▁ |
| debut_rank | 0 | 1 | 7.90 | 4.57 | 1 | 4.0 | 8 | 12.0 | 17 | ▇▆▅▅▅ |
| best_rank | 0 | 1 | 6.91 | 4.57 | 1 | 3.0 | 6 | 10.0 | 17 | ▇▅▃▃▂ |
data <- nyt_titles %>%
select(-id) %>%
filter(total_weeks > 0) %>%
mutate(bestseller_category = if_else(total_weeks > 5, 'long_term', 'short_term')) %>%
mutate(bestseller_category = as.factor(bestseller_category)) %>%
select(-total_weeks)
Year
data %>%
ggplot(aes(bestseller_category, year)) +
geom_boxplot()
data %>%
group_by(author) %>%
filter(n() > 40) %>%
ungroup() %>%
ggplot(aes(y = as.factor(author), fill = bestseller_category)) +
geom_bar(position = 'fill') +
labs(x = 'Proportion', y = 'Author')
data %>%
unnest_tokens(output = word, input = title) %>%
count(word, sort = TRUE) %>%
filter(!str_detect(word, '\\d')) %>%
slice_max(n, n = 20) %>%
ggplot(aes(n, fct_reorder(word, n))) +
geom_col() +
labs(y = 'words in title', x = 'count')
data_binarized_tbl <- data %>%
select(-title, -author, -first_week) %>%
binarize()
target_col_name <- names(data_binarized_tbl) %>%
str_subset('bestseller_category') %>%
tail(1)
data_corr_tbl <- data_binarized_tbl %>%
correlate(target = !!sym(target_col_name))
data_corr_tbl %>%
plot_correlation_funnel()
#split data
set.seed(1234)
data_split <- rsample::initial_split(data, prop = 0.75, strata = bestseller_category)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train, v = 5, strata = bestseller_category)
#create model
# Preprocess Data
rf_recipe <-
recipe(formula = bestseller_category ~ ., data = data_train) %>%
step_rm(first_week) %>%
step_string2factor(all_nominal_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
step_impute_median(all_numeric_predictors()) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tf(title) %>%
step_novel(all_nominal_predictors()) %>%
step_other(author, threshold = 0.02) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
rf_spec <-
rand_forest(trees = tune(), min_n = tune()) %>%
set_mode('classification') %>%
set_engine('ranger')
rf_workflow <-
workflow() %>%
add_recipe(rf_recipe) %>%
add_model(rf_spec)
set.seed(344)
rf_tune <-
tune_grid(rf_workflow,
resamples = data_cv,
grid = 2)
best_auc <- select_best(rf_tune, metric = 'roc_auc')
final_rf <- finalize_workflow(
rf_workflow,
best_auc
)
final_fit <- last_fit(final_rf, data_split)
collect_metrics(final_fit)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.861 pre0_mod0_post0
## 2 roc_auc binary 0.928 pre0_mod0_post0
## 3 brier_class binary 0.104 pre0_mod0_post0
# Visualize
collect_predictions(final_fit) %>%
roc_curve(truth = bestseller_category, .pred_long_term) %>%
autoplot() +
labs(title = 'Random Forest Classification: ROC Curve')