Import Data

# Load the NYT titles dataset

# Import Data
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
nyt_full <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_full.tsv')

Clean Data

skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
data <- nyt_titles %>%
  select(-id) %>%
  filter(total_weeks > 0) %>%
  mutate(bestseller_category = if_else(total_weeks > 5, 'long_term', 'short_term')) %>%
  mutate(bestseller_category = as.factor(bestseller_category)) %>%
  select(-total_weeks)

Explore data

Year

data %>%
  ggplot(aes(bestseller_category, year)) +
  geom_boxplot()

Authur

data %>%
  group_by(author) %>%
  filter(n() > 40) %>%
  ungroup() %>%
  ggplot(aes(y = as.factor(author), fill = bestseller_category)) +
  geom_bar(position = 'fill') +
  labs(x = 'Proportion', y = 'Author')

title

data %>%
  unnest_tokens(output = word, input = title) %>%
  count(word, sort = TRUE) %>%
  filter(!str_detect(word, '\\d')) %>%
  slice_max(n, n = 20) %>%
  ggplot(aes(n, fct_reorder(word, n))) +
  geom_col() +
  labs(y = 'words in title', x = 'count')

shortcut

data_binarized_tbl <- data %>%
  select(-title, -author, -first_week) %>% 
  binarize()

target_col_name <- names(data_binarized_tbl) %>%
  str_subset('bestseller_category') %>%
  tail(1)

data_corr_tbl <- data_binarized_tbl %>%
  correlate(target = !!sym(target_col_name))

data_corr_tbl %>%
  plot_correlation_funnel()

#split data

set.seed(1234)

data_split <- rsample::initial_split(data, prop = 0.75, strata = bestseller_category)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(2345)
data_cv <- rsample::vfold_cv(data_train, v = 5, strata = bestseller_category)

#create model

# Preprocess Data

rf_recipe <- 
  recipe(formula = bestseller_category ~ ., data = data_train) %>% 
  step_rm(first_week) %>% 
  step_string2factor(all_nominal_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  step_impute_median(all_numeric_predictors()) %>% 
  step_tokenize(title) %>% 
  step_tokenfilter(title, max_tokens = 100) %>% 
  step_tf(title) %>% 
  step_novel(all_nominal_predictors()) %>% 
  step_other(author, threshold = 0.02) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors())

rf_spec <- 
  rand_forest(trees = tune(), min_n = tune()) %>% 
  set_mode('classification') %>% 
  set_engine('ranger')

rf_workflow <- 
  workflow() %>% 
  add_recipe(rf_recipe) %>% 
  add_model(rf_spec)

set.seed(344)
rf_tune <- 
  tune_grid(rf_workflow, 
            resamples = data_cv, 
            grid = 2)
best_auc <- select_best(rf_tune, metric = 'roc_auc')

final_rf <- finalize_workflow(
  rf_workflow,
  best_auc
)

final_fit <- last_fit(final_rf, data_split)
collect_metrics(final_fit)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config        
##   <chr>       <chr>          <dbl> <chr>          
## 1 accuracy    binary         0.861 pre0_mod0_post0
## 2 roc_auc     binary         0.928 pre0_mod0_post0
## 3 brier_class binary         0.104 pre0_mod0_post0
# Visualize 
collect_predictions(final_fit) %>%
  roc_curve(truth = bestseller_category, .pred_long_term) %>%
  autoplot() +
  labs(title = 'Random Forest Classification: ROC Curve')