Import Data

# Load the NYT titles dataset

# Import Data
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
nyt_full <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_full.tsv')

Clean Data

skimr::skim(nyt_titles)
Data summary
Name nyt_titles
Number of rows 7431
Number of columns 8
_______________________
Column type frequency:
character 2
Date 1
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 74 0 7172 0
author 4 1 4 73 0 2205 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
first_week 0 1 1931-10-12 2020-12-06 2000-06-25 3348

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 3715.00 2145.29 0 1857.5 3715 5572.5 7430 ▇▇▇▇▇
year 0 1 1989.61 26.23 1931 1968.0 2000 2011.0 2020 ▂▂▂▃▇
total_weeks 0 1 8.13 11.21 1 2.0 4 10.0 178 ▇▁▁▁▁
debut_rank 0 1 7.90 4.57 1 4.0 8 12.0 17 ▇▆▅▅▅
best_rank 0 1 6.91 4.57 1 3.0 6 10.0 17 ▇▅▃▃▂
data <- nyt_titles %>%
  select(-id) %>%
  filter(total_weeks > 0) %>%
  mutate(total_weeks = log(total_weeks))

Explore data

Year

data %>%
    ggplot(aes(total_weeks, year)) +
    scale_y_log10() +
    geom_point()

Authur

data %>%
    group_by(author) %>%
    filter(n() > 40) %>%
    ungroup() %>%
    ggplot(aes(total_weeks, as.factor(author))) +
    geom_boxplot() +
    coord_flip()

title

data %>%
  # tokenize title
  unnest_tokens(output = word, input = title) %>%
  
  # calculate avg weeks per word
  group_by(word) %>%
  summarise(total_weeks = mean(total_weeks),
            n = n()) %>%
  ungroup() %>%
  filter(n > 10, !str_detect(word, '\\d')) %>%
  slice_max(order_by = total_weeks, n = 20) %>%
  
  ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
  geom_point() +
  labs(y = 'words in title')

shortcut

data_binarized_tbl <- data %>%
    # Drop first_week here so the correlation funnel does not break on date formats
    select(-title, -author, -first_week) %>% 
    binarize()

target_col_name <- names(data_binarized_tbl) %>%
    str_subset('total_weeks') %>%
    tail(1)

data_corr_tbl <- data_binarized_tbl %>%
    correlate(target = !!sym(target_col_name))

data_corr_tbl %>%
    plot_correlation_funnel()

#split data

set.seed(1234)

# train and test dataset
data_split <- rsample::initial_split(data, prop = 0.75)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(2345)
data_cv <- rsample::vfold_cv(data_train, v = 5)

#create model

# 3. Experiment with different preprocessing techniques
rf_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>% 
  
  # Drop the date column to prevent novel testing errors later
  step_rm(first_week) %>% 
  
  # Convert character strings to factors safely
  step_string2factor(all_nominal_predictors()) %>% 
  
  # Requirement 2: Impute missing values for the newly added columns
  step_impute_mean(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  
  # Requirement 3: Use step_tf instead of step_tfidf
  step_tokenize(title) %>% 
  step_tokenfilter(title, max_tokens = 100) %>% 
  step_tf(title) %>% 
  
  step_novel(all_nominal_predictors()) %>% 
  step_other(author, threshold = 0.02) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) 


# 4. Explore various machine learning algorithms (Random Forest)
rf_spec <- 
  rand_forest(trees = tune(), min_n = tune()) %>% 
  set_mode('regression') %>% 
  set_engine('ranger')

rf_workflow <- 
  workflow() %>% 
  add_recipe(rf_recipe) %>% 
  add_model(rf_spec)

# Tune 
set.seed(344)
rf_tune <- 
  tune_grid(rf_workflow, 
            resamples = data_cv, 
            grid = 5)
best_rmse <- select_best(rf_tune, metric = 'rmse')
final_rf <- finalize_workflow(
  rf_workflow,
  best_rmse
)
final_fit <- last_fit(final_rf, data_split)
collect_metrics(final_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config        
##   <chr>   <chr>          <dbl> <chr>          
## 1 rmse    standard       0.590 pre0_mod0_post0
## 2 rsq     standard       0.721 pre0_mod0_post0
# Visualize 
collect_predictions(final_fit) %>%
  ggplot(aes(total_weeks, .pred)) +
  geom_point(alpha = 0.5, color = 'midnightblue') +
  geom_abline(lty = 2, color = 'gray50') +
  coord_obs_pred() +
  labs(x = 'Actual Total Weeks (Log)',
       y = 'Predicted Total Weeks (Log)',
       title = 'Random Forest Model: Actual vs Predicted')