Import Data

# Load the NYT titles dataset

# Import Data
nyt_titles <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_titles.tsv')
nyt_full <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-05-10/nyt_full.tsv')

Clean Data

skimr::skim(nyt_titles)

Data summary
Name	nyt_titles
Number of rows	7431
Number of columns	8
_______________________
Column type frequency:
character	2
Date	1
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	1	74	0	7172	0
author	4	1	4	73	0	2205	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
first_week	0	1	1931-10-12	2020-12-06	2000-06-25	3348

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	3715.00	2145.29	0	1857.5	3715	5572.5	7430	▇▇▇▇▇
year	1	1989.61	26.23	1931	1968.0	2000	2011.0	2020	▂▂▂▃▇
total_weeks	1	8.13	11.21	1	2.0	4	10.0	178	▇▁▁▁▁
debut_rank	1	7.90	4.57	1	4.0	8	12.0	17	▇▆▅▅▅
best_rank	1	6.91	4.57	1	3.0	6	10.0	17	▇▅▃▃▂

data <- nyt_titles %>%
  select(-id) %>%
  filter(total_weeks > 0) %>%
  mutate(total_weeks = log(total_weeks))

Explore data

Year

data %>%
    ggplot(aes(total_weeks, year)) +
    scale_y_log10() +
    geom_point()

Authur

data %>%
    group_by(author) %>%
    filter(n() > 40) %>%
    ungroup() %>%
    ggplot(aes(total_weeks, as.factor(author))) +
    geom_boxplot() +
    coord_flip()

title

data %>%
  # tokenize title
  unnest_tokens(output = word, input = title) %>%
  
  # calculate avg weeks per word
  group_by(word) %>%
  summarise(total_weeks = mean(total_weeks),
            n = n()) %>%
  ungroup() %>%
  filter(n > 10, !str_detect(word, '\\d')) %>%
  slice_max(order_by = total_weeks, n = 20) %>%
  
  ggplot(aes(total_weeks, fct_reorder(word, total_weeks))) +
  geom_point() +
  labs(y = 'words in title')

shortcut

data_binarized_tbl <- data %>%
    # Drop first_week here so the correlation funnel does not break on date formats
    select(-title, -author, -first_week) %>% 
    binarize()

target_col_name <- names(data_binarized_tbl) %>%
    str_subset('total_weeks') %>%
    tail(1)

data_corr_tbl <- data_binarized_tbl %>%
    correlate(target = !!sym(target_col_name))

data_corr_tbl %>%
    plot_correlation_funnel()

#split data

set.seed(1234)

# train and test dataset
data_split <- rsample::initial_split(data, prop = 0.75)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(2345)
data_cv <- rsample::vfold_cv(data_train, v = 5)

#create model

# 3. Experiment with different preprocessing techniques
rf_recipe <- 
  recipe(formula = total_weeks ~ ., data = data_train) %>% 
  
  # Drop the date column to prevent novel testing errors later
  step_rm(first_week) %>% 
  
  # Convert character strings to factors safely
  step_string2factor(all_nominal_predictors()) %>% 
  
  # Requirement 2: Impute missing values for the newly added columns
  step_impute_mean(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  
  # Requirement 3: Use step_tf instead of step_tfidf
  step_tokenize(title) %>% 
  step_tokenfilter(title, max_tokens = 100) %>% 
  step_tf(title) %>% 
  
  step_novel(all_nominal_predictors()) %>% 
  step_other(author, threshold = 0.02) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) 


# 4. Explore various machine learning algorithms (Random Forest)
rf_spec <- 
  rand_forest(trees = tune(), min_n = tune()) %>% 
  set_mode('regression') %>% 
  set_engine('ranger')

rf_workflow <- 
  workflow() %>% 
  add_recipe(rf_recipe) %>% 
  add_model(rf_spec)

# Tune 
set.seed(344)
rf_tune <- 
  tune_grid(rf_workflow, 
            resamples = data_cv, 
            grid = 5)

best_rmse <- select_best(rf_tune, metric = 'rmse')
final_rf <- finalize_workflow(
  rf_workflow,
  best_rmse
)
final_fit <- last_fit(final_rf, data_split)
collect_metrics(final_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config        
##   <chr>   <chr>          <dbl> <chr>          
## 1 rmse    standard       0.590 pre0_mod0_post0
## 2 rsq     standard       0.721 pre0_mod0_post0

# Visualize 
collect_predictions(final_fit) %>%
  ggplot(aes(total_weeks, .pred)) +
  geom_point(alpha = 0.5, color = 'midnightblue') +
  geom_abline(lty = 2, color = 'gray50') +
  coord_obs_pred() +
  labs(x = 'Actual Total Weeks (Log)',
       y = 'Predicted Total Weeks (Log)',
       title = 'Random Forest Model: Actual vs Predicted')

Apply 4

Import Data

Clean Data

Explore data

Authur

title

shortcut