library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tidytext)
library(explore)
## Warning: package 'explore' was built under R version 4.2.3
library(spacyr)
## Warning: package 'spacyr' was built under R version 4.2.3
library(textrecipes)
## Loading required package: recipes
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.4     ✔ rsample      1.1.1
## ✔ dials        1.1.0     ✔ tune         1.0.1
## ✔ infer        1.0.4     ✔ workflows    1.1.2
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.1.0     ✔ yardstick    1.1.0
## Warning: package 'broom' was built under R version 4.2.3
## Warning: package 'parsnip' was built under R version 4.2.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()         masks purrr::discard()
## ✖ dplyr::filter()           masks stats::filter()
## ✖ recipes::fixed()          masks stringr::fixed()
## ✖ parsnip::get_dependency() masks spacyr::get_dependency()
## ✖ dplyr::lag()              masks stats::lag()
## ✖ yardstick::spec()         masks readr::spec()
## ✖ recipes::step()           masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(finetune)
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.2.3

Import Data

horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- horror_movies %>%
    
    mutate(vote_average = log1p(vote_average)) %>% 
   
    filter(!is.na(overview), vote_count != 0) %>%
    
    separate_rows(genre_names, sep = ", ") %>%
    
    filter(status == "Released") %>%
    
    select(id, vote_average, genre_names, overview, runtime, original_language) 

data <- data %>% sample_n(100)

Explore Data

data %>% glimpse()
data %>% skimr::skim()
data %>% select(id) %>% explore()
data %>% describe_all()
data %>% describe_cat(genre_names)
data %>% select(-id) %>% explore_all(target = vote_average)
data %>% 
    ggplot(aes(vote_average)) +
    geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>% count(original_language, sort = T)
## # A tibble: 17 × 2
##    original_language     n
##    <chr>             <int>
##  1 en                   72
##  2 ja                    7
##  3 zh                    3
##  4 es                    2
##  5 fr                    2
##  6 it                    2
##  7 ru                    2
##  8 cn                    1
##  9 da                    1
## 10 is                    1
## 11 ko                    1
## 12 lv                    1
## 13 ml                    1
## 14 pt                    1
## 15 sv                    1
## 16 th                    1
## 17 tl                    1
data %>%
    
    group_by(original_language) %>%
    summarise(
        n = n(),
        avg_vote_average = mean(vote_average) 
    ) %>%
    ungroup() %>%
    
    ggplot(aes(n, avg_vote_average)) +
    #geom_point() +
    geom_text(aes(label = original_language), check_overlap = TRUE) +
    geom_hline(yintercept = mean(data$vote_average), 
               linewidth = 2, linetype = "dotted", color = "darkgray") +
    
    scale_x_log10()

spacy_initialize(model = "en_core_web_sm")

# process documents and obtain a data.table
tidy_data <- data %>%
    
    # Parse overview
    mutate(overview_parsed = map(.x = .$overview, .f = ~spacy_parse(.x))) %>%
    unnest(overview_parsed) %>% 
    
    # Select nouns and adjectives 
    filter(pos %in% c("ADJ", "NOUN"))

data_filtered <- tidy_data %>%
    filter(str_detect(lemma, regex("[a-z]", ignore_case = TRUE))) %>%
    group_by(lemma) %>%
    summarise(
        n = n(),
        avg_vote_average = mean(vote_average)
    ) %>%
    filter(n > 150)

data_filtered %>%
    ggplot(aes(n, avg_vote_average)) +
    # geom_point() +
    geom_text(aes(label = lemma), check_overlap = TRUE) +
    geom_hline(yintercept = mean(data_filtered$avg_vote_average),
               linetype = "dotted", linewidth = 2, color = "darkgray") +
    scale_x_log10()sp
data %>%
    ggplot(aes(runtime, vote_average)) +
    geom_jitter(alpha = 0.3)

Build a Model

set.seed(123)
data_split <- initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- rsample::vfold_cv(data_train)
data_folds
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
use_xgboost(vote_average ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = vote_average ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(56024)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <- 
  recipe(formula = vote_average ~ ., data = data_train) %>% 
    recipes::update_role(id, new_role = "id") %>%
    step_tokenize(overview) %>%
    step_stopwords(overview) %>%
    step_tokenfilter(overview, max_tokens = 100) %>%
    step_tfidf(overview) %>%
    step_other(original_language) %>% 
    step_dummy(genre_names, original_language, one_hot = TRUE) %>%
    step_YeoJohnson(runtime)

xgboost_recipe %>% prep() %>% juice(new_data = NULL) %>% glimpse()
## Rows: 75
## Columns: 0
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(56024)
doParallel::registerDoParallel()

xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_folds, grid = 10)
# Explore Results
show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1   581    15          2    0.0253        1.98e- 4       0.738 rmse   
## 2  1116    32          6    0.0399        1.10e- 6       0.479 rmse   
## 3  1246     9          7    0.00902       7.48e-10       0.154 rmse   
## 4   357    40          3    0.0803        1.73e- 9       0.882 rmse   
## 5   861    25          9    0.0174        6.69e- 8       0.439 rmse   
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>
autoplot(xgboost_tune)

final_rf <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, "rmse"))
data_fit <- last_fit(final_rf, data_split)
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: TV Movie, Documentary
data_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits          id               .metrics .notes   .predictions .workflow 
##   <list>          <chr>            <list>   <list>   <list>       <list>    
## 1 <split [75/25]> train/test split <tibble> <tibble> <tibble>     <workflow>
## 
## There were issues with some computations:
## 
##   - Warning(s) x1: There are new levels in a factor: TV Movie, Documentary
## 
## Run `show_notes(.Last.tune.result)` for more information.

Evaluate Model

collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.232  Preprocessor1_Model1
## 2 rsq     standard      0.0483 Preprocessor1_Model1
collect_predictions(data_fit)
## # A tibble: 25 × 5
##    id               .pred  .row vote_average .config             
##    <chr>            <dbl> <int>        <dbl> <chr>               
##  1 train/test split  1.70     1         1.50 Preprocessor1_Model1
##  2 train/test split  1.88     2         1.99 Preprocessor1_Model1
##  3 train/test split  1.53     3         1.92 Preprocessor1_Model1
##  4 train/test split  1.77    10         2.01 Preprocessor1_Model1
##  5 train/test split  1.88    11         1.79 Preprocessor1_Model1
##  6 train/test split  1.97    24         2.20 Preprocessor1_Model1
##  7 train/test split  1.79    28         1.97 Preprocessor1_Model1
##  8 train/test split  1.88    35         1.84 Preprocessor1_Model1
##  9 train/test split  1.69    37         1.69 Preprocessor1_Model1
## 10 train/test split  1.88    44         1.57 Preprocessor1_Model1
## # ℹ 15 more rows
collect_predictions(data_fit) %>%
    ggplot(aes(vote_average, .pred)) +
    geom_point(alpha = 0.5, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") +
    coord_fixed()

data_fit %>%
    extract_workflow() %>%
    predict(data_test[1,])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  1.70
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
imp_spec <- xgboost_spec %>%
    tune::finalize_model(tune::select_best(xgboost_tune)) %>%
    parsnip::set_engine("xgboost", importance = "permutation")
## Warning: No value of `metric` was given; metric 'rmse' will be used.
workflows::workflow() %>%
    add_recipe(xgboost_recipe) %>%
    add_model(imp_spec) %>%
    fit(data_train) %>%
    workflows::extract_fit_parsnip() %>%
    vip()
## [04:51:53] WARNING: amalgamation/../src/learner.cc:627: 
## Parameters: { "importance" } might not be used.
## 
##   This could be a false alarm, with some parameters getting used by language bindings but
##   then being mistakenly passed down to XGBoost core, or some parameter actually being used
##   but getting flagged wrongly here. Please open an issue if you find any such cases.