Apply 3: Anton Jellvik

Explore data

library(tidyverse)

horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')

## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(horror_movies, aes(vote_average)) +
    geom_histogram(alpha = 0.8)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

horror_movies %>%
    filter(!is.na(runtime)) %>%
    mutate(runtime = cut(runtime, 10)) %>%
    
  ggplot(aes(runtime, vote_average, fill = runtime)) +
    geom_boxplot(alpha = 0.7, show.legend = FALSE)

Tune an xgboost model

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

set.seed(123)
horror_split <-
  horror_movies %>%
  select(title, vote_average, tagline) %>%
  na.omit() %>%
  initial_split(strata = vote_average)
horror_train <- training(horror_split)
horror_test <- testing(horror_split)

set.seed(234)
horror_folds <- vfold_cv(horror_train, strata = vote_average)
horror_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [8572/955]> Fold01
##  2 <split [8572/955]> Fold02
##  3 <split [8574/953]> Fold03
##  4 <split [8574/953]> Fold04
##  5 <split [8574/953]> Fold05
##  6 <split [8575/952]> Fold06
##  7 <split [8575/952]> Fold07
##  8 <split [8575/952]> Fold08
##  9 <split [8576/951]> Fold09
## 10 <split [8576/951]> Fold10

library(textrecipes)

split_tagline <- function(x) {
    x %>%
        str_split(" ") %>%
        map(str_remove_all, "[:punct:]") %>%
        map(str_squish) %>%
        map(str_to_lower) %>%
        map(str_replace_all, " ", "_")
}
horror_rec <- 
    recipe(vote_average ~ ., data = horror_train) %>%
    update_role(title, new_role = "id") %>%
    step_tokenize(tagline, custom_token = split_tagline) %>%
    step_tokenfilter(tagline, max_tokens = 100) %>%
    step_tf(tagline)

## just to make sure this works as expected
horror_movie_prep <- prep(horror_rec)
bake(horror_movie_prep, new_data = NULL) %>% str()

## tibble [9,527 × 102] (S3: tbl_df/tbl/data.frame)
##  $ title               : Factor w/ 9088 levels "'93: Del Soul",..: 4197 1723 6785 8125 7571 6588 3093 7764 1083 1949 ...
##  $ vote_average        : num [1:9527] 2 1.8 2 0 0 0 0 0 0 0 ...
##  $ tf_tagline_         : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_a        : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_about    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_all      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_an       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_and      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_are      : int [1:9527] 0 0 0 0 0 1 0 0 1 0 ...
##  $ tf_tagline_at       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_back     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_be       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_blood    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_but      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_by       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_can      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_cant     : int [1:9527] 0 0 0 0 0 0 0 0 1 0 ...
##  $ tf_tagline_come     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_comes    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_dead     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_death    : int [1:9527] 0 0 0 1 0 0 0 0 0 0 ...
##  $ tf_tagline_die      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_do       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_dont     : int [1:9527] 0 0 1 0 0 0 0 0 0 0 ...
##  $ tf_tagline_every    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_evil     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_fear     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_film     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_for      : int [1:9527] 0 1 0 0 0 0 0 0 0 1 ...
##  $ tf_tagline_from     : int [1:9527] 0 0 0 0 0 0 0 0 1 0 ...
##  $ tf_tagline_get      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_go       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_has      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_have     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_he       : int [1:9527] 0 0 0 1 0 0 0 0 0 0 ...
##  $ tf_tagline_hell     : int [1:9527] 0 0 0 0 0 1 0 0 0 0 ...
##  $ tf_tagline_her      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_hes      : int [1:9527] 0 0 0 0 0 0 0 0 0 1 ...
##  $ tf_tagline_his      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_home     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_horror   : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_i        : int [1:9527] 0 0 1 0 0 0 0 0 0 0 ...
##  $ tf_tagline_if       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_in       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_into     : int [1:9527] 0 0 0 0 0 1 0 0 0 0 ...
##  $ tf_tagline_is       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_it       : int [1:9527] 1 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_its      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_just     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_kill     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_killer   : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_last     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_life     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_like     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_love     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_man      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_more     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_never    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_new      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_night    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_nightmare: int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_no       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_not      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_now      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_of       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_on       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_one      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_only     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_or       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_out      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_real     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_see      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_she      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_some     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_something: int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_story    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_terror   : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_than     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_that     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_the      : int [1:9527] 0 1 0 0 0 0 1 0 0 0 ...
##  $ tf_tagline_their    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_them     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_there    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_theres   : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_they     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_this     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_time     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_to       : int [1:9527] 0 0 1 0 0 1 0 0 0 0 ...
##  $ tf_tagline_up       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_was      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_way      : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_we       : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_what     : int [1:9527] 0 0 0 0 0 0 0 1 0 0 ...
##  $ tf_tagline_when     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_where    : int [1:9527] 0 0 0 1 0 0 0 0 0 0 ...
##  $ tf_tagline_who      : int [1:9527] 0 0 0 0 0 0 0 0 1 0 ...
##  $ tf_tagline_will     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_with     : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##  $ tf_tagline_world    : int [1:9527] 0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]

xgb_spec <-
  boost_tree(
    trees = tune(),
    mtry = tune(),
    min_n = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

xgb_wf <- workflow(horror_rec, xgb_spec)
xgb_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = tune()
##   min_n = tune()
##   learn_rate = 0.01
## 
## Computational engine: xgboost

library(finetune)
doParallel::registerDoParallel()

set.seed(234)
xgb_horror_rs <- 
tune_race_anova(
    xgb_wf, 
    horror_folds,
    grid = 20, 
    control = control_race(verbose_elim = TRUE)
)

## i Creating pre-processing data to finalize unknown parameter: mtry

## ℹ Racing will minimize the rmse metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 5 eliminated; 15 candidates remain.
## 
## ℹ Fold06: 6 eliminated; 9 candidates remain.
## 
## ℹ Fold08: 3 eliminated; 6 candidates remain.
## 
## ℹ Fold01: 0 eliminated; 6 candidates remain.
## 
## ℹ Fold04: 1 eliminated; 5 candidates remain.
## 
## ℹ Fold02: 0 eliminated; 5 candidates remain.
## 
## ℹ Fold09: 1 eliminated; 4 candidates remain.

xgb_horror_rs

## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits             id     .order .metrics          .notes          
##    <list>             <chr>   <int> <list>            <list>          
##  1 <split [8574/953]> Fold03      1 <tibble [40 × 7]> <tibble [0 × 3]>
##  2 <split [8574/953]> Fold05      2 <tibble [40 × 7]> <tibble [0 × 3]>
##  3 <split [8576/951]> Fold10      3 <tibble [40 × 7]> <tibble [0 × 3]>
##  4 <split [8575/952]> Fold06      4 <tibble [30 × 7]> <tibble [0 × 3]>
##  5 <split [8575/952]> Fold08      5 <tibble [18 × 7]> <tibble [0 × 3]>
##  6 <split [8572/955]> Fold01      6 <tibble [12 × 7]> <tibble [0 × 3]>
##  7 <split [8574/953]> Fold04      7 <tibble [12 × 7]> <tibble [0 × 3]>
##  8 <split [8572/955]> Fold02      8 <tibble [10 × 7]> <tibble [0 × 3]>
##  9 <split [8576/951]> Fold09      9 <tibble [10 × 7]> <tibble [0 × 3]>
## 10 <split [8575/952]> Fold07     10 <tibble [8 × 7]>  <tibble [0 × 3]>

Evaluate the model

plot_race(xgb_horror_rs)

xgb_last <-
    xgb_wf %>% 
    finalize_workflow(select_best(xgb_horror_rs, "rmse")) %>%
    last_fit(horror_split)

xgb_last

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits              id               .metrics .notes   .predictions .workflow 
##   <list>              <chr>            <list>   <list>   <list>       <list>    
## 1 <split [9527/3178]> train/test split <tibble> <tibble> <tibble>     <workflow>

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

xgb_fit <- extract_workflow(xgb_last)
vip(xgb_fit, geom = "point", num_features = 12)

library(SHAPforxgboost)

horror_shap <-
  shap.prep(
    xgb_model = extract_fit_engine(xgb_fit),
    X_train = bake(horror_movie_prep,
      has_role("predictor"),
      new_data = NULL,
      composition = "matrix"
    )
  )

shap.plot.summary(horror_shap)

How do the runtime and tagline of horror movies impact their vote average, and which factors are the most significant predictors in determining this vote average?

The analysis explores horror movies’ vote averages, runtimes, and taglines. Initial visual explorations are followed by the application of an xgboost model to predict ‘vote_average’ using the movie’s tagline. The model is then tuned, evaluated, and interpreted to understand the key influencing factors.

The dataset revolves around horror movies, highlighting attributes like vote_average, runtime, and tagline. vote_average denotes a movie’s rating, while runtime presents its duration. taglines, the memorable movie phrases, are analyzed for patterns that might influence ratings. Through visual tools, the data reveals correlations and trends, serving as a basis for predicting movie ratings using machine learning models.

The dataset’s key variables are vote_average, runtime, and tagline. vote_average indicates a movie’s reception through audience ratings. runtime measures the movie’s duration, exploring its potential influence on ratings. taglines, the catchy movie phrases, are analyzed for words that might sway audience preferences. Together, these variables provide insights into what makes a horror movie resonate with its viewers.

The dataset underwent key transformations for optimized modeling. Taglines were tokenized into standardized, lowercase words, free from punctuation. Rows with NA in runtime were discarded for consistency. Additionally, data was stratified by vote_average for balanced training and testing sets. These adjustments enhanced model precision and clarity in data interpretation.
The data preparation steps mentioned are:

Tokenization of taglines Standardization to lowercase Removal of punctuation from taglines Discarding rows with NA in runtime Stratification of data by vote_average

The machine learning model used in the analysis is Extreme Gradient Boosting, commonly known as XGBoost. This is specified in the code with set_engine(“xgboost”) and set_mode(“regression”), indicating that it’s being used for a regression problem to predict board game ratings.

In the analysis, Root Mean Square Error (RMSE) is used as the evaluation metric, as indicated by select_best(xgb_game_rs, “rmse”). RMSE measures the average error between predicted and actual game ratings. A lower RMSE indicates a more accurate model, making it a relevant metric for my research question, which aims to accurately predict board game ratings based on various game characteristics.
The horror movie dataset analysis revealed key takeaways. The audience reception, depicted by vote_average, and the relationship between runtime and ratings suggest film length’s potential impact on success. Tokenized taglines indicated specific words or phrases might correlate with a film’s reception. The xgboost model’s performance and insights from VIP and SHAP analyses emphasized the importance of certain features, especially within taglines, in determining a movie’s success. This study highlights the nuances affecting a horror movie’s audience acclaim.

Apply 3: Anton Jellvik

2023-09-24

Explore data

Tune an xgboost model

Evaluate the model