Goal: Build a regression model to predict the average movie rating (vote_average). Use the horror_movies dataset. Click here for the data

Import Data

horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-11-01/horror_movies.csv')


skimr::skim(horror_movies)

Data summary
Name	horror_movies
Number of rows	32540
Number of columns	20
_______________________
Column type frequency:
character	10
Date	1
logical	1
numeric	8
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
original_title	0	1.00	1	191	30296
title	0	1.00	1	191	29563
original_language	0	1.00	2	2	97
overview	1286	0.96	1	1000	31020
tagline	19835	0.39	1	237	12513
poster_path	4474	0.86	30	32	28048
status	0	1.00	7	15	4
backdrop_path	18995	0.42	29	32	13536
genre_names	0	1.00	6	144	772
collection_name	30234	0.07	4	56	815

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
release_date	0	1	1950-01-01	2022-12-31	2012-12-09	10999

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
adult	0	1	0	FAL: 32540

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	445910.83	305744.67	17	146494.8	426521.00	707534.00	1033095.00	▇▆▆▅▅
popularity	0	1.00	4.01	37.51	0	0.6	0.84	2.24	5088.58	▇▁▁▁▁
vote_count	0	1.00	62.69	420.89	0	0.0	2.00	11.00	16900.00	▇▁▁▁▁
vote_average	0	1.00	3.34	2.88	0	0.0	4.00	5.70	10.00	▇▂▆▃▁
budget	0	1.00	543126.59	4542667.81	0	0.0	0.00	0.00	200000000.00	▇▁▁▁▁
revenue	0	1.00	1349746.73	14430479.15	0	0.0	0.00	0.00	701842551.00	▇▁▁▁▁
runtime	0	1.00	62.14	41.00	0	14.0	80.00	91.00	683.00	▇▁▁▁▁
collection	30234	0.07	481534.88	324498.16	656	155421.0	471259.00	759067.25	1033032.00	▇▅▅▅▅

data <- horror_movies %>%
    
    # Filter out Non Released Movies
    filter(status == "Released") %>%

    # Treat missing values
    select(-overview,-tagline,-poster_path,-backdrop_path,-collection_name, -status, -original_title, -adult) %>%
    na.omit() %>%
    
    # Seperating Row Names
    separate_rows(genre_names, sep = ", ") %>%
    
    #log transform variables for voting average
    mutate(vote_average = log(vote_average + 1)) %>%
    
    mutate(across(where(is.character),as.factor)) %>%
    mutate(title = as.character(title)) %>%
    
    mutate(across(where(is.logical),as.factor))

Explore Data

Identify good predictors.

genre_names

data %>%
    ggplot(aes(vote_average, genre_names)) +
    geom_boxplot()

popularity

data %>%
    ggplot(aes(vote_average, popularity)) +
    geom_point()

EDA shortcut

# Step 1: Prepare data
data_binarized_tbl <- data %>%
    select(-id, -release_date) %>%
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 4,949
## Columns: 47
## $ `title__Scooby-Doo!_Frankencreepy`              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `title__-OTHER`                                 <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ original_language__cn                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__de                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__en                           <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ original_language__es                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__id                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__ja                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__th                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `original_language__-OTHER`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `popularity__-Inf_1.55`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__1.55_4.882                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__4.882_13.017                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__13.017_Inf                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `vote_count__-Inf_5`                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__5_36                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__36_205                              <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ vote_count__205_Inf                             <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
## $ `vote_average__-Inf_1.66770682055808`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.66770682055808_1.85629799036563 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.85629799036563_1.97408102602201 <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ vote_average__1.97408102602201_Inf              <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
## $ `budget__-Inf_7e+05`                            <dbl> 1, 1, 0, 0, 0, 0, 0, 0…
## $ `budget__7e+05_Inf`                             <dbl> 0, 0, 1, 1, 1, 1, 1, 1…
## $ revenue__0                                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `revenue__-OTHER`                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `runtime__-Inf_81`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__81_90                                  <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ runtime__90_99                                  <dbl> 1, 1, 0, 0, 0, 0, 0, 0…
## $ runtime__99_Inf                                 <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ genre_names__Action                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Adventure                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Animation                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Comedy                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Crime                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Drama                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Fantasy                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Horror                             <dbl> 1, 0, 1, 0, 0, 1, 0, 0…
## $ genre_names__Mystery                            <dbl> 0, 0, 0, 1, 0, 0, 1, 0…
## $ genre_names__Science_Fiction                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Thriller                           <dbl> 0, 1, 0, 0, 1, 0, 0, 1…
## $ genre_names__TV_Movie                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `genre_names__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `collection__-Inf_133352`                       <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ collection__133352_459212                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ collection__459212_744915                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ collection__744915_Inf                          <dbl> 1, 1, 0, 0, 0, 1, 1, 1…

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(vote_average__1.97408102602201_Inf)

data_corr_tbl

## # A tibble: 47 × 3
##    feature      bin                               correlation
##    <fct>        <chr>                                   <dbl>
##  1 vote_average 1.97408102602201_Inf                    1    
##  2 vote_average 1.66770682055808_1.85629799036563      -0.335
##  3 vote_average -Inf_1.66770682055808                  -0.330
##  4 vote_average 1.85629799036563_1.97408102602201      -0.324
##  5 popularity   13.017_Inf                              0.243
##  6 revenue      0                                      -0.233
##  7 revenue      -OTHER                                  0.233
##  8 vote_count   205_Inf                                 0.214
##  9 runtime      99_Inf                                  0.196
## 10 vote_count   5_36                                   -0.123
## # ℹ 37 more rows

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split data

# data <- sample_n(data, 100)

#Split into train and test dataset
set.seed(1234)
data_split <- rsample:: initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

#Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [3339/372]> Fold01
##  2 <split [3340/371]> Fold02
##  3 <split [3340/371]> Fold03
##  4 <split [3340/371]> Fold04
##  5 <split [3340/371]> Fold05
##  6 <split [3340/371]> Fold06
##  7 <split [3340/371]> Fold07
##  8 <split [3340/371]> Fold08
##  9 <split [3340/371]> Fold09
## 10 <split [3340/371]> Fold10

library(usemodels)
usemodels::use_xgboost(vote_average ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = vote_average ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(10213)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify recipe
xgboost_recipe <-
    recipe(formula = vote_average ~ ., data = data_train) %>%
    recipes::update_role(id, new_role ="id variable") %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tf(title) %>%
    step_date(release_date, keep_original_cols = FALSE) %>%
    step_dummy(all_nominal_predictors())

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 3,711
## Columns: 170
## $ id                          <dbl> 29426, 520901, 68557, 246403, 147619, 6492…
## $ popularity                  <dbl> 17.627, 28.724, 4.015, 19.296, 0.600, 3.19…
## $ vote_count                  <dbl> 376, 325, 17, 1078, 2, 21, 130, 20, 26, 12…
## $ budget                      <dbl> 4.0e+06, 0.0e+00, 0.0e+00, 3.0e+06, 0.0e+0…
## $ revenue                     <dbl> 0, 0, 0, 1882074, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ runtime                     <dbl> 90, 97, 106, 102, 97, 72, 88, 78, 87, 86, …
## $ collection                  <dbl> 944437, 634390, 856206, 553902, 690968, 74…
## $ vote_average                <dbl> 1.774952, 1.774952, 2.014903, 1.871802, 1.…
## $ tf_title_1                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_3                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ tf_title_4                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_5                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_6                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_a                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_alien              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_all                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_and                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_angel              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_atta               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_attack             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_beast              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_bizarre            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_black              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_blood              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_bong               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_boys               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_children           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_curse              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_dark               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_darkness           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_day                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_dead               <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_death              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_demon              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_devil              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tf_title_don't`            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_doo                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_evil               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_fear               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_final              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ tf_title_from               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_gamera             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_ghost              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_girl               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_halloween          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_haunted            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_hell               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_high               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ tf_title_honto              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_horror             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_house              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_hunter             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tf_title_i'm`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_ii                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_iii                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_in                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_iv                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_kamen              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_killer             <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ tf_title_lake               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_last               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_legend             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_lost               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_man                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_massacre           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_master             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_me                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_movie              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_my                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_new                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_ni                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_night              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_nightmare          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_no                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_noroi              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_of                 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ tf_title_on                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_paranormal         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_part               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_pop                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_psycho             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_rattle             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_resident           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_return             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_revenge            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_rider              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_roll               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_scared             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_scooby             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_shake              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_shark              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_special            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_spirit             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_story              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_tales              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_terror             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_the                <int> 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, …
## $ tf_title_to                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_tokyo              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_tremors            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_troublesome        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_vampire            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_video              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_volume             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_vs                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_world              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_zombie             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_year           <int> 2010, 2019, 2009, 2014, 1976, 1999, 2011, …
## $ original_language_da        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_de        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_el        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_en        <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, …
## $ original_language_es        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_fi        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_fr        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_hi        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_id        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_it        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_ja        <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ original_language_ko        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_ml        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_ms        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_no        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_pl        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_pt        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_ru        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_si        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_ta        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_te        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_th        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_tl        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_tr        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_xx        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_zh        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Adventure       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Animation       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Comedy          <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Crime           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ genre_names_Documentary     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Drama           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Family          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Fantasy         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_History         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Horror          <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, …
## $ genre_names_Music           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Mystery         <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Romance         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Science.Fiction <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Thriller        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ genre_names_TV.Movie        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_War             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Western         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_dow_Mon        <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_dow_Tue        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ release_date_dow_Wed        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_dow_Thu        <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ release_date_dow_Fri        <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ release_date_dow_Sat        <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ release_date_month_Feb      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_month_Mar      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ release_date_month_Apr      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_month_May      <dbl> 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ release_date_month_Jun      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_month_Jul      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_month_Aug      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_month_Sep      <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_month_Oct      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ release_date_month_Nov      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ release_date_month_Dec      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, …

# Specify model
xgboost_spec <-
    boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
    set_mode("regression") %>%
    set_engine("xgboost")

# Combinte recipe and model using workflow
xgboost_workflow <-
    workflow() %>%
    add_recipe(xgboost_recipe) %>%
    add_model(xgboost_spec)

# Tune hyperparameters
set.seed(344)
xgboost_tune <-
    tune_grid(xgboost_workflow, 
              resamples = data_cv,
              grid = 5)

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 10
##    mtry trees min_n learn_rate .metric .estimator  mean     n std_err .config   
##   <int> <int> <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>     
## 1    42  1000    40    0.316   rmse    standard   0.124    10 0.00561 Preproces…
## 2   168  2000    21    0.0178  rmse    standard   0.135    10 0.00355 Preproces…
## 3     1  1500    11    0.00422 rmse    standard   0.454    10 0.0106  Preproces…
## 4   126   500    30    0.001   rmse    standard   0.811    10 0.00392 Preproces…
## 5    84     1     2    0.0750  rmse    standard   1.22     10 0.00483 Preproces…

# Update the model by selecting the best hyperparameters.
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire training data and test it on the test data.
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune::collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.108 Preprocessor1_Model1
## 2 rsq     standard       0.968 Preprocessor1_Model1

tune::collect_predictions(data_fit) %>%
    ggplot(aes(vote_average, .pred)) +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") +
    coord_fixed()

Make Predictions

I adjusted the step_tf so that it would run a little smoother, but aside from that not many changes were made.

Apply Data 2 Horror Movies

Thomas Hall

2025-02-25

Import Data

Explore Data

Build Models

Evaluate Models

Make Predictions