library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
horror_movies %>% 
    select(id, popularity:runtime) %>%
    pivot_longer(budget:runtime, names_to = "dim") %>%
    ggplot(aes(value, vote_average, color = dim)) +
    geom_point(alpha = 0.4, show.legend = FALSE) +
    scale_y_log10() +
    facet_wrap(~ dim, scales = "free_x") +
    labs(x = NULL)
## Warning: Transformation introduced infinite values in continuous y-axis

horror_df <- horror_movies %>% 
    select(title, popularity, vote_average, budget, revenue, runtime) %>%
    mutate(vote_average = log10(vote_average)) %>%
    mutate_if(is.character, factor)

Build a Model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
horror_split <- initial_split(horror_df, strata = vote_average)
horror_train <- training(horror_split)
horror_test <- testing(horror_split)

set.seed(234)
horror_folds <- bootstraps(horror_train, strata = vote_average)
horror_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits               id         
##    <list>               <chr>      
##  1 <split [24404/8966]> Bootstrap01
##  2 <split [24404/8967]> Bootstrap02
##  3 <split [24404/8909]> Bootstrap03
##  4 <split [24404/9005]> Bootstrap04
##  5 <split [24404/9101]> Bootstrap05
##  6 <split [24404/8976]> Bootstrap06
##  7 <split [24404/8961]> Bootstrap07
##  8 <split [24404/8998]> Bootstrap08
##  9 <split [24404/8952]> Bootstrap09
## 10 <split [24404/8981]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)

use_ranger(vote_average ~ ., data = horror_train)
## ranger_recipe <- 
##   recipe(formula = vote_average ~ ., data = horror_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("classification") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(20501)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(textrecipes)
ranger_recipe <- 
  recipe(formula = vote_average ~ ., data = horror_train) 

ranger_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_mode("regression") %>% 
  set_engine("ranger") 

ranger_workflow <- 
  workflow() %>% 
  add_recipe(ranger_recipe) %>% 
  add_model(ranger_spec) 

set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
  tune_grid(ranger_workflow, 
            resamples = horror_folds, 
            grid = 11)
## i Creating pre-processing data to finalize unknown parameter: mtry

EXPLORE RESULTS

show_best(ranger_tune, metric ="rmse")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     3    18 rmse    standard     NaN     0      NA Preprocessor1_Model01
## 2     4    28 rmse    standard     NaN     0      NA Preprocessor1_Model02
## 3     4    24 rmse    standard     NaN     0      NA Preprocessor1_Model03
## 4     5    38 rmse    standard     NaN     0      NA Preprocessor1_Model04
## 5     4    10 rmse    standard     NaN     0      NA Preprocessor1_Model05
show_best(ranger_tune, metric ="rsq")
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     3    18 rsq     standard     NaN     0      NA Preprocessor1_Model01
## 2     4    28 rsq     standard     NaN     0      NA Preprocessor1_Model02
## 3     4    24 rsq     standard     NaN     0      NA Preprocessor1_Model03
## 4     5    38 rsq     standard     NaN     0      NA Preprocessor1_Model04
## 5     4    10 rsq     standard     NaN     0      NA Preprocessor1_Model05
final_rf <- ranger_workflow %>%
    finalize_workflow(select_best(ranger_tune))
## Warning: No value of `metric` was given; metric 'rmse' will be used.
final_rf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 3
##   trees = 1000
##   min_n = 18
## 
## Computational engine: ranger
horror_fit <- last_fit(final_rf, horror_split)
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x1
horror_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits               id              .metrics .notes   .predictions .workflow 
##   <list>               <chr>           <list>   <list>   <list>       <list>    
## 1 <split [24404/8136]> train/test spl… <tibble> <tibble> <tibble>     <workflow>
## 
## There were issues with some computations:
## 
##   - Warning(s) x1: A correlation computation is required, but `estimate` is constant...
## 
## Run `show_notes(.Last.tune.result)` for more information.
collect_metrics(horror_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard         NaN Preprocessor1_Model1
## 2 rsq     standard          NA Preprocessor1_Model1