Apply 1

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')

## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

horror_movies %>% 
    select(id, popularity:runtime) %>%
    pivot_longer(budget:runtime, names_to = "dim") %>%
    ggplot(aes(value, vote_average, color = dim)) +
    geom_point(alpha = 0.4, show.legend = FALSE) +
    scale_y_log10() +
    facet_wrap(~ dim, scales = "free_x") +
    labs(x = NULL)

## Warning: Transformation introduced infinite values in continuous y-axis

horror_df <- horror_movies %>% 
    select(popularity, vote_average, budget, revenue, runtime) %>%
    mutate(vote_average = log10(vote_average + 1)) %>%
    mutate_if(is.character, factor)

Build a Model

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

horror_df<- sample_n(horror_df, 100)

set.seed(123)
horror_split <- initial_split(horror_df, strata = vote_average)
horror_train <- training(horror_split)
horror_test <- testing(horror_split)

set.seed(234)
horror_folds <- bootstraps(horror_train, strata = vote_average)

## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.

horror_folds

## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits          id         
##    <list>          <chr>      
##  1 <split [73/24]> Bootstrap01
##  2 <split [73/28]> Bootstrap02
##  3 <split [73/31]> Bootstrap03
##  4 <split [73/26]> Bootstrap04
##  5 <split [73/26]> Bootstrap05
##  6 <split [73/25]> Bootstrap06
##  7 <split [73/30]> Bootstrap07
##  8 <split [73/27]> Bootstrap08
##  9 <split [73/24]> Bootstrap09
## 10 <split [73/23]> Bootstrap10
## # ℹ 15 more rows

library(usemodels)

use_ranger(vote_average ~ ., data = horror_train)

## ranger_recipe <- 
##   recipe(formula = vote_average ~ ., data = horror_train) 
## 
## ranger_spec <- 
##   rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
##   set_mode("classification") %>% 
##   set_engine("ranger") 
## 
## ranger_workflow <- 
##   workflow() %>% 
##   add_recipe(ranger_recipe) %>% 
##   add_model(ranger_spec) 
## 
## set.seed(30117)
## ranger_tune <-
##   tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

library(textrecipes)
ranger_recipe <- 
  recipe(formula = vote_average ~ ., data = horror_train)
  

ranger_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_mode("regression") %>% 
  set_engine("ranger") 

ranger_workflow <- 
  workflow() %>% 
  add_recipe(ranger_recipe) %>% 
  add_model(ranger_spec) 

set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
  tune_grid(ranger_workflow, 
            resamples = horror_folds, 
            grid = 11)

## i Creating pre-processing data to finalize unknown parameter: mtry

EXPLORE RESULTS

show_best(ranger_tune, metric ="rmse")

## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     1    34 rmse    standard   0.335    25 0.00501 Preprocessor1_Model11
## 2     2    32 rmse    standard   0.338    25 0.00719 Preprocessor1_Model07
## 3     4    38 rmse    standard   0.340    25 0.00857 Preprocessor1_Model04
## 4     1    13 rmse    standard   0.340    25 0.00577 Preprocessor1_Model09
## 5     3    28 rmse    standard   0.345    25 0.00816 Preprocessor1_Model02

show_best(ranger_tune, metric ="rsq")

## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     4    38 rsq     standard   0.220    25  0.0241 Preprocessor1_Model04
## 2     2    32 rsq     standard   0.200    25  0.0238 Preprocessor1_Model07
## 3     3    28 rsq     standard   0.196    25  0.0248 Preprocessor1_Model02
## 4     1    34 rsq     standard   0.193    25  0.0223 Preprocessor1_Model11
## 5     3    24 rsq     standard   0.186    25  0.0250 Preprocessor1_Model03

final_rf <- ranger_workflow %>%
    finalize_workflow(select_best(ranger_tune))

## Warning: No value of `metric` was given; metric 'rmse' will be used.

final_rf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = 1
##   trees = 1000
##   min_n = 34
## 
## Computational engine: ranger

horror_fit <- last_fit(final_rf, horror_split)
horror_fit

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits          id               .metrics .notes   .predictions .workflow 
##   <list>          <chr>            <list>   <list>   <list>       <list>    
## 1 <split [73/27]> train/test split <tibble> <tibble> <tibble>     <workflow>

collect_metrics(horror_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.318 Preprocessor1_Model1
## 2 rsq     standard       0.378 Preprocessor1_Model1

Apply 1

Colin Tracy

2023-09-17

Build a Model

EXPLORE RESULTS