library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): original_title, title, original_language, overview, tagline, post...
## dbl (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl (1): adult
## date (1): release_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
horror_movies %>%
select(id, popularity:runtime) %>%
pivot_longer(budget:runtime, names_to = "dim") %>%
ggplot(aes(value, vote_average, color = dim)) +
geom_point(alpha = 0.4, show.legend = FALSE) +
scale_y_log10() +
facet_wrap(~ dim, scales = "free_x") +
labs(x = NULL)
## Warning: Transformation introduced infinite values in continuous y-axis

horror_df <- horror_movies %>%
select(title, popularity, vote_average, budget, revenue, runtime) %>%
mutate(vote_average = log10(vote_average)) %>%
mutate_if(is.character, factor)
Build a Model
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
set.seed(123)
horror_split <- initial_split(horror_df, strata = vote_average)
horror_train <- training(horror_split)
horror_test <- testing(horror_split)
set.seed(234)
horror_folds <- bootstraps(horror_train, strata = vote_average)
horror_folds
## # Bootstrap sampling using stratification
## # A tibble: 25 × 2
## splits id
## <list> <chr>
## 1 <split [24404/8966]> Bootstrap01
## 2 <split [24404/8967]> Bootstrap02
## 3 <split [24404/8909]> Bootstrap03
## 4 <split [24404/9005]> Bootstrap04
## 5 <split [24404/9101]> Bootstrap05
## 6 <split [24404/8976]> Bootstrap06
## 7 <split [24404/8961]> Bootstrap07
## 8 <split [24404/8998]> Bootstrap08
## 9 <split [24404/8952]> Bootstrap09
## 10 <split [24404/8981]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_ranger(vote_average ~ ., data = horror_train)
## ranger_recipe <-
## recipe(formula = vote_average ~ ., data = horror_train)
##
## ranger_spec <-
## rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
## set_mode("classification") %>%
## set_engine("ranger")
##
## ranger_workflow <-
## workflow() %>%
## add_recipe(ranger_recipe) %>%
## add_model(ranger_spec)
##
## set.seed(20501)
## ranger_tune <-
## tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(textrecipes)
ranger_recipe <-
recipe(formula = vote_average ~ ., data = horror_train)
ranger_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_mode("regression") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
set.seed(8577)
doParallel::registerDoParallel()
ranger_tune <-
tune_grid(ranger_workflow,
resamples = horror_folds,
grid = 11)
## i Creating pre-processing data to finalize unknown parameter: mtry
EXPLORE RESULTS
show_best(ranger_tune, metric ="rmse")
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 3 18 rmse standard NaN 0 NA Preprocessor1_Model01
## 2 4 28 rmse standard NaN 0 NA Preprocessor1_Model02
## 3 4 24 rmse standard NaN 0 NA Preprocessor1_Model03
## 4 5 38 rmse standard NaN 0 NA Preprocessor1_Model04
## 5 4 10 rmse standard NaN 0 NA Preprocessor1_Model05
show_best(ranger_tune, metric ="rsq")
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 3 18 rsq standard NaN 0 NA Preprocessor1_Model01
## 2 4 28 rsq standard NaN 0 NA Preprocessor1_Model02
## 3 4 24 rsq standard NaN 0 NA Preprocessor1_Model03
## 4 5 38 rsq standard NaN 0 NA Preprocessor1_Model04
## 5 4 10 rsq standard NaN 0 NA Preprocessor1_Model05
final_rf <- ranger_workflow %>%
finalize_workflow(select_best(ranger_tune))
## Warning: No value of `metric` was given; metric 'rmse' will be used.
final_rf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 3
## trees = 1000
## min_n = 18
##
## Computational engine: ranger
horror_fit <- last_fit(final_rf, horror_split)
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
##
There were issues with some computations A: x1
There were issues with some computations A: x1
horror_fit
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [24404/8136]> train/test spl… <tibble> <tibble> <tibble> <workflow>
##
## There were issues with some computations:
##
## - Warning(s) x1: A correlation computation is required, but `estimate` is constant...
##
## Run `show_notes(.Last.tune.result)` for more information.
collect_metrics(horror_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard NaN Preprocessor1_Model1
## 2 rsq standard NA Preprocessor1_Model1