library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)
library(explore)
library(spacyr)
library(textrecipes)

## Loading required package: recipes
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()         masks purrr::discard()
## ✖ dplyr::filter()           masks stats::filter()
## ✖ recipes::fixed()          masks stringr::fixed()
## ✖ parsnip::get_dependency() masks spacyr::get_dependency()
## ✖ dplyr::lag()              masks stats::lag()
## ✖ yardstick::spec()         masks readr::spec()
## ✖ recipes::step()           masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org

library(finetune)
library(stopwords)

Import Data

horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')

## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data <- horror_movies %>%
    
    mutate(vote_average = log1p(vote_average)) %>% 
   
    filter(!is.na(overview), vote_count != 0) %>%
    
    separate_rows(genre_names, sep = ", ") %>%
    
    filter(status == "Released") %>%
    
    select(id, vote_average, genre_names, runtime, original_language, original_title) 

data <- data %>% sample_n(1000)

Explore Data

data %>% glimpse()
data %>% skimr::skim()
data %>% select(id) %>% explore()
data %>% describe_all()
data %>% describe_cat(genre_names)
data %>% select(-id) %>% explore_all(target = vote_average)

data %>% 
    ggplot(aes(vote_average)) +
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>% count(original_title, sort = T)

## # A tibble: 978 × 2
##    original_title                                              n
##    <chr>                                                   <int>
##  1 Behind the Random Denominator                               2
##  2 Bird Of Prey                                                2
##  3 De Kuthoer                                                  2
##  4 Dead Shadows                                                2
##  5 Evilenko                                                    2
##  6 Firestarter                                                 2
##  7 Hi-8 (Horror Independent 8)                                 2
##  8 La Leyenda de la Llorona                                    2
##  9 Secret Santa                                                2
## 10 Sin and Salvation: The Comic Book Origin of Ghost Rider     2
## # ℹ 968 more rows

data %>%
    
    group_by(original_title) %>%
    summarise(
        n = n(),
        avg_vote_average = mean(vote_average) 
    ) %>%
    ungroup() %>%
    
    ggplot(aes(n, avg_vote_average)) +
    #geom_point() +
    geom_text(aes(label = original_title), check_overlap = TRUE) +
    geom_hline(yintercept = mean(data$vote_average), 
               linewidth = 2, linetype = "dotted", color = "darkgray") +
    
    scale_x_log10()

data %>%
    ggplot(aes(original_title, vote_average)) + 
    geom_jitter(alpha = 0.2)

data %>%
    ggplot(aes(runtime, vote_average)) +
    geom_jitter(alpha = 0.3)

Build a Model

set.seed(123)
data_split <- initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- rsample::vfold_cv(data_train)
data_folds

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [675/75]> Fold01
##  2 <split [675/75]> Fold02
##  3 <split [675/75]> Fold03
##  4 <split [675/75]> Fold04
##  5 <split [675/75]> Fold05
##  6 <split [675/75]> Fold06
##  7 <split [675/75]> Fold07
##  8 <split [675/75]> Fold08
##  9 <split [675/75]> Fold09
## 10 <split [675/75]> Fold10

library(usemodels)
use_xgboost(vote_average ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = vote_average ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(12447)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- 
  recipe(formula = vote_average ~ ., data = data_train) %>% 
    recipes::update_role(id, new_role = "id") %>%
    step_tokenize(original_title) %>%
    step_stopwords(original_title) %>%
    step_tokenfilter(original_title, max_tokens = 100) %>%
    step_tfidf(original_title) %>%
    step_other(original_language) %>%
    step_dummy(genre_names, original_language, one_hot = TRUE) %>%
    step_normalize(runtime)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 750
## Columns: 124
## $ id                                <dbl> 681996, 302241, 872325, 1013137, 667…
## $ runtime                           <dbl> 0.09935469, 0.83941104, 0.60973838, …
## $ vote_average                      <dbl> 1.609438, 2.079442, 1.945910, 2.3978…
## $ tfidf_original_title_1            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_12           <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_13           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_2            <dbl> 1.144662, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_5            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_8            <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_american     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_angel        <dbl> 1.746366, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_bad          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_behind       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_beneath      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_better       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_beyond       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_bigfoot      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_bird         <dbl> 0.000000, 0.000000, 0.000000, 5.9295…
## $ tfidf_original_title_bitch        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_bite         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_blood        <dbl> 1.561151, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_bloodlust    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_bloody       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_brain        <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_bubba        <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_cabin        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_chapter      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_come         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_creek        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_crimson      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_curse        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_dark         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_darkness     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_day          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_de           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_dead         <dbl> 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_original_title_deadly       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_death        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_demon        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_denominator  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_der          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_des          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_di           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_día          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_doo          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_door         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_dracula      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_dreams       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_el           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_end          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_eve          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_evil         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_eyes         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_farm         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_fear         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_firestarter  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_flats        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_frankenstein <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_get          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_ghost        <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_haunting     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_hell         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_horror       <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_house        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_ii           <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_il           <dbl> 0.00000, 0.00000, 0.00000, 0.00000, …
## $ tfidf_original_title_kill         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_killer       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_king         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_la           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_lake         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_last         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_le           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_legend       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_living       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_los          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_man          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_massacre     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_movie        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_new          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_night        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_one          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_part         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_run          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_s            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_screams      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_shadows      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_terror       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_v            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_vampire      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_vs           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_wait         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_wicked       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_woods        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_zombie       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `tfidf_original_title_ผี`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_の           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_人           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_劇場         <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_恐怖         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_original_title_版           <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_original_title_鬼           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Action                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Adventure             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Animation             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ genre_names_Comedy                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Crime                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ genre_names_Documentary           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Drama                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Family                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Fantasy               <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ genre_names_History               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Horror                <dbl> 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ genre_names_Music                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Mystery               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Romance               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Science.Fiction       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Thriller              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ genre_names_TV.Movie              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_War                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ genre_names_Western               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ original_language_en              <dbl> 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, …
## $ original_language_other           <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, …

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(56024)
doParallel::registerDoParallel()

xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_folds, grid = 10)

# Explore Results
show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1   581    15          2    0.0253        1.98e- 4       0.738 rmse   
## 2  1246     9          7    0.00902       7.48e-10       0.154 rmse   
## 3   861    25          9    0.0174        6.69e- 8       0.439 rmse   
## 4  1942    20         12    0.00384       1.60e+ 0       0.237 rmse   
## 5   357    40          3    0.0803        1.73e- 9       0.882 rmse   
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

autoplot(xgboost_tune)

final_rf <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, "rmse"))

data_fit <- last_fit(final_rf, data_split)
data_fit

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits            id               .metrics .notes   .predictions .workflow 
##   <list>            <chr>            <list>   <list>   <list>       <list>    
## 1 <split [750/250]> train/test split <tibble> <tibble> <tibble>     <workflow>

Evaluate Model

collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      0.319  Preprocessor1_Model1
## 2 rsq     standard      0.0404 Preprocessor1_Model1

collect_predictions(data_fit)

## # A tibble: 250 × 5
##    id               .pred  .row vote_average .config             
##    <chr>            <dbl> <int>        <dbl> <chr>               
##  1 train/test split  1.80     1         1.61 Preprocessor1_Model1
##  2 train/test split  1.78     3         1.79 Preprocessor1_Model1
##  3 train/test split  1.90     7         1.41 Preprocessor1_Model1
##  4 train/test split  1.78     9         2.03 Preprocessor1_Model1
##  5 train/test split  1.92    12         1.76 Preprocessor1_Model1
##  6 train/test split  1.72    15         1.79 Preprocessor1_Model1
##  7 train/test split  1.93    18         1.92 Preprocessor1_Model1
##  8 train/test split  1.78    21         2.09 Preprocessor1_Model1
##  9 train/test split  2.08    22         2.08 Preprocessor1_Model1
## 10 train/test split  1.76    25         1.79 Preprocessor1_Model1
## # ℹ 240 more rows

collect_predictions(data_fit) %>%
    ggplot(aes(vote_average, .pred)) +
    geom_point(alpha = 0.5, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50")

data_fit %>%
    extract_workflow() %>%
    predict(data_test[1,])

## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  1.80

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

imp_spec <- xgboost_spec %>%
    tune::finalize_model(tune::select_best(xgboost_tune)) %>%
    parsnip::set_engine("xgboost", importance = "permutation")

## Warning: No value of `metric` was given; metric 'rmse' will be used.

workflows::workflow() %>%
    add_recipe(xgboost_recipe) %>%
    add_model(imp_spec) %>%
    fit(data_train) %>%
    workflows::extract_fit_parsnip() %>%
    vip()

## [18:28:59] WARNING: src/learner.cc:767: 
## Parameters: { "importance" } are not used.

Apply Data 3

Colin Tracy

2023-09-27

Import Data

Explore Data

Build a Model

Evaluate Model