#Import Data:

horror <- horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(horror)
Data summary
Name horror
Number of rows 32540
Number of columns 20
_______________________
Column type frequency:
character 10
Date 1
logical 1
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
original_title 0 1.00 1 191 0 30296 0
title 0 1.00 1 191 0 29563 0
original_language 0 1.00 2 2 0 97 0
overview 1286 0.96 1 1000 0 31020 0
tagline 19835 0.39 1 237 0 12513 0
poster_path 4474 0.86 30 32 0 28048 0
status 0 1.00 7 15 0 4 0
backdrop_path 18995 0.42 29 32 0 13536 0
genre_names 0 1.00 6 144 0 772 0
collection_name 30234 0.07 4 56 0 815 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
release_date 0 1 1950-01-01 2022-12-31 2012-12-09 10999

Variable type: logical

skim_variable n_missing complete_rate mean count
adult 0 1 0 FAL: 32540

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 445910.83 305744.67 17 146494.8 426521.00 707534.00 1033095.00 ▇▆▆▅▅
popularity 0 1.00 4.01 37.51 0 0.6 0.84 2.24 5088.58 ▇▁▁▁▁
vote_count 0 1.00 62.69 420.89 0 0.0 2.00 11.00 16900.00 ▇▁▁▁▁
vote_average 0 1.00 3.34 2.88 0 0.0 4.00 5.70 10.00 ▇▂▆▃▁
budget 0 1.00 543126.59 4542667.81 0 0.0 0.00 0.00 200000000.00 ▇▁▁▁▁
revenue 0 1.00 1349746.73 14430479.15 0 0.0 0.00 0.00 701842551.00 ▇▁▁▁▁
runtime 0 1.00 62.14 41.00 0 14.0 80.00 91.00 683.00 ▇▁▁▁▁
collection 30234 0.07 481534.88 324498.16 656 155421.0 471259.00 759067.25 1033032.00 ▇▅▅▅▅
data <- horror %>%
    
    # Treat Missing Values
    select(-tagline, -collection_name, -collection, -original_title, -release_date, -title, -poster_path, -backdrop_path, -status, -adult) %>%
    filter(budget != 0) %>%
    filter(revenue != 0) %>%
    na.omit() %>%
    
    # Log transform variables with pos-skewed distribution
    mutate(vote_average = log(vote_average + 1)) %>%
    
    separate_rows(genre_names) %>%
    
    mutate(across(where(is.logical), as.factor))

Goal: Build a regression model to predict average movie rating (vote_average). Using the horror_movies dataset.

#Explore Data:

data %>%
    ggplot(aes(vote_average)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Identify Good Predictors.

Budget

data %>%
    ggplot(aes(vote_average, budget)) +
    scale_y_log10() +
    geom_point()

data %>%
    ggplot(aes(vote_average, as.factor(genre_names))) +
    geom_boxplot()

Correlation Plot

# Step 1: Prepare Data
data_binarize_tbl <- data %>%
    select(-id, -overview) %>%
    binarize()
data_binarize_tbl %>%
    glimpse()
## Rows: 2,959
## Columns: 43
## $ original_language__en                           <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ original_language__es                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__fr                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__hi                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__ja                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__ko                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `original_language__-OTHER`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `popularity__-Inf_8.358`                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__8.358_15.602                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__15.602_29.402                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__29.402_Inf                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `vote_count__-Inf_149`                          <dbl> 1, 1, 1, 0, 0, 1, 1, 1…
## $ vote_count__149_567                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__567_1647                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__1647_Inf                            <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `vote_average__-Inf_1.85629799036563`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.85629799036563_1.96009478404727 <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ vote_average__1.96009478404727_2.02814824729229 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__2.02814824729229_Inf              <dbl> 1, 1, 1, 1, 1, 0, 0, 0…
## $ `budget__-Inf_1550000`                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `budget__1550000_7e+06`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `budget__7e+06_1.9e+07`                         <dbl> 1, 1, 1, 1, 1, 0, 0, 0…
## $ `budget__1.9e+07_Inf`                           <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ `revenue__-Inf_1189315`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ revenue__1189315_12294931                       <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ revenue__12294931_48713152.5                    <dbl> 1, 1, 1, 0, 0, 0, 0, 0…
## $ revenue__48713152.5_Inf                         <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `runtime__-Inf_89`                              <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ runtime__89_96                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__96_105                                 <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ runtime__105_Inf                                <dbl> 1, 1, 1, 0, 0, 0, 0, 0…
## $ genre_names__Action                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Adventure                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Comedy                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Crime                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Drama                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Fantasy                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Fiction                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Horror                             <dbl> 1, 0, 0, 1, 0, 1, 0, 0…
## $ genre_names__Mystery                            <dbl> 0, 1, 0, 0, 0, 0, 1, 0…
## $ genre_names__Science                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Thriller                           <dbl> 0, 0, 1, 0, 1, 0, 0, 1…
## $ `genre_names__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
# Step 2: Correlate
data_corr_tbl <- data_binarize_tbl %>%
    correlate(vote_average__2.02814824729229_Inf)

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

# data <- sample_n(data, 100)

# Split into train and test dataset
set.seed(123)
data_split <- initial_split(data, strata = vote_average)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(234)
data_cv <- vfold_cv(data_train, strata = vote_average)
data_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [1994/224]> Fold01
##  2 <split [1995/223]> Fold02
##  3 <split [1996/222]> Fold03
##  4 <split [1996/222]> Fold04
##  5 <split [1996/222]> Fold05
##  6 <split [1996/222]> Fold06
##  7 <split [1996/222]> Fold07
##  8 <split [1997/221]> Fold08
##  9 <split [1998/220]> Fold09
## 10 <split [1998/220]> Fold10
library(usemodels)
usemodels::use_xgboost(vote_average ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = vote_average ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(24411)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
#specify recipe
#Specifiy Recipie
xgboost_recipe <-
    recipe(formula = vote_average ~ ., data = data_train) %>%
    update_role(id, new_role = "id") %>%
    step_tokenize(overview) %>%
    step_tokenfilter(overview, max_tokens = 100) %>%
    step_tfidf(overview) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_YeoJohnson(runtime)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 2,218
## Columns: 154
## $ id                        <dbl> 25853, 55341, 55341, 11249, 11249, 440, 440,…
## $ popularity                <dbl> 378.390, 246.189, 246.189, 102.731, 102.731,…
## $ vote_count                <dbl> 220, 692, 692, 1252, 1252, 2334, 2334, 2334,…
## $ budget                    <dbl> 250000, 6200000, 6200000, 12000000, 12000000…
## $ revenue                   <dbl> 144008, 3600000, 3600000, 24829644, 24829644…
## $ runtime                   <dbl> 1979.365, 1945.252, 1945.252, 1525.409, 1525…
## $ vote_average              <dbl> 1.824549, 1.774952, 1.774952, 1.856298, 1.85…
## $ tfidf_overview_a          <dbl> 0.00000000, 0.03113613, 0.03113613, 0.000000…
## $ tfidf_overview_about      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_after      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_against    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_all        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_an         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_and        <dbl> 0.05096523, 0.00000000, 0.00000000, 0.065526…
## $ tfidf_overview_are        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.128197…
## $ tfidf_overview_as         <dbl> 0.00000000, 0.14246335, 0.14246335, 0.122111…
## $ tfidf_overview_at         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_back       <dbl> 0.00000000, 0.12307855, 0.12307855, 0.210991…
## $ tfidf_overview_be         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_becomes    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_been       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_before     <dbl> 0.0000000, 0.1257618, 0.1257618, 0.0000000, …
## $ tfidf_overview_begins     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_but        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_by         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.115218…
## $ tfidf_overview_can        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_city       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_dark       <dbl> 0.0000000, 0.1301812, 0.1301812, 0.0000000, …
## $ tfidf_overview_dead       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_deadly     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_death      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_discovers  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_evil       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_family     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_film       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_find       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_finds      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_for        <dbl> 0.00000000, 0.06240096, 0.06240096, 0.000000…
## $ tfidf_overview_friends    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_from       <dbl> 0.09433096, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_get        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_girl       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_group      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_has        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_have       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_he         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_her        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_him        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_his        <dbl> 0.07642058, 0.00000000, 0.00000000, 0.196510…
## $ tfidf_overview_home       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_house      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_overview_in         <dbl> 0.11721274, 0.04395478, 0.04395478, 0.000000…
## $ tfidf_overview_into       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_is         <dbl> 0.06859469, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_it         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_its        <dbl> 0.0000000, 0.2580545, 0.2580545, 0.0000000, …
## $ tfidf_overview_killer     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_life       <dbl> 0.14189981, 0.00000000, 0.00000000, 0.182442…
## $ tfidf_overview_lives      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_man        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_more       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_must       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_mysterious <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_new        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_night      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_not        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_now        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_of         <dbl> 0.14568390, 0.07284195, 0.07284195, 0.062435…
## $ tfidf_overview_old        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_on         <dbl> 0.17260504, 0.12945378, 0.12945378, 0.000000…
## $ tfidf_overview_once       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_one        <dbl> 0.12506056, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_only       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_out        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_over       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_people     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_secret     <dbl> 0.0000000, 0.1286503, 0.1286503, 0.0000000, …
## $ tfidf_overview_she        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_small      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_soon       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_that       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_the        <dbl> 0.04427295, 0.19922825, 0.19922825, 0.056922…
## $ tfidf_overview_their      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.112500…
## $ tfidf_overview_them       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_themselves <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_they       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_this       <dbl> 0.13574316, 0.00000000, 0.00000000, 0.174526…
## $ tfidf_overview_time       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.2167946, …
## $ tfidf_overview_to         <dbl> 0.04750568, 0.03562926, 0.03562926, 0.061078…
## $ tfidf_overview_town       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_two        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_up         <dbl> 0.00000000, 0.09987816, 0.09987816, 0.000000…
## $ tfidf_overview_was        <dbl> 0.1628263, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_what       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_when       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_where      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_which      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_while      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_who        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_wife       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_will       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_with       <dbl> 0.00000000, 0.06030985, 0.06030985, 0.000000…
## $ tfidf_overview_woman      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_world      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_year       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_years      <dbl> 0.14834278, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_young      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ original_language_ar      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_bn      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_cn      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_de      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_en      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ original_language_es      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_fa      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_fi      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_fr      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_hi      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_it      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ja      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_kn      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ko      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ml      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ms      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_nb      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_no      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_pl      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_pt      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ru      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_sv      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ta      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_te      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_th      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_tl      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_tr      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Action        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Adventure     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Animation     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Comedy        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Crime         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Documentary   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Drama         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Family        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Fantasy       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Fiction       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ genre_names_History       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Horror        <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,…
## $ genre_names_Movie         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Music         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Mystery       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Romance       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Science       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Thriller      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,…
## $ genre_names_War           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Western       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
#specify model
xgboost_spec <-
    boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
    set_mode("regression") %>%
    set_engine("xgboost")

# Combine recipe and model using workflow
xgboost_workflow <-
    workflow() %>%
    add_recipe(xgboost_recipe) %>%
    add_model(xgboost_spec)

# Tune hyperparameters
set.seed(344)
xgboost_tune <-
    tune_grid(xgboost_workflow,
              resamples = data_cv,
              grid = 5)
## → A | warning: ! There are new levels in a factor: `pt` and `nb`.
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x2

There were issues with some computations   A: x3

There were issues with some computations   A: x4

There were issues with some computations   A: x5

                                                 
→ B | warning: ! There are new levels in a factor: `Documentary`.
## There were issues with some computations   A: x5

There were issues with some computations   A: x5   B: x1

There were issues with some computations   A: x5   B: x2

There were issues with some computations   A: x5   B: x3

There were issues with some computations   A: x5   B: x4

There were issues with some computations   A: x5   B: x5

                                                         
→ C | warning: ! There are new levels in a factor: `Movie`.
## There were issues with some computations   A: x5   B: x5

There were issues with some computations   A: x5   B: x5   C: x1

There were issues with some computations   A: x5   B: x5   C: x2

There were issues with some computations   A: x5   B: x5   C: x3

There were issues with some computations   A: x5   B: x5   C: x4

There were issues with some computations   A: x5   B: x5   C: x5

                                                                 
→ D | warning: ! There are new levels in a factor: `bn`., ! There are new levels in a factor: `War`.
## There were issues with some computations   A: x5   B: x5   C: x5

There were issues with some computations   A: x5   B: x5   C: x5   D: x1

There were issues with some computations   A: x5   B: x5   C: x5   D: x2

There were issues with some computations   A: x5   B: x5   C: x5   D: x3

There were issues with some computations   A: x5   B: x5   C: x5   D: x4

There were issues with some computations   A: x5   B: x5   C: x5   D: x5

                                                                         
→ E | warning: ! There are new levels in a factor: `ar` and `ms`., ! There are new levels in a factor: `Western`.
## There were issues with some computations   A: x5   B: x5   C: x5   D: x5

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x1

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x2

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x3

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x4

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x5

                                                                                 
→ F | warning: ! There are new levels in a factor: `cn`.
## There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x5

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

#Conclusion

The first thing that I did was make sure that all missing values were elimin ated from the data set and I also treated values with the na.omit function to lighten the overall data points in the set.

I also added the tokenizefilter argument under therecipe step and I set that as 100.

These are all things that I have been working on with Proffesor Lee and fixing as I went along.