#Import Data:

horror <- horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')

## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): original_title, title, original_language, overview, tagline, post...
## dbl   (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl   (1): adult
## date  (1): release_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(horror)

Data summary
Name	horror
Number of rows	32540
Number of columns	20
_______________________
Column type frequency:
character	10
Date	1
logical	1
numeric	8
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
original_title	0	1.00	1	191	30296
title	0	1.00	1	191	29563
original_language	0	1.00	2	2	97
overview	1286	0.96	1	1000	31020
tagline	19835	0.39	1	237	12513
poster_path	4474	0.86	30	32	28048
status	0	1.00	7	15	4
backdrop_path	18995	0.42	29	32	13536
genre_names	0	1.00	6	144	772
collection_name	30234	0.07	4	56	815

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
release_date	0	1	1950-01-01	2022-12-31	2012-12-09	10999

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
adult	0	1	0	FAL: 32540

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	445910.83	305744.67	17	146494.8	426521.00	707534.00	1033095.00	▇▆▆▅▅
popularity	0	1.00	4.01	37.51	0	0.6	0.84	2.24	5088.58	▇▁▁▁▁
vote_count	0	1.00	62.69	420.89	0	0.0	2.00	11.00	16900.00	▇▁▁▁▁
vote_average	0	1.00	3.34	2.88	0	0.0	4.00	5.70	10.00	▇▂▆▃▁
budget	0	1.00	543126.59	4542667.81	0	0.0	0.00	0.00	200000000.00	▇▁▁▁▁
revenue	0	1.00	1349746.73	14430479.15	0	0.0	0.00	0.00	701842551.00	▇▁▁▁▁
runtime	0	1.00	62.14	41.00	0	14.0	80.00	91.00	683.00	▇▁▁▁▁
collection	30234	0.07	481534.88	324498.16	656	155421.0	471259.00	759067.25	1033032.00	▇▅▅▅▅

data <- horror %>%
    
    # Treat Missing Values
    select(-tagline, -collection_name, -collection, -original_title, -release_date, -title, -poster_path, -backdrop_path, -status, -adult) %>%
    filter(budget != 0) %>%
    filter(revenue != 0) %>%
    na.omit() %>%
    
    # Log transform variables with pos-skewed distribution
    mutate(vote_average = log(vote_average + 1)) %>%
    
    separate_rows(genre_names) %>%
    
    mutate(across(where(is.logical), as.factor))

Goal: Build a regression model to predict average movie rating (vote_average). Using the horror_movies dataset.

#Explore Data:

data %>%
    ggplot(aes(vote_average)) +
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Identify Good Predictors.

Budget

data %>%
    ggplot(aes(vote_average, budget)) +
    scale_y_log10() +
    geom_point()

data %>%
    ggplot(aes(vote_average, as.factor(genre_names))) +
    geom_boxplot()

Correlation Plot

# Step 1: Prepare Data
data_binarize_tbl <- data %>%
    select(-id, -overview) %>%
    binarize()
data_binarize_tbl %>%
    glimpse()

## Rows: 2,959
## Columns: 43
## $ original_language__en                           <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ original_language__es                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__fr                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__hi                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__ja                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ original_language__ko                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `original_language__-OTHER`                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `popularity__-Inf_8.358`                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__8.358_15.602                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__15.602_29.402                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ popularity__29.402_Inf                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `vote_count__-Inf_149`                          <dbl> 1, 1, 1, 0, 0, 1, 1, 1…
## $ vote_count__149_567                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__567_1647                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_count__1647_Inf                            <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `vote_average__-Inf_1.85629799036563`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.85629799036563_1.96009478404727 <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ vote_average__1.96009478404727_2.02814824729229 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__2.02814824729229_Inf              <dbl> 1, 1, 1, 1, 1, 0, 0, 0…
## $ `budget__-Inf_1550000`                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `budget__1550000_7e+06`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ `budget__7e+06_1.9e+07`                         <dbl> 1, 1, 1, 1, 1, 0, 0, 0…
## $ `budget__1.9e+07_Inf`                           <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ `revenue__-Inf_1189315`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ revenue__1189315_12294931                       <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ revenue__12294931_48713152.5                    <dbl> 1, 1, 1, 0, 0, 0, 0, 0…
## $ revenue__48713152.5_Inf                         <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ `runtime__-Inf_89`                              <dbl> 0, 0, 0, 0, 0, 1, 1, 1…
## $ runtime__89_96                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__96_105                                 <dbl> 0, 0, 0, 1, 1, 0, 0, 0…
## $ runtime__105_Inf                                <dbl> 1, 1, 1, 0, 0, 0, 0, 0…
## $ genre_names__Action                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Adventure                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Comedy                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Crime                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Drama                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Fantasy                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Fiction                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Horror                             <dbl> 1, 0, 0, 1, 0, 1, 0, 0…
## $ genre_names__Mystery                            <dbl> 0, 1, 0, 0, 0, 0, 1, 0…
## $ genre_names__Science                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ genre_names__Thriller                           <dbl> 0, 0, 1, 0, 1, 0, 0, 1…
## $ `genre_names__-OTHER`                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…

# Step 2: Correlate
data_corr_tbl <- data_binarize_tbl %>%
    correlate(vote_average__2.02814824729229_Inf)

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

# data <- sample_n(data, 100)

# Split into train and test dataset
set.seed(123)
data_split <- initial_split(data, strata = vote_average)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(234)
data_cv <- vfold_cv(data_train, strata = vote_average)
data_cv

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [1994/224]> Fold01
##  2 <split [1995/223]> Fold02
##  3 <split [1996/222]> Fold03
##  4 <split [1996/222]> Fold04
##  5 <split [1996/222]> Fold05
##  6 <split [1996/222]> Fold06
##  7 <split [1996/222]> Fold07
##  8 <split [1997/221]> Fold08
##  9 <split [1998/220]> Fold09
## 10 <split [1998/220]> Fold10

library(usemodels)
usemodels::use_xgboost(vote_average ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = vote_average ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(24411)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

#specify recipe
#Specifiy Recipie
xgboost_recipe <-
    recipe(formula = vote_average ~ ., data = data_train) %>%
    update_role(id, new_role = "id") %>%
    step_tokenize(overview) %>%
    step_tokenfilter(overview, max_tokens = 100) %>%
    step_tfidf(overview) %>%
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
    step_YeoJohnson(runtime)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 2,218
## Columns: 154
## $ id                        <dbl> 25853, 55341, 55341, 11249, 11249, 440, 440,…
## $ popularity                <dbl> 378.390, 246.189, 246.189, 102.731, 102.731,…
## $ vote_count                <dbl> 220, 692, 692, 1252, 1252, 2334, 2334, 2334,…
## $ budget                    <dbl> 250000, 6200000, 6200000, 12000000, 12000000…
## $ revenue                   <dbl> 144008, 3600000, 3600000, 24829644, 24829644…
## $ runtime                   <dbl> 1979.365, 1945.252, 1945.252, 1525.409, 1525…
## $ vote_average              <dbl> 1.824549, 1.774952, 1.774952, 1.856298, 1.85…
## $ tfidf_overview_a          <dbl> 0.00000000, 0.03113613, 0.03113613, 0.000000…
## $ tfidf_overview_about      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_after      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_against    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_all        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_an         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_and        <dbl> 0.05096523, 0.00000000, 0.00000000, 0.065526…
## $ tfidf_overview_are        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.128197…
## $ tfidf_overview_as         <dbl> 0.00000000, 0.14246335, 0.14246335, 0.122111…
## $ tfidf_overview_at         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_back       <dbl> 0.00000000, 0.12307855, 0.12307855, 0.210991…
## $ tfidf_overview_be         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_becomes    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_been       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_before     <dbl> 0.0000000, 0.1257618, 0.1257618, 0.0000000, …
## $ tfidf_overview_begins     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_but        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_by         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.115218…
## $ tfidf_overview_can        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_city       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_dark       <dbl> 0.0000000, 0.1301812, 0.1301812, 0.0000000, …
## $ tfidf_overview_dead       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_deadly     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_death      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_discovers  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_evil       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_family     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_film       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_find       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_finds      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_for        <dbl> 0.00000000, 0.06240096, 0.06240096, 0.000000…
## $ tfidf_overview_friends    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_from       <dbl> 0.09433096, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_get        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_girl       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_group      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_has        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_have       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_he         <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_her        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_him        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_his        <dbl> 0.07642058, 0.00000000, 0.00000000, 0.196510…
## $ tfidf_overview_home       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_house      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_overview_in         <dbl> 0.11721274, 0.04395478, 0.04395478, 0.000000…
## $ tfidf_overview_into       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_is         <dbl> 0.06859469, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_it         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_its        <dbl> 0.0000000, 0.2580545, 0.2580545, 0.0000000, …
## $ tfidf_overview_killer     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_life       <dbl> 0.14189981, 0.00000000, 0.00000000, 0.182442…
## $ tfidf_overview_lives      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_man        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_more       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_must       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_mysterious <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_new        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_night      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_not        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_now        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_of         <dbl> 0.14568390, 0.07284195, 0.07284195, 0.062435…
## $ tfidf_overview_old        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_on         <dbl> 0.17260504, 0.12945378, 0.12945378, 0.000000…
## $ tfidf_overview_once       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_one        <dbl> 0.12506056, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_only       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_out        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_over       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_people     <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_secret     <dbl> 0.0000000, 0.1286503, 0.1286503, 0.0000000, …
## $ tfidf_overview_she        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_small      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_soon       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_that       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_the        <dbl> 0.04427295, 0.19922825, 0.19922825, 0.056922…
## $ tfidf_overview_their      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.112500…
## $ tfidf_overview_them       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_themselves <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_they       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_this       <dbl> 0.13574316, 0.00000000, 0.00000000, 0.174526…
## $ tfidf_overview_time       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.2167946, …
## $ tfidf_overview_to         <dbl> 0.04750568, 0.03562926, 0.03562926, 0.061078…
## $ tfidf_overview_town       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_two        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_up         <dbl> 0.00000000, 0.09987816, 0.09987816, 0.000000…
## $ tfidf_overview_was        <dbl> 0.1628263, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_what       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_overview_when       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_where      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_which      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_while      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_who        <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_wife       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_will       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_with       <dbl> 0.00000000, 0.06030985, 0.06030985, 0.000000…
## $ tfidf_overview_woman      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_world      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_overview_year       <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_years      <dbl> 0.14834278, 0.00000000, 0.00000000, 0.000000…
## $ tfidf_overview_young      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.000000…
## $ original_language_ar      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_bn      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_cn      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_de      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_en      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ original_language_es      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_fa      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_fi      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_fr      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_hi      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_it      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ja      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_kn      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ko      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ml      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ms      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_nb      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_no      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_pl      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_pt      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ru      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_sv      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_ta      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_te      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_th      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_tl      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ original_language_tr      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Action        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Adventure     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Animation     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Comedy        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Crime         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Documentary   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Drama         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Family        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Fantasy       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Fiction       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ genre_names_History       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Horror        <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,…
## $ genre_names_Movie         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Music         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Mystery       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Romance       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Science       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Thriller      <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,…
## $ genre_names_War           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ genre_names_Western       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

#specify model
xgboost_spec <-
    boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
    set_mode("regression") %>%
    set_engine("xgboost")

# Combine recipe and model using workflow
xgboost_workflow <-
    workflow() %>%
    add_recipe(xgboost_recipe) %>%
    add_model(xgboost_spec)

# Tune hyperparameters
set.seed(344)
xgboost_tune <-
    tune_grid(xgboost_workflow,
              resamples = data_cv,
              grid = 5)

## → A | warning: ! There are new levels in a factor: `pt` and `nb`.

## 
There were issues with some computations   A: x1

There were issues with some computations   A: x2

There were issues with some computations   A: x3

There were issues with some computations   A: x4

There were issues with some computations   A: x5

                                                 
→ B | warning: ! There are new levels in a factor: `Documentary`.
## There were issues with some computations   A: x5

There were issues with some computations   A: x5   B: x1

There were issues with some computations   A: x5   B: x2

There were issues with some computations   A: x5   B: x3

There were issues with some computations   A: x5   B: x4

There were issues with some computations   A: x5   B: x5

                                                         
→ C | warning: ! There are new levels in a factor: `Movie`.
## There were issues with some computations   A: x5   B: x5

There were issues with some computations   A: x5   B: x5   C: x1

There were issues with some computations   A: x5   B: x5   C: x2

There were issues with some computations   A: x5   B: x5   C: x3

There were issues with some computations   A: x5   B: x5   C: x4

There were issues with some computations   A: x5   B: x5   C: x5

                                                                 
→ D | warning: ! There are new levels in a factor: `bn`., ! There are new levels in a factor: `War`.
## There were issues with some computations   A: x5   B: x5   C: x5

There were issues with some computations   A: x5   B: x5   C: x5   D: x1

There were issues with some computations   A: x5   B: x5   C: x5   D: x2

There were issues with some computations   A: x5   B: x5   C: x5   D: x3

There were issues with some computations   A: x5   B: x5   C: x5   D: x4

There were issues with some computations   A: x5   B: x5   C: x5   D: x5

                                                                         
→ E | warning: ! There are new levels in a factor: `ar` and `ms`., ! There are new levels in a factor: `Western`.
## There were issues with some computations   A: x5   B: x5   C: x5   D: x5

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x1

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x2

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x3

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x4

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x5

                                                                                 
→ F | warning: ! There are new levels in a factor: `cn`.
## There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x5

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

There were issues with some computations   A: x5   B: x5   C: x5   D: x5   E: x…

#Conclusion

The first thing that I did was make sure that all missing values were elimin ated from the data set and I also treated values with the na.omit function to lighten the overall data points in the set.

I also added the tokenizefilter argument under therecipe step and I set that as 100.

These are all things that I have been working on with Proffesor Lee and fixing as I went along.

Apply to Your Data 4: Horror Movies

Stephen Morris

2024-02-27

Build Models