library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tidytext)
library(explore)
## Warning: package 'explore' was built under R version 4.2.3
library(spacyr)
## Warning: package 'spacyr' was built under R version 4.2.3
library(textrecipes)
## Loading required package: recipes
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.4 ✔ rsample 1.1.1
## ✔ dials 1.1.0 ✔ tune 1.0.1
## ✔ infer 1.0.4 ✔ workflows 1.1.2
## ✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
## ✔ parsnip 1.1.0 ✔ yardstick 1.1.0
## Warning: package 'broom' was built under R version 4.2.3
## Warning: package 'parsnip' was built under R version 4.2.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ parsnip::get_dependency() masks spacyr::get_dependency()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(finetune)
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.2.3
Import Data
horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): original_title, title, original_language, overview, tagline, post...
## dbl (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl (1): adult
## date (1): release_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- horror_movies %>%
mutate(vote_average = log1p(vote_average)) %>%
filter(!is.na(overview), vote_count != 0) %>%
separate_rows(genre_names, sep = ", ") %>%
filter(status == "Released") %>%
select(id, vote_average, genre_names, overview, runtime, original_language)
data <- data %>% sample_n(100)
Explore Data
data %>% glimpse()
data %>% skimr::skim()
data %>% select(id) %>% explore()
data %>% describe_all()
data %>% describe_cat(genre_names)
data %>% select(-id) %>% explore_all(target = vote_average)
data %>%
ggplot(aes(vote_average)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>% count(original_language, sort = T)
## # A tibble: 17 × 2
## original_language n
## <chr> <int>
## 1 en 72
## 2 ja 7
## 3 zh 3
## 4 es 2
## 5 fr 2
## 6 it 2
## 7 ru 2
## 8 cn 1
## 9 da 1
## 10 is 1
## 11 ko 1
## 12 lv 1
## 13 ml 1
## 14 pt 1
## 15 sv 1
## 16 th 1
## 17 tl 1
data %>%
group_by(original_language) %>%
summarise(
n = n(),
avg_vote_average = mean(vote_average)
) %>%
ungroup() %>%
ggplot(aes(n, avg_vote_average)) +
#geom_point() +
geom_text(aes(label = original_language), check_overlap = TRUE) +
geom_hline(yintercept = mean(data$vote_average),
linewidth = 2, linetype = "dotted", color = "darkgray") +
scale_x_log10()

spacy_initialize(model = "en_core_web_sm")
# process documents and obtain a data.table
tidy_data <- data %>%
# Parse overview
mutate(overview_parsed = map(.x = .$overview, .f = ~spacy_parse(.x))) %>%
unnest(overview_parsed) %>%
# Select nouns and adjectives
filter(pos %in% c("ADJ", "NOUN"))
data_filtered <- tidy_data %>%
filter(str_detect(lemma, regex("[a-z]", ignore_case = TRUE))) %>%
group_by(lemma) %>%
summarise(
n = n(),
avg_vote_average = mean(vote_average)
) %>%
filter(n > 150)
data_filtered %>%
ggplot(aes(n, avg_vote_average)) +
# geom_point() +
geom_text(aes(label = lemma), check_overlap = TRUE) +
geom_hline(yintercept = mean(data_filtered$avg_vote_average),
linetype = "dotted", linewidth = 2, color = "darkgray") +
scale_x_log10()sp
data %>%
ggplot(aes(runtime, vote_average)) +
geom_jitter(alpha = 0.3)

Build a Model
set.seed(123)
data_split <- initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- rsample::vfold_cv(data_train)
data_folds
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
use_xgboost(vote_average ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = vote_average ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(56024)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = vote_average ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id") %>%
step_tokenize(overview) %>%
step_stopwords(overview) %>%
step_tokenfilter(overview, max_tokens = 100) %>%
step_tfidf(overview) %>%
step_other(original_language) %>%
step_dummy(genre_names, original_language, one_hot = TRUE) %>%
step_YeoJohnson(runtime)
xgboost_recipe %>% prep() %>% juice(new_data = NULL) %>% glimpse()
## Rows: 75
## Columns: 0
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(56024)
doParallel::registerDoParallel()
xgboost_tune <-
tune_grid(xgboost_workflow, resamples = data_folds, grid = 10)
# Explore Results
show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 581 15 2 0.0253 1.98e- 4 0.738 rmse
## 2 1116 32 6 0.0399 1.10e- 6 0.479 rmse
## 3 1246 9 7 0.00902 7.48e-10 0.154 rmse
## 4 357 40 3 0.0803 1.73e- 9 0.882 rmse
## 5 861 25 9 0.0174 6.69e- 8 0.439 rmse
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
autoplot(xgboost_tune)

final_rf <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, "rmse"))
data_fit <- last_fit(final_rf, data_split)
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: TV Movie, Documentary
data_fit
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [75/25]> train/test split <tibble> <tibble> <tibble> <workflow>
##
## There were issues with some computations:
##
## - Warning(s) x1: There are new levels in a factor: TV Movie, Documentary
##
## Run `show_notes(.Last.tune.result)` for more information.
Evaluate Model
collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.232 Preprocessor1_Model1
## 2 rsq standard 0.0483 Preprocessor1_Model1
collect_predictions(data_fit)
## # A tibble: 25 × 5
## id .pred .row vote_average .config
## <chr> <dbl> <int> <dbl> <chr>
## 1 train/test split 1.70 1 1.50 Preprocessor1_Model1
## 2 train/test split 1.88 2 1.99 Preprocessor1_Model1
## 3 train/test split 1.53 3 1.92 Preprocessor1_Model1
## 4 train/test split 1.77 10 2.01 Preprocessor1_Model1
## 5 train/test split 1.88 11 1.79 Preprocessor1_Model1
## 6 train/test split 1.97 24 2.20 Preprocessor1_Model1
## 7 train/test split 1.79 28 1.97 Preprocessor1_Model1
## 8 train/test split 1.88 35 1.84 Preprocessor1_Model1
## 9 train/test split 1.69 37 1.69 Preprocessor1_Model1
## 10 train/test split 1.88 44 1.57 Preprocessor1_Model1
## # ℹ 15 more rows
collect_predictions(data_fit) %>%
ggplot(aes(vote_average, .pred)) +
geom_point(alpha = 0.5, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()

data_fit %>%
extract_workflow() %>%
predict(data_test[1,])
## # A tibble: 1 × 1
## .pred
## <dbl>
## 1 1.70
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
imp_spec <- xgboost_spec %>%
tune::finalize_model(tune::select_best(xgboost_tune)) %>%
parsnip::set_engine("xgboost", importance = "permutation")
## Warning: No value of `metric` was given; metric 'rmse' will be used.
workflows::workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(imp_spec) %>%
fit(data_train) %>%
workflows::extract_fit_parsnip() %>%
vip()
## [04:51:53] WARNING: amalgamation/../src/learner.cc:627:
## Parameters: { "importance" } might not be used.
##
## This could be a false alarm, with some parameters getting used by language bindings but
## then being mistakenly passed down to XGBoost core, or some parameter actually being used
## but getting flagged wrongly here. Please open an issue if you find any such cases.
