library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
## Rows: 32540 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): original_title, title, original_language, overview, tagline, post...
## dbl (8): id, popularity, vote_count, vote_average, budget, revenue, runtim...
## lgl (1): adult
## date (1): release_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(horror_movies, aes(vote_average)) +
geom_histogram(alpha = 0.8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
horror_movies %>%
filter(!is.na(runtime)) %>%
mutate(runtime = cut(runtime, 10)) %>%
ggplot(aes(runtime, vote_average, fill = runtime)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
horror_split <-
horror_movies %>%
select(title, vote_average, tagline) %>%
na.omit() %>%
initial_split(strata = vote_average)
horror_train <- training(horror_split)
horror_test <- testing(horror_split)
set.seed(234)
horror_folds <- vfold_cv(horror_train, strata = vote_average)
horror_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [8572/955]> Fold01
## 2 <split [8572/955]> Fold02
## 3 <split [8574/953]> Fold03
## 4 <split [8574/953]> Fold04
## 5 <split [8574/953]> Fold05
## 6 <split [8575/952]> Fold06
## 7 <split [8575/952]> Fold07
## 8 <split [8575/952]> Fold08
## 9 <split [8576/951]> Fold09
## 10 <split [8576/951]> Fold10
library(textrecipes)
split_tagline <- function(x) {
x %>%
str_split(" ") %>%
map(str_remove_all, "[:punct:]") %>%
map(str_squish) %>%
map(str_to_lower) %>%
map(str_replace_all, " ", "_")
}
horror_rec <-
recipe(vote_average ~ ., data = horror_train) %>%
update_role(title, new_role = "id") %>%
step_tokenize(tagline, custom_token = split_tagline) %>%
step_tokenfilter(tagline, max_tokens = 100) %>%
step_tf(tagline)
horror_prep <- prep(horror_rec)
bake(horror_prep, new_data = NULL)
## # A tibble: 9,527 × 102
## title vote_average tf_tagline_ tf_tagline_a tf_tagline_about tf_tagline_all
## <fct> <dbl> <int> <int> <int> <int>
## 1 Living… 2 0 0 0 0
## 2 Dahmer… 1.8 0 0 0 0
## 3 The Ca… 2 0 0 0 0
## 4 The Vi… 0 0 0 0 0
## 5 The Me… 0 0 0 0 0
## 6 The Ac… 0 0 0 0 0
## 7 Grimcu… 0 0 0 0 0
## 8 The Pr… 0 0 0 0 0
## 9 Bones … 0 0 0 0 0
## 10 Deadst… 0 0 0 0 0
## # ℹ 9,517 more rows
## # ℹ 96 more variables: tf_tagline_an <int>, tf_tagline_and <int>,
## # tf_tagline_are <int>, tf_tagline_at <int>, tf_tagline_back <int>,
## # tf_tagline_be <int>, tf_tagline_blood <int>, tf_tagline_but <int>,
## # tf_tagline_by <int>, tf_tagline_can <int>, tf_tagline_cant <int>,
## # tf_tagline_come <int>, tf_tagline_comes <int>, tf_tagline_dead <int>,
## # tf_tagline_death <int>, tf_tagline_die <int>, tf_tagline_do <int>, …
xgb_spec <-
boost_tree(
trees = tune(),
mtry = tune(),
min_n = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("regression")
xgb_wf <- workflow(horror_rec, xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_tokenize()
## • step_tokenfilter()
## • step_tf()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## trees = tune()
## min_n = tune()
## learn_rate = 0.01
##
## Computational engine: xgboost
library(finetune)
doParallel::registerDoParallel()
set.seed(234)
xgb_horror_rs <-
tune_race_anova(
xgb_wf,
horror_folds,
grid = 20,
control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will minimize the rmse metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 5 eliminated; 15 candidates remain.
##
## ℹ Fold06: 6 eliminated; 9 candidates remain.
##
## ℹ Fold08: 3 eliminated; 6 candidates remain.
##
## ℹ Fold01: 0 eliminated; 6 candidates remain.
##
## ℹ Fold04: 1 eliminated; 5 candidates remain.
##
## ℹ Fold02: 0 eliminated; 5 candidates remain.
##
## ℹ Fold09: 1 eliminated; 4 candidates remain.
xgb_horror_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .order .metrics .notes
## <list> <chr> <int> <list> <list>
## 1 <split [8574/953]> Fold03 1 <tibble [40 × 7]> <tibble [0 × 3]>
## 2 <split [8574/953]> Fold05 2 <tibble [40 × 7]> <tibble [0 × 3]>
## 3 <split [8576/951]> Fold10 3 <tibble [40 × 7]> <tibble [0 × 3]>
## 4 <split [8575/952]> Fold06 4 <tibble [30 × 7]> <tibble [0 × 3]>
## 5 <split [8575/952]> Fold08 5 <tibble [18 × 7]> <tibble [0 × 3]>
## 6 <split [8572/955]> Fold01 6 <tibble [12 × 7]> <tibble [0 × 3]>
## 7 <split [8574/953]> Fold04 7 <tibble [12 × 7]> <tibble [0 × 3]>
## 8 <split [8572/955]> Fold02 8 <tibble [10 × 7]> <tibble [0 × 3]>
## 9 <split [8576/951]> Fold09 9 <tibble [10 × 7]> <tibble [0 × 3]>
## 10 <split [8575/952]> Fold07 10 <tibble [8 × 7]> <tibble [0 × 3]>
plot_race(xgb_horror_rs)
show_best(xgb_horror_rs)
## Warning: No value of `metric` was given; metric 'rmse' will be used.
## # A tibble: 4 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 46 881 21 rmse standard 2.57 10 0.0117 Preprocessor1_Model10
## 2 28 628 15 rmse standard 2.58 10 0.0116 Preprocessor1_Model06
## 3 82 1006 24 rmse standard 2.58 10 0.0115 Preprocessor1_Model17
## 4 1 1571 40 rmse standard 2.58 10 0.0110 Preprocessor1_Model01
xgb_last <-
xgb_wf %>%
finalize_workflow(select_best(xgb_horror_rs, "rmse")) %>%
last_fit(horror_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [9527/3178]> train/test split <tibble> <tibble> <tibble> <workflow>
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
xgb_fit <- extract_workflow(xgb_last)
vip(xgb_fit, geom = "point", num_features = 12)
library(SHAPforxgboost)
horror_shap <-
shap.prep(
xgb_model = extract_fit_engine(xgb_fit),
X_train = bake(horror_prep,
has_role("predictor"),
new_data = NULL,
composition = "matrix"
)
)
shap.plot.summary(horror_shap)
I had to remove the last block of code with the shap.plot because I could not get the Apply to knit with that code chunk in there. It was stuck loading for hours. I also did not add more then one predictor as you had commented on my orevious apply, as I already had this assignment completed when you graded and commented that assignment. I had an away game today and was gone from 6:45 am to 8:45 pm and did not have the time to add to this. I am also not able to come to class as I have another class required for my major that is scheduled for 2:30-4:10 on M/W that you had given me an override for a class conflict when I signed up for courses in the spring. I hope you will understand and take this into consideration when grading the assignment.
What is the research question? Clearly state the research question you aim to address using the new dataset. How do the run time and tagline of horror movies affect the vote average of horror movies? What factors are the most significant predictors in controlling this vote average?
Describe the data briefly: Provide an overview of the new data set, highlighting its key characteristics and dimensions. The data is all about horror movies. The dimensions of the data set are 32,540 observations of 20 variables. These variables include vote averages, run times, title, overview, release date, budget, and taglines. There is numeric and character data throughout the data set, with 10 of the 20 variables being character data.
What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the data set and their characteristics. The dataset’s key variables are vote_average, run time, and tagline. vote_average indicates a movie’s reception through audience ratings. Run time measures the movie’s duration, exploring its potential influence on ratings. taglines, the catchy movie phrases, are analyzed for words that might earn the film a higher rating from the viewers. Together, these variables provide insights into what makes a horror movie stand out with the audience. The key variable “vote_average” provides data about a movie’s rating. “runtime” is the length of the movie, which I used in my predictor model. “taglines”, which is the data I used in my predictor model, are the lines or quotes from each movie that the someone may say as a reference to the movie. “Taglines” was used to in the predictor model to see what words may influence the movies ratings. Through graphs, the data shows correlations and trends, which provides a baseline for predicting movie ratings using machine learning models. Vote_average and runtime are both numerical data, while tagline is character data.
What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new data set. A preparation step used on the code is na.omit() which I used to remove rows with missing values in the tagline, title, and vote_average rows. Value_squared() and value_log() were used based on the “value” column. Lastly, we used step_tokenfilter() to select only the top 100 tagline words after they were tokenized. This made the tagline words more meaningful and made the analysis less cluttered.
What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question.
The machine learning model used in the analysis is XGBoost. This is specified in the code with set_engine(“xgboost”) and set_mode(“regression”) at the bottom of the seventh code chunk. These two codes are for predicting regression models. I used them in this set to predict horror movie ratings.