Goal: to predict the Youtube like count Click here for the data

.

Import Data

 youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
Data summary
Name youtube
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
data <- youtube %>%

  
  # Treat missing values
  select(-thumbnail, -description, -favorite_count, -comment_count, -published_at, -category_id, -superbowl_ads_dot_com_url, -youtube_url, -id ,-etag, -channel_title) %>% 
   na.omit() %>% 
  
  # log transform variables with pos-skewed distribution
  mutate(like_count = log(like_count))

Explore Data

Identify good predictors

like_count

data %>%
  ggplot(aes(like_count, view_count)) +
  scale_y_log10() +
  geom_point()

data %>% 
  ggplot(aes(like_count, as.factor(brand))) +
  geom_boxplot() 
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

title

data %>%  
  
  # tokenism title
  unnest_tokens(output = word, input = brand) %>%
  
  # calculate avg rent per word
  group_by(word) %>%
  summarise(like_count = mean(like_count), 
            n   = n()) %>%
  ungroup() %>%

filter(n > 10, !str_detect(word, "\\d")) %>%
  slice_max(order_by = like_count, n = 20) %>% 
  
  # plot

    ggplot(aes(like_count, fct_reorder(word, like_count))) +
  geom_point() +


  
  labs(y = "word in Title")

# step 1: prepare data
data_binarized_tbl <- data %>%

  select(-dislike_count, -title) %>%
binarize() 

data_binarized_tbl  %>% glimpse() 
## Rows: 225
## Columns: 36
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__0                                      <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__1                                      <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__0                       <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__1                       <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__0                                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__1                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__0                                  <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__1                                  <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__0                                     <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__1                                     <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__0                                    <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__1                                    <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__0                                    <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__1                                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `view_count__-Inf_6641`                       <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ view_count__6641_43983                        <dbl> 0, 0, 0, 0, 1, 1, 0, 1, …
## $ view_count__43983_175482                      <dbl> 1, 1, 1, 0, 0, 0, 0, 0, …
## $ view_count__175482_Inf                        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `like_count__-Inf_2.94443897916644`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.94443897916644_4.86753445045558 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.86753445045558_6.26720054854136 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26720054854136_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate(like_count__6.26720054854136_Inf )

data_corr_tbl
## # A tibble: 36 × 3
##    feature    bin                               correlation
##    <fct>      <chr>                                   <dbl>
##  1 like_count 6.26720054854136_Inf                    1    
##  2 view_count 175482_Inf                              0.715
##  3 like_count -Inf_2.94443897916644                  -0.339
##  4 view_count -Inf_6641                              -0.335
##  5 like_count 4.86753445045558_6.26720054854136      -0.331
##  6 like_count 2.94443897916644_4.86753445045558      -0.327
##  7 view_count 6641_43983                             -0.308
##  8 brand      Doritos                                 0.281
##  9 brand      NFL                                     0.250
## 10 brand      Bud_Light                              -0.212
## # ℹ 26 more rows
# step 3: 
data_corr_tbl %>%
  plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

split data

# data <- sample_n(data, 100)

# Split into train and test dataset 
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [151/17]> Fold01
##  2 <split [151/17]> Fold02
##  3 <split [151/17]> Fold03
##  4 <split [151/17]> Fold04
##  5 <split [151/17]> Fold05
##  6 <split [151/17]> Fold06
##  7 <split [151/17]> Fold07
##  8 <split [151/17]> Fold08
##  9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count ~ ., data_train)
## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(18995)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe
xgboost_recipe <- 
  xgboost_recipe <-
  recipe(formula = like_count ~ ., data = data_train) %>%
  recipes::update_role(title, new_role = "id variable") %>%
  step_tokenize(title) %>%
  step_tokenfilter(title, max_tokens = 100) %>%
  step_tfidf(title) %>%
  step_other(brand) %>%
  step_dummy(brand,  one_hot = TRUE) %>%
  step_log(view_count, like_count, dislike_count)

  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Warning in bake.step_log(x$steps[[i]], new_data = training): NaNs produced
## Rows: 168
## Columns: 120
## $ year                      <dbl> 2013, 2015, 2008, 2010, 2009, 2007, 2010, 20…
## $ funny                     <lgl> TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE…
## $ show_product_quickly      <lgl> TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, …
## $ patriotic                 <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE…
## $ celebrity                 <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRU…
## $ danger                    <lgl> TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALS…
## $ animals                   <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE…
## $ use_sex                   <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FAL…
## $ kind                      <fct> youtube#video, youtube#video, youtube#video,…
## $ view_count                <dbl> 8.366835, 4.369448, 9.565704, 11.422760, 10.…
## $ dislike_count             <dbl> -Inf, -Inf, 0.6931472, 2.6390573, 0.6931472,…
## $ like_count                <dbl> 1.1285084, NaN, 1.2763453, 1.7919953, 1.2517…
## $ tfidf_title_2000          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2001          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2002          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_2005          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2007          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2009          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_2010          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_2011          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2012          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2013          <dbl> 0.6437752, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_2014          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_2015          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2016          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2018          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_2019          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_2020          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_44            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_a             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_ad            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_ads           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_advertisement <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_all           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_and           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_baby          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_bears         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_beer          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_best          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.6346645, …
## $ tfidf_title_bestbuds      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_big           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_bowl          <dbl> 0.2488648, 0.0000000, 0.2488648, 0.1777606, …
## $ tfidf_title_britney       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_bud           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_budweiser     <dbl> 0.0000000, 0.6812715, 0.0000000, 0.0000000, …
## $ tfidf_title_camry         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_car           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cedric        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cindy         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_coca          <dbl> 0.0000000, 0.0000000, 0.5129899, 0.0000000, …
## $ tfidf_title_coke          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_cola          <dbl> 0.0000000, 0.0000000, 0.5002872, 0.0000000, …
## $ tfidf_title_commercial    <dbl> 0.2135681, 0.3559469, 0.2135681, 0.0000000, …
## $ tfidf_title_commercials   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_cool          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_crash         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_date          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_diet          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_dilly         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_dog           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_dogs          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_doritos       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_e             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_elantra       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_etrade        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_extended      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_factory       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_fantasy       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_featuring     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_flavor        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_fly           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_full          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_funny         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_game          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_genesis       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_girlfriend    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_great         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_happiness     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_hd            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_horse         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_hyundai       <dbl> 0.5002872, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_in            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_is            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_island        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_it            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_kia           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_light         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_lighta        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_love          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_new           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_nfl           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.4810423, …
## $ tfidf_title_of            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_official      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_on            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.5775788, …
## $ tfidf_title_one           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_pepsi         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_puppy         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_spot          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_starring      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_super         <dbl> 0.2488648, 0.0000000, 0.2488648, 0.1777606, …
## $ tfidf_title_superbowl     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_the           <dbl> 0.0000000, 0.6931472, 0.0000000, 0.2970631, …
## $ tfidf_title_toyota        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_trade         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_tv            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_usa           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.00…
## $ tfidf_title_version       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_vs            <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.00…
## $ tfidf_title_winner        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_with          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, …
## $ tfidf_title_x             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_title_xliv          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.5775788, …
## $ brand_Bud.Light           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Budweiser           <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,…
## $ brand_Coca.Cola           <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,…
## $ brand_Doritos             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Hynudai             <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,…
## $ brand_Pepsi               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Toyota              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_other               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,…
# specify model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), mtry = tune()) %>%  
  set_mode("classification") %>% 
  set_engine("xgboost") 

# combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# tune hyperprarameters 
set.seed(344)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 5)
## i Creating pre-processing data to finalize 1 unknown parameter: "mtry"
## Warning in bake.step_log(x$steps[[i]], new_data = training): NaNs produced
## → A | warning: NaNs produced
## There were issues with some computations   A: x1                                                 → B | error:   For a classification model, the outcome should be a <factor>, not a
##                double vector.
## There were issues with some computations   A: x1There were issues with some computations   A: x2   B: x1There were issues with some computations   A: x3   B: x3There were issues with some computations   A: x4   B: x6There were issues with some computations   A: x5   B: x10There were issues with some computations   A: x6   B: x15There were issues with some computations   A: x7   B: x15There were issues with some computations   A: x8   B: x16There were issues with some computations   A: x9   B: x18There were issues with some computations   A: x10   B: x21There were issues with some computations   A: x11   B: x25There were issues with some computations   A: x12   B: x30There were issues with some computations   A: x13   B: x30There were issues with some computations   A: x14   B: x31There were issues with some computations   A: x15   B: x33There were issues with some computations   A: x16   B: x36There were issues with some computations   A: x17   B: x40There were issues with some computations   A: x18   B: x45There were issues with some computations   A: x19   B: x45There were issues with some computations   A: x20   B: x46There were issues with some computations   A: x21   B: x48There were issues with some computations   A: x22   B: x51There were issues with some computations   A: x23   B: x55There were issues with some computations   A: x25   B: x60There were issues with some computations   A: x26   B: x61There were issues with some computations   A: x27   B: x63There were issues with some computations   A: x28   B: x66There were issues with some computations   A: x29   B: x70There were issues with some computations   A: x30   B: x75There were issues with some computations   A: x31   B: x75There were issues with some computations   A: x32   B: x76There were issues with some computations   A: x33   B: x78There were issues with some computations   A: x34   B: x81There were issues with some computations   A: x35   B: x85There were issues with some computations   A: x36   B: x90There were issues with some computations   A: x37   B: x90There were issues with some computations   A: x38   B: x91There were issues with some computations   A: x39   B: x93There were issues with some computations   A: x40   B: x96There were issues with some computations   A: x41   B: x100There were issues with some computations   A: x42   B: x105There were issues with some computations   A: x43   B: x105There were issues with some computations   A: x44   B: x106There were issues with some computations   A: x45   B: x108There were issues with some computations   A: x46   B: x111There were issues with some computations   A: x47   B: x115There were issues with some computations   A: x48   B: x120There were issues with some computations   A: x49   B: x120There were issues with some computations   A: x50   B: x121There were issues with some computations   A: x51   B: x123There were issues with some computations   A: x52   B: x126There were issues with some computations   A: x54   B: x135There were issues with some computations   A: x55   B: x135There were issues with some computations   A: x56   B: x136There were issues with some computations   A: x57   B: x138There were issues with some computations   A: x58   B: x141There were issues with some computations   A: x59   B: x145There were issues with some computations   A: x60   B: x150
## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.
## There were issues with some computations   A: x60   B: x150