Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

skimr::skim(youtube)
Data summary
Name youtube
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
data <- youtube %>% 
    
    # Treat missing values
    select(-superbowl_ads_dot_com_url, -youtube_url, -kind, -etag, -channel_title, -comment_count, -thumbnail, -published_at, -description, -dislike_count, -favorite_count, -view_count) %>% 
    na.omit() %>% 
    mutate(like_count = log(like_count + 1)) %>% 
    mutate(category_id = as.factor(category_id)) %>% 
    mutate(across(where(is.character),as.factor)) %>%
    mutate(title = as.character(title)) %>% 
    mutate(across(where(is.logical),as.factor))

Explore Data

Identify good predictors.

celebrity

data %>% 
    ggplot(aes(like_count, funny)) + 
    geom_point()

Brand

data %>% 
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg rent per word
    group_by(word) %>%
    summarise(like_count = mean(like_count),
              n          = n()) %>%
    ungroup() %>%
    
    # Plot 
    ggplot(aes(like_count, fct_reorder(word, like_count))) + 
    geom_point() + 
    
    labs(y = "Brands")

EDA Shortcut

# Step 1: Prepare Data
data_binarized_tbl <- data %>% 
    select(-id, -title) %>% 
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 225
## Columns: 43
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE                                  <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE                                   <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE                   <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE                    <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE                              <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE                               <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE                                 <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE                                  <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE                                <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE                                 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE                                <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id__1                                <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22                               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24                               <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27                               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>% 
    correlate(like_count__6.26909628370626_Inf)

data_corr_tbl
## # A tibble: 43 × 3
##    feature     bin                               correlation
##    <fct>       <chr>                                   <dbl>
##  1 like_count  6.26909628370626_Inf                    1    
##  2 like_count  -Inf_2.99573227355399                  -0.339
##  3 like_count  4.87519732320115_6.26909628370626      -0.331
##  4 like_count  2.99573227355399_4.87519732320115      -0.327
##  5 brand       Doritos                                 0.281
##  6 brand       NFL                                     0.250
##  7 brand       Bud_Light                              -0.212
##  8 year        2015_Inf                                0.202
##  9 year        -Inf_2005                              -0.193
## 10 category_id 1                                       0.133
## # ℹ 33 more rows
# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split Data

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [151/17]> Fold01
##  2 <split [151/17]> Fold02
##  3 <split [151/17]> Fold03
##  4 <split [151/17]> Fold04
##  5 <split [151/17]> Fold05
##  6 <split [151/17]> Fold06
##  7 <split [151/17]> Fold07
##  8 <split [151/17]> Fold08
##  9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(18995)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify Recipe 
xgboost_recipe <- recipe(formula = like_count ~ ., data = data_train) %>% 
  recipes::update_role(id, new_role = "id variables") %>% 
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>% 
    step_tfidf(title) %>%
    step_dummy(all_nominal_predictors(),one_hot = TRUE) %>%
    step_zv(all_predictors()) 
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 168
## Columns: 140
## $ year                        <dbl> 2013, 2015, 2008, 2010, 2009, 2007, 2010, …
## $ id                          <fct> WTf0XGpINJI, 7_EfXuGev24, 2_LWZe2BGaE, 6cM…
## $ like_count                  <dbl> 3.1354942, 0.0000000, 3.6109179, 6.0038871…
## $ tfidf_title_2000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2002            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2007            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2009            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2010            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2011            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2013            <dbl> 0.6437752, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2014            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2016            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2018            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2020            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_44              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_a               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_ad              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_ads             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_advertisement   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_all             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_and             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_baby            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bears           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_beer            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_best            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.6346645…
## $ tfidf_title_bestbuds        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_big             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bowl            <dbl> 0.2488648, 0.0000000, 0.2488648, 0.1777606…
## $ tfidf_title_britney         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bud             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_budweiser       <dbl> 0.0000000, 0.6812715, 0.0000000, 0.0000000…
## $ tfidf_title_camry           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_car             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cedric          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cindy           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_coca            <dbl> 0.0000000, 0.0000000, 0.5129899, 0.0000000…
## $ tfidf_title_coke            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_cola            <dbl> 0.0000000, 0.0000000, 0.5002872, 0.0000000…
## $ tfidf_title_commercial      <dbl> 0.2135681, 0.3559469, 0.2135681, 0.0000000…
## $ tfidf_title_commercials     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cool            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_crash           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_date            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_diet            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_dilly           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_dog             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_dogs            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_doritos         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_e               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_elantra         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_etrade          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_extended        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_factory         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_fantasy         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_featuring       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_flavor          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_fly             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_full            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_funny           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_game            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_genesis         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_girlfriend      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_great           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_happiness       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_hd              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_horse           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_hyundai         <dbl> 0.5002872, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_in              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_is              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_island          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_it              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_kia             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_light           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_lighta          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_love            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_new             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_nfl             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.4810423…
## $ tfidf_title_of              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_official        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_on              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.5775788…
## $ tfidf_title_one             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_pepsi           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_puppy           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_spot            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_starring        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_super           <dbl> 0.2488648, 0.0000000, 0.2488648, 0.1777606…
## $ tfidf_title_superbowl       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_the             <dbl> 0.0000000, 0.6931472, 0.0000000, 0.2970631…
## $ tfidf_title_toyota          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_trade           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_tv              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_usa             <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_version         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_vs              <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.…
## $ tfidf_title_winner          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_with            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_x               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_xliv            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.5775788…
## $ brand_Bud.Light             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser             <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ brand_Coca.Cola             <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ brand_Doritos               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_E.Trade               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ brand_Hynudai               <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Kia                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_NFL                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Pepsi                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny_FALSE.                <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, …
## $ funny_TRUE.                 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ show_product_quickly_FALSE. <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ show_product_quickly_TRUE.  <dbl> 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ patriotic_FALSE.            <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ patriotic_TRUE.             <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ celebrity_FALSE.            <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, …
## $ celebrity_TRUE.             <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ danger_FALSE.               <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, …
## $ danger_TRUE.                <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ animals_FALSE.              <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ animals_TRUE.               <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, …
## $ use_sex_FALSE.              <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex_TRUE.               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ category_id_X1              <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id_X10             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X15             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17             <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X19             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ category_id_X23             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X24             <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ category_id_X25             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X29             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Specify Model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(89984)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 5)