Goal: to predict the Youtube like count Click here for the data

.

Import Data

 youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
Data summary
Name youtube
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
# data <- youtube %>%
# 
#   
#   # Treat missing values
#   select(-thumbnail, -description, -favorite_count, -comment_count, -published_at, -category_id, -superbowl_ads_dot_com_url, -youtube_url, -id ,-etag,  -show_product_quickly, -patriotic, -celebrity, -danger, -animals, -use_sex, - channel_title, -kind) %>%
#   mutate(across(is.logical, as.factor)) %>%
#    na.omit() %>% 
#   
#   # log transform variables with pos-skewed distribution
#   mutate(like_count = log(like_count))

# Can't log transform like_count as it has non-positive values; Instead, you can address it in recipes step 
data <- youtube %>%

  # Treat missing values
  select(-thumbnail, -description, -favorite_count, -comment_count, -published_at, -category_id, -superbowl_ads_dot_com_url, -youtube_url, -id ,-etag,  -show_product_quickly, -patriotic, -celebrity, -danger, -animals, -use_sex, - channel_title, -kind, 
         - view_count, -dislike_count) %>%   # Remove these counts as they are not true predictors
  mutate(across(is.logical, as.factor)) %>%
   na.omit()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(is.logical, as.factor)`.
## Caused by warning:
## ! Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
##   # Was:
##   data %>% select(is.logical)
## 
##   # Now:
##   data %>% select(where(is.logical))

Explore Data

Identify good predictors

like_count

data %>%
  ggplot(aes(like_count, view_count)) +
  scale_y_log10() +
  geom_point()
data %>% 
  ggplot(aes(like_count, as.factor(brand))) +
  geom_boxplot() 

title

data %>%  
  
  # tokenism title
  unnest_tokens(output = word, input = brand) %>%
  
  # calculate avg rent per word
  group_by(word) %>%
  summarise(like_count = mean(like_count), 
            n   = n()) %>%
  ungroup() %>%

filter(n > 10, !str_detect(word, "\\d")) %>%
  slice_max(order_by = like_count, n = 20) %>% 
  
  # plot

    ggplot(aes(like_count, fct_reorder(word, like_count))) +
  geom_point() +


  
  labs(y = "word in Title")

# step 1: prepare data
data_binarized_tbl <- data %>%

  # select(-dislike_count, -title) %>%
    select(-title) %>%    # count variables were removed in the cleaning step 
    binarize() 

data_binarized_tbl  %>% glimpse() 
## Rows: 225
## Columns: 20
## $ `year__-Inf_2005`     <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ year__2005_2010       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ year__2010_2015       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf        <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, …
## $ brand__Bud_Light      <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Budweiser      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ `brand__Coca-Cola`    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Doritos        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai        <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ brand__Kia            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__NFL            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ brand__Pepsi          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota         <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny__FALSE          <dbl> 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, …
## $ funny__TRUE           <dbl> 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ `like_count__-Inf_19` <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ like_count__19_130    <dbl> 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ like_count__130_527   <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ like_count__527_Inf   <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
  # correlate(like_count__6.26720054854136_Inf ) %>%
    correlate(like_count__527_Inf)   

data_corr_tbl
## # A tibble: 20 × 3
##    feature    bin       correlation
##    <fct>      <chr>           <dbl>
##  1 like_count 527_Inf      1       
##  2 like_count -Inf_19     -0.339   
##  3 like_count 130_527     -0.331   
##  4 like_count 19_130      -0.327   
##  5 brand      Doritos      0.281   
##  6 brand      NFL          0.250   
##  7 brand      Bud_Light   -0.212   
##  8 year       2015_Inf     0.202   
##  9 year       -Inf_2005   -0.193   
## 10 brand      Kia         -0.0909  
## 11 brand      Hynudai     -0.0857  
## 12 brand      Toyota       0.0754  
## 13 brand      E-Trade     -0.0451  
## 14 funny      FALSE        0.0184  
## 15 funny      TRUE        -0.0184  
## 16 year       2010_2015    0.0133  
## 17 year       2005_2010   -0.00992 
## 18 brand      Budweiser   -0.00579 
## 19 brand      Pepsi        0.000888
## 20 brand      Coca-Cola    0.000803
# step 3: 
data_corr_tbl %>%
  plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

split data

# No need to make the data small as yours is already small.
 # data <- sample_n(data, 100)

# Split into train and test dataset 
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training data set for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [151/17]> Fold01
##  2 <split [151/17]> Fold02
##  3 <split [151/17]> Fold03
##  4 <split [151/17]> Fold04
##  5 <split [151/17]> Fold05
##  6 <split [151/17]> Fold06
##  7 <split [151/17]> Fold07
##  8 <split [151/17]> Fold08
##  9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count ~ ., data_train)
## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(18995)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe
# xgboost_recipe <-
#   recipe(like_count ~ ., data = data_train) %>%
#   # recipes::update_role(like_count, new_role = "id variable") %>%
#   step_tokenize(title) %>%
#   step_tokenfilter(title, max_tokens = 100) %>%
#   step_tfidf(title) %>%
#   step_other(brand, threshold = 0.01) %>%
#   step_dummy(brand, funny,  one_hot = TRUE) %>%
#   step_YeoJohnson(view_count,like_count)

xgboost_recipe <-
  recipe(like_count ~ ., data = data_train) %>%
  step_tokenize(title) %>%
  step_tokenfilter(title, max_tokens = 100) %>%
  step_tfidf(title) %>%
  step_other(brand, threshold = 0.01) %>%
  step_dummy(brand, funny, one_hot = TRUE) %>%
  # 1. Remove variables with zero variance (constants)
  step_zv(all_predictors()) %>% 
  # 2. Normalize numeric predictors (now safe from zero-variance issues)
  step_normalize(all_numeric_predictors()) %>% 
  # 3. Transform the outcome
  step_YeoJohnson(like_count)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 168
## Columns: 114
## $ year                      <dbl> 0.5154043, 0.8590071, -0.3436029, 0.0000000,…
## $ like_count                <dbl> 2.9685828, 0.0000000, 3.3907606, 5.4116392, …
## $ tfidf_title_2000          <dbl> -0.1087598, -0.1087598, -0.1087598, -0.10875…
## $ tfidf_title_2001          <dbl> -0.1514581, -0.1514581, -0.1514581, -0.15145…
## $ tfidf_title_2002          <dbl> -0.1278554, -0.1278554, -0.1278554, -0.12785…
## $ tfidf_title_2005          <dbl> -0.1462442, -0.1462442, -0.1462442, -0.14624…
## $ tfidf_title_2007          <dbl> -0.1089823, -0.1089823, -0.1089823, -0.10898…
## $ tfidf_title_2009          <dbl> -0.1339065, -0.1339065, -0.1339065, -0.13390…
## $ tfidf_title_2010          <dbl> -0.1371154, -0.1371154, -0.1371154, -0.13711…
## $ tfidf_title_2011          <dbl> -0.1015255, -0.1015255, -0.1015255, -0.10152…
## $ tfidf_title_2012          <dbl> -0.2180951, -0.2180951, -0.2180951, -0.21809…
## $ tfidf_title_2013          <dbl> 5.1647302, -0.2037133, -0.2037133, -0.203713…
## $ tfidf_title_2014          <dbl> -0.2438312, -0.2438312, -0.2438312, -0.24383…
## $ tfidf_title_2015          <dbl> -0.1516157, -0.1516157, -0.1516157, -0.15161…
## $ tfidf_title_2016          <dbl> -0.1323708, -0.1323708, -0.1323708, -0.13237…
## $ tfidf_title_2018          <dbl> -0.1948139, -0.1948139, -0.1948139, -0.19481…
## $ tfidf_title_2019          <dbl> -0.1645455, -0.1645455, -0.1645455, -0.16454…
## $ tfidf_title_2020          <dbl> -0.1700777, -0.1700777, -0.1700777, -0.17007…
## $ tfidf_title_44            <dbl> -0.1093612, -0.1093612, -0.1093612, -0.10936…
## $ tfidf_title_a             <dbl> -0.2060617, -0.2060617, -0.2060617, -0.20606…
## $ tfidf_title_ad            <dbl> -0.3647755, -0.3647755, -0.3647755, -0.36477…
## $ tfidf_title_ads           <dbl> -0.1339669, -0.1339669, -0.1339669, -0.13396…
## $ tfidf_title_advertisement <dbl> -0.107287, -0.107287, -0.107287, -0.107287, …
## $ tfidf_title_all           <dbl> -0.1083241, -0.1083241, -0.1083241, -0.10832…
## $ tfidf_title_and           <dbl> -0.1321017, -0.1321017, -0.1321017, -0.13210…
## $ tfidf_title_baby          <dbl> -0.1198909, -0.1198909, -0.1198909, -0.11989…
## $ tfidf_title_bears         <dbl> -0.1037587, -0.1037587, -0.1037587, -0.10375…
## $ tfidf_title_beer          <dbl> -0.107287, -0.107287, -0.107287, -0.107287, …
## $ tfidf_title_best          <dbl> -0.1094371, -0.1094371, -0.1094371, 9.083278…
## $ tfidf_title_bestbuds      <dbl> -0.1049399, -0.1049399, -0.1049399, -0.10493…
## $ tfidf_title_big           <dbl> -0.1717156, -0.1717156, -0.1717156, -0.17171…
## $ tfidf_title_bowl          <dbl> 1.3995365, -0.7506182, 1.3995365, 0.7852066,…
## $ tfidf_title_britney       <dbl> -0.1079305, -0.1079305, -0.1079305, -0.10793…
## $ tfidf_title_bud           <dbl> -0.4864483, -0.4864483, -0.4864483, -0.48644…
## $ tfidf_title_budweiser     <dbl> -0.3459334, 1.3019205, -0.3459334, -0.345933…
## $ tfidf_title_camry         <dbl> -0.1094371, -0.1094371, -0.1094371, -0.10943…
## $ tfidf_title_car           <dbl> -0.1089823, -0.1089823, -0.1089823, -0.10898…
## $ tfidf_title_cedric        <dbl> -0.1688008, -0.1688008, -0.1688008, -0.16880…
## $ tfidf_title_cindy         <dbl> -0.1091109, -0.1091109, -0.1091109, -0.10911…
## $ tfidf_title_coca          <dbl> -0.2708385, -0.2708385, 2.6794296, -0.270838…
## $ tfidf_title_coke          <dbl> -0.1959223, -0.1959223, -0.1959223, -0.19592…
## $ tfidf_title_cola          <dbl> -0.2824949, -0.2824949, 2.6073696, -0.282494…
## $ tfidf_title_commercial    <dbl> 0.56011212, 1.49839080, 0.56011212, -0.84730…
## $ tfidf_title_commercials   <dbl> -0.107287, -0.107287, -0.107287, -0.107287, …
## $ tfidf_title_cool          <dbl> -0.1341561, -0.1341561, -0.1341561, -0.13415…
## $ tfidf_title_crash         <dbl> -0.1550743, -0.1550743, -0.1550743, -0.15507…
## $ tfidf_title_date          <dbl> -0.107287, -0.107287, -0.107287, -0.107287, …
## $ tfidf_title_diet          <dbl> -0.1459409, -0.1459409, -0.1459409, -0.14594…
## $ tfidf_title_dilly         <dbl> -0.07715167, -0.07715167, -0.07715167, -0.07…
## $ tfidf_title_dog           <dbl> -0.1941356, -0.1941356, -0.1941356, -0.19413…
## $ tfidf_title_dogs          <dbl> -0.09216796, -0.09216796, -0.09216796, -0.09…
## $ tfidf_title_doritos       <dbl> -0.2827732, -0.2827732, -0.2827732, -0.28277…
## $ tfidf_title_e             <dbl> -0.1511986, -0.1511986, -0.1511986, -0.15119…
## $ tfidf_title_elantra       <dbl> -0.1083241, -0.1083241, -0.1083241, -0.10832…
## $ tfidf_title_etrade        <dbl> -0.1409018, -0.1409018, -0.1409018, -0.14090…
## $ tfidf_title_extended      <dbl> -0.1674721, -0.1674721, -0.1674721, -0.16747…
## $ tfidf_title_factory       <dbl> -0.1091109, -0.1091109, -0.1091109, -0.10911…
## $ tfidf_title_fantasy       <dbl> -0.1344381, -0.1344381, -0.1344381, -0.13443…
## $ tfidf_title_featuring     <dbl> -0.1037587, -0.1037587, -0.1037587, -0.10375…
## $ tfidf_title_flavor        <dbl> -0.09216796, -0.09216796, -0.09216796, -0.09…
## $ tfidf_title_fly           <dbl> -0.1091918, -0.1091918, -0.1091918, -0.10919…
## $ tfidf_title_full          <dbl> -0.1087598, -0.1087598, -0.1087598, -0.10875…
## $ tfidf_title_funny         <dbl> -0.1473298, -0.1473298, -0.1473298, -0.14732…
## $ tfidf_title_game          <dbl> -0.1870394, -0.1870394, -0.1870394, -0.18703…
## $ tfidf_title_genesis       <dbl> -0.1037587, -0.1037587, -0.1037587, -0.10375…
## $ tfidf_title_girlfriend    <dbl> -0.1083241, -0.1083241, -0.1083241, -0.10832…
## $ tfidf_title_great         <dbl> -0.09776579, -0.09776579, -0.09776579, -0.09…
## $ tfidf_title_happiness     <dbl> -0.1313108, -0.1313108, -0.1313108, -0.13131…
## $ tfidf_title_hd            <dbl> -0.1945391, -0.1945391, -0.1945391, -0.19453…
## $ tfidf_title_horse         <dbl> -0.1079305, -0.1079305, -0.1079305, -0.10793…
## $ tfidf_title_hyundai       <dbl> 1.7447629, -0.2371829, -0.2371829, -0.237182…
## $ tfidf_title_in            <dbl> -0.1004947, -0.1004947, -0.1004947, -0.10049…
## $ tfidf_title_is            <dbl> -0.1317287, -0.1317287, -0.1317287, -0.13172…
## $ tfidf_title_island        <dbl> -0.1344381, -0.1344381, -0.1344381, -0.13443…
## $ tfidf_title_it            <dbl> -0.1015255, -0.1015255, -0.1015255, -0.10152…
## $ tfidf_title_kia           <dbl> -0.190631, -0.190631, -0.190631, -0.190631, …
## $ tfidf_title_light         <dbl> -0.4673781, -0.4673781, -0.4673781, -0.46737…
## $ tfidf_title_lighta        <dbl> -0.1344381, -0.1344381, -0.1344381, -0.13443…
## $ tfidf_title_love          <dbl> -0.1120992, -0.1120992, -0.1120992, -0.11209…
## $ tfidf_title_new           <dbl> -0.1825341, -0.1825341, -0.1825341, -0.18253…
## $ tfidf_title_nfl           <dbl> -0.1499735, -0.1499735, -0.1499735, 1.104576…
## $ tfidf_title_of            <dbl> -0.1120992, -0.1120992, -0.1120992, -0.11209…
## $ tfidf_title_official      <dbl> -0.1690023, -0.1690023, -0.1690023, -0.16900…
## $ tfidf_title_on            <dbl> -0.1340605, -0.1340605, -0.1340605, 6.978201…
## $ tfidf_title_one           <dbl> -0.1191012, -0.1191012, -0.1191012, -0.11910…
## $ tfidf_title_pepsi         <dbl> -0.301344, -0.301344, -0.301344, -0.301344, …
## $ tfidf_title_puppy         <dbl> -0.1037587, -0.1037587, -0.1037587, -0.10375…
## $ tfidf_title_spot          <dbl> -0.1302495, -0.1302495, -0.1302495, -0.13024…
## $ tfidf_title_starring      <dbl> -0.1291036, -0.1291036, -0.1291036, -0.12910…
## $ tfidf_title_super         <dbl> 1.3995365, -0.7506182, 1.3995365, 0.7852066,…
## $ tfidf_title_superbowl     <dbl> -0.2295741, -0.2295741, -0.2295741, -0.22957…
## $ tfidf_title_the           <dbl> -0.348641, 3.706434, -0.348641, 1.389248, -0…
## $ tfidf_title_toyota        <dbl> -0.1701628, -0.1701628, -0.1701628, -0.17016…
## $ tfidf_title_trade         <dbl> -0.1511986, -0.1511986, -0.1511986, -0.15119…
## $ tfidf_title_tv            <dbl> -0.1950976, -0.1950976, -0.1950976, -0.19509…
## $ tfidf_title_usa           <dbl> -0.148928, -0.148928, -0.148928, -0.148928, …
## $ tfidf_title_version       <dbl> -0.1542293, -0.1542293, -0.1542293, -0.15422…
## $ tfidf_title_vs            <dbl> -0.1536271, -0.1536271, -0.1536271, -0.15362…
## $ tfidf_title_winner        <dbl> -0.1550743, -0.1550743, -0.1550743, -0.15507…
## $ tfidf_title_with          <dbl> -0.1502229, -0.1502229, -0.1502229, -0.15022…
## $ tfidf_title_x             <dbl> -0.1228548, -0.1228548, -0.1228548, -0.12285…
## $ tfidf_title_xliv          <dbl> -0.1273842, -0.1273842, -0.1273842, 10.37099…
## $ brand_Bud.Light           <dbl> -0.5664918, -0.5664918, -0.5664918, -0.56649…
## $ brand_Budweiser           <dbl> -0.4070315, 2.4421887, -0.4070315, -0.407031…
## $ brand_Coca.Cola           <dbl> -0.3345335, -0.3345335, 2.9714444, -0.334533…
## $ brand_Doritos             <dbl> -0.3768379, -0.3768379, -0.3768379, -0.37683…
## $ brand_E.Trade             <dbl> -0.2229403, -0.2229403, -0.2229403, -0.22294…
## $ brand_Hynudai             <dbl> 3.1842245, -0.3121789, -0.3121789, -0.312178…
## $ brand_Kia                 <dbl> -0.2229403, -0.2229403, -0.2229403, -0.22294…
## $ brand_NFL                 <dbl> -0.2078929, -0.2078929, -0.2078929, 4.781536…
## $ brand_Pepsi               <dbl> -0.3453776, -0.3453776, -0.3453776, -0.34537…
## $ brand_Toyota              <dbl> -0.2372063, -0.2372063, -0.2372063, -0.23720…
## $ funny_FALSE.              <dbl> -0.6490043, 1.5316502, -0.6490043, 1.5316502…
## $ funny_TRUE.               <dbl> 0.6490043, -1.5316502, 0.6490043, -1.5316502…
# Specify model
xgboost_spec <-
  boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
  set_mode("regression") %>%
  set_engine("xgboost")

# Combine recipe and model using workflow
xgboost_workflow <-
  workflow() %>%
  add_recipe(xgboost_recipe) %>%
  add_model(xgboost_spec)

# Tune hyperparameters
set.seed(344)
xgboost_tune <-
  tune_grid(
    xgboost_workflow,
    resamples = data_cv,
    grid = 5
  )
## i Creating pre-processing data to finalize 1 unknown parameter: "mtry"
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
##    mtry trees min_n learn_rate .metric .estimator  mean     n std_err .config   
##   <int> <int> <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>     
## 1     1  1500    11    0.00422 rmse    standard    2.11    10  0.0818 pre0_mod1…
## 2    57     1     2    0.0750  rmse    standard    2.11    10  0.0823 pre0_mod3…
## 3    85   500    30    0.001   rmse    standard    2.13    10  0.0810 pre0_mod4…
## 4   113  2000    21    0.0178  rmse    standard    2.14    10  0.136  pre0_mod5…
## 5    29  1000    40    0.316   rmse    standard    2.18    10  0.0948 pre0_mod2…
# Update the model by selecting the best hyperparameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
  tune::select_best(xgboost_tune, metric = "rmse"))


# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config        
##   <chr>   <chr>          <dbl> <chr>          
## 1 rmse    standard       2.07  pre0_mod0_post0
## 2 rsq     standard       0.135 pre0_mod0_post0
tune::collect_predictions(data_fit) %>%
  # ggplot(aes(price, .pred)) +
    ggplot(aes(like_count, .pred)) +
  geom_point(alpha = 0.3, fill = "midnightblue") +
  geom_abline(lty = 2, color = "gray50") +
  coord_fixed()