Goal: What factors lead to the most youtube likes?

Import Data

likes <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

skimr::skim(likes)
Data summary
Name likes
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
Youtube <- likes %>%
    select(-superbowl_ads_dot_com_url, -youtube_url, -dislike_count, -view_count, -comment_count, -thumbnail) %>%
    na.omit() %>%

    # Transform data
    mutate(like_count = log(like_count + 1)) %>%
    mutate(across(where(is.logical), as.factor)) %>%
    mutate(across(where(is.character), as.factor)) %>%
    # Keep title as character as it's not a categorical variable, but a string
    mutate(title = as.character(title)) %>%
    mutate(description = as.character(description)) %>%
    mutate(category_id = as.factor(category_id))

Explore Data

Identify Good predictors

funny

Youtube %>%
    ggplot(aes(like_count, as.numeric(funny))) +
    scale_x_log10() +
    geom_point() 
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

brand

Youtube %>%
    ggplot(aes(x = brand, y = like_count)) +
    scale_y_log10() +
    geom_boxplot()
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Youtube %>%
    ggplot(aes(like_count, as.numeric(celebrity))) +
    scale_x_log10() +
    geom_point() +
    labs(
        title = "2 = True, 1 = False",
        x = "like count",
        y = "celebrity sponsorship"
    )
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

title

Youtube %>%
  # Group by brand
  group_by(brand) %>%
  
  # Calculate average like count per brand
  summarize(like_count = mean(like_count, na.rm = TRUE),
            n = n()) %>%
  
  # Calculate brands with highest like count
  filter(n > 10) %>%
  slice_max(order_by = like_count, n = 20) %>%
  
  # Plot
  ggplot(aes(x = like_count, y = fct_reorder(brand, like_count))) +
  geom_point() +
  labs(
    title = "Top 6 Brands with Highest Avg Like Count",
    x = "Average Like Count",
    y = "Brand"
  ) 

EDA Shortcut

data_binarized_table <- Youtube %>%
    select(-id, -kind, -etag, -published_at, -channel_title, category_id, -title) %>%
    binarize()


data_binarized_table %>% glimpse
## Rows: 194
## Columns: 45
## $ `year__-Inf_2006`                                    <dbl> 0, 0, 1, 0, 1, 0,…
## $ year__2006_2010                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2010_2014.75                                   <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2014.75_Inf                                    <dbl> 1, 1, 0, 1, 0, 1,…
## $ brand__Bud_Light                                     <dbl> 0, 1, 1, 0, 1, 0,…
## $ brand__Budweiser                                     <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__Coca-Cola`                                   <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Doritos                                       <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__E-Trade`                                     <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Hynudai                                       <dbl> 0, 0, 0, 1, 0, 0,…
## $ brand__Kia                                           <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__NFL                                           <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Pepsi                                         <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Toyota                                        <dbl> 1, 0, 0, 0, 0, 1,…
## $ funny__FALSE                                         <dbl> 1, 0, 0, 1, 0, 0,…
## $ funny__TRUE                                          <dbl> 0, 1, 1, 0, 1, 1,…
## $ show_product_quickly__FALSE                          <dbl> 1, 0, 1, 0, 0, 0,…
## $ show_product_quickly__TRUE                           <dbl> 0, 1, 0, 1, 1, 1,…
## $ patriotic__FALSE                                     <dbl> 1, 1, 1, 1, 1, 1,…
## $ patriotic__TRUE                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ celebrity__FALSE                                     <dbl> 1, 0, 1, 1, 1, 0,…
## $ celebrity__TRUE                                      <dbl> 0, 1, 0, 0, 0, 1,…
## $ danger__FALSE                                        <dbl> 1, 0, 0, 1, 0, 0,…
## $ danger__TRUE                                         <dbl> 0, 1, 1, 0, 1, 1,…
## $ animals__FALSE                                       <dbl> 1, 1, 0, 1, 0, 0,…
## $ animals__TRUE                                        <dbl> 0, 0, 1, 0, 1, 1,…
## $ use_sex__FALSE                                       <dbl> 1, 1, 1, 1, 0, 1,…
## $ use_sex__TRUE                                        <dbl> 0, 0, 0, 0, 1, 0,…
## $ `like_count__-Inf_3.42502492661324`                  <dbl> 0, 0, 0, 1, 1, 0,…
## $ like_count__3.42502492661324_5.11191520361373        <dbl> 0, 0, 1, 0, 0, 1,…
## $ like_count__5.11191520361373_6.38645168838416        <dbl> 0, 1, 0, 0, 0, 0,…
## $ like_count__6.38645168838416_Inf                     <dbl> 1, 0, 0, 0, 0, 0,…
## $ description__Bud_Lighta_Cedric_a_Island_Fantasy_2005 <dbl> 0, 0, 0, 0, 0, 0,…
## $ `description__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1,…
## $ category_id__1                                       <dbl> 1, 0, 0, 0, 0, 1,…
## $ category_id__2                                       <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__10                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__15                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__17                                      <dbl> 0, 0, 1, 0, 0, 0,…
## $ category_id__22                                      <dbl> 0, 0, 0, 1, 0, 0,…
## $ category_id__23                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__24                                      <dbl> 0, 0, 0, 0, 1, 0,…
## $ category_id__25                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__27                                      <dbl> 0, 1, 0, 0, 0, 0,…
## $ `category_id__-OTHER`                                <dbl> 0, 0, 0, 0, 0, 0,…
# Step 2: Correlate
data_corr_table <- data_binarized_table %>% 
    correlate( like_count__6.38645168838416_Inf   )

data_corr_table
## # A tibble: 45 × 3
##    feature    bin                               correlation
##    <fct>      <chr>                                   <dbl>
##  1 like_count 6.38645168838416_Inf                    1    
##  2 like_count -Inf_3.42502492661324                  -0.338
##  3 like_count 3.42502492661324_5.11191520361373      -0.333
##  4 like_count 5.11191520361373_6.38645168838416      -0.333
##  5 brand      NFL                                     0.297
##  6 brand      Doritos                                 0.241
##  7 year       2014.75_Inf                             0.235
##  8 brand      Bud_Light                              -0.229
##  9 year       -Inf_2006                              -0.170
## 10 brand      Hynudai                                -0.145
## # ℹ 35 more rows
# Step 3: Plot
data_corr_table %>%
    plot_correlation_funnel()
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Split Data

# Split into train and test data set
set.seed(123)
youtube_split <- initial_split(Youtube)
youtube_train <- training(youtube_split)
youtube_test <- testing(youtube_split)

# Further split training data set for cross-validation
set.seed(234)
youtube_cv <- vfold_cv(youtube_train)
youtube_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [130/15]> Fold01
##  2 <split [130/15]> Fold02
##  3 <split [130/15]> Fold03
##  4 <split [130/15]> Fold04
##  5 <split [130/15]> Fold05
##  6 <split [131/14]> Fold06
##  7 <split [131/14]> Fold07
##  8 <split [131/14]> Fold08
##  9 <split [131/14]> Fold09
## 10 <split [131/14]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count~., data = youtube_train)
## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = youtube_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(25718)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_youtube_recipe1 <- 
  recipe(like_count ~ title + category_id + brand + funny, data = youtube_train) %>%
    step_other(category_id, threshold = 0.05) %>%  
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_dummy(category_id, brand, funny, one_hot = TRUE) %>%
    step_zv(all_numeric_predictors()) %>%
    step_normalize(all_numeric_predictors()) 

xgboost_youtube_recipe1 %>% prep() %>% juice() %>% glimpse()
## Rows: 145
## Columns: 120
## $ like_count              <dbl> 4.262680, 5.805135, 6.466145, 1.386294, 6.3801…
## $ tfidf_title_2000        <dbl> -0.117854, -0.117854, -0.117854, -0.117854, -0…
## $ tfidf_title_2001        <dbl> -0.111728, -0.111728, -0.111728, -0.111728, -0…
## $ tfidf_title_2002        <dbl> -0.1413508, -0.1413508, 8.5014851, -0.1413508,…
## $ tfidf_title_2005        <dbl> -0.1616337, -0.1616337, -0.1616337, 4.8098280,…
## $ tfidf_title_2006        <dbl> -0.1162285, -0.1162285, -0.1162285, -0.1162285…
## $ tfidf_title_2007        <dbl> -0.1640177, -0.1640177, -0.1640177, -0.1640177…
## $ tfidf_title_2008        <dbl> -0.1166532, -0.1166532, -0.1166532, -0.1166532…
## $ tfidf_title_2009        <dbl> -0.1861636, -0.1861636, -0.1861636, -0.1861636…
## $ tfidf_title_2010        <dbl> -0.2002679, -0.2002679, -0.2002679, -0.2002679…
## $ tfidf_title_2012        <dbl> -0.1860765, -0.1860765, -0.1860765, -0.1860765…
## $ tfidf_title_2013        <dbl> -0.1809111, -0.1809111, -0.1809111, -0.1809111…
## $ tfidf_title_2014        <dbl> -0.2029139, -0.2029139, -0.2029139, -0.2029139…
## $ tfidf_title_2015        <dbl> -0.1416749, -0.1416749, -0.1416749, -0.1416749…
## $ tfidf_title_2016        <dbl> -0.1663372, -0.1663372, -0.1663372, -0.1663372…
## $ tfidf_title_2017        <dbl> -0.1052647, -0.1052647, -0.1052647, -0.1052647…
## $ tfidf_title_2018        <dbl> -0.1815229, -0.1815229, -0.1815229, -0.1815229…
## $ tfidf_title_2019        <dbl> -0.1655967, -0.1655967, -0.1655967, -0.1655967…
## $ tfidf_title_2020        <dbl> -0.1660449, -0.1660449, -0.1660449, -0.1660449…
## $ tfidf_title_44          <dbl> -0.1605625, -0.1605625, -0.1605625, -0.1605625…
## $ tfidf_title_a           <dbl> -0.2041022, -0.2041022, -0.2041022, -0.2041022…
## $ tfidf_title_ad          <dbl> 2.7521488, -0.3592667, -0.3592667, -0.3592667,…
## $ tfidf_title_ads         <dbl> -0.1162285, -0.1162285, -0.1162285, -0.1162285…
## $ tfidf_title_babies      <dbl> -0.1155343, -0.1155343, -0.1155343, -0.1155343…
## $ tfidf_title_baby        <dbl> -0.1741991, -0.1741991, -0.1741991, -0.1741991…
## $ tfidf_title_best        <dbl> -0.1445696, -0.1445696, -0.1445696, -0.1445696…
## $ tfidf_title_big         <dbl> -0.1398253, -0.1398253, -0.1398253, -0.1398253…
## $ tfidf_title_black       <dbl> -0.117854, -0.117854, -0.117854, -0.117854, -0…
## $ tfidf_title_bowl        <dbl> -0.8223737, -0.8223737, -0.8223737, -0.8223737…
## $ tfidf_title_bud         <dbl> -0.4520338, -0.4520338, -0.4520338, -0.4520338…
## $ tfidf_title_budweiser   <dbl> -0.3540843, -0.3540843, 1.2328706, -0.3540843,…
## $ tfidf_title_camry       <dbl> -0.117854, -0.117854, -0.117854, -0.117854, -0…
## $ tfidf_title_car         <dbl> -0.1177585, -0.1177585, -0.1177585, -0.1177585…
## $ tfidf_title_cedric      <dbl> -0.1820238, -0.1820238, -0.1820238, -0.1820238…
## $ tfidf_title_cindy       <dbl> -0.1155343, -0.1155343, -0.1155343, 9.9359461,…
## $ tfidf_title_clydesdale  <dbl> -0.1093193, -0.1093193, -0.1093193, -0.1093193…
## $ tfidf_title_coca        <dbl> -0.1945627, 4.3556941, -0.1945627, -0.1945627,…
## $ tfidf_title_coke        <dbl> -0.1666848, -0.1666848, -0.1666848, -0.1666848…
## $ tfidf_title_cola        <dbl> -0.2088434, 4.2444359, -0.2088434, -0.2088434,…
## $ tfidf_title_commercial  <dbl> -0.844154141, -0.844154141, -0.844154141, -0.8…
## $ tfidf_title_commercials <dbl> -0.1155343, -0.1155343, -0.1155343, -0.1155343…
## $ tfidf_title_cool        <dbl> -0.1175893, -0.1175893, -0.1175893, -0.1175893…
## $ tfidf_title_crash       <dbl> -0.1445438, -0.1445438, -0.1445438, -0.1445438…
## $ tfidf_title_crown       <dbl> -0.117854, -0.117854, -0.117854, -0.117854, -0…
## $ tfidf_title_date        <dbl> -0.1155343, -0.1155343, -0.1155343, -0.1155343…
## $ tfidf_title_diet        <dbl> 6.5241800, -0.1445438, -0.1445438, 6.5241800, …
## $ tfidf_title_dog         <dbl> -0.1746674, -0.1746674, -0.1746674, -0.1746674…
## $ tfidf_title_doritos     <dbl> -0.2414289, -0.2414289, -0.2414289, -0.2414289…
## $ tfidf_title_e           <dbl> -0.2004979, -0.2004979, -0.2004979, -0.2004979…
## $ tfidf_title_elantra     <dbl> -0.1147953, -0.1147953, -0.1147953, -0.1147953…
## $ tfidf_title_epic        <dbl> -0.117854, -0.117854, -0.117854, -0.117854, -0…
## $ tfidf_title_etrade      <dbl> -0.1492281, -0.1492281, -0.1492281, -0.1492281…
## $ tfidf_title_exclusive   <dbl> -0.1447042, -0.1447042, -0.1447042, -0.1447042…
## $ tfidf_title_extended    <dbl> -0.1444393, -0.1444393, -0.1444393, -0.1444393…
## $ tfidf_title_factory     <dbl> -0.1162285, -0.1162285, -0.1162285, -0.1162285…
## $ tfidf_title_fantasy     <dbl> -0.1448484, -0.1448484, -0.1448484, -0.1448484…
## $ tfidf_title_featuring   <dbl> -0.111728, -0.111728, -0.111728, -0.111728, -0…
## $ tfidf_title_flavor      <dbl> -0.09785011, -0.09785011, -0.09785011, -0.0978…
## $ tfidf_title_fly         <dbl> -0.117854, -0.117854, -0.117854, -0.117854, -0…
## $ tfidf_title_full        <dbl> -0.1155343, -0.1155343, -0.1155343, -0.1155343…
## $ tfidf_title_funny       <dbl> -0.1548356, -0.1548356, -0.1548356, -0.1548356…
## $ tfidf_title_game        <dbl> -0.1619201, -0.1619201, -0.1619201, -0.1619201…
## $ tfidf_title_genesis     <dbl> -0.1175893, -0.1175893, -0.1175893, -0.1175893…
## $ tfidf_title_hd          <dbl> -0.1923869, -0.1923869, -0.1923869, -0.1923869…
## $ tfidf_title_hyundai     <dbl> -0.2503641, -0.2503641, -0.2503641, -0.2503641…
## $ tfidf_title_is          <dbl> -0.1434604, -0.1434604, -0.1434604, -0.1434604…
## $ tfidf_title_island      <dbl> -0.1448484, -0.1448484, -0.1448484, -0.1448484…
## $ tfidf_title_it          <dbl> -0.1093193, 10.9865869, -0.1093193, -0.1093193…
## $ tfidf_title_jackie      <dbl> 9.3177979, -0.1171232, -0.1171232, -0.1171232,…
## $ tfidf_title_kia         <dbl> -0.2176276, -0.2176276, -0.2176276, -0.2176276…
## $ tfidf_title_king        <dbl> -0.1166532, -0.1166532, -0.1166532, -0.1166532…
## $ tfidf_title_legends     <dbl> -0.1142881, -0.1142881, -0.1142881, -0.1142881…
## $ tfidf_title_life        <dbl> -0.113922, -0.113922, -0.113922, -0.113922, -0…
## $ tfidf_title_light       <dbl> -0.4272945, -0.4272945, -0.4272945, -0.4272945…
## $ tfidf_title_lighta      <dbl> -0.1448484, -0.1448484, -0.1448484, -0.1448484…
## $ tfidf_title_love        <dbl> -0.1207265, -0.1207265, -0.1207265, -0.1207265…
## $ tfidf_title_meter       <dbl> -0.117854, -0.117854, -0.117854, -0.117854, -0…
## $ tfidf_title_monkey      <dbl> -0.1439811, -0.1439811, -0.1439811, -0.1439811…
## $ tfidf_title_new         <dbl> -0.1791731, -0.1791731, -0.1791731, -0.1791731…
## $ tfidf_title_nfl         <dbl> -0.1381801, -0.1381801, -0.1381801, -0.1381801…
## $ tfidf_title_of          <dbl> -0.138621, -0.138621, -0.138621, -0.138621, -0…
## $ tfidf_title_official    <dbl> -0.1421136, -0.1421136, -0.1421136, -0.1421136…
## $ tfidf_title_on          <dbl> -0.117854, -0.117854, -0.117854, -0.117854, -0…
## $ tfidf_title_one         <dbl> -0.1443393, -0.1443393, -0.1443393, -0.1443393…
## $ tfidf_title_optima      <dbl> -0.1088667, -0.1088667, -0.1088667, -0.1088667…
## $ tfidf_title_pepsi       <dbl> 1.6090111, -0.2816421, -0.2816421, 1.6090111, …
## $ tfidf_title_puppy       <dbl> -0.1131191, -0.1131191, -0.1131191, -0.1131191…
## $ tfidf_title_respect     <dbl> -0.1039988, -0.1039988, 11.4276357, -0.1039988…
## $ tfidf_title_ride        <dbl> -0.1171232, -0.1171232, -0.1171232, -0.1171232…
## $ tfidf_title_spot        <dbl> -0.1421136, -0.1421136, -0.1421136, -0.1421136…
## $ tfidf_title_super       <dbl> -0.8223737, -0.8223737, -0.8223737, -0.8223737…
## $ tfidf_title_superbowl   <dbl> -0.2921432, -0.2921432, -0.2921432, -0.2921432…
## $ tfidf_title_the         <dbl> -0.3286748, -0.3286748, -0.3286748, -0.3286748…
## $ tfidf_title_toyota      <dbl> -0.1715512, -0.1715512, -0.1715512, -0.1715512…
## $ tfidf_title_trade       <dbl> -0.2004979, -0.2004979, -0.2004979, -0.2004979…
## $ tfidf_title_tv          <dbl> -0.15743, -0.15743, -0.15743, -0.15743, -0.157…
## $ tfidf_title_usa         <dbl> -0.1660449, -0.1660449, 7.2043138, -0.1660449,…
## $ tfidf_title_version     <dbl> -0.1655967, -0.1655967, -0.1655967, -0.1655967…
## $ tfidf_title_winner      <dbl> -0.1445438, -0.1445438, -0.1445438, -0.1445438…
## $ tfidf_title_xliii       <dbl> -0.1827813, -0.1827813, -0.1827813, -0.1827813…
## $ tfidf_title_xliv        <dbl> -0.1637861, -0.1637861, -0.1637861, -0.1637861…
## $ category_id_X1          <dbl> -0.3509638, -0.3509638, -0.3509638, 2.8296460,…
## $ category_id_X2          <dbl> -0.3257809, -0.3257809, -0.3257809, -0.3257809…
## $ category_id_X17         <dbl> -0.3257809, -0.3257809, -0.3257809, -0.3257809…
## $ category_id_X22         <dbl> -0.2855231, 3.4781900, -0.2855231, -0.2855231,…
## $ category_id_X23         <dbl> -0.4658117, -0.4658117, -0.4658117, -0.4658117…
## $ category_id_X24         <dbl> 1.3129652, -0.7563821, 1.3129652, -0.7563821, …
## $ category_id_other       <dbl> -0.2855231, -0.2855231, -0.2855231, -0.2855231…
## $ brand_Bud.Light         <dbl> -0.5409352, -0.5409352, -0.5409352, -0.5409352…
## $ brand_Budweiser         <dbl> -0.4548588, -0.4548588, 2.1833224, -0.4548588,…
## $ brand_Coca.Cola         <dbl> -0.2563593, 3.8738736, -0.2563593, -0.2563593,…
## $ brand_Doritos           <dbl> -0.3127389, -0.3127389, -0.3127389, -0.3127389…
## $ brand_E.Trade           <dbl> -0.2712254, -0.2712254, -0.2712254, -0.2712254…
## $ brand_Hynudai           <dbl> -0.3509638, -0.3509638, -0.3509638, -0.3509638…
## $ brand_Kia               <dbl> -0.2563593, -0.2563593, -0.2563593, -0.2563593…
## $ brand_NFL               <dbl> -0.2244433, -0.2244433, -0.2244433, -0.2244433…
## $ brand_Pepsi             <dbl> 2.9337513, -0.3385098, -0.3385098, 2.9337513, …
## $ brand_Toyota            <dbl> -0.2408141, -0.2408141, -0.2408141, -0.2408141…
## $ funny_FALSE.            <dbl> -0.7010226, 1.4166498, 1.4166498, -0.7010226, …
## $ funny_TRUE.             <dbl> 0.7010226, -1.4166498, -1.4166498, 0.7010226, …
# Specify Model
xgboost_spec_youtube1 <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine Recipe and Model Using Workflow
xgboost_workflow_youtube1 <- 
  workflow() %>% 
  add_recipe(xgboost_youtube_recipe1) %>% 
  add_model(xgboost_spec_youtube1)

# Tune Hyperparameters
set.seed(678)
tuned_youtube1 <-
  tune_grid(xgboost_workflow_youtube1,
            resamples = youtube_cv,
            grid = 5)
youtube_recipe_rf <- recipe(like_count ~ description + brand + funny + celebrity, data = youtube_train) %>%
    step_tokenize(description) %>%
    step_tokenfilter(description, max_tokens = 50) %>%
    step_stopwords(description) %>%
    step_tf(description) %>%
    step_dummy(celebrity, brand, funny, one_hot = TRUE) %>%
    step_zv(all_numeric_predictors()) %>%
    step_normalize(all_numeric_predictors()) 

youtube_recipe_rf %>% prep() %>% juice() %>% glimpse() 
## Rows: 145
## Columns: 43
## $ like_count                     <dbl> 4.262680, 5.805135, 6.466145, 1.386294,…
## $ tf_description_ad              <dbl> -0.3509638, -0.3509638, -0.3509638, -0.…
## $ tf_description_ads             <dbl> -0.1994087, -0.1994087, -0.1994087, -0.…
## $ tf_description_best            <dbl> -0.1678486, -0.1678486, -0.1678486, -0.…
## $ tf_description_bowl            <dbl> 0.2257614, -0.6356965, 0.2257614, -0.63…
## $ tf_description_bud             <dbl> -0.4037686, -0.4037686, -0.4037686, -0.…
## $ tf_description_budweiser       <dbl> -0.2541298, -0.2541298, 0.8625012, -0.2…
## $ tf_description_channel         <dbl> -0.1772426, -0.1772426, -0.1772426, -0.…
## $ tf_description_commercial      <dbl> 0.3950380, -0.6098832, 0.3950380, -0.60…
## $ tf_description_commercials     <dbl> -0.2358729, -0.2358729, -0.2358729, -0.…
## $ tf_description_game            <dbl> -0.3034305, -0.3034305, -0.3034305, -0.…
## $ tf_description_http            <dbl> -0.3018556, -0.3018556, -0.3018556, -0.…
## $ tf_description_https           <dbl> -0.2905620, -0.2905620, -0.2905620, 0.8…
## $ tf_description_hyundai         <dbl> -0.1643681, -0.1643681, -0.1643681, -0.…
## $ tf_description_kia             <dbl> -0.1602503, -0.1602503, -0.1602503, -0.…
## $ tf_description_light           <dbl> -0.3851398, -0.3851398, -0.3851398, -0.…
## $ tf_description_new             <dbl> -0.2797697, -0.2797697, 1.3428945, -0.2…
## $ tf_description_nfl             <dbl> -0.175743, -0.175743, -0.175743, -0.175…
## $ tf_description_one             <dbl> -0.3369666, -0.3369666, -0.3369666, -0.…
## $ tf_description_pepsi           <dbl> 1.0353574, -0.1851859, -0.1851859, 1.03…
## $ tf_description_super           <dbl> 0.2189659, -0.6391437, 0.2189659, -0.63…
## $ tf_description_superbowl       <dbl> -0.305571, -0.305571, -0.305571, -0.305…
## $ tf_description_team            <dbl> -0.1906577, -0.1906577, -0.1906577, -0.…
## $ tf_description_today           <dbl> -0.1433916, -0.1433916, -0.1433916, -0.…
## $ tf_description_toyota          <dbl> -0.1657364, -0.1657364, -0.1657364, -0.…
## $ tf_description_us              <dbl> -0.2070453, -0.2070453, -0.2070453, -0.…
## $ tf_description_watch           <dbl> -0.2567387, -0.2567387, -0.2567387, -0.…
## $ tf_description_www.nfl.com     <dbl> -0.1678486, -0.1678486, -0.1678486, -0.…
## $ tf_description_www.youtube.com <dbl> -0.1991718, -0.1991718, -0.1991718, -0.…
## $ celebrity_FALSE.               <dbl> -1.5098423, 0.6577531, 0.6577531, -1.50…
## $ celebrity_TRUE.                <dbl> 1.5098423, -0.6577531, -0.6577531, 1.50…
## $ brand_Bud.Light                <dbl> -0.5409352, -0.5409352, -0.5409352, -0.…
## $ brand_Budweiser                <dbl> -0.4548588, -0.4548588, 2.1833224, -0.4…
## $ brand_Coca.Cola                <dbl> -0.2563593, 3.8738736, -0.2563593, -0.2…
## $ brand_Doritos                  <dbl> -0.3127389, -0.3127389, -0.3127389, -0.…
## $ brand_E.Trade                  <dbl> -0.2712254, -0.2712254, -0.2712254, -0.…
## $ brand_Hynudai                  <dbl> -0.3509638, -0.3509638, -0.3509638, -0.…
## $ brand_Kia                      <dbl> -0.2563593, -0.2563593, -0.2563593, -0.…
## $ brand_NFL                      <dbl> -0.2244433, -0.2244433, -0.2244433, -0.…
## $ brand_Pepsi                    <dbl> 2.9337513, -0.3385098, -0.3385098, 2.93…
## $ brand_Toyota                   <dbl> -0.2408141, -0.2408141, -0.2408141, -0.…
## $ funny_FALSE.                   <dbl> -0.7010226, 1.4166498, 1.4166498, -0.70…
## $ funny_TRUE.                    <dbl> 0.7010226, -1.4166498, -1.4166498, 0.70…
rf_model <- rand_forest(mtry = tune(), trees = tune(), min_n = tune()) %>%
  set_engine("ranger") %>%
  set_mode("regression")

rf_workflow <- workflow() %>%
  add_recipe(youtube_recipe_rf) %>%
  add_model(rf_model)

# Tune Hyperparameters
set.seed(123)
tuned_youtube_rf <-
  tune_grid(rf_workflow,
            resamples = youtube_cv,
            grid = 5)
## Warning: package 'stopwords' was built under R version 4.4.3
# Update model by selecting best hyperparameters
rf_fw_youtube <- tune::finalize_workflow(rf_workflow,
                        tune::select_best(tuned_youtube_rf, metric = "rmse"))

# Fit model on entire training data and test it on test data
youtube_fit <- tune::last_fit(rf_fw_youtube, youtube_split)
tune::collect_metrics(youtube_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      2.45   Preprocessor1_Model1
## 2 rsq     standard      0.0188 Preprocessor1_Model1
tune::collect_metrics(youtube_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      2.45   Preprocessor1_Model1
## 2 rsq     standard      0.0188 Preprocessor1_Model1
tune::collect_predictions(youtube_fit) %>%
    ggplot(aes(like_count, .pred)) +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") + 
    coord_fixed()

In my updated data set, I decided to experiment with random forest. I replaced xgboost with this. In addition I included the use of celebrity with this set, and I replaced title with description as a predictor. These changes improved the rmse from 2.62 to 2.44 and decreased R squared from .0668 to .02.