Goal: What factors lead to the most youtube likes?

Import Data

likes <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

skimr::skim(likes)
Data summary
Name likes
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
Youtube <- likes %>%
    select(-thumbnail, -superbowl_ads_dot_com_url, -youtube_url, -dislike_count, -view_count, -comment_count, -description) %>%
    na.omit() %>%

    # Transform data
    mutate(like_count = log(like_count + 1)) %>%
    mutate(across(where(is.logical), as.factor)) %>%
    mutate(across(where(is.character), as.factor)) %>%
    # Keep title as character as it's not a categorical variable, but a string
    mutate(title = as.character(title)) %>%
    mutate(category_id = as.factor(category_id)) 

Explore Data

Identify Good predictors

funny

Youtube %>%
    ggplot(aes(like_count, as.numeric(funny))) +
    scale_x_log10() +
    geom_point() 
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

brand

Youtube %>%
    ggplot(aes(x = brand, y = like_count)) +
    scale_y_log10() +
    geom_boxplot()
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

title

Youtube %>%
  # Group by brand
  group_by(brand) %>%
  
  # Calculate average like count per brand
  summarize(like_count = mean(like_count, na.rm = TRUE),
            n = n()) %>%
  
  # Calculate brands with highest like count
  filter(n > 10) %>%
  slice_max(order_by = like_count, n = 20) %>%
  
  # Plot
  ggplot(aes(x = like_count, y = fct_reorder(brand, like_count))) +
  geom_point() +
  labs(
    title = "Top 6 Brands with Highest Avg Like Count",
    x = "Average Like Count",
    y = "Brand"
  ) 

EDA Shortcut

data_binarized_table <- Youtube %>%
    select(-id, -kind, -etag, -published_at, -channel_title, category_id, -title) %>%
    binarize()


data_binarized_table %>% glimpse
## Rows: 225
## Columns: 43
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE                                  <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE                                   <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE                   <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE                    <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE                              <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE                               <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE                                 <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE                                  <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE                                <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE                                 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE                                <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id__1                                <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22                               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24                               <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27                               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate
data_corr_table <- data_binarized_table %>% 
    correlate(  like_count__6.26909628370626_Inf)

data_corr_table
## # A tibble: 43 × 3
##    feature     bin                               correlation
##    <fct>       <chr>                                   <dbl>
##  1 like_count  6.26909628370626_Inf                    1    
##  2 like_count  -Inf_2.99573227355399                  -0.339
##  3 like_count  4.87519732320115_6.26909628370626      -0.331
##  4 like_count  2.99573227355399_4.87519732320115      -0.327
##  5 brand       Doritos                                 0.281
##  6 brand       NFL                                     0.250
##  7 brand       Bud_Light                              -0.212
##  8 year        2015_Inf                                0.202
##  9 year        -Inf_2005                              -0.193
## 10 category_id 1                                       0.133
## # ℹ 33 more rows
# Step 3: Plot
data_corr_table %>%
    plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Split Data

# Split into train and test data set
set.seed(123)
youtube_split <- initial_split(Youtube)
youtube_train <- training(youtube_split)
youtube_test <- testing(youtube_split)

# Further split training data set for cross-validation
set.seed(234)
youtube_cv <- vfold_cv(youtube_train)
youtube_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [151/17]> Fold01
##  2 <split [151/17]> Fold02
##  3 <split [151/17]> Fold03
##  4 <split [151/17]> Fold04
##  5 <split [151/17]> Fold05
##  6 <split [151/17]> Fold06
##  7 <split [151/17]> Fold07
##  8 <split [151/17]> Fold08
##  9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count~., data = youtube_train)
## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = youtube_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(18555)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_youtube_recipe1 <- 
  recipe(like_count ~ title + category_id + brand + funny, data = youtube_train) %>%
    step_other(category_id, threshold = 0.05) %>%  
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title) %>%
    step_dummy(category_id, brand, funny, one_hot = TRUE) %>%
    step_zv(all_numeric_predictors()) %>%
    step_normalize(all_numeric_predictors()) 

xgboost_youtube_recipe1 %>% prep() %>% juice() %>% glimpse()
## Rows: 168
## Columns: 120
## $ like_count              <dbl> 4.700480, 5.805135, 5.123964, 6.466145, 1.6094…
## $ tfidf_title_2000        <dbl> -0.1336354, -0.1336354, -0.1336354, -0.1336354…
## $ tfidf_title_2001        <dbl> -0.1296382, -0.1296382, -0.1296382, -0.1296382…
## $ tfidf_title_2002        <dbl> -0.1312009, -0.1312009, -0.1312009, 9.1635120,…
## $ tfidf_title_2005        <dbl> -0.1609131, -0.1609131, -0.1609131, -0.1609131…
## $ tfidf_title_2006        <dbl> -0.1079305, -0.1079305, 7.4472054, -0.1079305,…
## $ tfidf_title_2007        <dbl> -0.188392, -0.188392, -0.188392, -0.188392, -0…
## $ tfidf_title_2008        <dbl> -0.1091109, -0.1091109, -0.1091109, -0.1091109…
## $ tfidf_title_2009        <dbl> -0.1542525, -0.1542525, -0.1542525, -0.1542525…
## $ tfidf_title_2010        <dbl> -0.1157272, -0.1157272, -0.1157272, -0.1157272…
## $ tfidf_title_2011        <dbl> -0.1026157, -0.1026157, -0.1026157, -0.1026157…
## $ tfidf_title_2012        <dbl> -0.1659273, -0.1659273, -0.1659273, -0.1659273…
## $ tfidf_title_2013        <dbl> -0.198471, -0.198471, -0.198471, -0.198471, -0…
## $ tfidf_title_2014        <dbl> -0.256473, -0.256473, -0.256473, -0.256473, -0…
## $ tfidf_title_2015        <dbl> -0.1515659, -0.1515659, -0.1515659, -0.1515659…
## $ tfidf_title_2016        <dbl> -0.1715368, -0.1715368, -0.1715368, -0.1715368…
## $ tfidf_title_2018        <dbl> -0.196099, -0.196099, -0.196099, -0.196099, -0…
## $ tfidf_title_2019        <dbl> -0.1533443, -0.1533443, -0.1533443, -0.1533443…
## $ tfidf_title_2020        <dbl> -0.1522508, -0.1522508, -0.1522508, -0.1522508…
## $ tfidf_title_44          <dbl> -0.1257324, -0.1257324, -0.1257324, -0.1257324…
## $ tfidf_title_a           <dbl> -0.2222684, -0.2222684, -0.2222684, -0.2222684…
## $ tfidf_title_ad          <dbl> -0.3519862, -0.3519862, -0.3519862, -0.3519862…
## $ tfidf_title_and         <dbl> -0.1321017, -0.1321017, -0.1321017, -0.1321017…
## $ tfidf_title_babies      <dbl> -0.107287, -0.107287, -0.107287, -0.107287, -0…
## $ tfidf_title_baby        <dbl> -0.1171696, -0.1171696, -0.1171696, -0.1171696…
## $ tfidf_title_beer        <dbl> -0.106132, -0.106132, -0.106132, -0.106132, -0…
## $ tfidf_title_best        <dbl> -0.1091918, -0.1091918, -0.1091918, -0.1091918…
## $ tfidf_title_big         <dbl> -0.1548758, -0.1548758, -0.1548758, -0.1548758…
## $ tfidf_title_bowl        <dbl> -0.7971586, -0.7971586, 0.7201022, -0.7971586,…
## $ tfidf_title_britney     <dbl> -0.1079305, -0.1079305, -0.1079305, -0.1079305…
## $ tfidf_title_bud         <dbl> -0.4447197, -0.4447197, 0.6695668, -0.4447197,…
## $ tfidf_title_budweiser   <dbl> -0.3463570, -0.3463570, -0.3463570, 1.1177438,…
## $ tfidf_title_cedric      <dbl> -0.186015, -0.186015, -0.186015, -0.186015, 4.…
## $ tfidf_title_cindy       <dbl> -0.1091109, -0.1091109, -0.1091109, -0.1091109…
## $ tfidf_title_clydesdale  <dbl> -0.1304693, -0.1304693, -0.1304693, -0.1304693…
## $ tfidf_title_coca        <dbl> -0.2648954, 4.1042070, -0.2648954, -0.2648954,…
## $ tfidf_title_coke        <dbl> -0.1514581, -0.1514581, -0.1514581, -0.1514581…
## $ tfidf_title_cola        <dbl> -0.2648954, 4.1042070, -0.2648954, -0.2648954,…
## $ tfidf_title_commercial  <dbl> 1.4121698, -0.8403560, -0.8403560, -0.8403560,…
## $ tfidf_title_commercials <dbl> -0.1518487, -0.1518487, -0.1518487, -0.1518487…
## $ tfidf_title_cool        <dbl> -0.1094371, -0.1094371, -0.1094371, -0.1094371…
## $ tfidf_title_crash       <dbl> -0.1554245, -0.1554245, -0.1554245, -0.1554245…
## $ tfidf_title_date        <dbl> -0.107287, -0.107287, -0.107287, -0.107287, -0…
## $ tfidf_title_diet        <dbl> -0.1292761, -0.1292761, -0.1292761, -0.1292761…
## $ tfidf_title_dilly       <dbl> -0.07715167, -0.07715167, -0.07715167, -0.0771…
## $ tfidf_title_dog         <dbl> -0.107287, -0.107287, -0.107287, -0.107287, -0…
## $ tfidf_title_doritos     <dbl> -0.2927531, -0.2927531, -0.2927531, -0.2927531…
## $ tfidf_title_down        <dbl> -0.1020634, -0.1020634, -0.1020634, -0.1020634…
## $ tfidf_title_e           <dbl> -0.1731492, -0.1731492, -0.1731492, -0.1731492…
## $ tfidf_title_elantra     <dbl> -0.1091109, -0.1091109, -0.1091109, -0.1091109…
## $ tfidf_title_epic        <dbl> -0.106132, -0.106132, -0.106132, -0.106132, -0…
## $ tfidf_title_etrade      <dbl> -0.1244886, -0.1244886, -0.1244886, -0.1244886…
## $ tfidf_title_extended    <dbl> -0.1307519, -0.1307519, -0.1307519, -0.1307519…
## $ tfidf_title_factory     <dbl> -0.1079305, -0.1079305, -0.1079305, -0.1079305…
## $ tfidf_title_fantasy     <dbl> -0.1557083, -0.1557083, -0.1557083, -0.1557083…
## $ tfidf_title_featuring   <dbl> -0.1037587, -0.1037587, -0.1037587, -0.1037587…
## $ tfidf_title_flavor      <dbl> -0.09088885, -0.09088885, -0.09088885, -0.0908…
## $ tfidf_title_funny       <dbl> 9.3221631, -0.1436779, -0.1436779, -0.1436779,…
## $ tfidf_title_game        <dbl> -0.1738631, -0.1738631, -0.1738631, -0.1738631…
## $ tfidf_title_genesis     <dbl> -0.1055369, -0.1055369, -0.1055369, -0.1055369…
## $ tfidf_title_girlfriend  <dbl> -0.1083241, -0.1083241, -0.1083241, -0.1083241…
## $ tfidf_title_halftime    <dbl> -0.07715167, -0.07715167, -0.07715167, -0.0771…
## $ tfidf_title_hd          <dbl> -0.2088777, -0.2088777, -0.2088777, -0.2088777…
## $ tfidf_title_hyundai     <dbl> -0.2503603, -0.2503603, -0.2503603, -0.2503603…
## $ tfidf_title_inside      <dbl> -0.09776579, -0.09776579, -0.09776579, -0.0977…
## $ tfidf_title_is          <dbl> -0.1498258, -0.1498258, -0.1498258, -0.1498258…
## $ tfidf_title_island      <dbl> -0.1557083, -0.1557083, -0.1557083, -0.1557083…
## $ tfidf_title_it          <dbl> -0.1015255, 11.8378694, -0.1015255, -0.1015255…
## $ tfidf_title_journey     <dbl> -0.1015255, -0.1015255, -0.1015255, -0.1015255…
## $ tfidf_title_kia         <dbl> -0.2145999, -0.2145999, -0.2145999, -0.2145999…
## $ tfidf_title_king        <dbl> -0.1083241, -0.1083241, -0.1083241, -0.1083241…
## $ tfidf_title_legends     <dbl> -0.1315009, -0.1315009, -0.1315009, -0.1315009…
## $ tfidf_title_light       <dbl> -0.4205641, -0.4205641, 0.7320422, -0.4205641,…
## $ tfidf_title_lighta      <dbl> -0.1557083, -0.1557083, -0.1557083, -0.1557083…
## $ tfidf_title_love        <dbl> -0.1120992, -0.1120992, -0.1120992, -0.1120992…
## $ tfidf_title_meter       <dbl> -0.1087598, -0.1087598, -0.1087598, -0.1087598…
## $ tfidf_title_monkey      <dbl> -0.1094371, -0.1094371, -0.1094371, -0.1094371…
## $ tfidf_title_new         <dbl> -0.1629016, -0.1629016, -0.1629016, -0.1629016…
## $ tfidf_title_nfl         <dbl> -0.1035113, -0.1035113, -0.1035113, -0.1035113…
## $ tfidf_title_of          <dbl> -0.1155596, -0.1155596, -0.1155596, -0.1155596…
## $ tfidf_title_official    <dbl> -0.204087, -0.204087, -0.204087, -0.204087, -0…
## $ tfidf_title_on          <dbl> -0.1091109, -0.1091109, 8.3511788, -0.1091109,…
## $ tfidf_title_one         <dbl> -0.1015255, -0.1015255, -0.1015255, -0.1015255…
## $ tfidf_title_party       <dbl> -0.1020634, -0.1020634, -0.1020634, -0.1020634…
## $ tfidf_title_pepsi       <dbl> 2.3489833, -0.2902986, -0.2902986, -0.2902986,…
## $ tfidf_title_puppy       <dbl> -0.07715167, -0.07715167, -0.07715167, -0.0771…
## $ tfidf_title_respect     <dbl> -0.09776579, -0.09776579, -0.09776579, 12.2207…
## $ tfidf_title_spot        <dbl> -0.1164506, -0.1164506, -0.1164506, -0.1164506…
## $ tfidf_title_starring    <dbl> -0.123236, -0.123236, -0.123236, -0.123236, -0…
## $ tfidf_title_super       <dbl> -0.7971586, -0.7971586, 0.7201022, -0.7971586,…
## $ tfidf_title_superbowl   <dbl> -0.2286568, -0.2286568, -0.2286568, -0.2286568…
## $ tfidf_title_team        <dbl> -0.1292761, -0.1292761, -0.1292761, -0.1292761…
## $ tfidf_title_the         <dbl> -0.3171074, -0.3171074, 1.6224815, -0.3171074,…
## $ tfidf_title_toyota      <dbl> -0.1975127, -0.1975127, -0.1975127, -0.1975127…
## $ tfidf_title_trade       <dbl> -0.1731492, -0.1731492, -0.1731492, -0.1731492…
## $ tfidf_title_tv          <dbl> -0.1500656, -0.1500656, -0.1500656, -0.1500656…
## $ tfidf_title_usa         <dbl> -0.1730928, -0.1730928, -0.1730928, 6.9776275,…
## $ tfidf_title_vs          <dbl> -0.132349, -0.132349, -0.132349, -0.132349, -0…
## $ tfidf_title_winner      <dbl> -0.1554245, -0.1554245, -0.1554245, -0.1554245…
## $ tfidf_title_with        <dbl> -0.1510437, -0.1510437, -0.1510437, -0.1510437…
## $ tfidf_title_xliii       <dbl> -0.1294588, -0.1294588, -0.1294588, -0.1294588…
## $ category_id_X1          <dbl> -0.2887416, -0.2887416, -0.2887416, -0.2887416…
## $ category_id_X2          <dbl> -0.2372063, -0.2372063, -0.2372063, -0.2372063…
## $ category_id_X17         <dbl> -0.2765234, -0.2765234, -0.2765234, -0.2765234…
## $ category_id_X22         <dbl> 2.2294031, 2.2294031, -0.4458806, -0.4458806, …
## $ category_id_X23         <dbl> -0.4458806, -0.4458806, 2.2294031, -0.4458806,…
## $ category_id_X24         <dbl> -0.7527948, -0.7527948, -0.7527948, 1.3204762,…
## $ category_id_other       <dbl> -0.3345335, -0.3345335, -0.3345335, -0.3345335…
## $ brand_Bud.Light         <dbl> -0.5298694, -0.5298694, -0.5298694, -0.5298694…
## $ brand_Budweiser         <dbl> -0.4266248, -0.4266248, 2.3300277, 2.3300277, …
## $ brand_Coca.Cola         <dbl> -0.3234758, 3.0730201, -0.3234758, -0.3234758,…
## $ brand_Doritos           <dbl> -0.3870245, -0.3870245, -0.3870245, -0.3870245…
## $ brand_E.Trade           <dbl> -0.2229403, -0.2229403, -0.2229403, -0.2229403…
## $ brand_Hynudai           <dbl> -0.3560305, -0.3560305, -0.3560305, -0.3560305…
## $ brand_Kia               <dbl> -0.2372063, -0.2372063, -0.2372063, -0.2372063…
## $ brand_NFL               <dbl> -0.1746203, -0.1746203, -0.1746203, -0.1746203…
## $ brand_Pepsi             <dbl> 2.7920290, -0.3560305, -0.3560305, -0.3560305,…
## $ brand_Toyota            <dbl> -0.2078929, -0.2078929, -0.2078929, -0.2078929…
## $ funny_FALSE.            <dbl> -0.6582574, 1.5101199, -0.6582574, 1.5101199, …
## $ funny_TRUE.             <dbl> 0.6582574, -1.5101199, 0.6582574, -1.5101199, …
# Specify Model
xgboost_spec_youtube1 <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine Recipe and Model Using Workflow
xgboost_workflow_youtube1 <- 
  workflow() %>% 
  add_recipe(xgboost_youtube_recipe1) %>% 
  add_model(xgboost_spec_youtube1)

# Tune Hyperparameters
set.seed(678)
tuned_youtube1 <-
  tune_grid(xgboost_workflow_youtube1,
            resamples = youtube_cv,
            grid = 5)
tune::show_best(tuned_youtube1, metric = "rmse")
## # A tibble: 5 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1   403     5          1    0.0243        1.15e- 6       0.201 rmse   
## 2   861    39          6    0.00296       7.32e+ 0       0.697 rmse   
## 3  1777    18          8    0.152         1.91e- 5       0.380 rmse   
## 4   390    30         13    0.00459       8.81e- 2       0.465 rmse   
## 5  1205    13         10    0.0559        3.00e-10       0.828 rmse   
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>
# Update model by selecting best hyperparameters
xgboost_fw_youtube1 <- tune::finalize_workflow(xgboost_workflow_youtube1,
                        tune::select_best(tuned_youtube1, metric = "rmse"))

# Fit model on entire training data and test it on test data
youtube_fit <- tune::last_fit(xgboost_fw_youtube1, youtube_split)
tune::collect_metrics(youtube_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      2.62   Preprocessor1_Model1
## 2 rsq     standard      0.0668 Preprocessor1_Model1
tune::collect_predictions(youtube_fit) %>%
    ggplot(aes(like_count, .pred)) +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") + 
    coord_fixed()