Superbowl commercials: Build a regression model to predict the Youtube like count (like_count). Use the youtube dataset.

Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean data

youtube %>% glimpse()
## Rows: 247
## Columns: 25
## $ year                      <dbl> 2018, 2020, 2006, 2018, 2003, 2020, 2020, 20…
## $ brand                     <chr> "Toyota", "Bud Light", "Bud Light", "Hynudai…
## $ superbowl_ads_dot_com_url <chr> "https://superbowl-ads.com/good-odds-toyota/…
## $ youtube_url               <chr> "https://www.youtube.com/watch?v=zeBZvwYQ-hA…
## $ funny                     <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, …
## $ show_product_quickly      <lgl> FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE,…
## $ patriotic                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ celebrity                 <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE…
## $ danger                    <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE,…
## $ animals                   <lgl> FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE,…
## $ use_sex                   <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FAL…
## $ id                        <chr> "zeBZvwYQ-hA", "nbbp0VW7z8w", "yk0MQD5YgV8",…
## $ kind                      <chr> "youtube#video", "youtube#video", "youtube#v…
## $ etag                      <chr> "rn-ggKNly38Cl0C3CNjNnUH9xUw", "1roDoK-SYqSp…
## $ view_count                <dbl> 173929, 47752, 142310, 198, 13741, 23636, 30…
## $ like_count                <dbl> 1233, 485, 129, 2, 20, 115, 1470, 78, 342, 7…
## $ dislike_count             <dbl> 38, 14, 15, 0, 3, 11, 384, 6, 7, 0, 14, 0, 2…
## $ favorite_count            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ comment_count             <dbl> NA, 14, 9, 0, 2, 13, 227, 6, 30, 0, 8, 1, 13…
## $ published_at              <dttm> 2018-02-03 11:29:14, 2020-01-31 21:04:13, 2…
## $ title                     <chr> "Toyota Super Bowl Commercial 2018 Good Odds…
## $ description               <chr> "Toyota Super Bowl Commercial 2018 Good Odds…
## $ thumbnail                 <chr> "https://i.ytimg.com/vi/zeBZvwYQ-hA/sddefaul…
## $ channel_title             <chr> "Funny Commercials", "VCU Brandcenter", "Joh…
## $ category_id               <dbl> 1, 27, 17, 22, 24, 1, 24, 2, 24, 24, 24, 24,…
youtube %>% skimr::skim()
Data summary
Name Piped data
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
data_clean <- youtube %>% 
    
    # Treat missing values
    filter(!is.na(like_count)) %>%
    
    # Convert to factor
    mutate(category_id = factor(category_id)) %>%
    
    # Log Transform like_count
    mutate(like_count = log10(like_count + 1))

Explore Data

data_clean %>% select(-id) %>% explore()
data_clean %>% describe_all()
data_clean %>% describe_cat(author)
data_clean %>% select(-id) %>% explore_all(target = like_count)

target variable

Check the distribution

data_clean %>%
    
    ggplot(aes(like_count)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data_clean %>% count(category_id, sort = TRUE)
## # A tibble: 13 × 2
##    category_id     n
##    <fct>       <int>
##  1 24             80
##  2 23             37
##  3 22             36
##  4 1              18
##  5 17             17
##  6 2              16
##  7 10              6
##  8 25              5
##  9 27              4
## 10 15              3
## 11 19              1
## 12 26              1
## 13 29              1

category_id

data_clean %>%
    
    group_by(category_id) %>%
    summarise(avg_like_count = mean(like_count)) %>%
    ungroup() %>%
    
    slice_max(order_by = avg_like_count, n = 10) %>%
    
    ggplot(aes(avg_like_count, fct_reorder(category_id, avg_like_count))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "Top Categories by Like Counts",
         x = "Avg. Like Counts", y = NULL)

title

data_clean %>% 
    tidytext::unnest_tokens(word, title) %>%
    anti_join(stop_words) %>%
  
    group_by(word) %>%
      summarise(
        n = n(),
        avg_like_count = mean(like_count)
      ) %>%
      ggplot(aes(n, avg_like_count)) +
      geom_hline(
        yintercept = mean(data_clean$like_count), lty = 2,
        color = "gray50", size = 1.5
      ) +
      geom_jitter(color = "midnightblue", alpha = 0.7) +
      geom_text(aes(label = word),
        check_overlap = TRUE, family = "IBMPlexSans",
        vjust = "top", hjust = "left"
      ) +
      scale_x_log10() 
## Joining with `by = join_by(word)`

logical variables

Examine logical variables: The plot shows these logical variables have no predictive power.

data_clean %>%
    select(like_count, funny:use_sex) %>%
    pivot_longer(cols = funny:use_sex) %>%
    
    # Plot
    ggplot(aes(value, like_count)) +
    geom_boxplot() +
    facet_wrap(~name)

brand

The plot shows brand may be a good predictor.

data_clean %>%
    ggplot(aes(brand, like_count)) +
    geom_boxplot()

Description

It may be a good predictor.

data_clean %>% 
    tidytext::unnest_tokens(word, description) %>%
    anti_join(stop_words) %>%
  
    group_by(word) %>%
      summarise(
        n = n(),
        avg_like_count = mean(like_count)
      ) %>%
      ggplot(aes(n, avg_like_count)) +
      geom_hline(
        yintercept = mean(data_clean$like_count), lty = 2,
        color = "gray50", linewidth = 1.5
      ) +
      geom_jitter(color = "midnightblue", alpha = 0.7) +
      geom_text(aes(label = word),
        check_overlap = TRUE, family = "IBMPlexSans",
        vjust = "top", hjust = "left"
      ) +
      scale_x_log10() 
## Joining with `by = join_by(word)`

channel title

data_clean %>% count(channel_title, sort = T)
## # A tibble: 181 × 2
##    channel_title               n
##    <chr>                   <int>
##  1 NFL                         5
##  2 omon007                     5
##  3 BudBowlXLII                 4
##  4 reggiep08v2                 4
##  5 Coca-Cola                   3
##  6 Funny Commercials           3
##  7 John Keehler                3
##  8 The Hall of Advertising     3
##  9 USA TODAY                   3
## 10 World Hyundai Matteson      3
## # ℹ 171 more rows
data_clean %>%
    
    group_by(channel_title) %>%
    summarise(avg_like_count = mean(like_count)) %>%
    ungroup() %>%
    
    slice_max(order_by = avg_like_count, n = 10) %>%
    
    ggplot(aes(avg_like_count, fct_reorder(channel_title, avg_like_count))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "Top Categories by Like Counts",
         x = "Avg. Like Counts", y = NULL)

year

data_clean %>% count(year, sort = T)
## # A tibble: 21 × 2
##     year     n
##    <dbl> <int>
##  1  2009    15
##  2  2012    15
##  3  2007    14
##  4  2013    14
##  5  2008    13
##  6  2010    13
##  7  2018    13
##  8  2001    12
##  9  2019    12
## 10  2004    11
## # ℹ 11 more rows
data_clean  %>%
    ggplot(aes(year)) +
    geom_bar()

data_clean %>%
    
    group_by(year) %>%
    summarise(avg_like_count = mean(like_count)) %>%
    ungroup() %>%
    
    # slice_max(order_by = avg_like_count, n = 10) %>%
    
    ggplot(aes(year, avg_like_count)) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "Top Categories by Like Counts",
         x = "Avg. Like Counts", y = NULL)

data_processed <- data_clean %>%
    
    select(id, like_count, category_id, description, brand, channel_title, year)

Build a Model

# Try with a small dataset first to be sure the code is error-free
# data_processed <- sample_n(data_processed, 100)


set.seed(123)
data_split <- initial_split(data_processed, strata = like_count)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- bootstraps(data_train, strata = like_count)
data_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits           id         
##    <list>           <chr>      
##  1 <split [168/62]> Bootstrap01
##  2 <split [168/65]> Bootstrap02
##  3 <split [168/58]> Bootstrap03
##  4 <split [168/59]> Bootstrap04
##  5 <split [168/62]> Bootstrap05
##  6 <split [168/66]> Bootstrap06
##  7 <split [168/58]> Bootstrap07
##  8 <split [168/67]> Bootstrap08
##  9 <split [168/67]> Bootstrap09
## 10 <split [168/62]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_xgboost(like_count ~., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(12984)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <- 
  recipe(formula = like_count ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id") %>%
    step_tokenize(description) %>%
    step_stopwords(description)%>%
    step_tokenfilter(description, max_tokens = 100) %>%
    step_tfidf(description) %>%
    step_other(channel_title, threshold = 0.01) %>%
    step_dummy(all_nominal_predictors())

xgboost_recipe %>% prep() %>% bake(new_data = NULL) %>% glimpse()
## Rows: 168
## Columns: 142
## $ id                                    <fct> J0xugdotpp8, agISXMN4tng, be5DH6…
## $ year                                  <dbl> 2020, 2010, 2018, 2013, 2010, 20…
## $ like_count                            <dbl> 0.9030900, 1.0000000, 0.4771213,…
## $ tfidf_description_2                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2012                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2013                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_2014                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_2018                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2019                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2020                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ad                  <dbl> 0.0000000, 0.2079442, 0.0000000,…
## $ tfidf_description_ads                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_arnold              <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_beer                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ben                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_best                <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_big                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_bowl                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_bud                 <dbl> 0.00000000, 0.19459101, 0.000000…
## $ tfidf_description_budweiser           <dbl> 2.44234704, 0.00000000, 0.000000…
## $ tfidf_description_callner             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_can                 <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_car                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_channel             <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_check               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_coca                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_cola                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_comedy              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_commercial          <dbl> 0.00000000, 0.12656664, 1.265666…
## $ tfidf_description_commercials         <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_compilation         <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_copyright           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_day                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_december            <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_directed            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_director            <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_doritos             <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_e                   <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_epic                <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_fail                <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_fails               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_failure             <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_favorite            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_film                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_first               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_follow              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_full                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_funniest            <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_funny               <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_game                <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_go                  <dbl> 0.000000000, 0.000000000, 0.0000…
## $ tfidf_description_good                <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_hilarious           <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_http                <dbl> 0.0000000, 0.3718635, 0.0000000,…
## $ tfidf_description_https               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_hyundai             <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_instagram           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_january             <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_kia                 <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_light               <dbl> 0.00000000, 0.40876287, 0.000000…
## $ tfidf_description_like                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_lol                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_man                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_method              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_monthly             <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_music               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_new                 <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_nfl                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_noob                <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_one                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ownage              <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_owned               <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_pepsi               <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_producer            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_production          <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_scenes              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_see                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_spot                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_subscribe           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_super               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_superbowl           <dbl> 0.00000000, 0.25649494, 0.000000…
## $ tfidf_description_supervisor          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_team                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_today               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_top                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_toyota              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_twitter             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_twitter.com         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_us                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_use                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_v                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_videos              <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_viewed              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_visit               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_watch               <dbl> 0.00000, 0.00000, 0.00000, 0.000…
## $ tfidf_description_world               <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_www.facebook.com    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_www.nfl.com         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_www.youtube.com     <dbl> 0.0000000, 0.4884694, 0.0000000,…
## $ tfidf_description_xli                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_xlviii              <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_year                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ category_id_X2                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X10                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X15                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X17                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X19                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X22                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ category_id_X23                       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,…
## $ category_id_X24                       <dbl> 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,…
## $ category_id_X25                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X26                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X27                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X29                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Budweiser                       <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,…
## $ brand_Coca.Cola                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ brand_Doritos                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_E.Trade                         <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,…
## $ brand_Hynudai                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Kia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_NFL                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Pepsi                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Toyota                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_alohawarriorchief       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_BudBowlXLII             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_ClearBroadcasting       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Coca.Cola               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Funny.Commercials       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Iulian.Craciun          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_NFL                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_omon007                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Peek.of.the.Net         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_reggiep08v2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_SuperBowlsSpots         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_The.Hall.of.Advertising <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Tomi.Jaya               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Trailer.Tube            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ channel_title_Unknown.Classics.       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_USA.TODAY               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_viralstuff              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_World.Hyundai.Matteson  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_other                   <dbl> 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,…
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(15793)
doParallel::registerDoParallel()
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_folds, 
            grid = 10)

Explore Results

show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
##   trees min_n tree_depth learn_rate .metric .estimator  mean     n std_err
##   <int> <int>      <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1  1719    12          5    0.00252 rmse    standard    1.05    25  0.0162
## 2   356    38          7    0.0248  rmse    standard    1.08    25  0.0183
## 3  1461     7          3    0.00631 rmse    standard    1.08    25  0.0123
## 4  1819     3         13    0.0166  rmse    standard    1.13    25  0.0193
## 5   662    17          2    0.0747  rmse    standard    1.16    25  0.0156
## # ℹ 1 more variable: .config <chr>
# How did all the possible parameter combinations do?
autoplot(xgboost_tune)

We can finalize our random forest workflow with the best performing parameters.

final_rf <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, "rmse"))

The function last_fit() fits this finalized random forest one last time to the training data and evaluates one last time on the testing data.

data_fit <- last_fit(final_rf, data_split)
data_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits           id               .metrics .notes   .predictions .workflow 
##   <list>           <chr>            <list>   <list>   <list>       <list>    
## 1 <split [168/57]> train/test split <tibble> <tibble> <tibble>     <workflow>

Evaluate model

collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.972 Preprocessor1_Model1
## 2 rsq     standard       0.259 Preprocessor1_Model1
collect_predictions(data_fit)
## # A tibble: 57 × 5
##    id               .pred  .row like_count .config             
##    <chr>            <dbl> <int>      <dbl> <chr>               
##  1 train/test split 2.80      2      2.69  Preprocessor1_Model1
##  2 train/test split 2.27      3      2.11  Preprocessor1_Model1
##  3 train/test split 1.42      4      0.477 Preprocessor1_Model1
##  4 train/test split 1.72      5      1.32  Preprocessor1_Model1
##  5 train/test split 1.79      9      2.54  Preprocessor1_Model1
##  6 train/test split 0.936    28      1.36  Preprocessor1_Model1
##  7 train/test split 1.80     30      2.37  Preprocessor1_Model1
##  8 train/test split 2.27     31      2.98  Preprocessor1_Model1
##  9 train/test split 2.46     32      1.61  Preprocessor1_Model1
## 10 train/test split 2.57     33      2.08  Preprocessor1_Model1
## # ℹ 47 more rows
collect_predictions(data_fit) %>%
    ggplot(aes(like_count, .pred)) +
    geom_point(alpha = 0.5, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") +
    coord_fixed()