Apply it to your Data 1

Superbowl commercials: Build a regression model to predict the Youtube like count (like_count). Use the youtube dataset.

Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')

## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean data

missing values: like_count
positively skewed distribution: like_count

youtube %>% glimpse()

## Rows: 247
## Columns: 25
## $ year                      <dbl> 2018, 2020, 2006, 2018, 2003, 2020, 2020, 20…
## $ brand                     <chr> "Toyota", "Bud Light", "Bud Light", "Hynudai…
## $ superbowl_ads_dot_com_url <chr> "https://superbowl-ads.com/good-odds-toyota/…
## $ youtube_url               <chr> "https://www.youtube.com/watch?v=zeBZvwYQ-hA…
## $ funny                     <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, …
## $ show_product_quickly      <lgl> FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE,…
## $ patriotic                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ celebrity                 <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE…
## $ danger                    <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE,…
## $ animals                   <lgl> FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE,…
## $ use_sex                   <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FAL…
## $ id                        <chr> "zeBZvwYQ-hA", "nbbp0VW7z8w", "yk0MQD5YgV8",…
## $ kind                      <chr> "youtube#video", "youtube#video", "youtube#v…
## $ etag                      <chr> "rn-ggKNly38Cl0C3CNjNnUH9xUw", "1roDoK-SYqSp…
## $ view_count                <dbl> 173929, 47752, 142310, 198, 13741, 23636, 30…
## $ like_count                <dbl> 1233, 485, 129, 2, 20, 115, 1470, 78, 342, 7…
## $ dislike_count             <dbl> 38, 14, 15, 0, 3, 11, 384, 6, 7, 0, 14, 0, 2…
## $ favorite_count            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ comment_count             <dbl> NA, 14, 9, 0, 2, 13, 227, 6, 30, 0, 8, 1, 13…
## $ published_at              <dttm> 2018-02-03 11:29:14, 2020-01-31 21:04:13, 2…
## $ title                     <chr> "Toyota Super Bowl Commercial 2018 Good Odds…
## $ description               <chr> "Toyota Super Bowl Commercial 2018 Good Odds…
## $ thumbnail                 <chr> "https://i.ytimg.com/vi/zeBZvwYQ-hA/sddefaul…
## $ channel_title             <chr> "Funny Commercials", "VCU Brandcenter", "Joh…
## $ category_id               <dbl> 1, 27, 17, 22, 24, 1, 24, 2, 24, 24, 24, 24,…

youtube %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	247
Number of columns	25
_______________________
Column type frequency:
character	10
logical	7
numeric	7
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
brand	0	1.00	3	9	10
superbowl_ads_dot_com_url	0	1.00	34	120	244
youtube_url	11	0.96	43	43	233
id	11	0.96	11	11	233
kind	16	0.94	13	13	1
etag	16	0.94	27	27	228
title	16	0.94	6	99	228
description	50	0.80	3	3527	194
thumbnail	129	0.48	48	48	118
channel_title	16	0.94	3	37	185

Variable type: logical

skim_variable	complete_rate	mean	count
funny	1	0.69	TRU: 171, FAL: 76
show_product_quickly	1	0.68	TRU: 169, FAL: 78
patriotic	1	0.17	FAL: 206, TRU: 41
celebrity	1	0.29	FAL: 176, TRU: 71
danger	1	0.30	FAL: 172, TRU: 75
animals	1	0.37	FAL: 155, TRU: 92
use_sex	1	0.27	FAL: 181, TRU: 66

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2010.19	5.86	2000	2005	2010	2015.00	2020	▇▇▇▇▆
view_count	16	0.94	1407556.46	11971111.01	10	6431	41379	170015.50	176373378	▇▁▁▁▁
like_count	22	0.91	4146.03	23920.40	0	19	130	527.00	275362	▇▁▁▁▁
dislike_count	22	0.91	833.54	6948.52	0	1	7	24.00	92990	▇▁▁▁▁
favorite_count	16	0.94	0.00	0.00	0	0	0	0.00	0	▁▁▇▁▁
comment_count	25	0.90	188.64	986.46	0	1	10	50.75	9190	▇▁▁▁▁
category_id	16	0.94	19.32	8.00	1	17	23	24.00	29	▃▁▂▆▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
published_at	16	0.94	2006-02-06 10:02:36	2021-01-27 13:11:29	2013-01-31 09:13:55	227

data_clean <- youtube %>% 
    
    # Treat missing values
    filter(!is.na(like_count)) %>%
    
    # Convert to factor
    mutate(category_id = factor(category_id)) %>%
    
    # Log Transform like_count
    mutate(like_count = log10(like_count + 1))

Explore Data

data_clean %>% select(-id) %>% explore()
data_clean %>% describe_all()
data_clean %>% describe_cat(author)
data_clean %>% select(-id) %>% explore_all(target = like_count)

target variable

Check the distribution

data_clean %>%
    
    ggplot(aes(like_count)) +
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data_clean %>% count(category_id, sort = TRUE)

## # A tibble: 13 × 2
##    category_id     n
##    <fct>       <int>
##  1 24             80
##  2 23             37
##  3 22             36
##  4 1              18
##  5 17             17
##  6 2              16
##  7 10              6
##  8 25              5
##  9 27              4
## 10 15              3
## 11 19              1
## 12 26              1
## 13 29              1

category_id

data_clean %>%
    
    group_by(category_id) %>%
    summarise(avg_like_count = mean(like_count)) %>%
    ungroup() %>%
    
    slice_max(order_by = avg_like_count, n = 10) %>%
    
    ggplot(aes(avg_like_count, fct_reorder(category_id, avg_like_count))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "Top Categories by Like Counts",
         x = "Avg. Like Counts", y = NULL)

title

data_clean %>% 
    tidytext::unnest_tokens(word, title) %>%
    anti_join(stop_words) %>%
  
    group_by(word) %>%
      summarise(
        n = n(),
        avg_like_count = mean(like_count)
      ) %>%
      ggplot(aes(n, avg_like_count)) +
      geom_hline(
        yintercept = mean(data_clean$like_count), lty = 2,
        color = "gray50", size = 1.5
      ) +
      geom_jitter(color = "midnightblue", alpha = 0.7) +
      geom_text(aes(label = word),
        check_overlap = TRUE, family = "IBMPlexSans",
        vjust = "top", hjust = "left"
      ) +
      scale_x_log10()

## Joining with `by = join_by(word)`

logical variables

Examine logical variables: The plot shows these logical variables have no predictive power.

data_clean %>%
    select(like_count, funny:use_sex) %>%
    pivot_longer(cols = funny:use_sex) %>%
    
    # Plot
    ggplot(aes(value, like_count)) +
    geom_boxplot() +
    facet_wrap(~name)

brand

The plot shows brand may be a good predictor.

data_clean %>%
    ggplot(aes(brand, like_count)) +
    geom_boxplot()

Description

It may be a good predictor.

data_clean %>% 
    tidytext::unnest_tokens(word, description) %>%
    anti_join(stop_words) %>%
  
    group_by(word) %>%
      summarise(
        n = n(),
        avg_like_count = mean(like_count)
      ) %>%
      ggplot(aes(n, avg_like_count)) +
      geom_hline(
        yintercept = mean(data_clean$like_count), lty = 2,
        color = "gray50", linewidth = 1.5
      ) +
      geom_jitter(color = "midnightblue", alpha = 0.7) +
      geom_text(aes(label = word),
        check_overlap = TRUE, family = "IBMPlexSans",
        vjust = "top", hjust = "left"
      ) +
      scale_x_log10()

## Joining with `by = join_by(word)`

channel title

data_clean %>% count(channel_title, sort = T)

## # A tibble: 181 × 2
##    channel_title               n
##    <chr>                   <int>
##  1 NFL                         5
##  2 omon007                     5
##  3 BudBowlXLII                 4
##  4 reggiep08v2                 4
##  5 Coca-Cola                   3
##  6 Funny Commercials           3
##  7 John Keehler                3
##  8 The Hall of Advertising     3
##  9 USA TODAY                   3
## 10 World Hyundai Matteson      3
## # ℹ 171 more rows

data_clean %>%
    
    group_by(channel_title) %>%
    summarise(avg_like_count = mean(like_count)) %>%
    ungroup() %>%
    
    slice_max(order_by = avg_like_count, n = 10) %>%
    
    ggplot(aes(avg_like_count, fct_reorder(channel_title, avg_like_count))) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "Top Categories by Like Counts",
         x = "Avg. Like Counts", y = NULL)

year

data_clean %>% count(year, sort = T)

## # A tibble: 21 × 2
##     year     n
##    <dbl> <int>
##  1  2009    15
##  2  2012    15
##  3  2007    14
##  4  2013    14
##  5  2008    13
##  6  2010    13
##  7  2018    13
##  8  2001    12
##  9  2019    12
## 10  2004    11
## # ℹ 11 more rows

data_clean  %>%
    ggplot(aes(year)) +
    geom_bar()

data_clean %>%
    
    group_by(year) %>%
    summarise(avg_like_count = mean(like_count)) %>%
    ungroup() %>%
    
    # slice_max(order_by = avg_like_count, n = 10) %>%
    
    ggplot(aes(year, avg_like_count)) +
    geom_col(fill = "midnightblue") +
    
    labs(title = "Top Categories by Like Counts",
         x = "Avg. Like Counts", y = NULL)

data_processed <- data_clean %>%
    
    select(id, like_count, category_id, description, brand, channel_title, year)

Build a Model

# Try with a small dataset first to be sure the code is error-free
# data_processed <- sample_n(data_processed, 100)


set.seed(123)
data_split <- initial_split(data_processed, strata = like_count)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- bootstraps(data_train, strata = like_count)
data_folds

## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits           id         
##    <list>           <chr>      
##  1 <split [168/62]> Bootstrap01
##  2 <split [168/65]> Bootstrap02
##  3 <split [168/58]> Bootstrap03
##  4 <split [168/59]> Bootstrap04
##  5 <split [168/62]> Bootstrap05
##  6 <split [168/66]> Bootstrap06
##  7 <split [168/58]> Bootstrap07
##  8 <split [168/67]> Bootstrap08
##  9 <split [168/67]> Bootstrap09
## 10 <split [168/62]> Bootstrap10
## # ℹ 15 more rows

library(usemodels)
use_xgboost(like_count ~., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(12984)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- 
  recipe(formula = like_count ~ ., data = data_train) %>%
    recipes::update_role(id, new_role = "id") %>%
    step_tokenize(description) %>%
    step_stopwords(description)%>%
    step_tokenfilter(description, max_tokens = 100) %>%
    step_tfidf(description) %>%
    step_other(channel_title, threshold = 0.01) %>%
    step_dummy(all_nominal_predictors())

xgboost_recipe %>% prep() %>% bake(new_data = NULL) %>% glimpse()

## Rows: 168
## Columns: 142
## $ id                                    <fct> J0xugdotpp8, agISXMN4tng, be5DH6…
## $ year                                  <dbl> 2020, 2010, 2018, 2013, 2010, 20…
## $ like_count                            <dbl> 0.9030900, 1.0000000, 0.4771213,…
## $ tfidf_description_2                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2012                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2013                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_2014                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_2018                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2019                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2020                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ad                  <dbl> 0.0000000, 0.2079442, 0.0000000,…
## $ tfidf_description_ads                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_arnold              <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_beer                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ben                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_best                <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_big                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_bowl                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_bud                 <dbl> 0.00000000, 0.19459101, 0.000000…
## $ tfidf_description_budweiser           <dbl> 2.44234704, 0.00000000, 0.000000…
## $ tfidf_description_callner             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_can                 <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_car                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_channel             <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_check               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_coca                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_cola                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_comedy              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_commercial          <dbl> 0.00000000, 0.12656664, 1.265666…
## $ tfidf_description_commercials         <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_compilation         <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_copyright           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_day                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_december            <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_directed            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_director            <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_doritos             <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_e                   <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_epic                <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_fail                <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_fails               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_failure             <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_favorite            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_film                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_first               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_follow              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_full                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_funniest            <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_funny               <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_game                <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_go                  <dbl> 0.000000000, 0.000000000, 0.0000…
## $ tfidf_description_good                <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_hilarious           <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_http                <dbl> 0.0000000, 0.3718635, 0.0000000,…
## $ tfidf_description_https               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_hyundai             <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_instagram           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_january             <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_kia                 <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_light               <dbl> 0.00000000, 0.40876287, 0.000000…
## $ tfidf_description_like                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_lol                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_man                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_method              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_monthly             <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_music               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_new                 <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_nfl                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_noob                <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_one                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ownage              <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_owned               <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_pepsi               <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_producer            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_production          <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_scenes              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_see                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_spot                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_subscribe           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_super               <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_superbowl           <dbl> 0.00000000, 0.25649494, 0.000000…
## $ tfidf_description_supervisor          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_team                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_today               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_top                 <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_toyota              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_twitter             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_twitter.com         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_us                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_use                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_v                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_videos              <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_viewed              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_visit               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_watch               <dbl> 0.00000, 0.00000, 0.00000, 0.000…
## $ tfidf_description_world               <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_www.facebook.com    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_www.nfl.com         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_www.youtube.com     <dbl> 0.0000000, 0.4884694, 0.0000000,…
## $ tfidf_description_xli                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_xlviii              <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_year                <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ category_id_X2                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X10                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X15                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X17                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X19                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X22                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ category_id_X23                       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,…
## $ category_id_X24                       <dbl> 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,…
## $ category_id_X25                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X26                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X27                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X29                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Budweiser                       <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,…
## $ brand_Coca.Cola                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ brand_Doritos                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_E.Trade                         <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,…
## $ brand_Hynudai                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Kia                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_NFL                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Pepsi                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Toyota                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_alohawarriorchief       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_BudBowlXLII             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_ClearBroadcasting       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Coca.Cola               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Funny.Commercials       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Iulian.Craciun          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_NFL                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_omon007                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Peek.of.the.Net         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_reggiep08v2             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_SuperBowlsSpots         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_The.Hall.of.Advertising <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Tomi.Jaya               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Trailer.Tube            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ channel_title_Unknown.Classics.       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_USA.TODAY               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_viralstuff              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_World.Hyundai.Matteson  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_other                   <dbl> 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,…

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(15793)
doParallel::registerDoParallel()
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_folds, 
            grid = 10)

Explore Results

show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 10
##   trees min_n tree_depth learn_rate .metric .estimator  mean     n std_err
##   <int> <int>      <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1  1719    12          5    0.00252 rmse    standard    1.05    25  0.0162
## 2   356    38          7    0.0248  rmse    standard    1.08    25  0.0183
## 3  1461     7          3    0.00631 rmse    standard    1.08    25  0.0123
## 4  1819     3         13    0.0166  rmse    standard    1.13    25  0.0193
## 5   662    17          2    0.0747  rmse    standard    1.16    25  0.0156
## # ℹ 1 more variable: .config <chr>

# How did all the possible parameter combinations do?
autoplot(xgboost_tune)

We can finalize our random forest workflow with the best performing parameters.

final_rf <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, "rmse"))

The function last_fit() fits this finalized random forest one last time to the training data and evaluates one last time on the testing data.

data_fit <- last_fit(final_rf, data_split)
data_fit

## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits           id               .metrics .notes   .predictions .workflow 
##   <list>           <chr>            <list>   <list>   <list>       <list>    
## 1 <split [168/57]> train/test split <tibble> <tibble> <tibble>     <workflow>

Evaluate model

collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.972 Preprocessor1_Model1
## 2 rsq     standard       0.259 Preprocessor1_Model1

collect_predictions(data_fit)

## # A tibble: 57 × 5
##    id               .pred  .row like_count .config             
##    <chr>            <dbl> <int>      <dbl> <chr>               
##  1 train/test split 2.80      2      2.69  Preprocessor1_Model1
##  2 train/test split 2.27      3      2.11  Preprocessor1_Model1
##  3 train/test split 1.42      4      0.477 Preprocessor1_Model1
##  4 train/test split 1.72      5      1.32  Preprocessor1_Model1
##  5 train/test split 1.79      9      2.54  Preprocessor1_Model1
##  6 train/test split 0.936    28      1.36  Preprocessor1_Model1
##  7 train/test split 1.80     30      2.37  Preprocessor1_Model1
##  8 train/test split 2.27     31      2.98  Preprocessor1_Model1
##  9 train/test split 2.46     32      1.61  Preprocessor1_Model1
## 10 train/test split 2.57     33      2.08  Preprocessor1_Model1
## # ℹ 47 more rows

collect_predictions(data_fit) %>%
    ggplot(aes(like_count, .pred)) +
    geom_point(alpha = 0.5, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") +
    coord_fixed()