Goal: The goal is to predict the Youtube like count. Click here for the data.

Import Data

youtube <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
Data summary
Name youtube
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
data <- youtube %>%
    
    # Treat missing values
    select(-superbowl_ads_dot_com_url, -youtube_url, -kind, -etag, -favorite_count, -thumbnail, -channel_title) %>%
    na.omit()

Clean Data

data <- data %>% 
    
    # Treat missing values
    filter(!is.na(like_count)) %>%
    
    # Convert to factor
    mutate(category_id = factor(category_id)) %>% 
    
    # Log Transform like_count
    mutate(like_count = log10(like_count + 1)) %>%
    
    mutate(across(where(is.logical), as.numeric))

Explore Data

data %>% skimr::skim()
Data summary
Name Piped data
Number of rows 190
Number of columns 18
_______________________
Column type frequency:
character 4
factor 1
numeric 12
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1 3 9 0 10 0
id 0 1 11 11 0 187 0
title 0 1 6 99 0 187 0
description 0 1 11 3527 0 187 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
category_id 0 1 FALSE 12 24: 71, 23: 34, 22: 21, 1: 17

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2010.10 5.73 2000 2006.00 2010.00 2014.75 2020.00 ▆▇▇▆▆
funny 0 1 0.69 0.46 0 0.00 1.00 1.00 1.00 ▃▁▁▁▇
show_product_quickly 0 1 0.70 0.46 0 0.00 1.00 1.00 1.00 ▃▁▁▁▇
patriotic 0 1 0.15 0.36 0 0.00 0.00 0.00 1.00 ▇▁▁▁▂
celebrity 0 1 0.30 0.46 0 0.00 0.00 1.00 1.00 ▇▁▁▁▃
danger 0 1 0.31 0.46 0 0.00 0.00 1.00 1.00 ▇▁▁▁▃
animals 0 1 0.36 0.48 0 0.00 0.00 1.00 1.00 ▇▁▁▁▅
use_sex 0 1 0.28 0.45 0 0.00 0.00 1.00 1.00 ▇▁▁▁▃
view_count 0 1 1698142.99 13187377.43 10 10484.75 58515.50 219180.25 176373378.00 ▇▁▁▁▁
like_count 0 1 2.18 1.08 0 1.52 2.22 2.77 5.44 ▃▆▇▂▁
dislike_count 0 1 984.25 7554.84 0 2.00 8.50 37.00 92990.00 ▇▁▁▁▁
comment_count 0 1 219.33 1063.61 0 2.00 15.00 65.00 9190.00 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 0 1 2006-02-06 10:02:36 2021-01-27 13:11:29 2012-02-14 12:03:48 186

Identify good predictors

likes per year

data %>%
    ggplot(aes(year, like_count)) +
    geom_col() +
    labs(title = "Likes by year",
         x = "Year",
         y = "Likes")

Likes for brands

data %>% 
    ggplot(aes(brand, like_count)) +
    geom_boxplot()

Likes for categories # {r} # # List categories # characteristics <- c("funny", "show_product_quickly", "patriotic", "celebrity", "danger", "animals", "use_sex") # # # Select TRUE values and count likes # likes_by_cat <- data %>% # select(all_of(characteristics), like_count) %>% # group_by(dplyr::across(all_of(characteristics))) %>% # summarise(total_likes = sum(like_count, na.rm = TRUE)) %>% # ungroup() # # # # Reshape Data # likes_long <- likes_by_cat %>% # pivot_longer(cols = all_of(characteristics), names_to = "characteristic", values_to = TRUE) %>% # filter(is_true) # # # Plot Data # likes_long %>% # ggplot(aes(x = characteristic, y = total_likes)) + # geom_col() + # labs(title = "Likes by category", # x = "Category", # y = "Likes") #

Likes per Category

data %>% count(category_id, sort = TRUE)
## # A tibble: 12 × 2
##    category_id     n
##    <fct>       <int>
##  1 24             71
##  2 23             34
##  3 22             21
##  4 1              17
##  5 17             16
##  6 2              14
##  7 10              6
##  8 25              4
##  9 15              3
## 10 27              2
## 11 19              1
## 12 26              1
data %>% group_by(category_id) %>%
    summarise(avg_like_count = mean(like_count)) %>%
    ungroup() %>%
    
    slice_max(order_by = avg_like_count, n = 10) %>%
    
    ggplot(aes(avg_like_count, fct_reorder(category_id, avg_like_count))) +
    geom_col() +
    
    labs(title = "Top Categories by Like Counts",
         x = "Avg. Like Counts", y = NULL)

title

data %>%

    # tockenize title
    unnest_tokens(output = word, input = title) %>%

    # calculate avg rent per word
    group_by(word) %>%
    summarise(like_count = mean(like_count),
              n     = n()) %>%
    ungroup() %>%

    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = like_count, n = 20) %>%

    #Plot
    ggplot(aes(like_count, fct_reorder(word, like_count))) +
    geom_point() +

    labs(y = "words in Title")

EDA shortcut

# Step 1: Prepare data

data_binarized_tbl <- data %>%
    select(-published_at) %>%
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 190
## Columns: 61
## $ `year__-Inf_2006`                                    <dbl> 0, 1, 0, 1, 0, 0,…
## $ year__2006_2010                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2010_2014.75                                   <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2014.75_Inf                                    <dbl> 1, 0, 1, 0, 1, 1,…
## $ brand__Bud_Light                                     <dbl> 1, 1, 0, 1, 0, 0,…
## $ brand__Budweiser                                     <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__Coca-Cola`                                   <dbl> 0, 0, 0, 0, 0, 1,…
## $ brand__Doritos                                       <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__E-Trade`                                     <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Hynudai                                       <dbl> 0, 0, 1, 0, 0, 0,…
## $ brand__Kia                                           <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__NFL                                           <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Pepsi                                         <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Toyota                                        <dbl> 0, 0, 0, 0, 1, 0,…
## $ funny__0                                             <dbl> 0, 0, 1, 0, 0, 0,…
## $ funny__1                                             <dbl> 1, 1, 0, 1, 1, 1,…
## $ show_product_quickly__0                              <dbl> 0, 1, 0, 0, 0, 1,…
## $ show_product_quickly__1                              <dbl> 1, 0, 1, 1, 1, 0,…
## $ patriotic__0                                         <dbl> 1, 1, 1, 1, 1, 1,…
## $ patriotic__1                                         <dbl> 0, 0, 0, 0, 0, 0,…
## $ celebrity__0                                         <dbl> 0, 1, 1, 1, 0, 0,…
## $ celebrity__1                                         <dbl> 1, 0, 0, 0, 1, 1,…
## $ danger__0                                            <dbl> 0, 0, 1, 0, 0, 1,…
## $ danger__1                                            <dbl> 1, 1, 0, 1, 1, 0,…
## $ animals__0                                           <dbl> 1, 0, 1, 0, 0, 0,…
## $ animals__1                                           <dbl> 0, 1, 0, 1, 1, 1,…
## $ use_sex__0                                           <dbl> 1, 1, 1, 0, 1, 1,…
## $ use_sex__1                                           <dbl> 0, 0, 0, 1, 0, 0,…
## $ id__Q5Hu_FBUIsk                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ `id__-OTHER`                                         <dbl> 1, 1, 1, 1, 1, 1,…
## $ `view_count__-Inf_10484.75`                          <dbl> 0, 0, 1, 0, 0, 0,…
## $ view_count__10484.75_58515.5                         <dbl> 1, 0, 0, 1, 1, 0,…
## $ view_count__58515.5_219180.25                        <dbl> 0, 1, 0, 0, 0, 0,…
## $ view_count__219180.25_Inf                            <dbl> 0, 0, 0, 0, 0, 1,…
## $ `like_count__-Inf_1.51851393987789`                  <dbl> 0, 0, 1, 1, 0, 0,…
## $ like_count__1.51851393987789_2.22007656488678        <dbl> 0, 1, 0, 0, 1, 0,…
## $ like_count__2.22007656488678_2.77066783242838        <dbl> 1, 0, 0, 0, 0, 0,…
## $ like_count__2.77066783242838_Inf                     <dbl> 0, 0, 0, 0, 0, 1,…
## $ `dislike_count__-Inf_2`                              <dbl> 0, 0, 1, 0, 0, 0,…
## $ dislike_count__2_8.5                                 <dbl> 0, 0, 0, 1, 0, 0,…
## $ dislike_count__8.5_37                                <dbl> 1, 1, 0, 0, 1, 0,…
## $ dislike_count__37_Inf                                <dbl> 0, 0, 0, 0, 0, 1,…
## $ `comment_count__-Inf_2`                              <dbl> 0, 0, 1, 1, 0, 0,…
## $ comment_count__2_15                                  <dbl> 1, 1, 0, 0, 1, 0,…
## $ comment_count__15_65                                 <dbl> 0, 0, 0, 0, 0, 0,…
## $ comment_count__65_Inf                                <dbl> 0, 0, 0, 0, 0, 1,…
## $ title__Bud_Lighta_Cedric_a_Island_Fantasy_2005       <dbl> 0, 0, 0, 0, 0, 0,…
## $ `title__-OTHER`                                      <dbl> 1, 1, 1, 1, 1, 1,…
## $ description__Bud_Lighta_Cedric_a_Island_Fantasy_2005 <dbl> 0, 0, 0, 0, 0, 0,…
## $ `description__-OTHER`                                <dbl> 1, 1, 1, 1, 1, 1,…
## $ category_id__1                                       <dbl> 0, 0, 0, 0, 1, 0,…
## $ category_id__2                                       <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__10                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__15                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__17                                      <dbl> 0, 1, 0, 0, 0, 0,…
## $ category_id__22                                      <dbl> 0, 0, 1, 0, 0, 0,…
## $ category_id__23                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__24                                      <dbl> 0, 0, 0, 1, 0, 1,…
## $ category_id__25                                      <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__27                                      <dbl> 1, 0, 0, 0, 0, 0,…
## $ `category_id__-OTHER`                                <dbl> 0, 0, 0, 0, 0, 0,…
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
    correlate(like_count__2.77066783242838_Inf)

data_corr_tbl
## # A tibble: 61 × 3
##    feature       bin                               correlation
##    <fct>         <chr>                                   <dbl>
##  1 like_count    2.77066783242838_Inf                    1    
##  2 view_count    219180.25_Inf                           0.777
##  3 comment_count 65_Inf                                  0.761
##  4 dislike_count 37_Inf                                  0.649
##  5 comment_count -Inf_2                                 -0.362
##  6 dislike_count -Inf_2                                 -0.362
##  7 like_count    -Inf_1.51851393987789                  -0.343
##  8 view_count    -Inf_10484.75                          -0.338
##  9 view_count    10484.75_58515.5                       -0.333
## 10 like_count    2.22007656488678_2.77066783242838      -0.333
## # ℹ 51 more rows
# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

data <- data %>%
    
    select(id, like_count, category_id, description, brand, year, funny, show_product_quickly, patriotic, celebrity, danger, animals, use_sex, title)

Build Models

Split Data

# Split into train and test dataset
set.seed(123)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
data_cv <- bootstraps(data_train, strata = like_count)
data_cv
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits           id         
##    <list>           <chr>      
##  1 <split [142/53]> Bootstrap01
##  2 <split [142/57]> Bootstrap02
##  3 <split [142/49]> Bootstrap03
##  4 <split [142/52]> Bootstrap04
##  5 <split [142/51]> Bootstrap05
##  6 <split [142/55]> Bootstrap06
##  7 <split [142/52]> Bootstrap07
##  8 <split [142/50]> Bootstrap08
##  9 <split [142/56]> Bootstrap09
## 10 <split [142/46]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_xgboost(like_count ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(57912)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

Preprocessing and set up xgboost

 xgboost_recipe <-
     recipe(formula = like_count ~ ., data = data_train) %>%
     recipes::update_role(id, new_role = "id") %>%
     step_tokenize(description) %>%
     step_tokenfilter(description, max_tokens = 100) %>%
     step_tokenize(title) %>%
     step_tokenfilter(title, max_tokens = 50) %>%
     step_tfidf(description) %>%
     step_tfidf(title) %>%
     step_dummy(all_nominal_predictors())
xgboost_recipe %>% prep() %>% bake(new_data = NULL) %>% glimpse()
## Rows: 142
## Columns: 180
## $ id                                <fct> Zb3fhsfs6ZU, JJmqCKtJnxM, lbkafMhmvM…
## $ year                              <dbl> 2006, 2000, 2020, 2009, 2016, 2001, …
## $ funny                             <dbl> 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ show_product_quickly              <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, …
## $ patriotic                         <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ celebrity                         <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ danger                            <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ animals                           <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, …
## $ use_sex                           <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
## $ like_count                        <dbl> 3.338456, 4.395169, 5.244104, 2.8830…
## $ tfidf_description_2               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_2012            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_2014            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_2018            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_2019            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_2020            <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_a               <dbl> 0.46926686, 0.00000000, 0.04022287, …
## $ tfidf_description_ad              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_ads             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_all             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_an              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_and             <dbl> 0.00000000, 0.00000000, 0.04385530, …
## $ tfidf_description_are             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_as              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_assistant       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_at              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_be              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_beer            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_ben             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_best            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_big             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_bowl            <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_bud             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_budweiser       <dbl> 0.00000000, 1.09240103, 0.00000000, …
## $ tfidf_description_but             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_by              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_callner         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_channel         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_comedy          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_commercial      <dbl> 0.00000000, 0.60116606, 0.00000000, …
## $ tfidf_description_commercials     <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_de              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_do              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_doritos         <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_during          <dbl> 0.45354924, 0.00000000, 0.00000000, …
## $ tfidf_description_facebook        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_favorite        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_featuring       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_fire            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_follow          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_for             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_from            <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_full            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_funny           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_game            <dbl> 0.39136593, 0.00000000, 0.00000000, …
## $ tfidf_description_he              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_here            <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_his             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_http            <dbl> 0.00000000, 0.00000000, 0.27130257, …
## $ tfidf_description_https           <dbl> 0.00000000, 0.00000000, 0.11953509, …
## $ tfidf_description_hyundai         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_in              <dbl> 0.00000000, 0.00000000, 0.04912298, …
## $ tfidf_description_instagram       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_is              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_it              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.2…
## $ tfidf_description_kia             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_light           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_like            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_more            <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_most            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_network         <dbl> 0.00000000, 0.00000000, 0.09659985, …
## $ tfidf_description_new             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_nfl             <dbl> 0.00000000, 0.00000000, 0.69897969, …
## $ tfidf_description_of              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_on              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_one             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_other           <dbl> 0.00000000, 0.00000000, 0.07291560, …
## $ tfidf_description_our             <dbl> 0.00000000, 0.00000000, 0.07521550, …
## $ tfidf_description_out             <dbl> 0.00000000, 0.00000000, 0.07521550, …
## $ tfidf_description_pepsi           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_podcasts        <dbl> 0.0000000, 0.0000000, 0.1027804, 0.0…
## $ tfidf_description_producer        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_production      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_see             <dbl> 0.43875711, 0.00000000, 0.00000000, …
## $ tfidf_description_subscribe       <dbl> 0.00000000, 0.00000000, 0.08374839, …
## $ tfidf_description_super           <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_superbowl       <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_that            <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_the             <dbl> 0.00000000, 0.00000000, 0.09580728, …
## $ tfidf_description_this            <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_time            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.3…
## $ tfidf_description_to              <dbl> 0.25582256, 0.00000000, 0.08771059, …
## $ tfidf_description_today           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_toyota          <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_description_twitter         <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_description_twitter.com     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_up              <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_us              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_viewed          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_visit           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_was             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_watch           <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_when            <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_description_with            <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_world           <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_www.nfl.com     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_www.youtube.com <dbl> 0.0000000, 0.0000000, 0.4665078, 0.0…
## $ tfidf_description_xli             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_you             <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_your            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2000                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2005                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_2007                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_2009                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_2010                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2012                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_2013                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_2014                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2015                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_2016                  <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_2018                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_2019                  <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_2020                  <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_a                     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_ad                    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_baby                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.7…
## $ tfidf_title_best                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_big                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_bowl                  <dbl> 0.0000000, 0.0000000, 0.3005830, 0.0…
## $ tfidf_title_bud                   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_budweiser             <dbl> 1.8658674, 0.9329337, 0.0000000, 0.0…
## $ tfidf_title_cedric                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_coca                  <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_coke                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_cola                  <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_commercial            <dbl> 0.0000000, 0.4986299, 0.2493149, 0.1…
## $ tfidf_title_crash                 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_dog                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_doritos               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_e                     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_etrade                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.7…
## $ tfidf_title_game                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_hd                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_hyundai               <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_kia                   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_light                 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_new                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_nfl                   <dbl> 0.0000000, 0.0000000, 0.8993281, 0.0…
## $ tfidf_title_of                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_official              <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_pepsi                 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_super                 <dbl> 0.0000000, 0.0000000, 0.3005830, 0.0…
## $ tfidf_title_superbowl             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.5…
## $ tfidf_title_the                   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_toyota                <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_trade                 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_tv                    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_version               <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_winner                <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_title_xliii                 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.6…
## $ category_id_X2                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id_X10                   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X15                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17                   <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X19                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22                   <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id_X23                   <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X24                   <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ category_id_X25                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser                   <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Coca.Cola                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Doritos                     <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ brand_E.Trade                     <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ brand_Hynudai                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Kia                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_NFL                         <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Pepsi                       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
xgboost_spec <-
    boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), sample_size = tune()) %>%
   set_mode("regression") %>%
   set_engine("xgboost")

 xgboost_workflow <-
   workflow() %>%
   add_recipe(xgboost_recipe) %>%
   add_model(xgboost_spec)

 set.seed(64548)
 doParallel::registerDoParallel()
 xgboost_tune <-
   tune_grid(xgboost_workflow, resamples = data_cv,
             grid = 15)
## Warning: package 'xgboost' was built under R version 4.3.3

Try different models, Random Forest, and SVM

ranger_spec <-
  rand_forest(trees = 500) %>%
  set_engine("ranger") %>%
  set_mode("regression")

ranger_spec 
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger
svm_spec <-
    svm_linear() %>%
    set_engine("LiblineaR") %>%
    set_mode("regression")

svm_spec
## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR
ranger_wf <- workflow(xgboost_recipe, ranger_spec)
svm_wf <- workflow(xgboost_recipe, svm_spec)
doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
  svm_wf,
  resamples = data_cv,
  control = contrl_preds
)
## Warning: package 'LiblineaR' was built under R version 4.3.3
ranger_rs <- fit_resamples(
  ranger_wf,
  resamples = data_cv,
  control = contrl_preds
)
collect_metrics(svm_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.12     25  0.0209 Preprocessor1_Model1
## 2 rsq     standard   0.134    25  0.0161 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1.06     25  0.0210 Preprocessor1_Model1
## 2 rsq     standard   0.116    25  0.0142 Preprocessor1_Model1

# Evalute and Plot Random Forest, and SVM

bind_rows(
  collect_predictions(svm_rs) %>%
    mutate(mod = "SVM"), collect_predictions(ranger_rs) %>%
    mutate(mod = "ranger")) %>%
    
  ggplot(aes(like_count, .pred)) +
    geom_abline(lty = 2, color = "gray50") +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    coord_fixed()

Evaluate xgboost

tune::show_best(xgboost_tune, metric=NULL)
## Warning in tune::show_best(xgboost_tune, metric = NULL): No value of `metric`
## was given; "rmse" will be used.
## # A tibble: 5 × 11
##   trees min_n tree_depth learn_rate sample_size .metric .estimator  mean     n
##   <int> <int>      <int>      <dbl>       <dbl> <chr>   <chr>      <dbl> <int>
## 1   559     6          9    0.00616       0.971 rmse    standard    1.10    25
## 2  1568    12          5    0.00141       0.886 rmse    standard    1.11    25
## 3  1235    18         14    0.00202       0.621 rmse    standard    1.11    25
## 4  1346    24         14    0.00716       0.252 rmse    standard    1.11    25
## 5   954    37          8    0.0112        0.337 rmse    standard    1.12    25
## # ℹ 2 more variables: std_err <dbl>, .config <chr>
# Update the model by selecting the best hyperparameters.
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire training data and test it on the test data.
data_fit <- tune::last_fit(xgboost_fw, data_split)

Plot xgboost

tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard      1.04   Preprocessor1_Model1
## 2 rsq     standard      0.0624 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
    ggplot(aes(like_count, .pred)) +
    geom_point(alpha = 0.3, fill = "midnightblue") +
    geom_abline(lty = 2, color = "grey50") +
    coord_fixed()

Finalize Models

final_fitted <- last_fit(ranger_wf, data_split)
collect_metrics(final_fitted)  
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.936 Preprocessor1_Model1
## 2 rsq     standard       0.165 Preprocessor1_Model1
final_wf <- extract_workflow(final_fitted)
predict(final_wf, data_test)
## # A tibble: 48 × 1
##    .pred
##    <dbl>
##  1  2.03
##  2  2.24
##  3  2.32
##  4  2.01
##  5  3.79
##  6  2.16
##  7  2.26
##  8  2.81
##  9  2.18
## 10  2.23
## # ℹ 38 more rows

Changes to improve the model

I made the following changes to my model: Included the columns funny, show_product_quickly, patriotic, celebrity, danger, animals, use_sex, title. Some of them were boolean values (true/false), so I converted them to numeric values. Furthermore I included title as a predictor. I used step_tokenize() and take the first 50 words. Furthermore I included learn_rate = tune(), mtry = tune(), sample_size = tune() as predictors.

I also tried to apply the other two machine learning models Random Forest, and SVM. Random Forest is performing slightly better than xgboost. However, it is only a very small improvement compared to last weeks Apply to your data.