Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')

## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(youtube)

Data summary
Name	youtube
Number of rows	247
Number of columns	25
_______________________
Column type frequency:
character	10
logical	7
numeric	7
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
brand	0	1.00	3	9	10
superbowl_ads_dot_com_url	0	1.00	34	120	244
youtube_url	11	0.96	43	43	233
id	11	0.96	11	11	233
kind	16	0.94	13	13	1
etag	16	0.94	27	27	228
title	16	0.94	6	99	228
description	50	0.80	3	3527	194
thumbnail	129	0.48	48	48	118
channel_title	16	0.94	3	37	185

Variable type: logical

skim_variable	complete_rate	mean	count
funny	1	0.69	TRU: 171, FAL: 76
show_product_quickly	1	0.68	TRU: 169, FAL: 78
patriotic	1	0.17	FAL: 206, TRU: 41
celebrity	1	0.29	FAL: 176, TRU: 71
danger	1	0.30	FAL: 172, TRU: 75
animals	1	0.37	FAL: 155, TRU: 92
use_sex	1	0.27	FAL: 181, TRU: 66

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2010.19	5.86	2000	2005	2010	2015.00	2020	▇▇▇▇▆
view_count	16	0.94	1407556.46	11971111.01	10	6431	41379	170015.50	176373378	▇▁▁▁▁
like_count	22	0.91	4146.03	23920.40	0	19	130	527.00	275362	▇▁▁▁▁
dislike_count	22	0.91	833.54	6948.52	0	1	7	24.00	92990	▇▁▁▁▁
favorite_count	16	0.94	0.00	0.00	0	0	0	0.00	0	▁▁▇▁▁
comment_count	25	0.90	188.64	986.46	0	1	10	50.75	9190	▇▁▁▁▁
category_id	16	0.94	19.32	8.00	1	17	23	24.00	29	▃▁▂▆▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
published_at	16	0.94	2006-02-06 10:02:36	2021-01-27 13:11:29	2013-01-31 09:13:55	227

data <- youtube %>%
  
  # Treat Missing Values
  select(-thumbnail, -channel_title, -description, -etag, -category_id, -youtube_url, -kind) %>%
  na.omit() %>%

# log transform variables with pos-skewed distributions
mutate(like_count = log(like_count+1))

Explore Data

Identify Good Predictors

view count

youtube %>%
  ggplot(aes(like_count, view_count)) +
  scale_y_log10() +
  geom_point()

## Warning: Removed 22 rows containing missing values (`geom_point()`).

dislike count

youtube %>%
  ggplot(aes(like_count, as.factor(dislike_count))) +
  geom_point()

## Warning: Removed 22 rows containing missing values (`geom_point()`).

title

youtube %>%
  
  # tokenize title
  unnest_tokens(output = word, input = title) %>%
  
  # calculate avg view per like
  group_by(word) %>%
  summarise(like_count = mean(like_count),
            n     = n()) %>%
  ungroup() %>%
  
  filter(n > 5, !str_detect(word, "\\d")) %>%
  slice_max(order_by = like_count, n = 20) %>%
  
  # Plot
  ggplot(aes(like_count, fct_reorder(word, like_count))) +
  geom_point()

EDA Shortcuts

# Step 1: Prepare Data
data_binarized_tbl <- data %>%
  select(-title, -id, -published_at, -show_product_quickly, -year, -brand, -superbowl_ads_dot_com_url) %>%
  binarize()

data_binarized_tbl %>% glimpse()

## Rows: 219
## Columns: 28
## $ funny__0                                      <dbl> 0, 0, 1, 0, 0, 0, 1, 0, …
## $ funny__1                                      <dbl> 1, 1, 0, 1, 1, 1, 0, 1, …
## $ patriotic__0                                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__1                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__0                                  <dbl> 0, 1, 1, 1, 0, 0, 0, 0, …
## $ celebrity__1                                  <dbl> 1, 0, 0, 0, 1, 1, 1, 1, …
## $ danger__0                                     <dbl> 0, 0, 1, 0, 0, 1, 1, 1, …
## $ danger__1                                     <dbl> 1, 1, 0, 1, 1, 0, 0, 0, …
## $ animals__0                                    <dbl> 1, 0, 1, 0, 0, 0, 1, 0, …
## $ animals__1                                    <dbl> 0, 1, 0, 1, 1, 1, 0, 1, …
## $ use_sex__0                                    <dbl> 1, 1, 1, 0, 1, 1, 1, 1, …
## $ use_sex__1                                    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `view_count__-Inf_6577`                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ view_count__6577_41828                        <dbl> 0, 0, 0, 1, 1, 0, 1, 1, …
## $ view_count__41828_176014.5                    <dbl> 1, 1, 0, 0, 0, 0, 0, 0, …
## $ view_count__176014.5_Inf                      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `like_count__-Inf_2.97008562636022`           <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ like_count__2.97008562636022_4.87519732320115 <dbl> 0, 1, 0, 1, 1, 0, 1, 0, …
## $ like_count__4.87519732320115_6.17686255739442 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ like_count__6.17686255739442_Inf              <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ `dislike_count__-Inf_1`                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ dislike_count__1_7                            <dbl> 0, 0, 0, 1, 0, 0, 1, 1, …
## $ dislike_count__7_24                           <dbl> 1, 1, 0, 0, 1, 0, 0, 0, …
## $ dislike_count__24_Inf                         <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `comment_count__-Inf_1`                       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ comment_count__1_11                           <dbl> 0, 1, 0, 1, 0, 0, 1, 0, …
## $ comment_count__11_51.5                        <dbl> 1, 0, 0, 0, 1, 0, 0, 1, …
## $ comment_count__51.5_Inf                       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate(like_count__6.17686255739442_Inf)

data_corr_tbl

## # A tibble: 28 × 3
##    feature       bin                               correlation
##    <fct>         <chr>                                   <dbl>
##  1 like_count    6.17686255739442_Inf                    1    
##  2 comment_count 51.5_Inf                                0.806
##  3 view_count    176014.5_Inf                            0.733
##  4 dislike_count 24_Inf                                  0.695
##  5 comment_count -Inf_1                                 -0.372
##  6 dislike_count -Inf_1                                 -0.352
##  7 dislike_count 1_7                                    -0.348
##  8 view_count    -Inf_6577                              -0.335
##  9 like_count    -Inf_2.97008562636022                  -0.335
## 10 like_count    2.97008562636022_4.87519732320115      -0.335
## # ℹ 18 more rows

# Step 3: Plot
data_corr_tbl %>%
  plot_correlation_funnel()

Build Models

# data <- sample_n(data, 100)

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [147/17]> Fold01
##  2 <split [147/17]> Fold02
##  3 <split [147/17]> Fold03
##  4 <split [147/17]> Fold04
##  5 <split [148/16]> Fold05
##  6 <split [148/16]> Fold06
##  7 <split [148/16]> Fold07
##  8 <split [148/16]> Fold08
##  9 <split [148/16]> Fold09
## 10 <split [148/16]> Fold10

library(usemodels)
usemodels::use_xgboost(like_count ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(81602)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify Recipe
xgboost_recipe <- 
    recipe(formula = like_count ~ ., data = data_train) %>% 
    recipes::update_role(id, new_role = "id variable") %>%
    step_select(-animals, -funny, -danger, -use_sex, -patriotic, -celebrity, -show_product_quickly,          -year, -brand, -superbowl_ads_dot_com_url) %>%
    step_tokenize(title) %>% 
    step_tokenfilter(title, max_tokens = 100) %>% 
    step_tfidf(title) %>% 
    step_date(published_at, keep_original_cols = FALSE) %>% 
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
    step_YeoJohnson(view_count, dislike_count, comment_count)

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 164
## Columns: 126
## $ id                      <fct> sl8ooTIMk2w, HtBZvl7dIu4, ecqiZn2DDFQ, yQ_nU0_…
## $ view_count              <dbl> 12.705674, 20.686035, 16.109205, 12.869695, 11…
## $ dislike_count           <dbl> 2.1099882, 4.1906048, 3.0446622, 1.6019271, 0.…
## $ favorite_count          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ comment_count           <dbl> 3.4294870, 5.5455775, 3.8674433, 1.9313973, 0.…
## $ like_count              <dbl> 6.523562, 10.787751, 7.233455, 4.779123, 2.944…
## $ tfidf_title_2001        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2005        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_2006        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2007        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2008        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2009        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2010        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2011        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2012        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2013        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2014        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2015        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2016        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2017        <dbl> 0.0000000, 0.7364734, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2018        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2019        <dbl> 0.8801152, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2020        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_44          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_a           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_ad          <dbl> 0.5913209, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_ads         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_and         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_baby        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_best        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_big         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_black       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_body        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bowl        <dbl> 0.0000000, 0.2011376, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_britney     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bud         <dbl> 0.0000000, 0.0000000, 0.3384774, 0.5641290, 0.…
## $ tfidf_title_budweiser   <dbl> 0.0000000, 0.3065021, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_camry       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_car         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_cedric      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cindy       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_clydesdale  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_coca        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_coke        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cola        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_commercial  <dbl> 0.0000000, 0.1705648, 0.2046778, 0.3411296, 0.…
## $ tfidf_title_commercials <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cool        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_crash       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_crown       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_date        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_diet        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dilly       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dog         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_dogs        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_doritos     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_down        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_elantra     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_etrade      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_exclusive   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_factory     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_fantasy     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_featuring   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_flavor      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_fly         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_for         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_ft          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_funny       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_game        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_genesis     <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_girlfriend  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_great       <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 2.20942, 0…
## $ tfidf_title_halftime    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_happiness   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_hd          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_horse       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hyundai     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_in          <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0…
## $ tfidf_title_island      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_jackie      <dbl> 0.0000000, 0.0000000, 0.8837681, 0.0000000, 0.…
## $ tfidf_title_journey     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_kia         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_legends     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_light       <dbl> 0.0000000, 0.0000000, 0.3573392, 0.5955654, 0.…
## $ tfidf_title_lighta      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_love        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_max         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_meter       <dbl> 1.10471, 0.00000, 0.00000, 0.00000, 0.00000, 0…
## $ tfidf_title_new         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_nfl         <dbl> 0.7989384, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_of          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_official    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_on          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_pepsi       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_spot        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_super       <dbl> 0.0000000, 0.2011376, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_superbowl   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_the         <dbl> 0.0000000, 0.3557840, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_toyota      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_tv          <dbl> 0.0000000, 0.0000000, 0.7040922, 0.0000000, 0.…
## $ tfidf_title_vs          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_winner      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_with        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_xliii       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_xliv        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_you         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_year       <int> 2019, 2017, 2008, 2007, 2010, 2009, 2014, 2008…
## $ published_at_dow_Sun    <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0…
## $ published_at_dow_Mon    <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
## $ published_at_dow_Tue    <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_dow_Wed    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_dow_Thu    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0…
## $ published_at_dow_Fri    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ published_at_dow_Sat    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ published_at_month_Jan  <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Feb  <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1…
## $ published_at_month_Mar  <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Apr  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_May  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Jun  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Jul  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Aug  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Sep  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Oct  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0…
## $ published_at_month_Nov  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ published_at_month_Dec  <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…

# Specify Model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine Recipe and Model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(93275)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv,
            grid      = 5)

## i Creating pre-processing data to finalize unknown parameter: mtry

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 10
##    mtry trees min_n learn_rate .metric .estimator  mean     n std_err .config   
##   <int> <int> <int>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>     
## 1    91   605     2    0.0359  rmse    standard   0.625    10  0.0363 Preproces…
## 2    32  1109    10    0.213   rmse    standard   0.774    10  0.0557 Preproces…
## 3   112  1306    19    0.00328 rmse    standard   0.811    10  0.0737 Preproces…
## 4    73  1774    37    0.0195  rmse    standard   1.02     10  0.0878 Preproces…
## 5    21   183    28    0.00203 rmse    standard   3.75     10  0.196  Preproces…

# update the model by selecting the best hyperparameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))


# Fit the model on the entire training data and test it on the test data set.
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune::collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.576 Preprocessor1_Model1
## 2 rsq     standard       0.922 Preprocessor1_Model1

tune::collect_predictions(data_fit) %>%
  ggplot(aes(like_count, .pred)) +
  geom_point(alpha = 0.4, fill = "midnightblue") +
  geom_abline(lty = 2, color = "gray50") +
  coord_fixed()

rf_spec <-
  rand_forest(trees = 500) %>%
  set_mode("regression")

rf_spec

## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger

svm_spec <-
  svm_linear() %>%
  set_mode("regression")

svm_spec

## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR

svm_wf <- workflow(xgboost_recipe, svm_spec)
rf_wf <- workflow(xgboost_recipe, rf_spec)

doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
  svm_wf,
  resamples = data_cv,
  control = contrl_preds
)

ranger_rs <- fit_resamples(
  rf_wf,
  resamples = data_cv,
  control = contrl_preds
)

collect_metrics(svm_rs)

## # A tibble: 2 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   2.62      10  0.168  Preprocessor1_Model1
## 2 rsq     standard   0.0628    10  0.0199 Preprocessor1_Model1

collect_metrics(ranger_rs)

## # A tibble: 2 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   0.952    10  0.0896 Preprocessor1_Model1
## 2 rsq     standard   0.897    10  0.0144 Preprocessor1_Model1

Apply To Your Data 4

Jason Zink

2024-02-29

Import Data

Explore Data

Build Models

Evaluate Models