Get the Data

Read in with tidytuesdayR package

Install from CRAN via: install.packages(“tidytuesdayR”)

This loads the readme and all the datasets for the week of interest

Either ISO-8601 date or year/week works!

tuesdata <- tidytuesdayR::tt_load(‘2021-03-02’) tuesdata <- tidytuesdayR::tt_load(2021, week = 10)

youtube <- tuesdata$youtube

Or read in the data manually

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl   (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl   (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm  (1): published_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data <- youtube %>%
  # Remove missing values
  select(-thumbnail, -ends_with("url"), -etag, -kind, -favorite_count, -published_at, -view_count, -comment_count, -description, -dislike_count) %>%
  na.omit %>%
 
  
  
  # log transform the target variable
  mutate(like_count = log(like_count + 1)) %>%
  
  # category_id is a factor
  mutate(category_id = as.factor(category_id)) %>%
  
  # Convert logical to factor
  mutate(across(where(is.logical), factor)) %>%
  
  # Convert character to factor
  mutate(across(c(id, channel_title, brand), factor))

# Step 1: Prepare Data

library(lubridate)
data_binarized_tbl <-data %>%
  select(-id, -title) %>%
  binarize()

data_binarized_tbl %>% glimpse()

## Rows: 225
## Columns: 54
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE                                  <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE                                   <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE                   <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE                    <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE                              <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE                               <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE                                 <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE                                  <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE                                <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE                                 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE                                <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ channel_title__BudBowlXLII                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `channel_title__Coca-Cola`                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ channel_title__Funny_Commercials              <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ channel_title__John_Keehler                   <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ channel_title__NFL                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__omon007                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__reggiep08v2                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__The_Hall_of_Advertising        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__USA_TODAY                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__World_Hyundai_Matteson         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `channel_title__-OTHER`                       <dbl> 0, 1, 0, 1, 1, 0, 0, 1, …
## $ category_id__1                                <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22                               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24                               <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27                               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2 : Correlate
data_corr_tbl <- data_binarized_tbl %>%
  correlate(like_count__6.26909628370626_Inf )

data_corr_tbl

## # A tibble: 54 × 3
##    feature       bin                               correlation
##    <fct>         <chr>                                   <dbl>
##  1 like_count    6.26909628370626_Inf                    1    
##  2 like_count    -Inf_2.99573227355399                  -0.339
##  3 like_count    4.87519732320115_6.26909628370626      -0.331
##  4 like_count    2.99573227355399_4.87519732320115      -0.327
##  5 brand         Doritos                                 0.281
##  6 channel_title NFL                                     0.262
##  7 brand         NFL                                     0.250
##  8 brand         Bud_Light                              -0.212
##  9 year          2015_Inf                                0.202
## 10 channel_title Coca-Cola                               0.202
## # ℹ 44 more rows

#Step 3: Plot
data_corr_tbl %>%
  plot_correlation_funnel()

## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split Data

#Split into train and test dataset 
set.seed(1234)
data_split <- rsample::initial_split(data) 
data_train <- training(data_split)
data_test <- testing(data_split)

# Further Split Training data set for cross validation

set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [151/17]> Fold01
##  2 <split [151/17]> Fold02
##  3 <split [151/17]> Fold03
##  4 <split [151/17]> Fold04
##  5 <split [151/17]> Fold05
##  6 <split [151/17]> Fold06
##  7 <split [151/17]> Fold07
##  8 <split [151/17]> Fold08
##  9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10

usemodels::use_xgboost(like_count ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(18995)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

xgboost_recipe <- 
  recipe(formula = like_count ~ ., data = data_train) %>% 
  recipes::update_role(id, new_role = "id variable") %>%
  step_tokenize(title) %>%
  step_tokenfilter(title, max_tokens = 50) %>%
  step_tf(title) %>%
  step_other(channel_title) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_zv(all_predictors())


xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 168
## Columns: 92
## $ year                        <dbl> 2013, 2015, 2008, 2010, 2009, 2007, 2010, …
## $ id                          <fct> WTf0XGpINJI, 7_EfXuGev24, 2_LWZe2BGaE, 6cM…
## $ like_count                  <dbl> 3.1354942, 0.0000000, 3.6109179, 6.0038871…
## $ tf_title_2001               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2005               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2010               <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2012               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2013               <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ tf_title_2014               <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2015               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2018               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ tf_title_2019               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2020               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_a                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_ad                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
## $ tf_title_big                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ tf_title_bowl               <int> 1, 0, 1, 1, 1, 0, 2, 1, 0, 0, 0, 1, 0, 0, …
## $ tf_title_bud                <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_budweiser          <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ tf_title_cedric             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_coca               <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ tf_title_coke               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tf_title_cola               <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ tf_title_commercial         <int> 1, 1, 1, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0, …
## $ tf_title_crash              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_diet               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_dog                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_doritos            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_e                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_etrade             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_extended           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
## $ tf_title_funny              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_game               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ tf_title_hd                 <int> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ tf_title_hyundai            <int> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ tf_title_kia                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ tf_title_light              <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_new                <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_nfl                <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ tf_title_official           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ tf_title_pepsi              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_spot               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_super              <int> 1, 0, 1, 1, 1, 0, 2, 1, 0, 0, 0, 1, 0, 0, …
## $ tf_title_superbowl          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_the                <int> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ tf_title_toyota             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_trade              <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_tv                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_usa                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_version            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_vs                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tf_title_winner             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_with               <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_Bud.Light             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser             <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ brand_Coca.Cola             <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ brand_Doritos               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_E.Trade               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ brand_Hynudai               <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Kia                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_NFL                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Pepsi                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny_FALSE.                <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, …
## $ funny_TRUE.                 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ show_product_quickly_FALSE. <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ show_product_quickly_TRUE.  <dbl> 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ patriotic_FALSE.            <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ patriotic_TRUE.             <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ celebrity_FALSE.            <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, …
## $ celebrity_TRUE.             <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ danger_FALSE.               <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, …
## $ danger_TRUE.                <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ animals_FALSE.              <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ animals_TRUE.               <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, …
## $ use_sex_FALSE.              <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex_TRUE.               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ channel_title_omon007       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title_other         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ category_id_X1              <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id_X10             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X15             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17             <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X19             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ category_id_X23             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X24             <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ category_id_X25             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X29             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(30220)
xgboost_tune <-
  tune_grid(xgboost_workflow, resamples = data_cv, grid = 10)

## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.

## There were issues with some computations   A: x1There were issues with some computations   A: x2There were issues with some computations   A: x3There were issues with some computations   A: x4There were issues with some computations   A: x5There were issues with some computations   A: x6There were issues with some computations   A: x7There were issues with some computations   A: x8There were issues with some computations   A: x9There were issues with some computations   A: x10There were issues with some computations   A: x10

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1   445     2          4    0.00681       1.90e- 9         0.5 rmse   
## 2   889    18          1    0.0245        3.16e+ 1         0.8 rmse   
## 3   223    35          8    0.0880        8.80e- 2         0.2 rmse   
## 4  1111    23         15    0.0129        1   e-10         0.3 rmse   
## 5  2000    10          5    0.0464        2.45e- 4         0.1 rmse   
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

# Update the model by selecting the best hyperparameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow, tune::select_best(xgboost_tune, metric = "rmse") )

# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune::collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       2.17  Preprocessor1_Model1
## 2 rsq     standard       0.257 Preprocessor1_Model1

tune::collect_predictions(data_fit) %>%
  ggplot(aes(like_count, .pred)) +
  geom_point(alpha = 0.3, fill = "gold") +
  geom_abline(lty = 2, color = "red") +
  coord_fixed()

Apply to your Data 4

Michael Toronto

2025-02-06

Get the Data

Read in with tidytuesdayR package

Install from CRAN via: install.packages(“tidytuesdayR”)

This loads the readme and all the datasets for the week of interest

Either ISO-8601 date or year/week works!

Or read in the data manually

Build Models

Evaluate Models