Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

skimr::skim(youtube)
Data summary
Name youtube
Number of rows 247
Number of columns 25
_______________________
Column type frequency:
character 10
logical 7
numeric 7
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
brand 0 1.00 3 9 0 10 0
superbowl_ads_dot_com_url 0 1.00 34 120 0 244 0
youtube_url 11 0.96 43 43 0 233 0
id 11 0.96 11 11 0 233 0
kind 16 0.94 13 13 0 1 0
etag 16 0.94 27 27 0 228 0
title 16 0.94 6 99 0 228 0
description 50 0.80 3 3527 0 194 0
thumbnail 129 0.48 48 48 0 118 0
channel_title 16 0.94 3 37 0 185 0

Variable type: logical

skim_variable n_missing complete_rate mean count
funny 0 1 0.69 TRU: 171, FAL: 76
show_product_quickly 0 1 0.68 TRU: 169, FAL: 78
patriotic 0 1 0.17 FAL: 206, TRU: 41
celebrity 0 1 0.29 FAL: 176, TRU: 71
danger 0 1 0.30 FAL: 172, TRU: 75
animals 0 1 0.37 FAL: 155, TRU: 92
use_sex 0 1 0.27 FAL: 181, TRU: 66

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2010.19 5.86 2000 2005 2010 2015.00 2020 ▇▇▇▇▆
view_count 16 0.94 1407556.46 11971111.01 10 6431 41379 170015.50 176373378 ▇▁▁▁▁
like_count 22 0.91 4146.03 23920.40 0 19 130 527.00 275362 ▇▁▁▁▁
dislike_count 22 0.91 833.54 6948.52 0 1 7 24.00 92990 ▇▁▁▁▁
favorite_count 16 0.94 0.00 0.00 0 0 0 0.00 0 ▁▁▇▁▁
comment_count 25 0.90 188.64 986.46 0 1 10 50.75 9190 ▇▁▁▁▁
category_id 16 0.94 19.32 8.00 1 17 23 24.00 29 ▃▁▂▆▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
published_at 16 0.94 2006-02-06 10:02:36 2021-01-27 13:11:29 2013-01-31 09:13:55 227
data <- youtube %>% 
    
    # Treat missing values
    select(-superbowl_ads_dot_com_url, -youtube_url, -kind, -etag, -channel_title, -comment_count, -thumbnail, -published_at, -description, -dislike_count, -favorite_count, -view_count) %>% 
    na.omit() %>% 
    mutate(like_count = log(like_count + 1)) %>% 
    mutate(category_id = as.factor(category_id)) %>% 
    mutate(across(where(is.character),as.factor)) %>%
    mutate(title = as.character(title)) %>% 
    mutate(across(where(is.logical),as.factor))

Explore Data

Identify good predictors.

celebrity

data %>% 
    ggplot(aes(like_count, funny)) + 
    geom_point()

Brand

data %>% 
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg rent per word
    group_by(word) %>%
    summarise(like_count = mean(like_count),
              n          = n()) %>%
    ungroup() %>%
    
    # Plot 
    ggplot(aes(like_count, fct_reorder(word, like_count))) + 
    geom_point() + 
    
    labs(y = "Brands")

EDA Shortcut

# Step 1: Prepare Data
data_binarized_tbl <- data %>% 
    select(-id, -title) %>% 
    binarize()

data_binarized_tbl %>% glimpse()
## Rows: 225
## Columns: 43
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE                                  <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE                                   <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE                   <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE                    <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE                              <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE                               <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE                                 <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE                                  <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE                                <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE                                 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE                                <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id__1                                <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22                               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24                               <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27                               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>% 
    correlate(like_count__6.26909628370626_Inf)

data_corr_tbl
## # A tibble: 43 × 3
##    feature     bin                               correlation
##    <fct>       <chr>                                   <dbl>
##  1 like_count  6.26909628370626_Inf                    1    
##  2 like_count  -Inf_2.99573227355399                  -0.339
##  3 like_count  4.87519732320115_6.26909628370626      -0.331
##  4 like_count  2.99573227355399_4.87519732320115      -0.327
##  5 brand       Doritos                                 0.281
##  6 brand       NFL                                     0.250
##  7 brand       Bud_Light                              -0.212
##  8 year        2015_Inf                                0.202
##  9 year        -Inf_2005                              -0.193
## 10 category_id 1                                       0.133
## # ℹ 33 more rows
# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split Data

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [151/17]> Fold01
##  2 <split [151/17]> Fold02
##  3 <split [151/17]> Fold03
##  4 <split [151/17]> Fold04
##  5 <split [151/17]> Fold05
##  6 <split [151/17]> Fold06
##  7 <split [151/17]> Fold07
##  8 <split [151/17]> Fold08
##  9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(18995)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify Recipe 
xgboost_recipe <- recipe(formula = like_count ~ ., data = data_train) %>% 
  recipes::update_role(id, new_role = "id variables") %>% 
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 50) %>% 
    step_tfidf(title) %>%
    step_dummy(all_nominal_predictors(),one_hot = TRUE) %>%
    step_zv(all_predictors()) 
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 168
## Columns: 90
## $ year                        <dbl> 2013, 2015, 2008, 2010, 2009, 2007, 2010, …
## $ id                          <fct> WTf0XGpINJI, 7_EfXuGev24, 2_LWZe2BGaE, 6cM…
## $ like_count                  <dbl> 3.1354942, 0.0000000, 3.6109179, 6.0038871…
## $ tfidf_title_2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2010            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2013            <dbl> 0.6437752, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2014            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2018            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2020            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_a               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_ad              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_big             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bowl            <dbl> 0.2488648, 0.0000000, 0.2488648, 0.3110810…
## $ tfidf_title_bud             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_budweiser       <dbl> 0.0000000, 0.6812715, 0.0000000, 0.0000000…
## $ tfidf_title_cedric          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_coca            <dbl> 0.0000000, 0.0000000, 0.5129899, 0.0000000…
## $ tfidf_title_coke            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_cola            <dbl> 0.0000000, 0.0000000, 0.5002872, 0.0000000…
## $ tfidf_title_commercial      <dbl> 0.2135681, 0.3559469, 0.2135681, 0.0000000…
## $ tfidf_title_crash           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_diet            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_dog             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_doritos         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_e               <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.0000…
## $ tfidf_title_etrade          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_extended        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_funny           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_game            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_hd              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_hyundai         <dbl> 0.5002872, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_kia             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_light           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_new             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_nfl             <dbl> 0.000000, 0.000000, 0.000000, 0.841824, 0.…
## $ tfidf_title_official        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_pepsi           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_spot            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_super           <dbl> 0.2488648, 0.0000000, 0.2488648, 0.3110810…
## $ tfidf_title_superbowl       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_the             <dbl> 0.0000000, 0.6931472, 0.0000000, 0.5198604…
## $ tfidf_title_toyota          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_trade           <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.0000…
## $ tfidf_title_tv              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_usa             <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.…
## $ tfidf_title_version         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_vs              <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.…
## $ tfidf_title_winner          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_with            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ brand_Bud.Light             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser             <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ brand_Coca.Cola             <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ brand_Doritos               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_E.Trade               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ brand_Hynudai               <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Kia                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_NFL                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Pepsi                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny_FALSE.                <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, …
## $ funny_TRUE.                 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ show_product_quickly_FALSE. <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ show_product_quickly_TRUE.  <dbl> 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ patriotic_FALSE.            <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ patriotic_TRUE.             <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ celebrity_FALSE.            <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, …
## $ celebrity_TRUE.             <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ danger_FALSE.               <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, …
## $ danger_TRUE.                <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ animals_FALSE.              <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ animals_TRUE.               <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, …
## $ use_sex_FALSE.              <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex_TRUE.               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ category_id_X1              <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id_X10             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X15             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17             <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X19             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ category_id_X23             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X24             <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ category_id_X25             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X29             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Specify Model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(89984)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 10)
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1  1022    38          3    0.137     0.00320            0.242 rmse   
## 2  1839    30          9    0.0112    0.0000000169       0.165 rmse   
## 3   991    25          1    0.00202   0.0000692          0.937 rmse   
## 4  1581    21          5    0.00924   0.0000000485       0.459 rmse   
## 5   631    15         12    0.0263    0.187              0.749 rmse   
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>
# Update the model by selecting the best hyperparameters.
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire training data and test it on the test data.
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard        2.50 Preprocessor1_Model1
## 2 rsq     standard       NA    Preprocessor1_Model1
tune::collect_predictions(data_fit) %>% 
    ggplot(aes(like_count, .pred)) + 
    geom_point(alpha = 0.3, fill = "midnightblue") + 
    geom_abline(lty = 2, color = "gray50") + 
    coord_fixed()

There were a few different things that I played with when trying to improve my workflow, first I changed tfidf(title) to tfidf(description), and when looking through I couldn’t really notice anything that changed throughout. The next thing I did was reduce the max tokens from 100 to 50, and I noticed that the ammount of rows stayed the same at 168 but the number of columns changed from 140 to 90, which would make sense as I reduced the max tokens by 50. The next thing that I did was changing the grid by improving it from =5 to =10, this ended up doing a few things. It first made it so I wasn’t able to see the estimate for the rsq metric where before it was 0.126 and now is listed as N/A, and the rmse metric changing from 2.36 to 2.5. Along with this the numbers all ended up changing from their original values and the mean category being the most surprising to me as before it varied from 2.54-4.32 and now all of the mean values fall between 2.51-2.57. Addistionally the graph changed where as before it clearly labeled the trend of the values in correlation to the line plot, where as now its a flat line with no values on the Y axis and everything clumped together on the X axis making it extremely difficult to read.