Import Data

youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')

skimr::skim(youtube)

Data summary
Name	youtube
Number of rows	247
Number of columns	25
_______________________
Column type frequency:
character	10
logical	7
numeric	7
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
brand	0	1.00	3	9	10
superbowl_ads_dot_com_url	0	1.00	34	120	244
youtube_url	11	0.96	43	43	233
id	11	0.96	11	11	233
kind	16	0.94	13	13	1
etag	16	0.94	27	27	228
title	16	0.94	6	99	228
description	50	0.80	3	3527	194
thumbnail	129	0.48	48	48	118
channel_title	16	0.94	3	37	185

Variable type: logical

skim_variable	complete_rate	mean	count
funny	1	0.69	TRU: 171, FAL: 76
show_product_quickly	1	0.68	TRU: 169, FAL: 78
patriotic	1	0.17	FAL: 206, TRU: 41
celebrity	1	0.29	FAL: 176, TRU: 71
danger	1	0.30	FAL: 172, TRU: 75
animals	1	0.37	FAL: 155, TRU: 92
use_sex	1	0.27	FAL: 181, TRU: 66

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2010.19	5.86	2000	2005	2010	2015.00	2020	▇▇▇▇▆
view_count	16	0.94	1407556.46	11971111.01	10	6431	41379	170015.50	176373378	▇▁▁▁▁
like_count	22	0.91	4146.03	23920.40	0	19	130	527.00	275362	▇▁▁▁▁
dislike_count	22	0.91	833.54	6948.52	0	1	7	24.00	92990	▇▁▁▁▁
favorite_count	16	0.94	0.00	0.00	0	0	0	0.00	0	▁▁▇▁▁
comment_count	25	0.90	188.64	986.46	0	1	10	50.75	9190	▇▁▁▁▁
category_id	16	0.94	19.32	8.00	1	17	23	24.00	29	▃▁▂▆▇

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
published_at	16	0.94	2006-02-06 10:02:36	2021-01-27 13:11:29	2013-01-31 09:13:55	227

data <- youtube %>% 
    
    # Treat missing values
    select(-superbowl_ads_dot_com_url, -youtube_url, -kind, -etag, -channel_title, -comment_count, -thumbnail, -published_at, -description, -dislike_count, -favorite_count, -view_count) %>% 
    na.omit() %>% 
    mutate(like_count = log(like_count + 1)) %>% 
    mutate(category_id = as.factor(category_id)) %>% 
    mutate(across(where(is.character),as.factor)) %>%
    mutate(title = as.character(title)) %>% 
    mutate(across(where(is.logical),as.factor))

Explore Data

Identify good predictors.

celebrity

data %>% 
    ggplot(aes(like_count, funny)) + 
    geom_point()

Brand

data %>% 
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg rent per word
    group_by(word) %>%
    summarise(like_count = mean(like_count),
              n          = n()) %>%
    ungroup() %>%
    
    # Plot 
    ggplot(aes(like_count, fct_reorder(word, like_count))) + 
    geom_point() + 
    
    labs(y = "Brands")

EDA Shortcut

# Step 1: Prepare Data
data_binarized_tbl <- data %>% 
    select(-id, -title) %>% 
    binarize()

data_binarized_tbl %>% glimpse()

## Rows: 225
## Columns: 43
## $ `year__-Inf_2005`                             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf                                <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light                              <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola`                            <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai                                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota                                 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE                                  <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE                                   <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE                   <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE                    <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE                              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE                              <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE                               <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE                                 <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE                                  <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE                                <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE                                 <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE                                <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE                                 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399`           <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf              <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id__1                                <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2                                <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17                               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22                               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24                               <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27                               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER`                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …

# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>% 
    correlate(like_count__6.26909628370626_Inf)

data_corr_tbl

## # A tibble: 43 × 3
##    feature     bin                               correlation
##    <fct>       <chr>                                   <dbl>
##  1 like_count  6.26909628370626_Inf                    1    
##  2 like_count  -Inf_2.99573227355399                  -0.339
##  3 like_count  4.87519732320115_6.26909628370626      -0.331
##  4 like_count  2.99573227355399_4.87519732320115      -0.327
##  5 brand       Doritos                                 0.281
##  6 brand       NFL                                     0.250
##  7 brand       Bud_Light                              -0.212
##  8 year        2015_Inf                                0.202
##  9 year        -Inf_2005                              -0.193
## 10 category_id 1                                       0.133
## # ℹ 33 more rows

# Step 3: Plot
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Build Models

Split Data

# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)

# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv

## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [151/17]> Fold01
##  2 <split [151/17]> Fold02
##  3 <split [151/17]> Fold03
##  4 <split [151/17]> Fold04
##  5 <split [151/17]> Fold05
##  6 <split [151/17]> Fold06
##  7 <split [151/17]> Fold07
##  8 <split [151/17]> Fold08
##  9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10

library(usemodels)
usemodels::use_xgboost(like_count ~ ., data = data_train)

## xgboost_recipe <- 
##   recipe(formula = like_count ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(18995)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

# Specify Recipe 
xgboost_recipe <- recipe(formula = like_count ~ ., data = data_train) %>% 
  recipes::update_role(id, new_role = "id variables") %>% 
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 50) %>% 
    step_tfidf(title) %>%
    step_dummy(all_nominal_predictors(),one_hot = TRUE) %>%
    step_zv(all_predictors()) 
  

xgboost_recipe %>% prep() %>% juice() %>% glimpse()

## Rows: 168
## Columns: 90
## $ year                        <dbl> 2013, 2015, 2008, 2010, 2009, 2007, 2010, …
## $ id                          <fct> WTf0XGpINJI, 7_EfXuGev24, 2_LWZe2BGaE, 6cM…
## $ like_count                  <dbl> 3.1354942, 0.0000000, 3.6109179, 6.0038871…
## $ tfidf_title_2001            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2005            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2010            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2012            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2013            <dbl> 0.6437752, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2014            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2015            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2018            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2019            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2020            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_a               <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_ad              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_big             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bowl            <dbl> 0.2488648, 0.0000000, 0.2488648, 0.3110810…
## $ tfidf_title_bud             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_budweiser       <dbl> 0.0000000, 0.6812715, 0.0000000, 0.0000000…
## $ tfidf_title_cedric          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_coca            <dbl> 0.0000000, 0.0000000, 0.5129899, 0.0000000…
## $ tfidf_title_coke            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_cola            <dbl> 0.0000000, 0.0000000, 0.5002872, 0.0000000…
## $ tfidf_title_commercial      <dbl> 0.2135681, 0.3559469, 0.2135681, 0.0000000…
## $ tfidf_title_crash           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_diet            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_dog             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_doritos         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_e               <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.0000…
## $ tfidf_title_etrade          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_extended        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_funny           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_game            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_hd              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_hyundai         <dbl> 0.5002872, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_kia             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_light           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_new             <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_nfl             <dbl> 0.000000, 0.000000, 0.000000, 0.841824, 0.…
## $ tfidf_title_official        <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_pepsi           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_spot            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_super           <dbl> 0.2488648, 0.0000000, 0.2488648, 0.3110810…
## $ tfidf_title_superbowl       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_the             <dbl> 0.0000000, 0.6931472, 0.0000000, 0.5198604…
## $ tfidf_title_toyota          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_trade           <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.0000…
## $ tfidf_title_tv              <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_usa             <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.…
## $ tfidf_title_version         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_vs              <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.…
## $ tfidf_title_winner          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_with            <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ brand_Bud.Light             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser             <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ brand_Coca.Cola             <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ brand_Doritos               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_E.Trade               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ brand_Hynudai               <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Kia                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_NFL                   <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Pepsi                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny_FALSE.                <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, …
## $ funny_TRUE.                 <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ show_product_quickly_FALSE. <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ show_product_quickly_TRUE.  <dbl> 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ patriotic_FALSE.            <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ patriotic_TRUE.             <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ celebrity_FALSE.            <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, …
## $ celebrity_TRUE.             <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ danger_FALSE.               <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, …
## $ danger_TRUE.                <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ animals_FALSE.              <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ animals_TRUE.               <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, …
## $ use_sex_FALSE.              <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex_TRUE.               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ category_id_X1              <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X2              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id_X10             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X15             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17             <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X19             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22             <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ category_id_X23             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X24             <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ category_id_X25             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X29             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

# Specify Model
xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
    loss_reduction = tune(), sample_size = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

# Combine recipe and model using workflow
xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

# Tune hyperparameters
set.seed(89984)
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_cv, 
            grid = 10)

tune::show_best(xgboost_tune, metric = "rmse")

## # A tibble: 5 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1  1022    38          3    0.137     0.00320            0.242 rmse   
## 2  1839    30          9    0.0112    0.0000000169       0.165 rmse   
## 3   991    25          1    0.00202   0.0000692          0.937 rmse   
## 4  1581    21          5    0.00924   0.0000000485       0.459 rmse   
## 5   631    15         12    0.0263    0.187              0.749 rmse   
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

# Update the model by selecting the best hyperparameters.
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                        tune::select_best(xgboost_tune, metric = "rmse"))

# Fit the model on the entire training data and test it on the test data.
data_fit <- tune::last_fit(xgboost_fw, data_split)

tune::collect_metrics(data_fit)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard        2.50 Preprocessor1_Model1
## 2 rsq     standard       NA    Preprocessor1_Model1

tune::collect_predictions(data_fit) %>% 
    ggplot(aes(like_count, .pred)) + 
    geom_point(alpha = 0.3, fill = "midnightblue") + 
    geom_abline(lty = 2, color = "gray50") + 
    coord_fixed()

There were a few different things that I played with when trying to improve my workflow, first I changed tfidf(title) to tfidf(description), and when looking through I couldn’t really notice anything that changed throughout. The next thing I did was reduce the max tokens from 100 to 50, and I noticed that the ammount of rows stayed the same at 168 but the number of columns changed from 140 to 90, which would make sense as I reduced the max tokens by 50. The next thing that I did was changing the grid by improving it from =5 to =10, this ended up doing a few things. It first made it so I wasn’t able to see the estimate for the rsq metric where before it was 0.126 and now is listed as N/A, and the rmse metric changing from 2.36 to 2.5. Along with this the numbers all ended up changing from their original values and the mean category being the most surprising to me as before it varied from 2.54-4.32 and now all of the mean values fall between 2.51-2.57. Addistionally the graph changed where as before it clearly labeled the trend of the values in correlation to the line plot, where as now its a flat line with no values on the Y axis and everything clumped together on the X axis making it extremely difficult to read.

Apply 1

Alden Dimick

2025-02-18

Import Data

Explore Data

Build Models