tuesdata <- tidytuesdayR::tt_load(‘2021-03-02’) tuesdata <- tidytuesdayR::tt_load(2021, week = 10)
youtube <- tuesdata$youtube
youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- youtube %>%
# Remove missing values
select(-thumbnail, -ends_with("url"), -etag, -kind, -favorite_count, -published_at, -view_count, -comment_count, -description, -dislike_count) %>%
na.omit %>%
# log transform the target variable
mutate(like_count = log(like_count + 1)) %>%
# category_id is a factor
mutate(category_id = as.factor(category_id)) %>%
# Convert logical to factor
mutate(across(where(is.logical), factor)) %>%
# Convert character to factor
mutate(across(c(id, channel_title, brand), factor))
# Step 1: Prepare Data
library(lubridate)
data_binarized_tbl <-data %>%
select(-id, -title) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 225
## Columns: 54
## $ `year__-Inf_2005` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399` <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ channel_title__BudBowlXLII <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `channel_title__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ channel_title__Funny_Commercials <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ channel_title__John_Keehler <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ channel_title__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__omon007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__reggiep08v2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__The_Hall_of_Advertising <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__USA_TODAY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title__World_Hyundai_Matteson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `channel_title__-OTHER` <dbl> 0, 1, 0, 1, 1, 0, 0, 1, …
## $ category_id__1 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24 <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2 : Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(like_count__6.26909628370626_Inf )
data_corr_tbl
## # A tibble: 54 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 6.26909628370626_Inf 1
## 2 like_count -Inf_2.99573227355399 -0.339
## 3 like_count 4.87519732320115_6.26909628370626 -0.331
## 4 like_count 2.99573227355399_4.87519732320115 -0.327
## 5 brand Doritos 0.281
## 6 channel_title NFL 0.262
## 7 brand NFL 0.250
## 8 brand Bud_Light -0.212
## 9 year 2015_Inf 0.202
## 10 channel_title Coca-Cola 0.202
## # ℹ 44 more rows
#Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Split Data
#Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further Split Training data set for cross validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [151/17]> Fold01
## 2 <split [151/17]> Fold02
## 3 <split [151/17]> Fold03
## 4 <split [151/17]> Fold04
## 5 <split [151/17]> Fold05
## 6 <split [151/17]> Fold06
## 7 <split [151/17]> Fold07
## 8 <split [151/17]> Fold08
## 9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
usemodels::use_xgboost(like_count ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = like_count ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(18995)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = like_count ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id variable") %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 50) %>%
step_tf(title) %>%
step_other(channel_title) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors())
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 168
## Columns: 92
## $ year <dbl> 2013, 2015, 2008, 2010, 2009, 2007, 2010, …
## $ id <fct> WTf0XGpINJI, 7_EfXuGev24, 2_LWZe2BGaE, 6cM…
## $ like_count <dbl> 3.1354942, 0.0000000, 3.6109179, 6.0038871…
## $ tf_title_2001 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2005 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2010 <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2012 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2013 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ tf_title_2014 <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2015 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2018 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ tf_title_2019 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_2020 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_a <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_ad <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
## $ tf_title_big <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ tf_title_bowl <int> 1, 0, 1, 1, 1, 0, 2, 1, 0, 0, 0, 1, 0, 0, …
## $ tf_title_bud <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_budweiser <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ tf_title_cedric <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_coca <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ tf_title_coke <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tf_title_cola <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ tf_title_commercial <int> 1, 1, 1, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0, …
## $ tf_title_crash <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_diet <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_dog <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_doritos <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_e <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_etrade <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_extended <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
## $ tf_title_funny <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_game <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ tf_title_hd <int> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, …
## $ tf_title_hyundai <int> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ tf_title_kia <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ tf_title_light <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_new <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_nfl <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ tf_title_official <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ tf_title_pepsi <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_spot <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_super <int> 1, 0, 1, 1, 1, 0, 2, 1, 0, 0, 0, 1, 0, 0, …
## $ tf_title_superbowl <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_the <int> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ tf_title_toyota <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_trade <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_tv <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ tf_title_usa <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_version <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_vs <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ tf_title_winner <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tf_title_with <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_Bud.Light <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ brand_Coca.Cola <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ brand_Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_E.Trade <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ brand_Hynudai <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_NFL <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny_FALSE. <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, …
## $ funny_TRUE. <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ show_product_quickly_FALSE. <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ show_product_quickly_TRUE. <dbl> 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ patriotic_FALSE. <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ patriotic_TRUE. <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ celebrity_FALSE. <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, …
## $ celebrity_TRUE. <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ danger_FALSE. <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, …
## $ danger_TRUE. <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ animals_FALSE. <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ animals_TRUE. <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, …
## $ use_sex_FALSE. <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex_TRUE. <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ channel_title_omon007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ channel_title_other <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ category_id_X1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id_X10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X19 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22 <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ category_id_X23 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X24 <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ category_id_X25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X29 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(30220)
xgboost_tune <-
tune_grid(xgboost_workflow, resamples = data_cv, grid = 10)
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
## standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations A: x1There were issues with some computations A: x2There were issues with some computations A: x3There were issues with some computations A: x4There were issues with some computations A: x5There were issues with some computations A: x6There were issues with some computations A: x7There were issues with some computations A: x8There were issues with some computations A: x9There were issues with some computations A: x10There were issues with some computations A: x10
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 445 2 4 0.00681 1.90e- 9 0.5 rmse
## 2 889 18 1 0.0245 3.16e+ 1 0.8 rmse
## 3 223 35 8 0.0880 8.80e- 2 0.2 rmse
## 4 1111 23 15 0.0129 1 e-10 0.3 rmse
## 5 2000 10 5 0.0464 2.45e- 4 0.1 rmse
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
# Update the model by selecting the best hyperparameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow, tune::select_best(xgboost_tune, metric = "rmse") )
# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 2.17 Preprocessor1_Model1
## 2 rsq standard 0.257 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
ggplot(aes(like_count, .pred)) +
geom_point(alpha = 0.3, fill = "gold") +
geom_abline(lty = 2, color = "red") +
coord_fixed()