youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
skimr::skim(youtube)
Name | youtube |
Number of rows | 247 |
Number of columns | 25 |
_______________________ | |
Column type frequency: | |
character | 10 |
logical | 7 |
numeric | 7 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
brand | 0 | 1.00 | 3 | 9 | 0 | 10 | 0 |
superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 244 | 0 |
youtube_url | 11 | 0.96 | 43 | 43 | 0 | 233 | 0 |
id | 11 | 0.96 | 11 | 11 | 0 | 233 | 0 |
kind | 16 | 0.94 | 13 | 13 | 0 | 1 | 0 |
etag | 16 | 0.94 | 27 | 27 | 0 | 228 | 0 |
title | 16 | 0.94 | 6 | 99 | 0 | 228 | 0 |
description | 50 | 0.80 | 3 | 3527 | 0 | 194 | 0 |
thumbnail | 129 | 0.48 | 48 | 48 | 0 | 118 | 0 |
channel_title | 16 | 0.94 | 3 | 37 | 0 | 185 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
funny | 0 | 1 | 0.69 | TRU: 171, FAL: 76 |
show_product_quickly | 0 | 1 | 0.68 | TRU: 169, FAL: 78 |
patriotic | 0 | 1 | 0.17 | FAL: 206, TRU: 41 |
celebrity | 0 | 1 | 0.29 | FAL: 176, TRU: 71 |
danger | 0 | 1 | 0.30 | FAL: 172, TRU: 75 |
animals | 0 | 1 | 0.37 | FAL: 155, TRU: 92 |
use_sex | 0 | 1 | 0.27 | FAL: 181, TRU: 66 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2010.19 | 5.86 | 2000 | 2005 | 2010 | 2015.00 | 2020 | ▇▇▇▇▆ |
view_count | 16 | 0.94 | 1407556.46 | 11971111.01 | 10 | 6431 | 41379 | 170015.50 | 176373378 | ▇▁▁▁▁ |
like_count | 22 | 0.91 | 4146.03 | 23920.40 | 0 | 19 | 130 | 527.00 | 275362 | ▇▁▁▁▁ |
dislike_count | 22 | 0.91 | 833.54 | 6948.52 | 0 | 1 | 7 | 24.00 | 92990 | ▇▁▁▁▁ |
favorite_count | 16 | 0.94 | 0.00 | 0.00 | 0 | 0 | 0 | 0.00 | 0 | ▁▁▇▁▁ |
comment_count | 25 | 0.90 | 188.64 | 986.46 | 0 | 1 | 10 | 50.75 | 9190 | ▇▁▁▁▁ |
category_id | 16 | 0.94 | 19.32 | 8.00 | 1 | 17 | 23 | 24.00 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
published_at | 16 | 0.94 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2013-01-31 09:13:55 | 227 |
data <- youtube %>%
# Treat missing values
select(-superbowl_ads_dot_com_url, -youtube_url, -kind, -etag, -channel_title, -comment_count, -thumbnail, -published_at, -description, -dislike_count, -favorite_count, -view_count) %>%
na.omit() %>%
mutate(like_count = log(like_count + 1)) %>%
mutate(category_id = as.factor(category_id)) %>%
mutate(across(where(is.character),as.factor)) %>%
mutate(title = as.character(title)) %>%
mutate(across(where(is.logical),as.factor))
Identify good predictors.
celebrity
data %>%
ggplot(aes(like_count, funny)) +
geom_point()
Brand
data %>%
# tokenize title
unnest_tokens(output = word, input = title) %>%
# calculate avg rent per word
group_by(word) %>%
summarise(like_count = mean(like_count),
n = n()) %>%
ungroup() %>%
# Plot
ggplot(aes(like_count, fct_reorder(word, like_count))) +
geom_point() +
labs(y = "Brands")
EDA Shortcut
# Step 1: Prepare Data
data_binarized_tbl <- data %>%
select(-id, -title) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 225
## Columns: 43
## $ `year__-Inf_2005` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399` <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id__1 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24 <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(like_count__6.26909628370626_Inf)
data_corr_tbl
## # A tibble: 43 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 6.26909628370626_Inf 1
## 2 like_count -Inf_2.99573227355399 -0.339
## 3 like_count 4.87519732320115_6.26909628370626 -0.331
## 4 like_count 2.99573227355399_4.87519732320115 -0.327
## 5 brand Doritos 0.281
## 6 brand NFL 0.250
## 7 brand Bud_Light -0.212
## 8 year 2015_Inf 0.202
## 9 year -Inf_2005 -0.193
## 10 category_id 1 0.133
## # ℹ 33 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Split Data
# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [151/17]> Fold01
## 2 <split [151/17]> Fold02
## 3 <split [151/17]> Fold03
## 4 <split [151/17]> Fold04
## 5 <split [151/17]> Fold05
## 6 <split [151/17]> Fold06
## 7 <split [151/17]> Fold07
## 8 <split [151/17]> Fold08
## 9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = like_count ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(18995)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify Recipe
xgboost_recipe <- recipe(formula = like_count ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id variables") %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 50) %>%
step_tfidf(title) %>%
step_dummy(all_nominal_predictors(),one_hot = TRUE) %>%
step_zv(all_predictors())
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 168
## Columns: 90
## $ year <dbl> 2013, 2015, 2008, 2010, 2009, 2007, 2010, …
## $ id <fct> WTf0XGpINJI, 7_EfXuGev24, 2_LWZe2BGaE, 6cM…
## $ like_count <dbl> 3.1354942, 0.0000000, 3.6109179, 6.0038871…
## $ tfidf_title_2001 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2005 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2010 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2013 <dbl> 0.6437752, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2014 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2018 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_2020 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_a <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_ad <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_big <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_bowl <dbl> 0.2488648, 0.0000000, 0.2488648, 0.3110810…
## $ tfidf_title_bud <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_budweiser <dbl> 0.0000000, 0.6812715, 0.0000000, 0.0000000…
## $ tfidf_title_cedric <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_coca <dbl> 0.0000000, 0.0000000, 0.5129899, 0.0000000…
## $ tfidf_title_coke <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_cola <dbl> 0.0000000, 0.0000000, 0.5002872, 0.0000000…
## $ tfidf_title_commercial <dbl> 0.2135681, 0.3559469, 0.2135681, 0.0000000…
## $ tfidf_title_crash <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_diet <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_dog <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_doritos <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_e <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.0000…
## $ tfidf_title_etrade <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_extended <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_funny <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_game <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_title_hd <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_hyundai <dbl> 0.5002872, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_kia <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_light <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_new <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_nfl <dbl> 0.000000, 0.000000, 0.000000, 0.841824, 0.…
## $ tfidf_title_official <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_pepsi <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_spot <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_super <dbl> 0.2488648, 0.0000000, 0.2488648, 0.3110810…
## $ tfidf_title_superbowl <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_the <dbl> 0.0000000, 0.6931472, 0.0000000, 0.5198604…
## $ tfidf_title_toyota <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_trade <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.0000…
## $ tfidf_title_tv <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ tfidf_title_usa <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.…
## $ tfidf_title_version <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_vs <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.…
## $ tfidf_title_winner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_title_with <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000…
## $ brand_Bud.Light <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ brand_Coca.Cola <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, …
## $ brand_Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_E.Trade <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ brand_Hynudai <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ brand_NFL <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny_FALSE. <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, …
## $ funny_TRUE. <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ show_product_quickly_FALSE. <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, …
## $ show_product_quickly_TRUE. <dbl> 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ patriotic_FALSE. <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, …
## $ patriotic_TRUE. <dbl> 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ celebrity_FALSE. <dbl> 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, …
## $ celebrity_TRUE. <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ danger_FALSE. <dbl> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, …
## $ danger_TRUE. <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, …
## $ animals_FALSE. <dbl> 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ animals_TRUE. <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, …
## $ use_sex_FALSE. <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex_TRUE. <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ category_id_X1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id_X10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X19 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22 <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
## $ category_id_X23 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X24 <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, …
## $ category_id_X25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X29 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Specify Model
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(89984)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 10)
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 1022 38 3 0.137 0.00320 0.242 rmse
## 2 1839 30 9 0.0112 0.0000000169 0.165 rmse
## 3 991 25 1 0.00202 0.0000692 0.937 rmse
## 4 1581 21 5 0.00924 0.0000000485 0.459 rmse
## 5 631 15 12 0.0263 0.187 0.749 rmse
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
# Update the model by selecting the best hyperparameters.
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
# Fit the model on the entire training data and test it on the test data.
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 2.50 Preprocessor1_Model1
## 2 rsq standard NA Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
ggplot(aes(like_count, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()
There were a few different things that I played with when trying to improve my workflow, first I changed tfidf(title) to tfidf(description), and when looking through I couldn’t really notice anything that changed throughout. The next thing I did was reduce the max tokens from 100 to 50, and I noticed that the ammount of rows stayed the same at 168 but the number of columns changed from 140 to 90, which would make sense as I reduced the max tokens by 50. The next thing that I did was changing the grid by improving it from =5 to =10, this ended up doing a few things. It first made it so I wasn’t able to see the estimate for the rsq metric where before it was 0.126 and now is listed as N/A, and the rmse metric changing from 2.36 to 2.5. Along with this the numbers all ended up changing from their original values and the mean category being the most surprising to me as before it varied from 2.54-4.32 and now all of the mean values fall between 2.51-2.57. Addistionally the graph changed where as before it clearly labeled the trend of the values in correlation to the line plot, where as now its a flat line with no values on the Y axis and everything clumped together on the X axis making it extremely difficult to read.