Goal: to predict the Youtube like count Click here for the data
.
youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
| Name | youtube |
| Number of rows | 247 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| logical | 7 |
| numeric | 7 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| brand | 0 | 1.00 | 3 | 9 | 0 | 10 | 0 |
| superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 244 | 0 |
| youtube_url | 11 | 0.96 | 43 | 43 | 0 | 233 | 0 |
| id | 11 | 0.96 | 11 | 11 | 0 | 233 | 0 |
| kind | 16 | 0.94 | 13 | 13 | 0 | 1 | 0 |
| etag | 16 | 0.94 | 27 | 27 | 0 | 228 | 0 |
| title | 16 | 0.94 | 6 | 99 | 0 | 228 | 0 |
| description | 50 | 0.80 | 3 | 3527 | 0 | 194 | 0 |
| thumbnail | 129 | 0.48 | 48 | 48 | 0 | 118 | 0 |
| channel_title | 16 | 0.94 | 3 | 37 | 0 | 185 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| funny | 0 | 1 | 0.69 | TRU: 171, FAL: 76 |
| show_product_quickly | 0 | 1 | 0.68 | TRU: 169, FAL: 78 |
| patriotic | 0 | 1 | 0.17 | FAL: 206, TRU: 41 |
| celebrity | 0 | 1 | 0.29 | FAL: 176, TRU: 71 |
| danger | 0 | 1 | 0.30 | FAL: 172, TRU: 75 |
| animals | 0 | 1 | 0.37 | FAL: 155, TRU: 92 |
| use_sex | 0 | 1 | 0.27 | FAL: 181, TRU: 66 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2010.19 | 5.86 | 2000 | 2005 | 2010 | 2015.00 | 2020 | ▇▇▇▇▆ |
| view_count | 16 | 0.94 | 1407556.46 | 11971111.01 | 10 | 6431 | 41379 | 170015.50 | 176373378 | ▇▁▁▁▁ |
| like_count | 22 | 0.91 | 4146.03 | 23920.40 | 0 | 19 | 130 | 527.00 | 275362 | ▇▁▁▁▁ |
| dislike_count | 22 | 0.91 | 833.54 | 6948.52 | 0 | 1 | 7 | 24.00 | 92990 | ▇▁▁▁▁ |
| favorite_count | 16 | 0.94 | 0.00 | 0.00 | 0 | 0 | 0 | 0.00 | 0 | ▁▁▇▁▁ |
| comment_count | 25 | 0.90 | 188.64 | 986.46 | 0 | 1 | 10 | 50.75 | 9190 | ▇▁▁▁▁ |
| category_id | 16 | 0.94 | 19.32 | 8.00 | 1 | 17 | 23 | 24.00 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| published_at | 16 | 0.94 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2013-01-31 09:13:55 | 227 |
# data <- youtube %>%
#
#
# # Treat missing values
# select(-thumbnail, -description, -favorite_count, -comment_count, -published_at, -category_id, -superbowl_ads_dot_com_url, -youtube_url, -id ,-etag, -show_product_quickly, -patriotic, -celebrity, -danger, -animals, -use_sex, - channel_title, -kind) %>%
# mutate(across(is.logical, as.factor)) %>%
# na.omit() %>%
#
# # log transform variables with pos-skewed distribution
# mutate(like_count = log(like_count))
# Can't log transform like_count as it has non-positive values; Instead, you can address it in recipes step
data <- youtube %>%
# Treat missing values
select(-thumbnail, -description, -favorite_count, -comment_count, -published_at, -category_id, -superbowl_ads_dot_com_url, -youtube_url, -id ,-etag, -show_product_quickly, -patriotic, -celebrity, -danger, -animals, -use_sex, - channel_title, -kind,
- view_count, -dislike_count) %>% # Remove these counts as they are not true predictors
mutate(across(is.logical, as.factor)) %>%
na.omit()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(is.logical, as.factor)`.
## Caused by warning:
## ! Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.logical)
##
## # Now:
## data %>% select(where(is.logical))
Identify good predictors
like_count
data %>%
ggplot(aes(like_count, view_count)) +
scale_y_log10() +
geom_point()
data %>%
ggplot(aes(like_count, as.factor(brand))) +
geom_boxplot()
title
data %>%
# tokenism title
unnest_tokens(output = word, input = brand) %>%
# calculate avg rent per word
group_by(word) %>%
summarise(like_count = mean(like_count),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\d")) %>%
slice_max(order_by = like_count, n = 20) %>%
# plot
ggplot(aes(like_count, fct_reorder(word, like_count))) +
geom_point() +
labs(y = "word in Title")
# step 1: prepare data
data_binarized_tbl <- data %>%
# select(-dislike_count, -title) %>%
select(-title) %>% # count variables were removed in the cleaning step
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 225
## Columns: 20
## $ `year__-Inf_2005` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ year__2005_2010 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ year__2010_2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf <dbl> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, …
## $ brand__Bud_Light <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Budweiser <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ `brand__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ brand__Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ brand__Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ funny__FALSE <dbl> 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, …
## $ funny__TRUE <dbl> 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ `like_count__-Inf_19` <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, …
## $ like_count__19_130 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, …
## $ like_count__130_527 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ like_count__527_Inf <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, …
# step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
# correlate(like_count__6.26720054854136_Inf ) %>%
correlate(like_count__527_Inf)
data_corr_tbl
## # A tibble: 20 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 527_Inf 1
## 2 like_count -Inf_19 -0.339
## 3 like_count 130_527 -0.331
## 4 like_count 19_130 -0.327
## 5 brand Doritos 0.281
## 6 brand NFL 0.250
## 7 brand Bud_Light -0.212
## 8 year 2015_Inf 0.202
## 9 year -Inf_2005 -0.193
## 10 brand Kia -0.0909
## 11 brand Hynudai -0.0857
## 12 brand Toyota 0.0754
## 13 brand E-Trade -0.0451
## 14 funny FALSE 0.0184
## 15 funny TRUE -0.0184
## 16 year 2010_2015 0.0133
## 17 year 2005_2010 -0.00992
## 18 brand Budweiser -0.00579
## 19 brand Pepsi 0.000888
## 20 brand Coca-Cola 0.000803
# step 3:
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
## Please report the issue at
## <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
## Please report the issue at
## <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
split data
# No need to make the data small as yours is already small.
# data <- sample_n(data, 100)
# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training data set for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [151/17]> Fold01
## 2 <split [151/17]> Fold02
## 3 <split [151/17]> Fold03
## 4 <split [151/17]> Fold04
## 5 <split [151/17]> Fold05
## 6 <split [151/17]> Fold06
## 7 <split [151/17]> Fold07
## 8 <split [151/17]> Fold08
## 9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count ~ ., data_train)
## xgboost_recipe <-
## recipe(formula = like_count ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(18995)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify recipe
# xgboost_recipe <-
# recipe(like_count ~ ., data = data_train) %>%
# # recipes::update_role(like_count, new_role = "id variable") %>%
# step_tokenize(title) %>%
# step_tokenfilter(title, max_tokens = 100) %>%
# step_tfidf(title) %>%
# step_other(brand, threshold = 0.01) %>%
# step_dummy(brand, funny, one_hot = TRUE) %>%
# step_YeoJohnson(view_count,like_count)
xgboost_recipe <-
recipe(like_count ~ ., data = data_train) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_other(brand, threshold = 0.01) %>%
step_dummy(brand, funny, one_hot = TRUE) %>%
# 1. Remove variables with zero variance (constants)
step_zv(all_predictors()) %>%
# 2. Normalize numeric predictors (now safe from zero-variance issues)
step_normalize(all_numeric_predictors()) %>%
# 3. Transform the outcome
step_YeoJohnson(like_count)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 168
## Columns: 114
## $ year <dbl> 0.5154043, 0.8590071, -0.3436029, 0.0000000,…
## $ like_count <dbl> 2.9685828, 0.0000000, 3.3907606, 5.4116392, …
## $ tfidf_title_2000 <dbl> -0.1087598, -0.1087598, -0.1087598, -0.10875…
## $ tfidf_title_2001 <dbl> -0.1514581, -0.1514581, -0.1514581, -0.15145…
## $ tfidf_title_2002 <dbl> -0.1278554, -0.1278554, -0.1278554, -0.12785…
## $ tfidf_title_2005 <dbl> -0.1462442, -0.1462442, -0.1462442, -0.14624…
## $ tfidf_title_2007 <dbl> -0.1089823, -0.1089823, -0.1089823, -0.10898…
## $ tfidf_title_2009 <dbl> -0.1339065, -0.1339065, -0.1339065, -0.13390…
## $ tfidf_title_2010 <dbl> -0.1371154, -0.1371154, -0.1371154, -0.13711…
## $ tfidf_title_2011 <dbl> -0.1015255, -0.1015255, -0.1015255, -0.10152…
## $ tfidf_title_2012 <dbl> -0.2180951, -0.2180951, -0.2180951, -0.21809…
## $ tfidf_title_2013 <dbl> 5.1647302, -0.2037133, -0.2037133, -0.203713…
## $ tfidf_title_2014 <dbl> -0.2438312, -0.2438312, -0.2438312, -0.24383…
## $ tfidf_title_2015 <dbl> -0.1516157, -0.1516157, -0.1516157, -0.15161…
## $ tfidf_title_2016 <dbl> -0.1323708, -0.1323708, -0.1323708, -0.13237…
## $ tfidf_title_2018 <dbl> -0.1948139, -0.1948139, -0.1948139, -0.19481…
## $ tfidf_title_2019 <dbl> -0.1645455, -0.1645455, -0.1645455, -0.16454…
## $ tfidf_title_2020 <dbl> -0.1700777, -0.1700777, -0.1700777, -0.17007…
## $ tfidf_title_44 <dbl> -0.1093612, -0.1093612, -0.1093612, -0.10936…
## $ tfidf_title_a <dbl> -0.2060617, -0.2060617, -0.2060617, -0.20606…
## $ tfidf_title_ad <dbl> -0.3647755, -0.3647755, -0.3647755, -0.36477…
## $ tfidf_title_ads <dbl> -0.1339669, -0.1339669, -0.1339669, -0.13396…
## $ tfidf_title_advertisement <dbl> -0.107287, -0.107287, -0.107287, -0.107287, …
## $ tfidf_title_all <dbl> -0.1083241, -0.1083241, -0.1083241, -0.10832…
## $ tfidf_title_and <dbl> -0.1321017, -0.1321017, -0.1321017, -0.13210…
## $ tfidf_title_baby <dbl> -0.1198909, -0.1198909, -0.1198909, -0.11989…
## $ tfidf_title_bears <dbl> -0.1037587, -0.1037587, -0.1037587, -0.10375…
## $ tfidf_title_beer <dbl> -0.107287, -0.107287, -0.107287, -0.107287, …
## $ tfidf_title_best <dbl> -0.1094371, -0.1094371, -0.1094371, 9.083278…
## $ tfidf_title_bestbuds <dbl> -0.1049399, -0.1049399, -0.1049399, -0.10493…
## $ tfidf_title_big <dbl> -0.1717156, -0.1717156, -0.1717156, -0.17171…
## $ tfidf_title_bowl <dbl> 1.3995365, -0.7506182, 1.3995365, 0.7852066,…
## $ tfidf_title_britney <dbl> -0.1079305, -0.1079305, -0.1079305, -0.10793…
## $ tfidf_title_bud <dbl> -0.4864483, -0.4864483, -0.4864483, -0.48644…
## $ tfidf_title_budweiser <dbl> -0.3459334, 1.3019205, -0.3459334, -0.345933…
## $ tfidf_title_camry <dbl> -0.1094371, -0.1094371, -0.1094371, -0.10943…
## $ tfidf_title_car <dbl> -0.1089823, -0.1089823, -0.1089823, -0.10898…
## $ tfidf_title_cedric <dbl> -0.1688008, -0.1688008, -0.1688008, -0.16880…
## $ tfidf_title_cindy <dbl> -0.1091109, -0.1091109, -0.1091109, -0.10911…
## $ tfidf_title_coca <dbl> -0.2708385, -0.2708385, 2.6794296, -0.270838…
## $ tfidf_title_coke <dbl> -0.1959223, -0.1959223, -0.1959223, -0.19592…
## $ tfidf_title_cola <dbl> -0.2824949, -0.2824949, 2.6073696, -0.282494…
## $ tfidf_title_commercial <dbl> 0.56011212, 1.49839080, 0.56011212, -0.84730…
## $ tfidf_title_commercials <dbl> -0.107287, -0.107287, -0.107287, -0.107287, …
## $ tfidf_title_cool <dbl> -0.1341561, -0.1341561, -0.1341561, -0.13415…
## $ tfidf_title_crash <dbl> -0.1550743, -0.1550743, -0.1550743, -0.15507…
## $ tfidf_title_date <dbl> -0.107287, -0.107287, -0.107287, -0.107287, …
## $ tfidf_title_diet <dbl> -0.1459409, -0.1459409, -0.1459409, -0.14594…
## $ tfidf_title_dilly <dbl> -0.07715167, -0.07715167, -0.07715167, -0.07…
## $ tfidf_title_dog <dbl> -0.1941356, -0.1941356, -0.1941356, -0.19413…
## $ tfidf_title_dogs <dbl> -0.09216796, -0.09216796, -0.09216796, -0.09…
## $ tfidf_title_doritos <dbl> -0.2827732, -0.2827732, -0.2827732, -0.28277…
## $ tfidf_title_e <dbl> -0.1511986, -0.1511986, -0.1511986, -0.15119…
## $ tfidf_title_elantra <dbl> -0.1083241, -0.1083241, -0.1083241, -0.10832…
## $ tfidf_title_etrade <dbl> -0.1409018, -0.1409018, -0.1409018, -0.14090…
## $ tfidf_title_extended <dbl> -0.1674721, -0.1674721, -0.1674721, -0.16747…
## $ tfidf_title_factory <dbl> -0.1091109, -0.1091109, -0.1091109, -0.10911…
## $ tfidf_title_fantasy <dbl> -0.1344381, -0.1344381, -0.1344381, -0.13443…
## $ tfidf_title_featuring <dbl> -0.1037587, -0.1037587, -0.1037587, -0.10375…
## $ tfidf_title_flavor <dbl> -0.09216796, -0.09216796, -0.09216796, -0.09…
## $ tfidf_title_fly <dbl> -0.1091918, -0.1091918, -0.1091918, -0.10919…
## $ tfidf_title_full <dbl> -0.1087598, -0.1087598, -0.1087598, -0.10875…
## $ tfidf_title_funny <dbl> -0.1473298, -0.1473298, -0.1473298, -0.14732…
## $ tfidf_title_game <dbl> -0.1870394, -0.1870394, -0.1870394, -0.18703…
## $ tfidf_title_genesis <dbl> -0.1037587, -0.1037587, -0.1037587, -0.10375…
## $ tfidf_title_girlfriend <dbl> -0.1083241, -0.1083241, -0.1083241, -0.10832…
## $ tfidf_title_great <dbl> -0.09776579, -0.09776579, -0.09776579, -0.09…
## $ tfidf_title_happiness <dbl> -0.1313108, -0.1313108, -0.1313108, -0.13131…
## $ tfidf_title_hd <dbl> -0.1945391, -0.1945391, -0.1945391, -0.19453…
## $ tfidf_title_horse <dbl> -0.1079305, -0.1079305, -0.1079305, -0.10793…
## $ tfidf_title_hyundai <dbl> 1.7447629, -0.2371829, -0.2371829, -0.237182…
## $ tfidf_title_in <dbl> -0.1004947, -0.1004947, -0.1004947, -0.10049…
## $ tfidf_title_is <dbl> -0.1317287, -0.1317287, -0.1317287, -0.13172…
## $ tfidf_title_island <dbl> -0.1344381, -0.1344381, -0.1344381, -0.13443…
## $ tfidf_title_it <dbl> -0.1015255, -0.1015255, -0.1015255, -0.10152…
## $ tfidf_title_kia <dbl> -0.190631, -0.190631, -0.190631, -0.190631, …
## $ tfidf_title_light <dbl> -0.4673781, -0.4673781, -0.4673781, -0.46737…
## $ tfidf_title_lighta <dbl> -0.1344381, -0.1344381, -0.1344381, -0.13443…
## $ tfidf_title_love <dbl> -0.1120992, -0.1120992, -0.1120992, -0.11209…
## $ tfidf_title_new <dbl> -0.1825341, -0.1825341, -0.1825341, -0.18253…
## $ tfidf_title_nfl <dbl> -0.1499735, -0.1499735, -0.1499735, 1.104576…
## $ tfidf_title_of <dbl> -0.1120992, -0.1120992, -0.1120992, -0.11209…
## $ tfidf_title_official <dbl> -0.1690023, -0.1690023, -0.1690023, -0.16900…
## $ tfidf_title_on <dbl> -0.1340605, -0.1340605, -0.1340605, 6.978201…
## $ tfidf_title_one <dbl> -0.1191012, -0.1191012, -0.1191012, -0.11910…
## $ tfidf_title_pepsi <dbl> -0.301344, -0.301344, -0.301344, -0.301344, …
## $ tfidf_title_puppy <dbl> -0.1037587, -0.1037587, -0.1037587, -0.10375…
## $ tfidf_title_spot <dbl> -0.1302495, -0.1302495, -0.1302495, -0.13024…
## $ tfidf_title_starring <dbl> -0.1291036, -0.1291036, -0.1291036, -0.12910…
## $ tfidf_title_super <dbl> 1.3995365, -0.7506182, 1.3995365, 0.7852066,…
## $ tfidf_title_superbowl <dbl> -0.2295741, -0.2295741, -0.2295741, -0.22957…
## $ tfidf_title_the <dbl> -0.348641, 3.706434, -0.348641, 1.389248, -0…
## $ tfidf_title_toyota <dbl> -0.1701628, -0.1701628, -0.1701628, -0.17016…
## $ tfidf_title_trade <dbl> -0.1511986, -0.1511986, -0.1511986, -0.15119…
## $ tfidf_title_tv <dbl> -0.1950976, -0.1950976, -0.1950976, -0.19509…
## $ tfidf_title_usa <dbl> -0.148928, -0.148928, -0.148928, -0.148928, …
## $ tfidf_title_version <dbl> -0.1542293, -0.1542293, -0.1542293, -0.15422…
## $ tfidf_title_vs <dbl> -0.1536271, -0.1536271, -0.1536271, -0.15362…
## $ tfidf_title_winner <dbl> -0.1550743, -0.1550743, -0.1550743, -0.15507…
## $ tfidf_title_with <dbl> -0.1502229, -0.1502229, -0.1502229, -0.15022…
## $ tfidf_title_x <dbl> -0.1228548, -0.1228548, -0.1228548, -0.12285…
## $ tfidf_title_xliv <dbl> -0.1273842, -0.1273842, -0.1273842, 10.37099…
## $ brand_Bud.Light <dbl> -0.5664918, -0.5664918, -0.5664918, -0.56649…
## $ brand_Budweiser <dbl> -0.4070315, 2.4421887, -0.4070315, -0.407031…
## $ brand_Coca.Cola <dbl> -0.3345335, -0.3345335, 2.9714444, -0.334533…
## $ brand_Doritos <dbl> -0.3768379, -0.3768379, -0.3768379, -0.37683…
## $ brand_E.Trade <dbl> -0.2229403, -0.2229403, -0.2229403, -0.22294…
## $ brand_Hynudai <dbl> 3.1842245, -0.3121789, -0.3121789, -0.312178…
## $ brand_Kia <dbl> -0.2229403, -0.2229403, -0.2229403, -0.22294…
## $ brand_NFL <dbl> -0.2078929, -0.2078929, -0.2078929, 4.781536…
## $ brand_Pepsi <dbl> -0.3453776, -0.3453776, -0.3453776, -0.34537…
## $ brand_Toyota <dbl> -0.2372063, -0.2372063, -0.2372063, -0.23720…
## $ funny_FALSE. <dbl> -0.6490043, 1.5316502, -0.6490043, 1.5316502…
## $ funny_TRUE. <dbl> 0.6490043, -1.5316502, 0.6490043, -1.5316502…
# Specify model
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine recipe and model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(344)
xgboost_tune <-
tune_grid(
xgboost_workflow,
resamples = data_cv,
grid = 5
)
## i Creating pre-processing data to finalize 1 unknown parameter: "mtry"
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## mtry trees min_n learn_rate .metric .estimator mean n std_err .config
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 1500 11 0.00422 rmse standard 2.11 10 0.0818 pre0_mod1…
## 2 57 1 2 0.0750 rmse standard 2.11 10 0.0823 pre0_mod3…
## 3 85 500 30 0.001 rmse standard 2.13 10 0.0810 pre0_mod4…
## 4 113 2000 21 0.0178 rmse standard 2.14 10 0.136 pre0_mod5…
## 5 29 1000 40 0.316 rmse standard 2.18 10 0.0948 pre0_mod2…
# Update the model by selecting the best hyperparameters
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
# Fit the model on the entire training data and test it on the test data
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 2.07 pre0_mod0_post0
## 2 rsq standard 0.135 pre0_mod0_post0
tune::collect_predictions(data_fit) %>%
# ggplot(aes(price, .pred)) +
ggplot(aes(like_count, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()