youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
| Name | youtube |
| Number of rows | 247 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| logical | 7 |
| numeric | 7 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| brand | 0 | 1.00 | 3 | 9 | 0 | 10 | 0 |
| superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 244 | 0 |
| youtube_url | 11 | 0.96 | 43 | 43 | 0 | 233 | 0 |
| id | 11 | 0.96 | 11 | 11 | 0 | 233 | 0 |
| kind | 16 | 0.94 | 13 | 13 | 0 | 1 | 0 |
| etag | 16 | 0.94 | 27 | 27 | 0 | 228 | 0 |
| title | 16 | 0.94 | 6 | 99 | 0 | 228 | 0 |
| description | 50 | 0.80 | 3 | 3527 | 0 | 194 | 0 |
| thumbnail | 129 | 0.48 | 48 | 48 | 0 | 118 | 0 |
| channel_title | 16 | 0.94 | 3 | 37 | 0 | 185 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| funny | 0 | 1 | 0.69 | TRU: 171, FAL: 76 |
| show_product_quickly | 0 | 1 | 0.68 | TRU: 169, FAL: 78 |
| patriotic | 0 | 1 | 0.17 | FAL: 206, TRU: 41 |
| celebrity | 0 | 1 | 0.29 | FAL: 176, TRU: 71 |
| danger | 0 | 1 | 0.30 | FAL: 172, TRU: 75 |
| animals | 0 | 1 | 0.37 | FAL: 155, TRU: 92 |
| use_sex | 0 | 1 | 0.27 | FAL: 181, TRU: 66 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2010.19 | 5.86 | 2000 | 2005 | 2010 | 2015.00 | 2020 | ▇▇▇▇▆ |
| view_count | 16 | 0.94 | 1407556.46 | 11971111.01 | 10 | 6431 | 41379 | 170015.50 | 176373378 | ▇▁▁▁▁ |
| like_count | 22 | 0.91 | 4146.03 | 23920.40 | 0 | 19 | 130 | 527.00 | 275362 | ▇▁▁▁▁ |
| dislike_count | 22 | 0.91 | 833.54 | 6948.52 | 0 | 1 | 7 | 24.00 | 92990 | ▇▁▁▁▁ |
| favorite_count | 16 | 0.94 | 0.00 | 0.00 | 0 | 0 | 0 | 0.00 | 0 | ▁▁▇▁▁ |
| comment_count | 25 | 0.90 | 188.64 | 986.46 | 0 | 1 | 10 | 50.75 | 9190 | ▇▁▁▁▁ |
| category_id | 16 | 0.94 | 19.32 | 8.00 | 1 | 17 | 23 | 24.00 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| published_at | 16 | 0.94 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2013-01-31 09:13:55 | 227 |
data <- youtube %>%
# Treat Missing Values
select(-thumbnail, -channel_title, -description, -etag, -category_id, -youtube_url, -kind) %>%
na.omit() %>%
# log transform variables with pos-skewed distributions
mutate(like_count = log(like_count))
Identify Good Predictors
view count
youtube %>%
ggplot(aes(like_count, view_count)) +
scale_y_log10() +
geom_point()
## Warning: Removed 22 rows containing missing values (`geom_point()`).
dislike count
youtube %>%
ggplot(aes(like_count, as.factor(dislike_count))) +
geom_point()
## Warning: Removed 22 rows containing missing values (`geom_point()`).
title
youtube %>%
# tokenize title
unnest_tokens(output = word, input = title) %>%
# calculate avg view per like
group_by(word) %>%
summarise(like_count = mean(like_count),
n = n()) %>%
ungroup() %>%
filter(n > 5, !str_detect(word, "\\d")) %>%
slice_max(order_by = like_count, n = 20) %>%
# Plot
ggplot(aes(like_count, fct_reorder(word, like_count))) +
geom_point()
EDA Shortcuts
# Step 1: Prepare Data
data_binarized_tbl <- data %>%
select(-title, -id, -published_at, -show_product_quickly, -year, -brand, -superbowl_ads_dot_com_url) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 219
## Columns: 28
## $ funny__0 <dbl> 0, 0, 1, 0, 0, 0, 1, 0, …
## $ funny__1 <dbl> 1, 1, 0, 1, 1, 1, 0, 1, …
## $ patriotic__0 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__0 <dbl> 0, 1, 1, 1, 0, 0, 0, 0, …
## $ celebrity__1 <dbl> 1, 0, 0, 0, 1, 1, 1, 1, …
## $ danger__0 <dbl> 0, 0, 1, 0, 0, 1, 1, 1, …
## $ danger__1 <dbl> 1, 1, 0, 1, 1, 0, 0, 0, …
## $ animals__0 <dbl> 1, 0, 1, 0, 0, 0, 1, 0, …
## $ animals__1 <dbl> 0, 1, 0, 1, 1, 1, 0, 1, …
## $ use_sex__0 <dbl> 1, 1, 1, 0, 1, 1, 1, 1, …
## $ use_sex__1 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ `view_count__-Inf_6577` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ view_count__6577_41828 <dbl> 0, 0, 0, 1, 1, 0, 1, 1, …
## $ view_count__41828_176014.5 <dbl> 1, 1, 0, 0, 0, 0, 0, 0, …
## $ view_count__176014.5_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `like_count__-Inf_2.9174053685313` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ like_count__2.9174053685313_4.86753445045558 <dbl> 0, 1, 0, 1, 1, 0, 1, 0, …
## $ like_count__4.86753445045558_6.17478337258445 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ like_count__6.17478337258445_Inf <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ `dislike_count__-Inf_1` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ dislike_count__1_7 <dbl> 0, 0, 0, 1, 0, 0, 1, 1, …
## $ dislike_count__7_24 <dbl> 1, 1, 0, 0, 1, 0, 0, 0, …
## $ dislike_count__24_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ `comment_count__-Inf_1` <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ comment_count__1_11 <dbl> 0, 1, 0, 1, 0, 0, 1, 0, …
## $ comment_count__11_51.5 <dbl> 1, 0, 0, 0, 1, 0, 0, 1, …
## $ comment_count__51.5_Inf <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(like_count__6.17478337258445_Inf)
data_corr_tbl
## # A tibble: 28 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 6.17478337258445_Inf 1
## 2 comment_count 51.5_Inf 0.806
## 3 view_count 176014.5_Inf 0.733
## 4 dislike_count 24_Inf 0.695
## 5 comment_count -Inf_1 -0.372
## 6 dislike_count -Inf_1 -0.352
## 7 dislike_count 1_7 -0.348
## 8 view_count -Inf_6577 -0.335
## 9 like_count -Inf_2.9174053685313 -0.335
## 10 like_count 2.9174053685313_4.86753445045558 -0.335
## # ℹ 18 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
# Split into train and test dataset
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training dataset for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [147/17]> Fold01
## 2 <split [147/17]> Fold02
## 3 <split [147/17]> Fold03
## 4 <split [147/17]> Fold04
## 5 <split [148/16]> Fold05
## 6 <split [148/16]> Fold06
## 7 <split [148/16]> Fold07
## 8 <split [148/16]> Fold08
## 9 <split [148/16]> Fold09
## 10 <split [148/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = like_count ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(81602)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
# Specify Recipe
xgboost_recipe <-
recipe(formula = view_count ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id variable") %>%
step_select(-animals, -funny, -danger, -use_sex, -patriotic, -celebrity, -show_product_quickly, -year, -brand, -superbowl_ads_dot_com_url) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_date(published_at, keep_original_cols = FALSE) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_YeoJohnson(like_count, dislike_count, comment_count)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Rows: 164
## Columns: 126
## $ id <fct> sl8ooTIMk2w, HtBZvl7dIu4, ecqiZn2DDFQ, yQ_nU0_…
## $ like_count <dbl> 6.5220928, 10.7877302, 7.2327331, 4.7706846, 2…
## $ dislike_count <dbl> 2.1099882, 4.1906048, 3.0446622, 1.6019271, 0.…
## $ favorite_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ comment_count <dbl> 3.4294870, 5.5455775, 3.8674433, 1.9313973, 0.…
## $ view_count <dbl> 77720, 28785122, 1060001, 88445, 27378, 1294, …
## $ tfidf_title_2001 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2005 <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_2006 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2007 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2008 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2009 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2010 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2011 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2012 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2013 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_2014 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2015 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2016 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2017 <dbl> 0.0000000, 0.7364734, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2018 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2019 <dbl> 0.8801152, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_2020 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_44 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_a <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_ad <dbl> 0.5913209, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_ads <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_and <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_baby <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_best <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_big <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_black <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_body <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bowl <dbl> 0.0000000, 0.2011376, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_britney <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_bud <dbl> 0.0000000, 0.0000000, 0.3384774, 0.5641290, 0.…
## $ tfidf_title_budweiser <dbl> 0.0000000, 0.3065021, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_camry <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_car <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_cedric <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cindy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_clydesdale <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_coca <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_coke <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cola <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_commercial <dbl> 0.0000000, 0.1705648, 0.2046778, 0.3411296, 0.…
## $ tfidf_title_commercials <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_cool <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_crash <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_crown <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_date <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_diet <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dilly <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_dog <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_dogs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_doritos <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_down <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_elantra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_etrade <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_exclusive <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_factory <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_fantasy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_featuring <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_flavor <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_fly <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_for <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_ft <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_funny <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_game <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_genesis <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_title_girlfriend <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_great <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 2.20942, 0…
## $ tfidf_title_halftime <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_happiness <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_hd <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_horse <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_hyundai <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_in <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0…
## $ tfidf_title_island <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_jackie <dbl> 0.0000000, 0.0000000, 0.8837681, 0.0000000, 0.…
## $ tfidf_title_journey <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_kia <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_legends <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_light <dbl> 0.0000000, 0.0000000, 0.3573392, 0.5955654, 0.…
## $ tfidf_title_lighta <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_love <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_max <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_meter <dbl> 1.10471, 0.00000, 0.00000, 0.00000, 0.00000, 0…
## $ tfidf_title_new <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_nfl <dbl> 0.7989384, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_of <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_official <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_on <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_pepsi <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_spot <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_super <dbl> 0.0000000, 0.2011376, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_superbowl <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_the <dbl> 0.0000000, 0.3557840, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_toyota <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_tv <dbl> 0.0000000, 0.0000000, 0.7040922, 0.0000000, 0.…
## $ tfidf_title_vs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_winner <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ tfidf_title_with <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_xliii <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_xliv <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ tfidf_title_you <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_year <int> 2019, 2017, 2008, 2007, 2010, 2009, 2014, 2008…
## $ published_at_dow_Sun <dbl> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0…
## $ published_at_dow_Mon <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
## $ published_at_dow_Tue <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_dow_Wed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_dow_Thu <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0…
## $ published_at_dow_Fri <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ published_at_dow_Sat <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ published_at_month_Jan <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Feb <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1…
## $ published_at_month_Mar <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Apr <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_May <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Jun <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Jul <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Aug <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Sep <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ published_at_month_Oct <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0…
## $ published_at_month_Nov <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ published_at_month_Dec <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
# Specify Model
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), mtry = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine Recipe and Model using workflow
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune hyperparameters
set.seed(81602)
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_cv,
grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## Warning in optimize(yj_obj, interval = limits, maximum = TRUE, dat = dat, :
## NA/Inf replaced by maximum positive value
## → A | warning: NA/Inf replaced by maximum positive value
##
There were issues with some computations A: x1
→ B | error: [10:45:12] src/data/data.cc:1104: Check failed: valid: Input data contains `inf` or `nan`
## Stack trace:
## [bt] (0) 1 xgboost.so 0x0000000115b66734 dmlc::LogMessageFatal::~LogMessageFatal() + 116
## [bt] (1) 2 xgboost.so 0x0000000115c0a673 unsigned long long xgboost::SparsePage::Push<xgboost::data::DenseAdapterBatch>(xgboost::data::DenseAdapterBatch const&, float, int) + 1235
## [bt] (2) 3 xgboost.so 0x0000000115bfafd2 xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int) + 466
## [bt] (3) 4 xgboost.so 0x0000000115c09e75 xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) + 53
## [bt] (4) 5 xgboost.so 0x0000000115d58eee XGDMatrixCreateFromMat_om
## There were issues with some computations A: x1
There were issues with some computations A: x1 B: x1
There were issues with some computations A: x1 B: x5
There were issues with some computations A: x2 B: x5
→ C | error: [10:45:13] src/data/data.cc:1104: Check failed: valid: Input data contains `inf` or `nan`
## Stack trace:
## [bt] (0) 1 xgboost.so 0x0000000115b66734 dmlc::LogMessageFatal::~LogMessageFatal() + 116
## [bt] (1) 2 xgboost.so 0x0000000115c0a673 unsigned long long xgboost::SparsePage::Push<xgboost::data::DenseAdapterBatch>(xgboost::data::DenseAdapterBatch const&, float, int) + 1235
## [bt] (2) 3 xgboost.so 0x0000000115bfafd2 xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int) + 466
## [bt] (3) 4 xgboost.so 0x0000000115c09e75 xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) + 53
## [bt] (4) 5 xgboost.so 0x0000000115d58eee XGDMatrixCreateFromMat_om
## There were issues with some computations A: x2 B: x5
There were issues with some computations A: x2 B: x5 C: x4
There were issues with some computations A: x3 B: x5 C: x5
There were issues with some computations A: x3 B: x5 C: x8
There were issues with some computations A: x4 B: x5 C: x10
→ D | error: [10:45:14] src/data/data.cc:1104: Check failed: valid: Input data contains `inf` or `nan`
## Stack trace:
## [bt] (0) 1 xgboost.so 0x0000000115b66734 dmlc::LogMessageFatal::~LogMessageFatal() + 116
## [bt] (1) 2 xgboost.so 0x0000000115c0a673 unsigned long long xgboost::SparsePage::Push<xgboost::data::DenseAdapterBatch>(xgboost::data::DenseAdapterBatch const&, float, int) + 1235
## [bt] (2) 3 xgboost.so 0x0000000115bfafd2 xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int) + 466
## [bt] (3) 4 xgboost.so 0x0000000115c09e75 xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) + 53
## [bt] (4) 5 xgboost.so 0x0000000115d58eee XGDMatrixCreateFromMat_om
## There were issues with some computations A: x4 B: x5 C: x10
There were issues with some computations A: x4 B: x5 C: x10 D: x2
There were issues with some computations A: x4 B: x5 C: x10 D: x4
There were issues with some computations A: x5 B: x5 C: x10 D: x5
There were issues with some computations A: x5 B: x5 C: x10 D: x8
There were issues with some computations A: x6 B: x5 C: x10 D: x10
→ E | error: [10:45:15] src/data/data.cc:1104: Check failed: valid: Input data contains `inf` or `nan`
## Stack trace:
## [bt] (0) 1 xgboost.so 0x0000000115b66734 dmlc::LogMessageFatal::~LogMessageFatal() + 116
## [bt] (1) 2 xgboost.so 0x0000000115c0a673 unsigned long long xgboost::SparsePage::Push<xgboost::data::DenseAdapterBatch>(xgboost::data::DenseAdapterBatch const&, float, int) + 1235
## [bt] (2) 3 xgboost.so 0x0000000115bfafd2 xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int) + 466
## [bt] (3) 4 xgboost.so 0x0000000115c09e75 xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) + 53
## [bt] (4) 5 xgboost.so 0x0000000115d58eee XGDMatrixCreateFromMat_om
## There were issues with some computations A: x6 B: x5 C: x10 D: x10
There were issues with some computations A: x6 B: x5 C: x10 D: x10 E:…
There were issues with some computations A: x7 B: x5 C: x10 D: x10 E:…
There were issues with some computations A: x7 B: x5 C: x10 D: x10 E:…
There were issues with some computations A: x7 B: x5 C: x10 D: x10 E:…
There were issues with some computations A: x8 B: x5 C: x10 D: x10 E:…
→ F | error: [10:45:16] src/data/data.cc:1104: Check failed: valid: Input data contains `inf` or `nan`
## Stack trace:
## [bt] (0) 1 xgboost.so 0x0000000115b66734 dmlc::LogMessageFatal::~LogMessageFatal() + 116
## [bt] (1) 2 xgboost.so 0x0000000115c0a673 unsigned long long xgboost::SparsePage::Push<xgboost::data::DenseAdapterBatch>(xgboost::data::DenseAdapterBatch const&, float, int) + 1235
## [bt] (2) 3 xgboost.so 0x0000000115bfafd2 xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int) + 466
## [bt] (3) 4 xgboost.so 0x0000000115c09e75 xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) + 53
## [bt] (4) 5 xgboost.so 0x0000000115d58eee XGDMatrixCreateFromMat_om
## There were issues with some computations A: x8 B: x5 C: x10 D: x10 E:…
There were issues with some computations A: x8 B: x5 C: x10 D: x10 E:…
There were issues with some computations A: x9 B: x5 C: x10 D: x10 E:…
There were issues with some computations A: x9 B: x5 C: x10 D: x10 E:…
There were issues with some computations A: x10 B: x5 C: x10 D: x10 E…
→ G | error: [10:45:17] src/data/data.cc:1104: Check failed: valid: Input data contains `inf` or `nan`
## Stack trace:
## [bt] (0) 1 xgboost.so 0x0000000115b66734 dmlc::LogMessageFatal::~LogMessageFatal() + 116
## [bt] (1) 2 xgboost.so 0x0000000115c0a673 unsigned long long xgboost::SparsePage::Push<xgboost::data::DenseAdapterBatch>(xgboost::data::DenseAdapterBatch const&, float, int) + 1235
## [bt] (2) 3 xgboost.so 0x0000000115bfafd2 xgboost::data::SimpleDMatrix::SimpleDMatrix<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int) + 466
## [bt] (3) 4 xgboost.so 0x0000000115c09e75 xgboost::DMatrix* xgboost::DMatrix::Create<xgboost::data::DenseAdapter>(xgboost::data::DenseAdapter*, float, int, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) + 53
## [bt] (4) 5 xgboost.so 0x0000000115d58eee XGDMatrixCreateFromMat_om
## There were issues with some computations A: x10 B: x5 C: x10 D: x10 E…
There were issues with some computations A: x10 B: x5 C: x10 D: x10 E…
There were issues with some computations A: x10 B: x5 C: x10 D: x10 E…
## Warning: All models failed. Run `show_notes(.Last.tune.result)` for more
## information.