Goal: The goal is to predict the Youtube like count. Click here for the data.
youtube <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(youtube)
Name | youtube |
Number of rows | 247 |
Number of columns | 25 |
_______________________ | |
Column type frequency: | |
character | 10 |
logical | 7 |
numeric | 7 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
brand | 0 | 1.00 | 3 | 9 | 0 | 10 | 0 |
superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 244 | 0 |
youtube_url | 11 | 0.96 | 43 | 43 | 0 | 233 | 0 |
id | 11 | 0.96 | 11 | 11 | 0 | 233 | 0 |
kind | 16 | 0.94 | 13 | 13 | 0 | 1 | 0 |
etag | 16 | 0.94 | 27 | 27 | 0 | 228 | 0 |
title | 16 | 0.94 | 6 | 99 | 0 | 228 | 0 |
description | 50 | 0.80 | 3 | 3527 | 0 | 194 | 0 |
thumbnail | 129 | 0.48 | 48 | 48 | 0 | 118 | 0 |
channel_title | 16 | 0.94 | 3 | 37 | 0 | 185 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
funny | 0 | 1 | 0.69 | TRU: 171, FAL: 76 |
show_product_quickly | 0 | 1 | 0.68 | TRU: 169, FAL: 78 |
patriotic | 0 | 1 | 0.17 | FAL: 206, TRU: 41 |
celebrity | 0 | 1 | 0.29 | FAL: 176, TRU: 71 |
danger | 0 | 1 | 0.30 | FAL: 172, TRU: 75 |
animals | 0 | 1 | 0.37 | FAL: 155, TRU: 92 |
use_sex | 0 | 1 | 0.27 | FAL: 181, TRU: 66 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2010.19 | 5.86 | 2000 | 2005 | 2010 | 2015.00 | 2020 | ▇▇▇▇▆ |
view_count | 16 | 0.94 | 1407556.46 | 11971111.01 | 10 | 6431 | 41379 | 170015.50 | 176373378 | ▇▁▁▁▁ |
like_count | 22 | 0.91 | 4146.03 | 23920.40 | 0 | 19 | 130 | 527.00 | 275362 | ▇▁▁▁▁ |
dislike_count | 22 | 0.91 | 833.54 | 6948.52 | 0 | 1 | 7 | 24.00 | 92990 | ▇▁▁▁▁ |
favorite_count | 16 | 0.94 | 0.00 | 0.00 | 0 | 0 | 0 | 0.00 | 0 | ▁▁▇▁▁ |
comment_count | 25 | 0.90 | 188.64 | 986.46 | 0 | 1 | 10 | 50.75 | 9190 | ▇▁▁▁▁ |
category_id | 16 | 0.94 | 19.32 | 8.00 | 1 | 17 | 23 | 24.00 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
published_at | 16 | 0.94 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2013-01-31 09:13:55 | 227 |
data <- youtube %>%
# Treat missing values
select(-superbowl_ads_dot_com_url, -youtube_url, -kind, -etag, -favorite_count, -thumbnail, -channel_title) %>%
na.omit()
data <- data %>%
# Treat missing values
filter(!is.na(like_count)) %>%
# Convert to factor
mutate(category_id = factor(category_id)) %>%
# Log Transform like_count
mutate(like_count = log10(like_count + 1))
data %>% skimr::skim()
Name | Piped data |
Number of rows | 190 |
Number of columns | 18 |
_______________________ | |
Column type frequency: | |
character | 4 |
factor | 1 |
logical | 7 |
numeric | 5 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
brand | 0 | 1 | 3 | 9 | 0 | 10 | 0 |
id | 0 | 1 | 11 | 11 | 0 | 187 | 0 |
title | 0 | 1 | 6 | 99 | 0 | 187 | 0 |
description | 0 | 1 | 11 | 3527 | 0 | 187 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
category_id | 0 | 1 | FALSE | 12 | 24: 71, 23: 34, 22: 21, 1: 17 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
funny | 0 | 1 | 0.69 | TRU: 131, FAL: 59 |
show_product_quickly | 0 | 1 | 0.70 | TRU: 133, FAL: 57 |
patriotic | 0 | 1 | 0.15 | FAL: 161, TRU: 29 |
celebrity | 0 | 1 | 0.30 | FAL: 133, TRU: 57 |
danger | 0 | 1 | 0.31 | FAL: 131, TRU: 59 |
animals | 0 | 1 | 0.36 | FAL: 121, TRU: 69 |
use_sex | 0 | 1 | 0.28 | FAL: 136, TRU: 54 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1 | 2010.10 | 5.73 | 2000 | 2006.00 | 2010.00 | 2014.75 | 2020.00 | ▆▇▇▆▆ |
view_count | 0 | 1 | 1698142.99 | 13187377.43 | 10 | 10484.75 | 58515.50 | 219180.25 | 176373378.00 | ▇▁▁▁▁ |
like_count | 0 | 1 | 2.18 | 1.08 | 0 | 1.52 | 2.22 | 2.77 | 5.44 | ▃▆▇▂▁ |
dislike_count | 0 | 1 | 984.25 | 7554.84 | 0 | 2.00 | 8.50 | 37.00 | 92990.00 | ▇▁▁▁▁ |
comment_count | 0 | 1 | 219.33 | 1063.61 | 0 | 2.00 | 15.00 | 65.00 | 9190.00 | ▇▁▁▁▁ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
published_at | 0 | 1 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2012-02-14 12:03:48 | 186 |
Identify good predictors
likes per year
data %>%
ggplot(aes(year, like_count)) +
geom_col() +
labs(title = "Likes by year",
x = "Year",
y = "Likes")
Likes for brands
data %>%
ggplot(aes(brand, like_count)) +
geom_boxplot()
Likes for categories
# List categories
characteristics <- c("funny", "show_product_quickly", "patriotic", "celebrity", "danger", "animals", "use_sex")
# Select TRUE values and count likes
likes_by_cat <- data %>%
select(all_of(characteristics), like_count) %>%
group_by(dplyr::across(all_of(characteristics))) %>%
summarise(total_likes = sum(like_count, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'funny', 'show_product_quickly',
## 'patriotic', 'celebrity', 'danger', 'animals'. You can override using the
## `.groups` argument.
# Reshape Data
likes_long <- likes_by_cat %>%
pivot_longer(cols = all_of(characteristics), names_to = "characteristic", values_to = "is_true") %>%
filter(is_true)
# Plot Data
likes_long %>%
ggplot(aes(x = characteristic, y = total_likes)) +
geom_col() +
labs(title = "Likes by category",
x = "Category",
y = "Likes")
data %>% count(category_id, sort = TRUE)
## # A tibble: 12 × 2
## category_id n
## <fct> <int>
## 1 24 71
## 2 23 34
## 3 22 21
## 4 1 17
## 5 17 16
## 6 2 14
## 7 10 6
## 8 25 4
## 9 15 3
## 10 27 2
## 11 19 1
## 12 26 1
data %>% group_by(category_id) %>%
summarise(avg_like_count = mean(like_count)) %>%
ungroup() %>%
slice_max(order_by = avg_like_count, n = 10) %>%
ggplot(aes(avg_like_count, fct_reorder(category_id, avg_like_count))) +
geom_col() +
labs(title = "Top Categories by Like Counts",
x = "Avg. Like Counts", y = NULL)
title
data %>%
# tockenize title
unnest_tokens(output = word, input = title) %>%
# calculate avg rent per word
group_by(word) %>%
summarise(like_count = mean(like_count),
n = n()) %>%
ungroup() %>%
filter(n > 10, !str_detect(word, "\\d")) %>%
slice_max(order_by = like_count, n = 20) %>%
#Plot
ggplot(aes(like_count, fct_reorder(word, like_count))) +
geom_point() +
labs(y = "words in Title")
EDA shortcut
# Step 1: Prepare data
data_binarized_tbl <- data %>%
select(-published_at) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 190
## Columns: 61
## $ `year__-Inf_2006` <dbl> 0, 1, 0, 1, 0, 0,…
## $ year__2006_2010 <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2010_2014.75 <dbl> 0, 0, 0, 0, 0, 0,…
## $ year__2014.75_Inf <dbl> 1, 0, 1, 0, 1, 1,…
## $ brand__Bud_Light <dbl> 1, 1, 0, 1, 0, 0,…
## $ brand__Budweiser <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 1,…
## $ brand__Doritos <dbl> 0, 0, 0, 0, 0, 0,…
## $ `brand__E-Trade` <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Hynudai <dbl> 0, 0, 1, 0, 0, 0,…
## $ brand__Kia <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__NFL <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Pepsi <dbl> 0, 0, 0, 0, 0, 0,…
## $ brand__Toyota <dbl> 0, 0, 0, 0, 1, 0,…
## $ funny__0 <dbl> 0, 0, 1, 0, 0, 0,…
## $ funny__1 <dbl> 1, 1, 0, 1, 1, 1,…
## $ show_product_quickly__0 <dbl> 0, 1, 0, 0, 0, 1,…
## $ show_product_quickly__1 <dbl> 1, 0, 1, 1, 1, 0,…
## $ patriotic__0 <dbl> 1, 1, 1, 1, 1, 1,…
## $ patriotic__1 <dbl> 0, 0, 0, 0, 0, 0,…
## $ celebrity__0 <dbl> 0, 1, 1, 1, 0, 0,…
## $ celebrity__1 <dbl> 1, 0, 0, 0, 1, 1,…
## $ danger__0 <dbl> 0, 0, 1, 0, 0, 1,…
## $ danger__1 <dbl> 1, 1, 0, 1, 1, 0,…
## $ animals__0 <dbl> 1, 0, 1, 0, 0, 0,…
## $ animals__1 <dbl> 0, 1, 0, 1, 1, 1,…
## $ use_sex__0 <dbl> 1, 1, 1, 0, 1, 1,…
## $ use_sex__1 <dbl> 0, 0, 0, 1, 0, 0,…
## $ id__Q5Hu_FBUIsk <dbl> 0, 0, 0, 0, 0, 0,…
## $ `id__-OTHER` <dbl> 1, 1, 1, 1, 1, 1,…
## $ `view_count__-Inf_10484.75` <dbl> 0, 0, 1, 0, 0, 0,…
## $ view_count__10484.75_58515.5 <dbl> 1, 0, 0, 1, 1, 0,…
## $ view_count__58515.5_219180.25 <dbl> 0, 1, 0, 0, 0, 0,…
## $ view_count__219180.25_Inf <dbl> 0, 0, 0, 0, 0, 1,…
## $ `like_count__-Inf_1.51851393987789` <dbl> 0, 0, 1, 1, 0, 0,…
## $ like_count__1.51851393987789_2.22007656488678 <dbl> 0, 1, 0, 0, 1, 0,…
## $ like_count__2.22007656488678_2.77066783242838 <dbl> 1, 0, 0, 0, 0, 0,…
## $ like_count__2.77066783242838_Inf <dbl> 0, 0, 0, 0, 0, 1,…
## $ `dislike_count__-Inf_2` <dbl> 0, 0, 1, 0, 0, 0,…
## $ dislike_count__2_8.5 <dbl> 0, 0, 0, 1, 0, 0,…
## $ dislike_count__8.5_37 <dbl> 1, 1, 0, 0, 1, 0,…
## $ dislike_count__37_Inf <dbl> 0, 0, 0, 0, 0, 1,…
## $ `comment_count__-Inf_2` <dbl> 0, 0, 1, 1, 0, 0,…
## $ comment_count__2_15 <dbl> 1, 1, 0, 0, 1, 0,…
## $ comment_count__15_65 <dbl> 0, 0, 0, 0, 0, 0,…
## $ comment_count__65_Inf <dbl> 0, 0, 0, 0, 0, 1,…
## $ title__Bud_Lighta_Cedric_a_Island_Fantasy_2005 <dbl> 0, 0, 0, 0, 0, 0,…
## $ `title__-OTHER` <dbl> 1, 1, 1, 1, 1, 1,…
## $ description__Bud_Lighta_Cedric_a_Island_Fantasy_2005 <dbl> 0, 0, 0, 0, 0, 0,…
## $ `description__-OTHER` <dbl> 1, 1, 1, 1, 1, 1,…
## $ category_id__1 <dbl> 0, 0, 0, 0, 1, 0,…
## $ category_id__2 <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__10 <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__15 <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__17 <dbl> 0, 1, 0, 0, 0, 0,…
## $ category_id__22 <dbl> 0, 0, 1, 0, 0, 0,…
## $ category_id__23 <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__24 <dbl> 0, 0, 0, 1, 0, 1,…
## $ category_id__25 <dbl> 0, 0, 0, 0, 0, 0,…
## $ category_id__27 <dbl> 1, 0, 0, 0, 0, 0,…
## $ `category_id__-OTHER` <dbl> 0, 0, 0, 0, 0, 0,…
# Step 2: Correlate
data_corr_tbl <- data_binarized_tbl %>%
correlate(like_count__2.77066783242838_Inf)
data_corr_tbl
## # A tibble: 61 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 2.77066783242838_Inf 1
## 2 view_count 219180.25_Inf 0.777
## 3 comment_count 65_Inf 0.761
## 4 dislike_count 37_Inf 0.649
## 5 comment_count -Inf_2 -0.362
## 6 dislike_count -Inf_2 -0.362
## 7 like_count -Inf_1.51851393987789 -0.343
## 8 view_count -Inf_10484.75 -0.338
## 9 view_count 10484.75_58515.5 -0.333
## 10 like_count 2.22007656488678_2.77066783242838 -0.333
## # ℹ 51 more rows
# Step 3: Plot
data_corr_tbl %>%
plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
data <- data %>%
select(id, like_count, category_id, description, brand, year)
Split Data
# Split into train and test dataset
set.seed(123)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training dataset for cross-validation
data_cv <- bootstraps(data_train, strata = like_count)
data_cv
## # Bootstrap sampling using stratification
## # A tibble: 25 × 2
## splits id
## <list> <chr>
## 1 <split [142/53]> Bootstrap01
## 2 <split [142/57]> Bootstrap02
## 3 <split [142/49]> Bootstrap03
## 4 <split [142/52]> Bootstrap04
## 5 <split [142/51]> Bootstrap05
## 6 <split [142/55]> Bootstrap06
## 7 <split [142/52]> Bootstrap07
## 8 <split [142/50]> Bootstrap08
## 9 <split [142/56]> Bootstrap09
## 10 <split [142/46]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_xgboost(like_count ~ ., data = data_train)
## xgboost_recipe <-
## recipe(formula = like_count ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(57912)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = like_count ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id") %>%
step_tokenize(description) %>%
step_tokenfilter(description, max_tokens = 100) %>%
step_tfidf(description) %>%
step_dummy(all_nominal_predictors())
xgboost_recipe %>% prep() %>% bake(new_data = NULL) %>% glimpse()
## Rows: 142
## Columns: 123
## $ id <fct> Zb3fhsfs6ZU, JJmqCKtJnxM, lbkafMhmvM…
## $ year <dbl> 2006, 2000, 2020, 2009, 2016, 2001, …
## $ like_count <dbl> 3.338456, 4.395169, 5.244104, 2.8830…
## $ tfidf_description_2 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_2012 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_2018 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_2019 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_2020 <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_a <dbl> 0.46926686, 0.00000000, 0.04022287, …
## $ tfidf_description_ad <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_ads <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_all <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_an <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_and <dbl> 0.00000000, 0.00000000, 0.04385530, …
## $ tfidf_description_are <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_as <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_assistant <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_at <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_be <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_beer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_ben <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_best <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_big <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_bowl <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_bud <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_budweiser <dbl> 0.00000000, 1.09240103, 0.00000000, …
## $ tfidf_description_but <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_by <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_callner <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_channel <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_comedy <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_commercial <dbl> 0.00000000, 0.60116606, 0.00000000, …
## $ tfidf_description_commercials <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_de <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_do <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_doritos <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_during <dbl> 0.45354924, 0.00000000, 0.00000000, …
## $ tfidf_description_facebook <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_favorite <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_featuring <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_fire <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_follow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_for <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_from <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_full <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_funny <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_game <dbl> 0.39136593, 0.00000000, 0.00000000, …
## $ tfidf_description_he <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_here <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_his <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_http <dbl> 0.00000000, 0.00000000, 0.27130257, …
## $ tfidf_description_https <dbl> 0.00000000, 0.00000000, 0.11953509, …
## $ tfidf_description_hyundai <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_in <dbl> 0.00000000, 0.00000000, 0.04912298, …
## $ tfidf_description_instagram <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_is <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_it <dbl> 0.0000000, 0.0000000, 0.0000000, 0.2…
## $ tfidf_description_kia <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_light <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_like <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_more <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_most <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_network <dbl> 0.00000000, 0.00000000, 0.09659985, …
## $ tfidf_description_new <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_nfl <dbl> 0.00000000, 0.00000000, 0.69897969, …
## $ tfidf_description_of <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_on <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_one <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_other <dbl> 0.00000000, 0.00000000, 0.07291560, …
## $ tfidf_description_our <dbl> 0.00000000, 0.00000000, 0.07521550, …
## $ tfidf_description_out <dbl> 0.00000000, 0.00000000, 0.07521550, …
## $ tfidf_description_pepsi <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_podcasts <dbl> 0.0000000, 0.0000000, 0.1027804, 0.0…
## $ tfidf_description_producer <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_production <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_see <dbl> 0.43875711, 0.00000000, 0.00000000, …
## $ tfidf_description_subscribe <dbl> 0.00000000, 0.00000000, 0.08374839, …
## $ tfidf_description_super <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_superbowl <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_that <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_the <dbl> 0.00000000, 0.00000000, 0.09580728, …
## $ tfidf_description_this <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_time <dbl> 0.0000000, 0.0000000, 0.0000000, 0.3…
## $ tfidf_description_to <dbl> 0.25582256, 0.00000000, 0.08771059, …
## $ tfidf_description_today <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_toyota <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_description_twitter <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_description_twitter.com <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_up <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_us <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_viewed <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0…
## $ tfidf_description_visit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_was <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_watch <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_when <dbl> 0.000000, 0.000000, 0.000000, 0.0000…
## $ tfidf_description_with <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_world <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_www.nfl.com <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_www.youtube.com <dbl> 0.0000000, 0.0000000, 0.4665078, 0.0…
## $ tfidf_description_xli <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tfidf_description_you <dbl> 0.00000000, 0.00000000, 0.00000000, …
## $ tfidf_description_your <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id_X10 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id_X15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X17 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X19 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X22 <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id_X23 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id_X24 <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ category_id_X25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X26 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id_X27 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Budweiser <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand_Coca.Cola <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand_Doritos <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ brand_E.Trade <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ brand_Hynudai <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_NFL <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Pepsi <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ brand_Toyota <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(64548)
doParallel::registerDoParallel()
xgboost_tune <-
tune_grid(xgboost_workflow, resamples = data_cv,
grid = 5)
## Warning: package 'xgboost' was built under R version 4.3.3