Superbowl commercials: Build a regression model to predict the Youtube like count (like_count). Use the youtube dataset.
youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
youtube %>% glimpse()
## Rows: 247
## Columns: 25
## $ year <dbl> 2018, 2020, 2006, 2018, 2003, 2020, 2020, 20…
## $ brand <chr> "Toyota", "Bud Light", "Bud Light", "Hynudai…
## $ superbowl_ads_dot_com_url <chr> "https://superbowl-ads.com/good-odds-toyota/…
## $ youtube_url <chr> "https://www.youtube.com/watch?v=zeBZvwYQ-hA…
## $ funny <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, …
## $ show_product_quickly <lgl> FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE,…
## $ patriotic <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ celebrity <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE…
## $ danger <lgl> FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE,…
## $ animals <lgl> FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE,…
## $ use_sex <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FAL…
## $ id <chr> "zeBZvwYQ-hA", "nbbp0VW7z8w", "yk0MQD5YgV8",…
## $ kind <chr> "youtube#video", "youtube#video", "youtube#v…
## $ etag <chr> "rn-ggKNly38Cl0C3CNjNnUH9xUw", "1roDoK-SYqSp…
## $ view_count <dbl> 173929, 47752, 142310, 198, 13741, 23636, 30…
## $ like_count <dbl> 1233, 485, 129, 2, 20, 115, 1470, 78, 342, 7…
## $ dislike_count <dbl> 38, 14, 15, 0, 3, 11, 384, 6, 7, 0, 14, 0, 2…
## $ favorite_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ comment_count <dbl> NA, 14, 9, 0, 2, 13, 227, 6, 30, 0, 8, 1, 13…
## $ published_at <dttm> 2018-02-03 11:29:14, 2020-01-31 21:04:13, 2…
## $ title <chr> "Toyota Super Bowl Commercial 2018 Good Odds…
## $ description <chr> "Toyota Super Bowl Commercial 2018 Good Odds…
## $ thumbnail <chr> "https://i.ytimg.com/vi/zeBZvwYQ-hA/sddefaul…
## $ channel_title <chr> "Funny Commercials", "VCU Brandcenter", "Joh…
## $ category_id <dbl> 1, 27, 17, 22, 24, 1, 24, 2, 24, 24, 24, 24,…
youtube %>% skimr::skim()
Name | Piped data |
Number of rows | 247 |
Number of columns | 25 |
_______________________ | |
Column type frequency: | |
character | 10 |
logical | 7 |
numeric | 7 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
brand | 0 | 1.00 | 3 | 9 | 0 | 10 | 0 |
superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 244 | 0 |
youtube_url | 11 | 0.96 | 43 | 43 | 0 | 233 | 0 |
id | 11 | 0.96 | 11 | 11 | 0 | 233 | 0 |
kind | 16 | 0.94 | 13 | 13 | 0 | 1 | 0 |
etag | 16 | 0.94 | 27 | 27 | 0 | 228 | 0 |
title | 16 | 0.94 | 6 | 99 | 0 | 228 | 0 |
description | 50 | 0.80 | 3 | 3527 | 0 | 194 | 0 |
thumbnail | 129 | 0.48 | 48 | 48 | 0 | 118 | 0 |
channel_title | 16 | 0.94 | 3 | 37 | 0 | 185 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
funny | 0 | 1 | 0.69 | TRU: 171, FAL: 76 |
show_product_quickly | 0 | 1 | 0.68 | TRU: 169, FAL: 78 |
patriotic | 0 | 1 | 0.17 | FAL: 206, TRU: 41 |
celebrity | 0 | 1 | 0.29 | FAL: 176, TRU: 71 |
danger | 0 | 1 | 0.30 | FAL: 172, TRU: 75 |
animals | 0 | 1 | 0.37 | FAL: 155, TRU: 92 |
use_sex | 0 | 1 | 0.27 | FAL: 181, TRU: 66 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2010.19 | 5.86 | 2000 | 2005 | 2010 | 2015.00 | 2020 | ▇▇▇▇▆ |
view_count | 16 | 0.94 | 1407556.46 | 11971111.01 | 10 | 6431 | 41379 | 170015.50 | 176373378 | ▇▁▁▁▁ |
like_count | 22 | 0.91 | 4146.03 | 23920.40 | 0 | 19 | 130 | 527.00 | 275362 | ▇▁▁▁▁ |
dislike_count | 22 | 0.91 | 833.54 | 6948.52 | 0 | 1 | 7 | 24.00 | 92990 | ▇▁▁▁▁ |
favorite_count | 16 | 0.94 | 0.00 | 0.00 | 0 | 0 | 0 | 0.00 | 0 | ▁▁▇▁▁ |
comment_count | 25 | 0.90 | 188.64 | 986.46 | 0 | 1 | 10 | 50.75 | 9190 | ▇▁▁▁▁ |
category_id | 16 | 0.94 | 19.32 | 8.00 | 1 | 17 | 23 | 24.00 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
published_at | 16 | 0.94 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2013-01-31 09:13:55 | 227 |
data_clean <- youtube %>%
# Treat missing values
filter(!is.na(like_count)) %>%
# Convert to factor
mutate(category_id = factor(category_id)) %>%
# Log Transform like_count
mutate(like_count = log10(like_count + 1))
data_clean %>% select(-id) %>% explore()
data_clean %>% describe_all()
data_clean %>% describe_cat(author)
data_clean %>% select(-id) %>% explore_all(target = like_count)
Check the distribution
data_clean %>%
ggplot(aes(like_count)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
data_clean %>% count(category_id, sort = TRUE)
## # A tibble: 13 × 2
## category_id n
## <fct> <int>
## 1 24 80
## 2 23 37
## 3 22 36
## 4 1 18
## 5 17 17
## 6 2 16
## 7 10 6
## 8 25 5
## 9 27 4
## 10 15 3
## 11 19 1
## 12 26 1
## 13 29 1
data_clean %>%
group_by(category_id) %>%
summarise(avg_like_count = mean(like_count)) %>%
ungroup() %>%
slice_max(order_by = avg_like_count, n = 10) %>%
ggplot(aes(avg_like_count, fct_reorder(category_id, avg_like_count))) +
geom_col(fill = "midnightblue") +
labs(title = "Top Categories by Like Counts",
x = "Avg. Like Counts", y = NULL)
data_clean %>%
tidytext::unnest_tokens(word, title) %>%
anti_join(stop_words) %>%
group_by(word) %>%
summarise(
n = n(),
avg_like_count = mean(like_count)
) %>%
ggplot(aes(n, avg_like_count)) +
geom_hline(
yintercept = mean(data_clean$like_count), lty = 2,
color = "gray50", size = 1.5
) +
geom_jitter(color = "midnightblue", alpha = 0.7) +
geom_text(aes(label = word),
check_overlap = TRUE, family = "IBMPlexSans",
vjust = "top", hjust = "left"
) +
scale_x_log10()
## Joining with `by = join_by(word)`
Examine logical variables: The plot shows these logical variables have no predictive power.
data_clean %>%
select(like_count, funny:use_sex) %>%
pivot_longer(cols = funny:use_sex) %>%
# Plot
ggplot(aes(value, like_count)) +
geom_boxplot() +
facet_wrap(~name)
The plot shows brand may be a good predictor.
data_clean %>%
ggplot(aes(brand, like_count)) +
geom_boxplot()
It may be a good predictor.
data_clean %>%
tidytext::unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
group_by(word) %>%
summarise(
n = n(),
avg_like_count = mean(like_count)
) %>%
ggplot(aes(n, avg_like_count)) +
geom_hline(
yintercept = mean(data_clean$like_count), lty = 2,
color = "gray50", linewidth = 1.5
) +
geom_jitter(color = "midnightblue", alpha = 0.7) +
geom_text(aes(label = word),
check_overlap = TRUE, family = "IBMPlexSans",
vjust = "top", hjust = "left"
) +
scale_x_log10()
## Joining with `by = join_by(word)`
data_clean %>% count(channel_title, sort = T)
## # A tibble: 181 × 2
## channel_title n
## <chr> <int>
## 1 NFL 5
## 2 omon007 5
## 3 BudBowlXLII 4
## 4 reggiep08v2 4
## 5 Coca-Cola 3
## 6 Funny Commercials 3
## 7 John Keehler 3
## 8 The Hall of Advertising 3
## 9 USA TODAY 3
## 10 World Hyundai Matteson 3
## # ℹ 171 more rows
data_clean %>%
group_by(channel_title) %>%
summarise(avg_like_count = mean(like_count)) %>%
ungroup() %>%
slice_max(order_by = avg_like_count, n = 10) %>%
ggplot(aes(avg_like_count, fct_reorder(channel_title, avg_like_count))) +
geom_col(fill = "midnightblue") +
labs(title = "Top Categories by Like Counts",
x = "Avg. Like Counts", y = NULL)
data_clean %>% count(year, sort = T)
## # A tibble: 21 × 2
## year n
## <dbl> <int>
## 1 2009 15
## 2 2012 15
## 3 2007 14
## 4 2013 14
## 5 2008 13
## 6 2010 13
## 7 2018 13
## 8 2001 12
## 9 2019 12
## 10 2004 11
## # ℹ 11 more rows
data_clean %>%
ggplot(aes(year)) +
geom_bar()
data_clean %>%
group_by(year) %>%
summarise(avg_like_count = mean(like_count)) %>%
ungroup() %>%
# slice_max(order_by = avg_like_count, n = 10) %>%
ggplot(aes(year, avg_like_count)) +
geom_col(fill = "midnightblue") +
labs(title = "Top Categories by Like Counts",
x = "Avg. Like Counts", y = NULL)
data_processed <- data_clean %>%
select(id, like_count, category_id, description, brand, channel_title, year)
# Try with a small dataset first to be sure the code is error-free
# data_processed <- sample_n(data_processed, 100)
set.seed(123)
data_split <- initial_split(data_processed, strata = like_count)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- bootstraps(data_train, strata = like_count)
data_folds
## # Bootstrap sampling using stratification
## # A tibble: 25 × 2
## splits id
## <list> <chr>
## 1 <split [168/62]> Bootstrap01
## 2 <split [168/65]> Bootstrap02
## 3 <split [168/58]> Bootstrap03
## 4 <split [168/59]> Bootstrap04
## 5 <split [168/62]> Bootstrap05
## 6 <split [168/66]> Bootstrap06
## 7 <split [168/58]> Bootstrap07
## 8 <split [168/67]> Bootstrap08
## 9 <split [168/67]> Bootstrap09
## 10 <split [168/62]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_xgboost(like_count ~., data = data_train)
## xgboost_recipe <-
## recipe(formula = like_count ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(12984)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = like_count ~ ., data = data_train) %>%
recipes::update_role(id, new_role = "id") %>%
step_tokenize(description) %>%
step_stopwords(description)%>%
step_tokenfilter(description, max_tokens = 100) %>%
step_tfidf(description) %>%
step_other(channel_title, threshold = 0.01) %>%
step_dummy(all_nominal_predictors())
xgboost_recipe %>% prep() %>% bake(new_data = NULL) %>% glimpse()
## Rows: 168
## Columns: 142
## $ id <fct> J0xugdotpp8, agISXMN4tng, be5DH6…
## $ year <dbl> 2020, 2010, 2018, 2013, 2010, 20…
## $ like_count <dbl> 0.9030900, 1.0000000, 0.4771213,…
## $ tfidf_description_2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2012 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2013 <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_2014 <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_2018 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2019 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_2020 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ad <dbl> 0.0000000, 0.2079442, 0.0000000,…
## $ tfidf_description_ads <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_arnold <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_beer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ben <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_best <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_big <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_bowl <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_bud <dbl> 0.00000000, 0.19459101, 0.000000…
## $ tfidf_description_budweiser <dbl> 2.44234704, 0.00000000, 0.000000…
## $ tfidf_description_callner <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_can <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_car <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_channel <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_check <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_coca <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_cola <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_comedy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_commercial <dbl> 0.00000000, 0.12656664, 1.265666…
## $ tfidf_description_commercials <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_compilation <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_copyright <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_day <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_december <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_directed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_director <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_doritos <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_e <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_epic <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_fail <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_fails <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_failure <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_favorite <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_film <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_first <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_follow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_full <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_funniest <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_funny <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_game <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_go <dbl> 0.000000000, 0.000000000, 0.0000…
## $ tfidf_description_good <dbl> 0.000000, 0.000000, 0.000000, 0.…
## $ tfidf_description_hilarious <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_http <dbl> 0.0000000, 0.3718635, 0.0000000,…
## $ tfidf_description_https <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_hyundai <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_instagram <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_january <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_kia <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_light <dbl> 0.00000000, 0.40876287, 0.000000…
## $ tfidf_description_like <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_lol <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_man <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_method <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_monthly <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_music <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_new <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_nfl <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_noob <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_one <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_ownage <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_owned <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_pepsi <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_producer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_production <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_scenes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_see <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_spot <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_subscribe <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_super <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ tfidf_description_superbowl <dbl> 0.00000000, 0.25649494, 0.000000…
## $ tfidf_description_supervisor <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_team <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_today <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_top <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_toyota <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_twitter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_twitter.com <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_us <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_use <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_v <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_videos <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_viewed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_visit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_watch <dbl> 0.00000, 0.00000, 0.00000, 0.000…
## $ tfidf_description_world <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_www.facebook.com <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_www.nfl.com <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_www.youtube.com <dbl> 0.0000000, 0.4884694, 0.0000000,…
## $ tfidf_description_xli <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ tfidf_description_xlviii <dbl> 0.00000000, 0.00000000, 0.000000…
## $ tfidf_description_year <dbl> 0.0000000, 0.0000000, 0.0000000,…
## $ category_id_X2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X17 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X19 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X22 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ category_id_X23 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,…
## $ category_id_X24 <dbl> 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,…
## $ category_id_X25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X26 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X27 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ category_id_X29 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Budweiser <dbl> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,…
## $ brand_Coca.Cola <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ brand_Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_E.Trade <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,…
## $ brand_Hynudai <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ brand_Toyota <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_alohawarriorchief <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_BudBowlXLII <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_ClearBroadcasting <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Coca.Cola <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Funny.Commercials <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Iulian.Craciun <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_omon007 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Peek.of.the.Net <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_reggiep08v2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_SuperBowlsSpots <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_The.Hall.of.Advertising <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Tomi.Jaya <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_Trailer.Tube <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ channel_title_Unknown.Classics. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_USA.TODAY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_viralstuff <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_World.Hyundai.Matteson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ channel_title_other <dbl> 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,…
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(15793)
doParallel::registerDoParallel()
xgboost_tune <-
tune_grid(xgboost_workflow,
resamples = data_folds,
grid = 10)
show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 1719 12 5 0.00252 rmse standard 1.05 25 0.0162
## 2 356 38 7 0.0248 rmse standard 1.08 25 0.0183
## 3 1461 7 3 0.00631 rmse standard 1.08 25 0.0123
## 4 1819 3 13 0.0166 rmse standard 1.13 25 0.0193
## 5 662 17 2 0.0747 rmse standard 1.16 25 0.0156
## # ℹ 1 more variable: .config <chr>
# How did all the possible parameter combinations do?
autoplot(xgboost_tune)
We can finalize our random forest workflow with the best performing parameters.
final_rf <- xgboost_workflow %>%
finalize_workflow(select_best(xgboost_tune, "rmse"))
The function last_fit() fits this finalized random forest one last time to the training data and evaluates one last time on the testing data.
data_fit <- last_fit(final_rf, data_split)
data_fit
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [168/57]> train/test split <tibble> <tibble> <tibble> <workflow>
collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.972 Preprocessor1_Model1
## 2 rsq standard 0.259 Preprocessor1_Model1
collect_predictions(data_fit)
## # A tibble: 57 × 5
## id .pred .row like_count .config
## <chr> <dbl> <int> <dbl> <chr>
## 1 train/test split 2.80 2 2.69 Preprocessor1_Model1
## 2 train/test split 2.27 3 2.11 Preprocessor1_Model1
## 3 train/test split 1.42 4 0.477 Preprocessor1_Model1
## 4 train/test split 1.72 5 1.32 Preprocessor1_Model1
## 5 train/test split 1.79 9 2.54 Preprocessor1_Model1
## 6 train/test split 0.936 28 1.36 Preprocessor1_Model1
## 7 train/test split 1.80 30 2.37 Preprocessor1_Model1
## 8 train/test split 2.27 31 2.98 Preprocessor1_Model1
## 9 train/test split 2.46 32 1.61 Preprocessor1_Model1
## 10 train/test split 2.57 33 2.08 Preprocessor1_Model1
## # ℹ 47 more rows
collect_predictions(data_fit) %>%
ggplot(aes(like_count, .pred)) +
geom_point(alpha = 0.5, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()