Goal: What factors lead to the most youtube likes?
likes <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
skimr::skim(likes)
Name | likes |
Number of rows | 247 |
Number of columns | 25 |
_______________________ | |
Column type frequency: | |
character | 10 |
logical | 7 |
numeric | 7 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
brand | 0 | 1.00 | 3 | 9 | 0 | 10 | 0 |
superbowl_ads_dot_com_url | 0 | 1.00 | 34 | 120 | 0 | 244 | 0 |
youtube_url | 11 | 0.96 | 43 | 43 | 0 | 233 | 0 |
id | 11 | 0.96 | 11 | 11 | 0 | 233 | 0 |
kind | 16 | 0.94 | 13 | 13 | 0 | 1 | 0 |
etag | 16 | 0.94 | 27 | 27 | 0 | 228 | 0 |
title | 16 | 0.94 | 6 | 99 | 0 | 228 | 0 |
description | 50 | 0.80 | 3 | 3527 | 0 | 194 | 0 |
thumbnail | 129 | 0.48 | 48 | 48 | 0 | 118 | 0 |
channel_title | 16 | 0.94 | 3 | 37 | 0 | 185 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
funny | 0 | 1 | 0.69 | TRU: 171, FAL: 76 |
show_product_quickly | 0 | 1 | 0.68 | TRU: 169, FAL: 78 |
patriotic | 0 | 1 | 0.17 | FAL: 206, TRU: 41 |
celebrity | 0 | 1 | 0.29 | FAL: 176, TRU: 71 |
danger | 0 | 1 | 0.30 | FAL: 172, TRU: 75 |
animals | 0 | 1 | 0.37 | FAL: 155, TRU: 92 |
use_sex | 0 | 1 | 0.27 | FAL: 181, TRU: 66 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2010.19 | 5.86 | 2000 | 2005 | 2010 | 2015.00 | 2020 | ▇▇▇▇▆ |
view_count | 16 | 0.94 | 1407556.46 | 11971111.01 | 10 | 6431 | 41379 | 170015.50 | 176373378 | ▇▁▁▁▁ |
like_count | 22 | 0.91 | 4146.03 | 23920.40 | 0 | 19 | 130 | 527.00 | 275362 | ▇▁▁▁▁ |
dislike_count | 22 | 0.91 | 833.54 | 6948.52 | 0 | 1 | 7 | 24.00 | 92990 | ▇▁▁▁▁ |
favorite_count | 16 | 0.94 | 0.00 | 0.00 | 0 | 0 | 0 | 0.00 | 0 | ▁▁▇▁▁ |
comment_count | 25 | 0.90 | 188.64 | 986.46 | 0 | 1 | 10 | 50.75 | 9190 | ▇▁▁▁▁ |
category_id | 16 | 0.94 | 19.32 | 8.00 | 1 | 17 | 23 | 24.00 | 29 | ▃▁▂▆▇ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
published_at | 16 | 0.94 | 2006-02-06 10:02:36 | 2021-01-27 13:11:29 | 2013-01-31 09:13:55 | 227 |
Youtube <- likes %>%
select(-thumbnail, -superbowl_ads_dot_com_url, -youtube_url, -dislike_count, -view_count, -comment_count, -description) %>%
na.omit() %>%
# Transform data
mutate(like_count = log(like_count + 1)) %>%
mutate(across(where(is.logical), as.factor)) %>%
mutate(across(where(is.character), as.factor)) %>%
# Keep title as character as it's not a categorical variable, but a string
mutate(title = as.character(title)) %>%
mutate(category_id = as.factor(category_id))
Identify Good predictors
funny
Youtube %>%
ggplot(aes(like_count, as.numeric(funny))) +
scale_x_log10() +
geom_point()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
brand
Youtube %>%
ggplot(aes(x = brand, y = like_count)) +
scale_y_log10() +
geom_boxplot()
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
title
Youtube %>%
# Group by brand
group_by(brand) %>%
# Calculate average like count per brand
summarize(like_count = mean(like_count, na.rm = TRUE),
n = n()) %>%
# Calculate brands with highest like count
filter(n > 10) %>%
slice_max(order_by = like_count, n = 20) %>%
# Plot
ggplot(aes(x = like_count, y = fct_reorder(brand, like_count))) +
geom_point() +
labs(
title = "Top 6 Brands with Highest Avg Like Count",
x = "Average Like Count",
y = "Brand"
)
EDA Shortcut
data_binarized_table <- Youtube %>%
select(-id, -kind, -etag, -published_at, -channel_title, category_id, -title) %>%
binarize()
data_binarized_table %>% glimpse
## Rows: 225
## Columns: 43
## $ `year__-Inf_2005` <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ year__2005_2010 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ year__2010_2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ year__2015_Inf <dbl> 1, 1, 0, 1, 0, 1, 1, 1, …
## $ brand__Bud_Light <dbl> 0, 1, 1, 0, 1, 0, 0, 0, …
## $ brand__Budweiser <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__Coca-Cola` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, …
## $ brand__Doritos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `brand__E-Trade` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Hynudai <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ brand__Kia <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ brand__NFL <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Pepsi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ brand__Toyota <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ funny__FALSE <dbl> 1, 0, 0, 1, 0, 0, 0, 1, …
## $ funny__TRUE <dbl> 0, 1, 1, 0, 1, 1, 1, 0, …
## $ show_product_quickly__FALSE <dbl> 1, 0, 1, 0, 0, 0, 1, 1, …
## $ show_product_quickly__TRUE <dbl> 0, 1, 0, 1, 1, 1, 0, 0, …
## $ patriotic__FALSE <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ patriotic__TRUE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ celebrity__FALSE <dbl> 1, 0, 1, 1, 1, 0, 0, 0, …
## $ celebrity__TRUE <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ danger__FALSE <dbl> 1, 0, 0, 1, 0, 0, 1, 1, …
## $ danger__TRUE <dbl> 0, 1, 1, 0, 1, 1, 0, 0, …
## $ animals__FALSE <dbl> 1, 1, 0, 1, 0, 0, 0, 1, …
## $ animals__TRUE <dbl> 0, 0, 1, 0, 1, 1, 1, 0, …
## $ use_sex__FALSE <dbl> 1, 1, 1, 1, 0, 1, 1, 1, …
## $ use_sex__TRUE <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ `like_count__-Inf_2.99573227355399` <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ like_count__2.99573227355399_4.87519732320115 <dbl> 0, 0, 1, 0, 1, 1, 0, 1, …
## $ like_count__4.87519732320115_6.26909628370626 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ like_count__6.26909628370626_Inf <dbl> 1, 0, 0, 0, 0, 0, 1, 0, …
## $ category_id__1 <dbl> 1, 0, 0, 0, 0, 1, 0, 0, …
## $ category_id__2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, …
## $ category_id__10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__15 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__17 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, …
## $ category_id__22 <dbl> 0, 0, 0, 1, 0, 0, 0, 0, …
## $ category_id__23 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__24 <dbl> 0, 0, 0, 0, 1, 0, 1, 0, …
## $ category_id__25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ category_id__27 <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ `category_id__-OTHER` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
# Step 2: Correlate
data_corr_table <- data_binarized_table %>%
correlate( like_count__6.26909628370626_Inf)
data_corr_table
## # A tibble: 43 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 like_count 6.26909628370626_Inf 1
## 2 like_count -Inf_2.99573227355399 -0.339
## 3 like_count 4.87519732320115_6.26909628370626 -0.331
## 4 like_count 2.99573227355399_4.87519732320115 -0.327
## 5 brand Doritos 0.281
## 6 brand NFL 0.250
## 7 brand Bud_Light -0.212
## 8 year 2015_Inf 0.202
## 9 year -Inf_2005 -0.193
## 10 category_id 1 0.133
## # ℹ 33 more rows
# Step 3: Plot
data_corr_table %>%
plot_correlation_funnel()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Split into train and test data set
set.seed(123)
youtube_split <- initial_split(Youtube)
youtube_train <- training(youtube_split)
youtube_test <- testing(youtube_split)
# Further split training data set for cross-validation
set.seed(234)
youtube_cv <- vfold_cv(youtube_train)
youtube_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [151/17]> Fold01
## 2 <split [151/17]> Fold02
## 3 <split [151/17]> Fold03
## 4 <split [151/17]> Fold04
## 5 <split [151/17]> Fold05
## 6 <split [151/17]> Fold06
## 7 <split [151/17]> Fold07
## 8 <split [151/17]> Fold08
## 9 <split [152/16]> Fold09
## 10 <split [152/16]> Fold10
library(usemodels)
usemodels::use_xgboost(like_count~., data = youtube_train)
## xgboost_recipe <-
## recipe(formula = like_count ~ ., data = youtube_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(18555)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_youtube_recipe1 <-
recipe(like_count ~ title + category_id + brand + funny, data = youtube_train) %>%
step_other(category_id, threshold = 0.05) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title) %>%
step_dummy(category_id, brand, funny, one_hot = TRUE) %>%
step_zv(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
xgboost_youtube_recipe1 %>% prep() %>% juice() %>% glimpse()
## Rows: 168
## Columns: 120
## $ like_count <dbl> 4.700480, 5.805135, 5.123964, 6.466145, 1.6094…
## $ tfidf_title_2000 <dbl> -0.1336354, -0.1336354, -0.1336354, -0.1336354…
## $ tfidf_title_2001 <dbl> -0.1296382, -0.1296382, -0.1296382, -0.1296382…
## $ tfidf_title_2002 <dbl> -0.1312009, -0.1312009, -0.1312009, 9.1635120,…
## $ tfidf_title_2005 <dbl> -0.1609131, -0.1609131, -0.1609131, -0.1609131…
## $ tfidf_title_2006 <dbl> -0.1079305, -0.1079305, 7.4472054, -0.1079305,…
## $ tfidf_title_2007 <dbl> -0.188392, -0.188392, -0.188392, -0.188392, -0…
## $ tfidf_title_2008 <dbl> -0.1091109, -0.1091109, -0.1091109, -0.1091109…
## $ tfidf_title_2009 <dbl> -0.1542525, -0.1542525, -0.1542525, -0.1542525…
## $ tfidf_title_2010 <dbl> -0.1157272, -0.1157272, -0.1157272, -0.1157272…
## $ tfidf_title_2011 <dbl> -0.1026157, -0.1026157, -0.1026157, -0.1026157…
## $ tfidf_title_2012 <dbl> -0.1659273, -0.1659273, -0.1659273, -0.1659273…
## $ tfidf_title_2013 <dbl> -0.198471, -0.198471, -0.198471, -0.198471, -0…
## $ tfidf_title_2014 <dbl> -0.256473, -0.256473, -0.256473, -0.256473, -0…
## $ tfidf_title_2015 <dbl> -0.1515659, -0.1515659, -0.1515659, -0.1515659…
## $ tfidf_title_2016 <dbl> -0.1715368, -0.1715368, -0.1715368, -0.1715368…
## $ tfidf_title_2018 <dbl> -0.196099, -0.196099, -0.196099, -0.196099, -0…
## $ tfidf_title_2019 <dbl> -0.1533443, -0.1533443, -0.1533443, -0.1533443…
## $ tfidf_title_2020 <dbl> -0.1522508, -0.1522508, -0.1522508, -0.1522508…
## $ tfidf_title_44 <dbl> -0.1257324, -0.1257324, -0.1257324, -0.1257324…
## $ tfidf_title_a <dbl> -0.2222684, -0.2222684, -0.2222684, -0.2222684…
## $ tfidf_title_ad <dbl> -0.3519862, -0.3519862, -0.3519862, -0.3519862…
## $ tfidf_title_and <dbl> -0.1321017, -0.1321017, -0.1321017, -0.1321017…
## $ tfidf_title_babies <dbl> -0.107287, -0.107287, -0.107287, -0.107287, -0…
## $ tfidf_title_baby <dbl> -0.1171696, -0.1171696, -0.1171696, -0.1171696…
## $ tfidf_title_beer <dbl> -0.106132, -0.106132, -0.106132, -0.106132, -0…
## $ tfidf_title_best <dbl> -0.1091918, -0.1091918, -0.1091918, -0.1091918…
## $ tfidf_title_big <dbl> -0.1548758, -0.1548758, -0.1548758, -0.1548758…
## $ tfidf_title_bowl <dbl> -0.7971586, -0.7971586, 0.7201022, -0.7971586,…
## $ tfidf_title_britney <dbl> -0.1079305, -0.1079305, -0.1079305, -0.1079305…
## $ tfidf_title_bud <dbl> -0.4447197, -0.4447197, 0.6695668, -0.4447197,…
## $ tfidf_title_budweiser <dbl> -0.3463570, -0.3463570, -0.3463570, 1.1177438,…
## $ tfidf_title_cedric <dbl> -0.186015, -0.186015, -0.186015, -0.186015, 4.…
## $ tfidf_title_cindy <dbl> -0.1091109, -0.1091109, -0.1091109, -0.1091109…
## $ tfidf_title_clydesdale <dbl> -0.1304693, -0.1304693, -0.1304693, -0.1304693…
## $ tfidf_title_coca <dbl> -0.2648954, 4.1042070, -0.2648954, -0.2648954,…
## $ tfidf_title_coke <dbl> -0.1514581, -0.1514581, -0.1514581, -0.1514581…
## $ tfidf_title_cola <dbl> -0.2648954, 4.1042070, -0.2648954, -0.2648954,…
## $ tfidf_title_commercial <dbl> 1.4121698, -0.8403560, -0.8403560, -0.8403560,…
## $ tfidf_title_commercials <dbl> -0.1518487, -0.1518487, -0.1518487, -0.1518487…
## $ tfidf_title_cool <dbl> -0.1094371, -0.1094371, -0.1094371, -0.1094371…
## $ tfidf_title_crash <dbl> -0.1554245, -0.1554245, -0.1554245, -0.1554245…
## $ tfidf_title_date <dbl> -0.107287, -0.107287, -0.107287, -0.107287, -0…
## $ tfidf_title_diet <dbl> -0.1292761, -0.1292761, -0.1292761, -0.1292761…
## $ tfidf_title_dilly <dbl> -0.07715167, -0.07715167, -0.07715167, -0.0771…
## $ tfidf_title_dog <dbl> -0.107287, -0.107287, -0.107287, -0.107287, -0…
## $ tfidf_title_doritos <dbl> -0.2927531, -0.2927531, -0.2927531, -0.2927531…
## $ tfidf_title_down <dbl> -0.1020634, -0.1020634, -0.1020634, -0.1020634…
## $ tfidf_title_e <dbl> -0.1731492, -0.1731492, -0.1731492, -0.1731492…
## $ tfidf_title_elantra <dbl> -0.1091109, -0.1091109, -0.1091109, -0.1091109…
## $ tfidf_title_epic <dbl> -0.106132, -0.106132, -0.106132, -0.106132, -0…
## $ tfidf_title_etrade <dbl> -0.1244886, -0.1244886, -0.1244886, -0.1244886…
## $ tfidf_title_extended <dbl> -0.1307519, -0.1307519, -0.1307519, -0.1307519…
## $ tfidf_title_factory <dbl> -0.1079305, -0.1079305, -0.1079305, -0.1079305…
## $ tfidf_title_fantasy <dbl> -0.1557083, -0.1557083, -0.1557083, -0.1557083…
## $ tfidf_title_featuring <dbl> -0.1037587, -0.1037587, -0.1037587, -0.1037587…
## $ tfidf_title_flavor <dbl> -0.09088885, -0.09088885, -0.09088885, -0.0908…
## $ tfidf_title_funny <dbl> 9.3221631, -0.1436779, -0.1436779, -0.1436779,…
## $ tfidf_title_game <dbl> -0.1738631, -0.1738631, -0.1738631, -0.1738631…
## $ tfidf_title_genesis <dbl> -0.1055369, -0.1055369, -0.1055369, -0.1055369…
## $ tfidf_title_girlfriend <dbl> -0.1083241, -0.1083241, -0.1083241, -0.1083241…
## $ tfidf_title_halftime <dbl> -0.07715167, -0.07715167, -0.07715167, -0.0771…
## $ tfidf_title_hd <dbl> -0.2088777, -0.2088777, -0.2088777, -0.2088777…
## $ tfidf_title_hyundai <dbl> -0.2503603, -0.2503603, -0.2503603, -0.2503603…
## $ tfidf_title_inside <dbl> -0.09776579, -0.09776579, -0.09776579, -0.0977…
## $ tfidf_title_is <dbl> -0.1498258, -0.1498258, -0.1498258, -0.1498258…
## $ tfidf_title_island <dbl> -0.1557083, -0.1557083, -0.1557083, -0.1557083…
## $ tfidf_title_it <dbl> -0.1015255, 11.8378694, -0.1015255, -0.1015255…
## $ tfidf_title_journey <dbl> -0.1015255, -0.1015255, -0.1015255, -0.1015255…
## $ tfidf_title_kia <dbl> -0.2145999, -0.2145999, -0.2145999, -0.2145999…
## $ tfidf_title_king <dbl> -0.1083241, -0.1083241, -0.1083241, -0.1083241…
## $ tfidf_title_legends <dbl> -0.1315009, -0.1315009, -0.1315009, -0.1315009…
## $ tfidf_title_light <dbl> -0.4205641, -0.4205641, 0.7320422, -0.4205641,…
## $ tfidf_title_lighta <dbl> -0.1557083, -0.1557083, -0.1557083, -0.1557083…
## $ tfidf_title_love <dbl> -0.1120992, -0.1120992, -0.1120992, -0.1120992…
## $ tfidf_title_meter <dbl> -0.1087598, -0.1087598, -0.1087598, -0.1087598…
## $ tfidf_title_monkey <dbl> -0.1094371, -0.1094371, -0.1094371, -0.1094371…
## $ tfidf_title_new <dbl> -0.1629016, -0.1629016, -0.1629016, -0.1629016…
## $ tfidf_title_nfl <dbl> -0.1035113, -0.1035113, -0.1035113, -0.1035113…
## $ tfidf_title_of <dbl> -0.1155596, -0.1155596, -0.1155596, -0.1155596…
## $ tfidf_title_official <dbl> -0.204087, -0.204087, -0.204087, -0.204087, -0…
## $ tfidf_title_on <dbl> -0.1091109, -0.1091109, 8.3511788, -0.1091109,…
## $ tfidf_title_one <dbl> -0.1015255, -0.1015255, -0.1015255, -0.1015255…
## $ tfidf_title_party <dbl> -0.1020634, -0.1020634, -0.1020634, -0.1020634…
## $ tfidf_title_pepsi <dbl> 2.3489833, -0.2902986, -0.2902986, -0.2902986,…
## $ tfidf_title_puppy <dbl> -0.07715167, -0.07715167, -0.07715167, -0.0771…
## $ tfidf_title_respect <dbl> -0.09776579, -0.09776579, -0.09776579, 12.2207…
## $ tfidf_title_spot <dbl> -0.1164506, -0.1164506, -0.1164506, -0.1164506…
## $ tfidf_title_starring <dbl> -0.123236, -0.123236, -0.123236, -0.123236, -0…
## $ tfidf_title_super <dbl> -0.7971586, -0.7971586, 0.7201022, -0.7971586,…
## $ tfidf_title_superbowl <dbl> -0.2286568, -0.2286568, -0.2286568, -0.2286568…
## $ tfidf_title_team <dbl> -0.1292761, -0.1292761, -0.1292761, -0.1292761…
## $ tfidf_title_the <dbl> -0.3171074, -0.3171074, 1.6224815, -0.3171074,…
## $ tfidf_title_toyota <dbl> -0.1975127, -0.1975127, -0.1975127, -0.1975127…
## $ tfidf_title_trade <dbl> -0.1731492, -0.1731492, -0.1731492, -0.1731492…
## $ tfidf_title_tv <dbl> -0.1500656, -0.1500656, -0.1500656, -0.1500656…
## $ tfidf_title_usa <dbl> -0.1730928, -0.1730928, -0.1730928, 6.9776275,…
## $ tfidf_title_vs <dbl> -0.132349, -0.132349, -0.132349, -0.132349, -0…
## $ tfidf_title_winner <dbl> -0.1554245, -0.1554245, -0.1554245, -0.1554245…
## $ tfidf_title_with <dbl> -0.1510437, -0.1510437, -0.1510437, -0.1510437…
## $ tfidf_title_xliii <dbl> -0.1294588, -0.1294588, -0.1294588, -0.1294588…
## $ category_id_X1 <dbl> -0.2887416, -0.2887416, -0.2887416, -0.2887416…
## $ category_id_X2 <dbl> -0.2372063, -0.2372063, -0.2372063, -0.2372063…
## $ category_id_X17 <dbl> -0.2765234, -0.2765234, -0.2765234, -0.2765234…
## $ category_id_X22 <dbl> 2.2294031, 2.2294031, -0.4458806, -0.4458806, …
## $ category_id_X23 <dbl> -0.4458806, -0.4458806, 2.2294031, -0.4458806,…
## $ category_id_X24 <dbl> -0.7527948, -0.7527948, -0.7527948, 1.3204762,…
## $ category_id_other <dbl> -0.3345335, -0.3345335, -0.3345335, -0.3345335…
## $ brand_Bud.Light <dbl> -0.5298694, -0.5298694, -0.5298694, -0.5298694…
## $ brand_Budweiser <dbl> -0.4266248, -0.4266248, 2.3300277, 2.3300277, …
## $ brand_Coca.Cola <dbl> -0.3234758, 3.0730201, -0.3234758, -0.3234758,…
## $ brand_Doritos <dbl> -0.3870245, -0.3870245, -0.3870245, -0.3870245…
## $ brand_E.Trade <dbl> -0.2229403, -0.2229403, -0.2229403, -0.2229403…
## $ brand_Hynudai <dbl> -0.3560305, -0.3560305, -0.3560305, -0.3560305…
## $ brand_Kia <dbl> -0.2372063, -0.2372063, -0.2372063, -0.2372063…
## $ brand_NFL <dbl> -0.1746203, -0.1746203, -0.1746203, -0.1746203…
## $ brand_Pepsi <dbl> 2.7920290, -0.3560305, -0.3560305, -0.3560305,…
## $ brand_Toyota <dbl> -0.2078929, -0.2078929, -0.2078929, -0.2078929…
## $ funny_FALSE. <dbl> -0.6582574, 1.5101199, -0.6582574, 1.5101199, …
## $ funny_TRUE. <dbl> 0.6582574, -1.5101199, 0.6582574, -1.5101199, …
# Specify Model
xgboost_spec_youtube1 <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
# Combine Recipe and Model Using Workflow
xgboost_workflow_youtube1 <-
workflow() %>%
add_recipe(xgboost_youtube_recipe1) %>%
add_model(xgboost_spec_youtube1)
# Tune Hyperparameters
set.seed(678)
tuned_youtube1 <-
tune_grid(xgboost_workflow_youtube1,
resamples = youtube_cv,
grid = 5)
tune::show_best(tuned_youtube1, metric = "rmse")
## # A tibble: 5 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 403 5 1 0.0243 1.15e- 6 0.201 rmse
## 2 861 39 6 0.00296 7.32e+ 0 0.697 rmse
## 3 1777 18 8 0.152 1.91e- 5 0.380 rmse
## 4 390 30 13 0.00459 8.81e- 2 0.465 rmse
## 5 1205 13 10 0.0559 3.00e-10 0.828 rmse
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
# Update model by selecting best hyperparameters
xgboost_fw_youtube1 <- tune::finalize_workflow(xgboost_workflow_youtube1,
tune::select_best(tuned_youtube1, metric = "rmse"))
# Fit model on entire training data and test it on test data
youtube_fit <- tune::last_fit(xgboost_fw_youtube1, youtube_split)
tune::collect_metrics(youtube_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 2.62 Preprocessor1_Model1
## 2 rsq standard 0.0668 Preprocessor1_Model1
tune::collect_predictions(youtube_fit) %>%
ggplot(aes(like_count, .pred)) +
geom_point(alpha = 0.3, fill = "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()